├── apps ├── miroflow-agent │ ├── src │ │ ├── __init__.py │ │ ├── io │ │ │ ├── __init__.py │ │ │ └── output_formatter.py │ │ ├── config │ │ │ └── __init__.py │ │ ├── core │ │ │ └── __init__.py │ │ ├── logging │ │ │ ├── __init__.py │ │ │ └── summary_time_cost.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ └── wrapper_utils.py │ │ └── llm │ │ │ ├── providers │ │ │ └── __init__.py │ │ │ ├── __init__.py │ │ │ ├── util.py │ │ │ └── factory.py │ ├── benchmarks │ │ ├── __init__.py │ │ ├── evaluators │ │ │ ├── __init__.py │ │ │ └── calculate_average_score.py │ │ └── check_progress │ │ │ ├── check_progress_hle.py │ │ │ ├── check_progress_frames.py │ │ │ ├── check_progress_seal-0.py │ │ │ ├── check_progress_aime2025.py │ │ │ ├── check_progress_browsecomp.py │ │ │ ├── check_progress_hle-text-500.py │ │ │ ├── check_progress_webwalkerqa.py │ │ │ ├── check_progress_browsecomp_zh.py │ │ │ ├── check_progress_hle-text-2158.py │ │ │ ├── check_progress_xbench_deepsearch.py │ │ │ ├── check_progress_gaia-validation.py │ │ │ └── check_progress_gaia-validation-text-103.py │ ├── conf │ │ ├── __init__.py │ │ ├── llm │ │ │ ├── gpt-5.yaml │ │ │ ├── claude-3-7.yaml │ │ │ ├── qwen-3.yaml │ │ │ └── default.yaml │ │ ├── benchmark │ │ │ ├── hle.yaml │ │ │ ├── debug.yaml │ │ │ ├── seal-0.yaml │ │ │ ├── frames.yaml │ │ │ ├── aime2025.yaml │ │ │ ├── futurex.yaml │ │ │ ├── browsecomp.yaml │ │ │ ├── collect_trace.yaml │ │ │ ├── hle-text-500.yaml │ │ │ ├── webwalkerqa.yaml │ │ │ ├── hle-text-2158.yaml │ │ │ ├── browsecomp_zh.yaml │ │ │ ├── gaia-validation.yaml │ │ │ ├── xbench_deepsearch.yaml │ │ │ ├── gaia-validation-text-103.yaml │ │ │ └── default.yaml │ │ ├── config.yaml │ │ └── agent │ │ │ ├── single_agent.yaml │ │ │ ├── single_agent_keep5.yaml │ │ │ ├── default.yaml │ │ │ ├── multi_agent.yaml │ │ │ └── multi_agent_os.yaml │ ├── README.md │ ├── .env.example │ ├── main.py │ ├── pyproject.toml │ └── scripts │ │ ├── run_evaluate_multiple_runs_hle.sh │ │ ├── run_evaluate_multiple_runs_debug.sh │ │ ├── run_evaluate_multiple_runs_frames.sh │ │ ├── run_evaluate_multiple_runs_seal-0.sh │ │ ├── run_evaluate_multiple_runs_aime2025.sh │ │ ├── run_evaluate_multiple_runs_browsecomp.sh │ │ ├── run_evaluate_multiple_runs_webwalkerqa.sh │ │ ├── run_evaluate_multiple_runs_browsecomp_zh.sh │ │ ├── run_evaluate_multiple_runs_hle-text-2158.sh │ │ ├── run_evaluate_multiple_runs_hle-text-500.sh │ │ ├── run_evaluate_multiple_runs_gaia-validation.sh │ │ ├── run_evaluate_multiple_runs_xbench_deepsearch.sh │ │ ├── run_evaluate_multiple_runs_gaia-validation-text-103.sh │ │ └── run_evaluate_multiple_runs_futurex.sh ├── visualize-trace │ ├── .python-version │ ├── requirements.txt │ ├── pyproject.toml │ ├── README.md │ └── run.py ├── collect-trace │ ├── pyproject.toml │ ├── utils │ │ ├── converters │ │ │ ├── __init__.py │ │ │ ├── system_prompts.py │ │ │ ├── example_usage.py │ │ │ └── convert_non_oai_to_chatml.py │ │ ├── merge_chatml_msgs_to_one_json.py │ │ └── process_logs.py │ ├── scripts │ │ ├── collect_trace_qwen3.sh │ │ ├── collect_trace_gpt41.sh │ │ ├── collect_trace_gpt5.sh │ │ └── collect_trace_claude37.sh │ └── README.md └── gradio-demo │ ├── pyproject.toml │ ├── .env.example │ └── utils.py ├── libs └── miroflow-tools │ ├── src │ ├── __init__.py │ └── miroflow_tools │ │ ├── mcp_servers │ │ ├── __init__.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ └── url_unquote.py │ │ ├── reasoning_mcp_server.py │ │ ├── vision_mcp_server.py │ │ ├── reasoning_mcp_server_os.py │ │ ├── browser_session.py │ │ ├── reading_mcp_server.py │ │ ├── vision_mcp_server_os.py │ │ └── serper_mcp_server.py │ │ ├── __init__.py │ │ └── dev_mcp_servers │ │ └── stateless_python_server.py │ └── pyproject.toml ├── assets ├── futurex-09-12.png ├── gaia_text_103.png ├── miro_thinker.png ├── miroflow_logo.png ├── miromind_wechat.png ├── MiroThinker_v1.0_Technical_Report.pdf ├── QA.md ├── qwen3_nonthinking.jinja ├── LOCAL-TOOL-DEPLOYMENT.md └── MiromindAI_H.svg ├── justfile ├── LICENSE ├── .github └── workflows │ └── run-ruff.yml └── .gitignore /apps/miroflow-agent/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apps/miroflow-agent/src/io/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libs/miroflow-tools/src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apps/miroflow-agent/benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apps/miroflow-agent/src/config/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apps/miroflow-agent/src/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apps/miroflow-agent/src/logging/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apps/miroflow-agent/src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apps/visualize-trace/.python-version: -------------------------------------------------------------------------------- 1 | 3.11 -------------------------------------------------------------------------------- /apps/miroflow-agent/benchmarks/evaluators/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libs/miroflow-tools/src/miroflow_tools/mcp_servers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /apps/visualize-trace/requirements.txt: -------------------------------------------------------------------------------- 1 | flask==2.3.3 2 | werkzeug==2.3.7 -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/__init__.py: -------------------------------------------------------------------------------- 1 | # This file makes the conf directory a Python package 2 | -------------------------------------------------------------------------------- /assets/futurex-09-12.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MiroMindAI/MiroThinker/HEAD/assets/futurex-09-12.png -------------------------------------------------------------------------------- /assets/gaia_text_103.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MiroMindAI/MiroThinker/HEAD/assets/gaia_text_103.png -------------------------------------------------------------------------------- /assets/miro_thinker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MiroMindAI/MiroThinker/HEAD/assets/miro_thinker.png -------------------------------------------------------------------------------- /assets/miroflow_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MiroMindAI/MiroThinker/HEAD/assets/miroflow_logo.png -------------------------------------------------------------------------------- /assets/miromind_wechat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MiroMindAI/MiroThinker/HEAD/assets/miromind_wechat.png -------------------------------------------------------------------------------- /assets/MiroThinker_v1.0_Technical_Report.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/MiroMindAI/MiroThinker/HEAD/assets/MiroThinker_v1.0_Technical_Report.pdf -------------------------------------------------------------------------------- /libs/miroflow-tools/src/miroflow_tools/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | from .manager import ToolManager 5 | 6 | __all__ = ["ToolManager"] 7 | -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/llm/gpt-5.yaml: -------------------------------------------------------------------------------- 1 | # conf/llm/gpt-5.yaml 2 | defaults: 3 | - default 4 | - _self_ 5 | 6 | provider: "openai" 7 | model_name: "gpt-5-2025-08-07" 8 | base_url: https://api.openai.com/v1 9 | max_context_length: 65536 10 | -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/llm/claude-3-7.yaml: -------------------------------------------------------------------------------- 1 | # conf/llm/claude-3-7.yaml 2 | defaults: 3 | - default 4 | - _self_ 5 | 6 | provider: "anthropic" 7 | model_name: "claude-3-7-sonnet-20250219" 8 | base_url: https://api.anthropic.com 9 | max_context_length: 65536 10 | -------------------------------------------------------------------------------- /libs/miroflow-tools/src/miroflow_tools/mcp_servers/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .url_unquote import decode_http_urls_in_dict, safe_unquote, strip_markdown_links 2 | 3 | __all__ = [ 4 | "safe_unquote", 5 | "decode_http_urls_in_dict", 6 | "strip_markdown_links", 7 | ] 8 | -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/llm/qwen-3.yaml: -------------------------------------------------------------------------------- 1 | # conf/llm/qwen-3.yaml 2 | defaults: 3 | - default 4 | - _self_ 5 | 6 | provider: "qwen" 7 | model_name: "qwen-3" 8 | base_url: "https://your-api.com/v1" 9 | max_context_length: 262144 10 | max_tokens: 16384 11 | top_p: 0.95 12 | repetition_penalty: 1.05 13 | -------------------------------------------------------------------------------- /apps/visualize-trace/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "trace-dashboard" 3 | version = "1.0.0" 4 | description = "A web dashboard for analyzing trace JSON files" 5 | requires-python = ">=3.8" 6 | dependencies = [ 7 | "flask>=2.3.3", 8 | "werkzeug>=2.3.7", 9 | ] 10 | 11 | [tool.uv] 12 | dev-dependencies = [] -------------------------------------------------------------------------------- /apps/miroflow-agent/src/llm/providers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | from .anthropic_client import AnthropicClient 5 | from .openai_client import OpenAIClient 6 | 7 | __all__ = [ 8 | "AnthropicClient", 9 | "OpenAIClient", 10 | ] 11 | -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/benchmark/hle.yaml: -------------------------------------------------------------------------------- 1 | # conf/benchmark/hle.yaml 2 | defaults: 3 | - default 4 | - _self_ 5 | 6 | name: "hle" 7 | 8 | data: 9 | data_dir: "../../data/hle" 10 | 11 | execution: 12 | max_tasks: null # null means no limit 13 | max_concurrent: 5 14 | pass_at_k: 1 15 | format_error_retry_limit: 0 16 | -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/benchmark/debug.yaml: -------------------------------------------------------------------------------- 1 | # conf/benchmark/debug.yaml 2 | defaults: 3 | - default 4 | - _self_ 5 | 6 | name: "debug" 7 | 8 | data: 9 | data_dir: "../../data/debug" 10 | 11 | execution: 12 | max_tasks: null # null means no limit 13 | max_concurrent: 5 14 | pass_at_k: 1 15 | format_error_retry_limit: 0 16 | -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/benchmark/seal-0.yaml: -------------------------------------------------------------------------------- 1 | # conf/benchmark/seal-0.yaml 2 | defaults: 3 | - default 4 | - _self_ 5 | 6 | name: "seal-0" 7 | 8 | data: 9 | data_dir: "../../data/seal-0" 10 | 11 | execution: 12 | max_tasks: null # null means no limit 13 | max_concurrent: 5 14 | pass_at_k: 1 15 | format_error_retry_limit: 0 -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/benchmark/frames.yaml: -------------------------------------------------------------------------------- 1 | # conf/benchmark/frames.yaml 2 | defaults: 3 | - default 4 | - _self_ 5 | 6 | name: "frames" 7 | 8 | data: 9 | data_dir: "../../data/frames" 10 | 11 | execution: 12 | max_tasks: null # null means no limit 13 | max_concurrent: 5 14 | pass_at_k: 1 15 | format_error_retry_limit: 0 16 | -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/benchmark/aime2025.yaml: -------------------------------------------------------------------------------- 1 | # conf/benchmark/aime2025.yaml 2 | defaults: 3 | - default 4 | - _self_ 5 | 6 | name: "aime2025" 7 | 8 | data: 9 | data_dir: "../../data/aime2025" 10 | 11 | execution: 12 | max_tasks: null # null means no limit 13 | max_concurrent: 5 14 | pass_at_k: 1 15 | format_error_retry_limit: 0 -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/benchmark/futurex.yaml: -------------------------------------------------------------------------------- 1 | # conf/benchmark/futurex.yaml 2 | defaults: 3 | - default 4 | - _self_ 5 | 6 | name: "futurex" 7 | 8 | data: 9 | data_dir: "../../data/futurex" 10 | 11 | execution: 12 | max_tasks: null # null means no limit 13 | max_concurrent: 5 14 | pass_at_k: 1 15 | format_error_retry_limit: 0 16 | -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/benchmark/browsecomp.yaml: -------------------------------------------------------------------------------- 1 | # conf/benchmark/browsecomp.yaml 2 | defaults: 3 | - default 4 | - _self_ 5 | 6 | name: "browsecomp" 7 | 8 | data: 9 | data_dir: "../../data/browsecomp" 10 | 11 | execution: 12 | max_tasks: null # null means no limit 13 | max_concurrent: 5 14 | pass_at_k: 1 15 | format_error_retry_limit: 0 16 | -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/benchmark/collect_trace.yaml: -------------------------------------------------------------------------------- 1 | # conf/benchmark/collect_trace.yaml 2 | defaults: 3 | - default 4 | - _self_ 5 | 6 | name: "collect_trace" 7 | 8 | data: 9 | data_dir: "../../data/debug" 10 | 11 | execution: 12 | max_tasks: null # null means no limit 13 | max_concurrent: 5 14 | pass_at_k: 1 15 | format_error_retry_limit: 0 16 | -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/benchmark/hle-text-500.yaml: -------------------------------------------------------------------------------- 1 | # conf/benchmark/hle-text-500.yaml 2 | defaults: 3 | - default 4 | - _self_ 5 | 6 | name: "hle-text-500" 7 | 8 | data: 9 | data_dir: "../../data/hle-text-500" 10 | 11 | execution: 12 | max_tasks: null # null means no limit 13 | max_concurrent: 5 14 | pass_at_k: 1 15 | format_error_retry_limit: 0 -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/benchmark/webwalkerqa.yaml: -------------------------------------------------------------------------------- 1 | # conf/benchmark/webwalkerqa.yaml 2 | defaults: 3 | - default 4 | - _self_ 5 | 6 | name: "webwalkerqa" 7 | 8 | data: 9 | data_dir: "../../data/webwalkerqa" 10 | 11 | execution: 12 | max_tasks: null # null means no limit 13 | max_concurrent: 5 14 | pass_at_k: 1 15 | format_error_retry_limit: 0 16 | -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/benchmark/hle-text-2158.yaml: -------------------------------------------------------------------------------- 1 | # conf/benchmark/hle-text-2158.yaml 2 | defaults: 3 | - default 4 | - _self_ 5 | 6 | name: "hle-text-2158" 7 | 8 | data: 9 | data_dir: "../../data/hle-text-2158" 10 | 11 | execution: 12 | max_tasks: null # null means no limit 13 | max_concurrent: 5 14 | pass_at_k: 1 15 | format_error_retry_limit: 0 -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/benchmark/browsecomp_zh.yaml: -------------------------------------------------------------------------------- 1 | # conf/benchmark/browsecomp_zh.yaml 2 | defaults: 3 | - default 4 | - _self_ 5 | 6 | name: "browsecomp_zh" 7 | 8 | data: 9 | data_dir: "../../data/browsecomp_zh" 10 | 11 | execution: 12 | max_tasks: null # null means no limit 13 | max_concurrent: 5 14 | pass_at_k: 1 15 | format_error_retry_limit: 0 16 | -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/benchmark/gaia-validation.yaml: -------------------------------------------------------------------------------- 1 | # conf/benchmark/gaia-validation.yaml 2 | defaults: 3 | - default 4 | - _self_ 5 | 6 | name: "gaia-validation" 7 | 8 | data: 9 | data_dir: "../../data/gaia-2023-validation" 10 | 11 | execution: 12 | max_tasks: null # null means no limit 13 | max_concurrent: 5 14 | pass_at_k: 1 15 | format_error_retry_limit: 0 16 | -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/benchmark/xbench_deepsearch.yaml: -------------------------------------------------------------------------------- 1 | # conf/benchmark/xbench_deepsearch.yaml 2 | defaults: 3 | - default 4 | - _self_ 5 | 6 | name: "xbench_deepsearch" 7 | 8 | data: 9 | data_dir: "../../data/xbench_deepsearch" 10 | 11 | execution: 12 | max_tasks: null # null means no limit 13 | max_concurrent: 5 14 | pass_at_k: 1 15 | format_error_retry_limit: 0 16 | -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/config.yaml: -------------------------------------------------------------------------------- 1 | # conf/config.yaml 2 | defaults: 3 | - llm: default 4 | - agent: default 5 | - benchmark: default 6 | - _self_ # Allows variables to be defined at the top of this file 7 | 8 | hydra: 9 | run: 10 | dir: ../../logs/debug 11 | 12 | # You can define some top-level or default parameters here 13 | project_name: "miroflow-agent" 14 | debug_dir: "../../logs/debug" 15 | -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/llm/default.yaml: -------------------------------------------------------------------------------- 1 | # conf/llm/default.yaml - Default LLM configuration 2 | provider: "anthropic" # openai, anthropic, qwen 3 | model_name: "claude-3-7-sonnet-20250219" 4 | async_client: false 5 | temperature: 0.3 6 | top_p: 1.0 7 | min_p: 0.0 8 | top_k: -1 9 | max_tokens: 4096 10 | api_key: "" 11 | base_url: https://api.anthropic.com 12 | keep_tool_result: -1 13 | repetition_penalty: 1.0 14 | -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/benchmark/gaia-validation-text-103.yaml: -------------------------------------------------------------------------------- 1 | # conf/benchmark/gaia-validation-text-103.yaml 2 | defaults: 3 | - default 4 | - _self_ 5 | 6 | name: "gaia-validation-text-103" 7 | 8 | data: 9 | data_dir: "../../data/gaia-2023-validation-text-103" 10 | 11 | execution: 12 | max_tasks: null # null means no limit 13 | max_concurrent: 5 14 | pass_at_k: 1 15 | format_error_retry_limit: 0 16 | -------------------------------------------------------------------------------- /apps/miroflow-agent/src/llm/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | from .base_client import BaseClient 5 | from .factory import ClientFactory 6 | from .providers import ( 7 | AnthropicClient, 8 | OpenAIClient, 9 | ) 10 | 11 | __all__ = [ 12 | "BaseClient", 13 | "ClientFactory", 14 | "AnthropicClient", 15 | "OpenAIClient", 16 | ] 17 | -------------------------------------------------------------------------------- /apps/collect-trace/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "collect-trace" 3 | version = "0.1.0" 4 | description = "Executes a user-defined agent loop for capturing multi-turn interaction traces" 5 | readme = "README.md" 6 | requires-python = ">=3.12" 7 | authors = [{ name = "MiroMind Team", email = "service@miromind.ai" }] 8 | dependencies = [ 9 | "miroflow-tools>=0.1.0", 10 | "dotenv>=0.9.9", 11 | "openai>=1.90.0", 12 | ] 13 | 14 | [tool.uv.sources] 15 | miroflow-tools = { path = "../../libs/miroflow-tools", editable = true } 16 | -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/agent/single_agent.yaml: -------------------------------------------------------------------------------- 1 | # conf/agent/single_agent.yaml 2 | # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py 3 | # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py 4 | defaults: 5 | - default 6 | - _self_ 7 | 8 | main_agent: 9 | tools: 10 | - search_and_scrape_webpage 11 | - jina_scrape_llm_summary 12 | - tool-python 13 | max_turns: 600 # Maximum number of turns for main agent execution 14 | 15 | sub_agents: 16 | 17 | keep_tool_result: -1 -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/agent/single_agent_keep5.yaml: -------------------------------------------------------------------------------- 1 | # conf/agent/single_agent_keep5.yaml 2 | # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py 3 | # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py 4 | defaults: 5 | - default 6 | - _self_ 7 | 8 | main_agent: 9 | tools: 10 | - search_and_scrape_webpage 11 | - jina_scrape_llm_summary 12 | - tool-python 13 | max_turns: 600 # Maximum number of turns for main agent execution 14 | 15 | sub_agents: 16 | 17 | keep_tool_result: 5 -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/benchmark/default.yaml: -------------------------------------------------------------------------------- 1 | # conf/benchmark/default.yaml - Default benchmark configuration 2 | # This is a base configuration for benchmarks. Specific benchmarks can override these defaults. 3 | name: "default" 4 | 5 | data: 6 | metadata_file: "standardized_data.jsonl" 7 | field_mapping: 8 | task_id_field: "task_id" 9 | task_question_field: "task_question" 10 | ground_truth_field: "ground_truth" 11 | file_name_field: "file_name" 12 | 13 | execution: 14 | max_tasks: null # null means no limit 15 | max_concurrent: 5 16 | pass_at_k: 1 17 | format_error_retry_limit: 0 -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/agent/default.yaml: -------------------------------------------------------------------------------- 1 | # conf/agent/default.yaml 2 | # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py 3 | # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py 4 | main_agent: 5 | tools: 6 | - tool-python 7 | - tool-vqa 8 | - tool-transcribe 9 | - tool-reasoning 10 | - tool-reader 11 | max_turns: 20 # Maximum number of turns for main agent execution 12 | 13 | sub_agents: 14 | agent-browsing: 15 | tools: 16 | - tool-google-search 17 | - tool-vqa 18 | - tool-reader 19 | - tool-python 20 | max_turns: 20 21 | 22 | keep_tool_result: -1 -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/agent/multi_agent.yaml: -------------------------------------------------------------------------------- 1 | # conf/agent/multi_agent.yaml 2 | # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py 3 | # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py 4 | defaults: 5 | - default 6 | - _self_ 7 | 8 | main_agent: 9 | tools: 10 | - tool-python 11 | - tool-vqa 12 | - tool-transcribe 13 | - tool-reasoning 14 | - tool-reader 15 | max_turns: 50 # Maximum number of turns for main agent execution 16 | 17 | sub_agents: 18 | agent-browsing: 19 | tools: 20 | - tool-google-search 21 | - tool-vqa 22 | - tool-reader 23 | - tool-python 24 | max_turns: 50 25 | 26 | keep_tool_result: -1 27 | 28 | -------------------------------------------------------------------------------- /apps/gradio-demo/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "gradio-demo" 3 | version = "0.1.0" 4 | description = "Gradio Demo" 5 | readme = "README.md" 6 | requires-python = ">=3.12" 7 | dependencies = [ 8 | "pydantic>=2.10.0", 9 | "python-dotenv>=1.0.0", 10 | "hydra-core>=1.3.0", 11 | "miroflow-agent", 12 | "aiohttp>=3.12.15", 13 | "gradio>=5.42.0", 14 | ] 15 | 16 | [build-system] 17 | requires = ["hatchling"] 18 | build-backend = "hatchling.build" 19 | 20 | [tool.hatch.build.targets.wheel] 21 | packages = ["./"] 22 | 23 | [tool.uv.sources] 24 | miroflow-agent = { path = "../miroflow-agent", editable = true } 25 | 26 | [dependency-groups] 27 | dev = [ 28 | "pytest>=8.4.1", 29 | "pytest-asyncio>=1.0.0", 30 | "httpx>=0.28.1", 31 | ] 32 | -------------------------------------------------------------------------------- /apps/miroflow-agent/conf/agent/multi_agent_os.yaml: -------------------------------------------------------------------------------- 1 | # conf/agent/multi_agent_os.yaml 2 | # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py 3 | # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py 4 | defaults: 5 | - default 6 | - _self_ 7 | 8 | main_agent: 9 | tools: 10 | - tool-python 11 | - tool-vqa-os 12 | - tool-transcribe-os 13 | - tool-reasoning-os 14 | - tool-reader 15 | max_turns: 50 # Maximum number of turns for main agent execution 16 | 17 | sub_agents: 18 | agent-browsing: 19 | tools: 20 | - tool-google-search 21 | - tool-vqa-os 22 | - tool-reader 23 | - tool-python 24 | max_turns: 50 25 | 26 | keep_tool_result: -1 27 | 28 | -------------------------------------------------------------------------------- /apps/miroflow-agent/src/llm/util.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import asyncio 5 | import functools 6 | from typing import Awaitable, Callable, TypeVar 7 | 8 | T = TypeVar("T") 9 | 10 | 11 | def with_timeout( 12 | timeout_s: float = 300.0, 13 | ) -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]: 14 | """ 15 | Decorator: wraps any *async* function in asyncio.wait_for(). 16 | Usage: 17 | @with_timeout(20) 18 | async def create_message_foo(...): ... 19 | """ 20 | 21 | def decorator(func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]: 22 | @functools.wraps(func) 23 | async def wrapper(*args, **kwargs) -> T: 24 | return await asyncio.wait_for(func(*args, **kwargs), timeout=timeout_s) 25 | 26 | return wrapper 27 | 28 | return decorator 29 | -------------------------------------------------------------------------------- /apps/miroflow-agent/src/utils/wrapper_utils.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | 4 | class ErrorBox: 5 | def __init__(self, error_msg: str): 6 | self.error_msg = error_msg 7 | 8 | def __str__(self): 9 | return self.error_msg 10 | 11 | @staticmethod 12 | def is_error_box(something): 13 | return isinstance(something, ErrorBox) 14 | 15 | 16 | class ResponseBox: 17 | def __init__(self, response: Any, extra_info: dict = None): 18 | self.response = response 19 | self.extra_info = extra_info 20 | 21 | def __str__(self): 22 | return self.response 23 | 24 | @staticmethod 25 | def is_response_box(something): 26 | return isinstance(something, ResponseBox) 27 | 28 | def has_extra_info(self): 29 | return self.extra_info is not None 30 | 31 | def get_extra_info(self): 32 | return self.extra_info 33 | 34 | def get_response(self): 35 | return self.response 36 | -------------------------------------------------------------------------------- /justfile: -------------------------------------------------------------------------------- 1 | default: 2 | just --list 3 | 4 | # lint monorepo 5 | [group('precommit')] 6 | lint: 7 | uv tool run ruff@0.8.0 check --fix . 8 | 9 | # sort imports 10 | [group('precommit')] 11 | sort-imports: 12 | uv tool run ruff@0.8.0 check --select I --fix . 13 | 14 | # format monorepo 15 | [group('precommit')] 16 | format: 17 | uv tool run ruff@0.8.0 format . 18 | 19 | # check license 20 | [group('precommit')] 21 | check-license: 22 | uv run reuse lint 23 | 24 | # insert license for contributor 25 | insert-license: 26 | # https://reuse.readthedocs.io/en/stable/scripts.html#add-headers-to-staged-files-based-on-git-settings 27 | git diff --name-only --cached | xargs -I {} reuse annotate -c "$(git config --get user.name) <$(git config --get user.email)>" "{}" 28 | 29 | # format markdown files 30 | [group('precommit')] 31 | format-md: 32 | find . -name "*.md" -type f | xargs uv tool run mdformat@0.7.17 33 | 34 | # run precommit before PR 35 | [group('precommit')] 36 | precommit: lint sort-imports format-md format 37 | -------------------------------------------------------------------------------- /apps/collect-trace/utils/converters/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | from .convert_non_oai_to_chatml import ( 5 | convert_to_json_chatml, 6 | extract_and_save_chat_history, 7 | ) 8 | from .convert_oai_to_chatml import ( 9 | extract_message_history_from_log, 10 | oai_tool_message_to_chat_message, 11 | process_log_file, 12 | save_chatml_to_files, 13 | ) 14 | from .convert_to_chatml_auto_batch import ( 15 | batch_process_files, 16 | determine_conversion_method, 17 | get_llm_provider, 18 | process_single_file, 19 | ) 20 | 21 | __all__ = [ 22 | # OAI conversion functions 23 | "oai_tool_message_to_chat_message", 24 | "extract_message_history_from_log", 25 | "save_chatml_to_files", 26 | "process_log_file", 27 | # Non-OAI conversion functions 28 | "convert_to_json_chatml", 29 | "extract_and_save_chat_history", 30 | # Auto batch conversion functions 31 | "get_llm_provider", 32 | "determine_conversion_method", 33 | "process_single_file", 34 | "batch_process_files", 35 | ] 36 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 MiroMind 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.github/workflows/run-ruff.yml: -------------------------------------------------------------------------------- 1 | name: lint 2 | 3 | on: 4 | pull_request: 5 | branches: [ "main" ] 6 | 7 | jobs: 8 | lint: 9 | if: github.repository_owner == 'MiroMindAI' 10 | name: lint pull request 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: checkout code 14 | uses: actions/checkout@v4 15 | 16 | - name: Install uv 17 | uses: astral-sh/setup-uv@v5 18 | 19 | - name: Check static error 20 | run: | 21 | uv tool run ruff@0.8.0 check --show-fixes --output-format=github 22 | 23 | - name: Reformat code style 24 | run: | 25 | echo '## Reformat summary' >> $GITHUB_STEP_SUMMARY 26 | if diff_output="$(uv tool run ruff@0.8.0 format --diff 2>&1)"; then 27 | echo "$diff_output" 28 | echo '✅ Format check passed.' >> "$GITHUB_STEP_SUMMARY" 29 | else 30 | echo "$diff_output" 31 | echo '❌ Format issues detected.' >> "$GITHUB_STEP_SUMMARY" 32 | { 33 | echo '```diff' 34 | echo "$diff_output" 35 | echo '```' 36 | } >> "$GITHUB_STEP_SUMMARY" 37 | exit 1 38 | fi -------------------------------------------------------------------------------- /apps/miroflow-agent/src/llm/factory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | from typing import Optional 5 | 6 | from omegaconf import DictConfig, OmegaConf 7 | 8 | from ..logging.task_logger import TaskLog 9 | from .providers.anthropic_client import AnthropicClient 10 | from .providers.openai_client import OpenAIClient 11 | 12 | 13 | def ClientFactory( 14 | task_id: str, cfg: DictConfig, task_log: Optional[TaskLog] = None, **kwargs 15 | ): 16 | """ 17 | Automatically select provider and create LLM client based on configuration 18 | """ 19 | provider = cfg.llm.provider 20 | config = OmegaConf.merge(cfg, kwargs) 21 | 22 | client_creators = { 23 | "anthropic": lambda: AnthropicClient( 24 | task_id=task_id, task_log=task_log, cfg=config 25 | ), 26 | "qwen": lambda: OpenAIClient(task_id=task_id, task_log=task_log, cfg=config), 27 | "openai": lambda: OpenAIClient(task_id=task_id, task_log=task_log, cfg=config), 28 | } 29 | 30 | factory = client_creators.get(provider) 31 | if not factory: 32 | raise ValueError(f"Unsupported provider: {provider}") 33 | 34 | return factory() 35 | -------------------------------------------------------------------------------- /apps/visualize-trace/README.md: -------------------------------------------------------------------------------- 1 | # Trace Analysis Web Demo 2 | 3 | An interactive web interface for analyzing and visualizing trace JSON files. 4 | 5 | ## Installation and Running 6 | 7 | ### Method 1: Using Python (Recommended) 8 | 9 | ```bash 10 | pip install -r requirements.txt 11 | python run.py 12 | ``` 13 | 14 | The startup script will automatically check and install dependencies, then start the web application. Visit `http://127.0.0.1:5000` 15 | 16 | ### Method 2: Using uv 17 | 18 | ```bash 19 | uv run run.py 20 | ``` 21 | 22 | ## Usage 23 | 24 | 1. **Start the application**: After running, visit `http://127.0.0.1:5000` in your browser 25 | 26 | 1. **Load files**: 27 | 28 | - Select the trace JSON file to analyze from the dropdown menu in the top navigation bar 29 | - Click the "Load" button to load the file 30 | 31 | 1. **View analysis results**: 32 | 33 | - **Left panel**: Shows basic information, execution summary, and performance statistics 34 | - **Right panel**: Displays detailed execution flow 35 | - **Bottom panel**: Shows spans statistics and step logs statistics 36 | 37 | 1. **Interactive operations**: 38 | 39 | - Click on execution steps to expand/collapse detailed information 40 | - Use "Expand All"/"Collapse All" buttons to control all steps 41 | - Click "View Details" button to see complete message content 42 | -------------------------------------------------------------------------------- /apps/collect-trace/scripts/collect_trace_qwen3.sh: -------------------------------------------------------------------------------- 1 | # Get the directory where the current script is located 2 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 3 | echo "Current script directory: $SCRIPT_DIR" 4 | 5 | 6 | # Enter the apps/miroflow-agent directory 7 | TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent" 8 | echo "Target directory: $TARGET_DIR" 9 | cd $TARGET_DIR 10 | 11 | mkdir -p ../../logs 12 | LOG_DIR="../../logs/collect_trace_qwen3" 13 | echo "Log directory: $LOG_DIR" 14 | mkdir -p $LOG_DIR 15 | 16 | # Collect traces 17 | uv run python benchmarks/common_benchmark.py \ 18 | benchmark=collect_trace \ 19 | benchmark.data.data_dir="../../data/debug" \ 20 | benchmark.data.metadata_file="standardized_data.jsonl" \ 21 | llm=qwen-3 \ 22 | llm.provider=qwen \ 23 | llm.model_name=qwen-3-32b \ 24 | llm.api_key="" \ 25 | llm.base_url=https://your-api.com/v1 \ 26 | llm.async_client=true \ 27 | benchmark.execution.max_tasks=null \ 28 | benchmark.execution.max_concurrent=10 \ 29 | benchmark.execution.pass_at_k=1 \ 30 | agent=single_agent \ 31 | hydra.run.dir=$LOG_DIR \ 32 | 2>&1 | tee "$LOG_DIR/output.log" 33 | 34 | # Enter the apps/collect-trace directory 35 | TARGET_DIR="$SCRIPT_DIR/../" 36 | echo "Target directory: $TARGET_DIR" 37 | cd $TARGET_DIR 38 | 39 | # Process traces 40 | uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl 41 | 42 | 43 | -------------------------------------------------------------------------------- /apps/miroflow-agent/README.md: -------------------------------------------------------------------------------- 1 | # MiroFlow Agent 2 | 3 | ## Quick Start 4 | 5 | The simplest way to run a case is using the default command: 6 | 7 | ```bash 8 | # Run Claude-3.7-Sonnet with single-agent configuration 9 | uv run python main.py llm=claude-3-7 agent=single_agent_keep5 benchmark=debug 10 | 11 | # Run GPT-5 with single-agent configuration 12 | uv run python main.py llm=gpt-5 agent=single_agent_keep5 benchmark=debug 13 | 14 | # Use a different benchmark configuration 15 | uv run python main.py llm=qwen-3 agent=single_agent_keep5 benchmark=debug llm.base_url= 16 | ``` 17 | 18 | This will execute the default task: "What is the title of today's arxiv paper in computer science?" 19 | 20 | ## Available Configurations 21 | 22 | - **LLM Models**: `claude-3-7`, `gpt-5`, `qwen-3` 23 | - **Agent Configs**: `single_agent`, `single_agent_keep5`, `multi_agent`, `multi_agent_os` 24 | - **Benchmark Configs**: `debug`, `browsecomp`, `frames`, etc. 25 | 26 | ### Customizing the Task 27 | 28 | To change the task description, you need to modify the `main.py` file directly: 29 | 30 | ```python 31 | # In main.py, change line 43: 32 | task_description = "Your custom task here" 33 | ``` 34 | 35 | ### Output 36 | 37 | The agent will: 38 | 39 | 1. Execute the task using available tools 40 | 1. Generate a final summary and boxed answer 41 | 1. Save logs to `../../logs/debug/` directory 42 | 1. Display the results in the terminal 43 | 44 | ### Troubleshooting 45 | 46 | - Make sure your API keys are set correctly 47 | - Check the logs in the `logs/debug/` directory for detailed execution information 48 | - Ensure all dependencies are installed with `uv sync` 49 | -------------------------------------------------------------------------------- /apps/collect-trace/scripts/collect_trace_gpt41.sh: -------------------------------------------------------------------------------- 1 | # Check if OPENAI_API_KEY is set 2 | if [ -z "$OPENAI_API_KEY" ]; then 3 | echo "Error: OPENAI_API_KEY is not set." 4 | exit 1 5 | else 6 | echo "OPENAI_API_KEY detected." 7 | fi 8 | 9 | # Get the directory where the current script is located 10 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 11 | echo "Current script directory: $SCRIPT_DIR" 12 | 13 | 14 | # Enter the apps/miroflow-agent directory 15 | TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent" 16 | echo "Target directory: $TARGET_DIR" 17 | cd $TARGET_DIR 18 | 19 | mkdir -p ../../logs 20 | LOG_DIR="../../logs/collect_trace_gpt41" 21 | echo "Log directory: $LOG_DIR" 22 | mkdir -p $LOG_DIR 23 | 24 | # Collect traces 25 | uv run python benchmarks/common_benchmark.py \ 26 | benchmark=collect_trace \ 27 | benchmark.data.data_dir="../../data/debug" \ 28 | benchmark.data.metadata_file="standardized_data.jsonl" \ 29 | llm=gpt-5 \ 30 | llm.provider=openai \ 31 | llm.model_name=gpt-4.1-mini \ 32 | llm.api_key="$OPENAI_API_KEY" \ 33 | llm.base_url=https://api.openai.com/v1 \ 34 | llm.async_client=true \ 35 | benchmark.execution.max_tasks=null \ 36 | benchmark.execution.max_concurrent=10 \ 37 | benchmark.execution.pass_at_k=1 \ 38 | agent=single_agent \ 39 | hydra.run.dir=$LOG_DIR \ 40 | 2>&1 | tee "$LOG_DIR/output.log" 41 | 42 | # Enter the apps/collect-trace directory 43 | TARGET_DIR="$SCRIPT_DIR/../" 44 | echo "Target directory: $TARGET_DIR" 45 | cd $TARGET_DIR 46 | 47 | # Process traces 48 | uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl 49 | 50 | 51 | -------------------------------------------------------------------------------- /apps/collect-trace/scripts/collect_trace_gpt5.sh: -------------------------------------------------------------------------------- 1 | # Check if OPENAI_API_KEY is set 2 | if [ -z "$OPENAI_API_KEY" ]; then 3 | echo "Error: OPENAI_API_KEY is not set." 4 | exit 1 5 | else 6 | echo "OPENAI_API_KEY detected." 7 | fi 8 | 9 | # Get the directory where the current script is located 10 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 11 | echo "Current script directory: $SCRIPT_DIR" 12 | 13 | 14 | # Enter the apps/miroflow-agent directory 15 | TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent" 16 | echo "Target directory: $TARGET_DIR" 17 | cd $TARGET_DIR 18 | 19 | mkdir -p ../../logs 20 | LOG_DIR="../../logs/collect_trace_gpt5" 21 | echo "Log directory: $LOG_DIR" 22 | mkdir -p $LOG_DIR 23 | 24 | # Collect traces 25 | uv run python benchmarks/common_benchmark.py \ 26 | benchmark=collect_trace \ 27 | benchmark.data.data_dir="../../data/debug" \ 28 | benchmark.data.metadata_file="standardized_data.jsonl" \ 29 | llm=gpt-5 \ 30 | llm.provider=openai \ 31 | llm.model_name=gpt-5-2025-08-07 \ 32 | llm.api_key="$OPENAI_API_KEY" \ 33 | llm.base_url=https://api.openai.com/v1 \ 34 | llm.async_client=true \ 35 | benchmark.execution.max_tasks=null \ 36 | benchmark.execution.max_concurrent=10 \ 37 | benchmark.execution.pass_at_k=1 \ 38 | agent=single_agent \ 39 | hydra.run.dir=$LOG_DIR \ 40 | 2>&1 | tee "$LOG_DIR/output.log" 41 | 42 | # Enter the apps/collect-trace directory 43 | TARGET_DIR="$SCRIPT_DIR/../" 44 | echo "Target directory: $TARGET_DIR" 45 | cd $TARGET_DIR 46 | 47 | # Process traces 48 | uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl 49 | 50 | 51 | -------------------------------------------------------------------------------- /apps/gradio-demo/.env.example: -------------------------------------------------------------------------------- 1 | # API for Google Search (recommend) 2 | SERPER_API_KEY=your_serper_key 3 | SERPER_BASE_URL="https://google.serper.dev" 4 | 5 | # API for Web Scraping (recommend) 6 | JINA_API_KEY=your_jina_key 7 | JINA_BASE_URL="https://r.jina.ai" 8 | 9 | # API for Linux Sandbox (recommend) 10 | E2B_API_KEY=your_e2b_key 11 | 12 | # API for LLM-as-Judge (for benchmark testing) 13 | OPENAI_API_KEY=your_openai_key 14 | OPENAI_BASE_URL=https://api.openai.com/v1 15 | 16 | # API for Open-Source Audio Transcription Tool (for benchmark testing) 17 | WHISPER_MODEL_NAME="openai/whisper-large-v3-turbo" 18 | WHISPER_API_KEY=your_whisper_key 19 | WHISPER_BASE_URL="https://your_whisper_base_url/v1" 20 | 21 | # API for Open-Source VQA Tool (for benchmark testing) 22 | VISION_MODEL_NAME="Qwen/Qwen2.5-VL-72B-Instruct" 23 | VISION_API_KEY=your_vision_key 24 | VISION_BASE_URL="https://your_vision_base_url/v1/chat/completions" 25 | 26 | # API for Open-Source Reasoning Tool (for benchmark testing) 27 | REASONING_MODEL_NAME="Qwen/Qwen3-235B-A22B-Thinking-2507" 28 | REASONING_API_KEY=your_reasoning_key 29 | REASONING_BASE_URL="https://your_reasoning_base_url/v1/chat/completions" 30 | 31 | # API for Claude Sonnet 3.7 as Commercial Tools (optional) 32 | ANTHROPIC_API_KEY=your_anthropic_key 33 | ANTHROPIC_BASE_URL=https://api.anthropic.com 34 | 35 | # API for Sougou Search (optional) 36 | TENCENTCLOUD_SECRET_ID=your_tencent_cloud_secret_id 37 | TENCENTCLOUD_SECRET_KEY=your_tencent_cloud_secret_key 38 | 39 | # API for Summary LLM (optional) 40 | SUMMARY_LLM_BASE_URL=your_summary_llm_base_url 41 | SUMMARY_LLM_MODEL_NAME=your_summary_llm_model_name 42 | SUMMARY_LLM_API_KEY=your_summary_llm_api_key -------------------------------------------------------------------------------- /apps/gradio-demo/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def contains_chinese(text): 5 | """ 6 | Detect if a string contains Chinese characters or Chinese punctuation 7 | 8 | Args: 9 | text (str): The string to detect 10 | 11 | Returns: 12 | bool: True if contains Chinese characters or punctuation, False otherwise 13 | """ 14 | # Chinese character Unicode ranges: 15 | # \u4e00-\u9fff: CJK Unified Ideographs 16 | # \u3400-\u4dbf: CJK Extension A 17 | # \uf900-\ufaff: CJK Compatibility Ideographs 18 | # \u3000-\u303f: CJK Symbols and Punctuation 19 | # \uff00-\uffef: Fullwidth ASCII, Fullwidth punctuation 20 | chinese_pattern = re.compile( 21 | r"[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff\u3000-\u303f\uff00-\uffef]" 22 | ) 23 | return bool(chinese_pattern.search(text)) 24 | 25 | 26 | def replace_chinese_punctuation(text): 27 | # Handle single-character replacements with translate 28 | punctuation_map = str.maketrans( 29 | { 30 | ",": ",", 31 | "。": ".", 32 | "!": "!", 33 | "?": "?", 34 | ";": ";", 35 | ":": ":", 36 | "“": '"', 37 | "”": '"', 38 | "‘": "'", 39 | "’": "'", 40 | "(": "(", 41 | ")": ")", 42 | "【": "[", 43 | "】": "]", 44 | "《": "<", 45 | "》": ">", 46 | "、": ",", 47 | "—": "-", 48 | } 49 | ) 50 | # First, replace multi-character punctuation 51 | text = text.replace("……", "...") 52 | # Then apply single-character replacements 53 | return text.translate(punctuation_map) 54 | -------------------------------------------------------------------------------- /apps/collect-trace/scripts/collect_trace_claude37.sh: -------------------------------------------------------------------------------- 1 | # Check if ANTHROPIC_API_KEY is set 2 | if [ -z "$ANTHROPIC_API_KEY" ]; then 3 | echo "Error: ANTHROPIC_API_KEY is not set." 4 | exit 1 5 | else 6 | echo "ANTHROPIC_API_KEY detected." 7 | fi 8 | 9 | # Get the directory where the current script is located 10 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 11 | echo "Current script directory: $SCRIPT_DIR" 12 | 13 | 14 | # Enter the apps/miroflow-agent directory 15 | TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent" 16 | echo "Target directory: $TARGET_DIR" 17 | cd $TARGET_DIR 18 | 19 | mkdir -p ../../logs 20 | LOG_DIR="../../logs/collect_trace_claude37" 21 | echo "Log directory: $LOG_DIR" 22 | mkdir -p $LOG_DIR 23 | 24 | # Collect traces 25 | uv run python benchmarks/common_benchmark.py \ 26 | benchmark=collect_trace \ 27 | benchmark.data.data_dir="../../data/debug" \ 28 | benchmark.data.metadata_file="standardized_data.jsonl" \ 29 | llm=claude-3-7 \ 30 | llm.provider=anthropic \ 31 | llm.model_name=claude-3-7-sonnet-20250219 \ 32 | llm.api_key="$ANTHROPIC_API_KEY" \ 33 | llm.base_url=https://api.anthropic.com \ 34 | llm.async_client=true \ 35 | benchmark.execution.max_tasks=null \ 36 | benchmark.execution.max_concurrent=10 \ 37 | benchmark.execution.pass_at_k=1 \ 38 | agent=single_agent \ 39 | hydra.run.dir=$LOG_DIR \ 40 | 2>&1 | tee "$LOG_DIR/output.log" 41 | 42 | # Enter the apps/collect-trace directory 43 | TARGET_DIR="$SCRIPT_DIR/../" 44 | echo "Target directory: $TARGET_DIR" 45 | cd $TARGET_DIR 46 | 47 | # Process traces 48 | uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl 49 | 50 | 51 | -------------------------------------------------------------------------------- /apps/miroflow-agent/.env.example: -------------------------------------------------------------------------------- 1 | # API for Google Search (recommend) 2 | SERPER_API_KEY=your_serper_key 3 | SERPER_BASE_URL="https://google.serper.dev" 4 | 5 | # API for Web Scraping (recommend) 6 | JINA_API_KEY=your_jina_key 7 | JINA_BASE_URL="https://r.jina.ai" 8 | 9 | # API for Linux Sandbox (recommend) 10 | E2B_API_KEY=your_e2b_key 11 | 12 | # API for LLM-as-Judge (for benchmark testing) 13 | OPENAI_API_KEY=your_openai_key 14 | OPENAI_BASE_URL=https://api.openai.com/v1 15 | 16 | # API for Open-Source Audio Transcription Tool (for benchmark testing) 17 | WHISPER_MODEL_NAME="openai/whisper-large-v3-turbo" 18 | WHISPER_API_KEY=your_whisper_key 19 | WHISPER_BASE_URL="https://your_whisper_base_url/v1" 20 | 21 | # API for Open-Source VQA Tool (for benchmark testing) 22 | VISION_MODEL_NAME="Qwen/Qwen2.5-VL-72B-Instruct" 23 | VISION_API_KEY=your_vision_key 24 | VISION_BASE_URL="https://your_vision_base_url/v1/chat/completions" 25 | 26 | # API for Open-Source Reasoning Tool (for benchmark testing) 27 | REASONING_MODEL_NAME="Qwen/Qwen3-235B-A22B-Thinking-2507" 28 | REASONING_API_KEY=your_reasoning_key 29 | REASONING_BASE_URL="https://your_reasoning_base_url/v1/chat/completions" 30 | 31 | # API for Claude Sonnet 3.7 as Commercial Tools (optional) 32 | ANTHROPIC_API_KEY=your_anthropic_key 33 | ANTHROPIC_BASE_URL=https://api.anthropic.com 34 | 35 | # API for Sougou Search (optional) 36 | TENCENTCLOUD_SECRET_ID=your_tencent_cloud_secret_id 37 | TENCENTCLOUD_SECRET_KEY=your_tencent_cloud_secret_key 38 | 39 | # API for Summary LLM (optional) 40 | SUMMARY_LLM_BASE_URL="https://your_summary_llm_base_url/v1/chat/completions" 41 | SUMMARY_LLM_MODEL_NAME=your_summary_llm_model_name 42 | SUMMARY_LLM_API_KEY=your_summary_llm_api_key -------------------------------------------------------------------------------- /apps/miroflow-agent/main.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import asyncio 5 | 6 | import hydra 7 | from omegaconf import DictConfig, OmegaConf 8 | 9 | # Import from the new modular structure 10 | from src.core.pipeline import ( 11 | create_pipeline_components, 12 | execute_task_pipeline, 13 | ) 14 | from src.logging.task_logger import bootstrap_logger 15 | 16 | # Configure logger and get the configured instance 17 | logger = bootstrap_logger() 18 | 19 | 20 | async def amain(cfg: DictConfig) -> None: 21 | """Asynchronous main function.""" 22 | 23 | logger.info(OmegaConf.to_yaml(cfg)) 24 | 25 | # Create pipeline components using the factory function 26 | main_agent_tool_manager, sub_agent_tool_managers, output_formatter = ( 27 | create_pipeline_components(cfg) 28 | ) 29 | 30 | # Define task parameters 31 | task_id = "task_example" 32 | task_description = "What is the title of today's arxiv paper in computer science?" 33 | task_file_name = "" 34 | 35 | # Execute task using the pipeline 36 | final_summary, final_boxed_answer, log_file_path = await execute_task_pipeline( 37 | cfg=cfg, 38 | task_id=task_id, 39 | task_file_name=task_file_name, 40 | task_description=task_description, 41 | main_agent_tool_manager=main_agent_tool_manager, 42 | sub_agent_tool_managers=sub_agent_tool_managers, 43 | output_formatter=output_formatter, 44 | log_dir=cfg.debug_dir, 45 | ) 46 | 47 | 48 | @hydra.main(config_path="conf", config_name="config", version_base=None) 49 | def main(cfg: DictConfig) -> None: 50 | asyncio.run(amain(cfg)) 51 | 52 | 53 | if __name__ == "__main__": 54 | main() 55 | -------------------------------------------------------------------------------- /libs/miroflow-tools/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "miroflow-tools" 3 | version = "0.1.0" 4 | description = "Tool management and MCP server utilities for MiroFlow" 5 | readme = "README.md" 6 | authors = [ 7 | { name = "MiroMind Team", email = "service@miromind.ai" } 8 | ] 9 | requires-python = ">=3.12" 10 | dependencies = [ 11 | "mcp>=1.0.0", 12 | "fastmcp>=0.1.0", 13 | "playwright>=1.40.0", 14 | "requests>=2.32.0", 15 | "e2b-code-interpreter==1.2.1", 16 | "wikipedia", 17 | "mutagen", 18 | "markitdown-mcp>=0.0.1a3", 19 | "google-genai", 20 | "aiohttp", 21 | "redis" 22 | ] 23 | 24 | [build-system] 25 | requires = ["hatchling"] 26 | build-backend = "hatchling.build" 27 | 28 | [tool.hatch.build.targets.wheel] 29 | packages = ["src/miroflow_tools"] 30 | 31 | [dependency-groups] 32 | dev = [ 33 | "pytest>=8.4.1", 34 | "pytest-asyncio>=1.0.0", 35 | "pytest-cov>=6.2.1", 36 | "pytest-html>=4.1.1", 37 | "pytest-xdist>=3.7.0", 38 | "pytest-mock>=3.10.0", 39 | "pytest-timeout>=2.1.0", 40 | "inline-snapshot>=0.23.2", 41 | ] 42 | 43 | [tool.pytest.ini_options] 44 | minversion = "8.3.5" 45 | testpaths = ["src/test"] 46 | asyncio_default_fixture_loop_scope = "function" 47 | addopts = [ 48 | "-rA", 49 | "--show-capture=stderr", 50 | "-n=auto", 51 | "--html=report.html", 52 | "--self-contained-html", 53 | "--cov=miroflow_tools", 54 | "--cov-report=html", 55 | "--strict-markers", 56 | "-v", 57 | ] 58 | markers = [ 59 | "integration: marks tests as integration tests (may be slow)", 60 | "unit: marks tests as unit tests", 61 | "slow: marks tests as slow (deselect with '-m \"not slow\"')", 62 | "requires_api_key: marks tests that require real API credentials", 63 | ] -------------------------------------------------------------------------------- /apps/miroflow-agent/benchmarks/check_progress/check_progress_hle.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import argparse 5 | import os 6 | 7 | from common import ProgressChecker 8 | 9 | # Benchmark configuration 10 | FILENAME = os.path.basename(__file__) 11 | BENCHMARK_NAME = "hle" 12 | BENCHMARK_NAME_STD = "HLE-2500" 13 | TASKS_PER_RUN = 2500 14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" 15 | TASK_ID_PATTERN = r"task_([a-f0-9]+)" 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser( 20 | description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." 21 | ) 22 | parser.add_argument( 23 | "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" 24 | ) 25 | return parser.parse_args() 26 | 27 | 28 | if __name__ == "__main__": 29 | args = parse_args() 30 | 31 | try: 32 | # Create progress checker and run analysis 33 | checker = ProgressChecker( 34 | args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH 35 | ) 36 | summary = checker.run_analysis( 37 | benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN 38 | ) 39 | # Exit with appropriate code 40 | if summary.total_tasks == 0: 41 | print("No task files found in any run directories") 42 | elif summary.total_completed == 0: 43 | print("No tasks completed yet") 44 | 45 | except FileNotFoundError as e: 46 | print(f"Error: {e}") 47 | except PermissionError as e: 48 | print(f"Error: {e}") 49 | except ValueError as e: 50 | print(f"Error: {e}") 51 | except Exception as e: 52 | print(f"Unexpected error: {e}") 53 | -------------------------------------------------------------------------------- /apps/miroflow-agent/benchmarks/check_progress/check_progress_frames.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import argparse 5 | import os 6 | 7 | from common import ProgressChecker 8 | 9 | # Benchmark configuration 10 | FILENAME = os.path.basename(__file__) 11 | BENCHMARK_NAME = "frames" 12 | BENCHMARK_NAME_STD = "Frames" 13 | TASKS_PER_RUN = 824 14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" 15 | TASK_ID_PATTERN = r"task_([a-f0-9]+)" 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser( 20 | description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." 21 | ) 22 | parser.add_argument( 23 | "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" 24 | ) 25 | return parser.parse_args() 26 | 27 | 28 | if __name__ == "__main__": 29 | args = parse_args() 30 | 31 | try: 32 | # Create progress checker and run analysis 33 | checker = ProgressChecker( 34 | args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH 35 | ) 36 | summary = checker.run_analysis( 37 | benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN 38 | ) 39 | # Exit with appropriate code 40 | if summary.total_tasks == 0: 41 | print("No task files found in any run directories") 42 | elif summary.total_completed == 0: 43 | print("No tasks completed yet") 44 | 45 | except FileNotFoundError as e: 46 | print(f"Error: {e}") 47 | except PermissionError as e: 48 | print(f"Error: {e}") 49 | except ValueError as e: 50 | print(f"Error: {e}") 51 | except Exception as e: 52 | print(f"Unexpected error: {e}") 53 | -------------------------------------------------------------------------------- /apps/miroflow-agent/benchmarks/check_progress/check_progress_seal-0.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import argparse 5 | import os 6 | 7 | from common import ProgressChecker 8 | 9 | # Benchmark configuration 10 | FILENAME = os.path.basename(__file__) 11 | BENCHMARK_NAME = "seal-0" 12 | BENCHMARK_NAME_STD = "SEAL-0" 13 | TASKS_PER_RUN = 111 14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" 15 | TASK_ID_PATTERN = r"task_([a-f0-9]+)" 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser( 20 | description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." 21 | ) 22 | parser.add_argument( 23 | "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" 24 | ) 25 | return parser.parse_args() 26 | 27 | 28 | if __name__ == "__main__": 29 | args = parse_args() 30 | 31 | try: 32 | # Create progress checker and run analysis 33 | checker = ProgressChecker( 34 | args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH 35 | ) 36 | summary = checker.run_analysis( 37 | benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN 38 | ) 39 | # Exit with appropriate code 40 | if summary.total_tasks == 0: 41 | print("No task files found in any run directories") 42 | elif summary.total_completed == 0: 43 | print("No tasks completed yet") 44 | 45 | except FileNotFoundError as e: 46 | print(f"Error: {e}") 47 | except PermissionError as e: 48 | print(f"Error: {e}") 49 | except ValueError as e: 50 | print(f"Error: {e}") 51 | except Exception as e: 52 | print(f"Unexpected error: {e}") 53 | -------------------------------------------------------------------------------- /apps/miroflow-agent/benchmarks/check_progress/check_progress_aime2025.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import argparse 5 | import os 6 | 7 | from common import ProgressChecker 8 | 9 | # Benchmark configuration 10 | FILENAME = os.path.basename(__file__) 11 | BENCHMARK_NAME = "aime2025" 12 | BENCHMARK_NAME_STD = "AIME2025" 13 | TASKS_PER_RUN = 30 14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" 15 | TASK_ID_PATTERN = r"task_([a-f0-9]+)" 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser( 20 | description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." 21 | ) 22 | parser.add_argument( 23 | "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" 24 | ) 25 | return parser.parse_args() 26 | 27 | 28 | if __name__ == "__main__": 29 | args = parse_args() 30 | 31 | try: 32 | # Create progress checker and run analysis 33 | checker = ProgressChecker( 34 | args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH 35 | ) 36 | summary = checker.run_analysis( 37 | benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN 38 | ) 39 | # Exit with appropriate code 40 | if summary.total_tasks == 0: 41 | print("No task files found in any run directories") 42 | elif summary.total_completed == 0: 43 | print("No tasks completed yet") 44 | 45 | except FileNotFoundError as e: 46 | print(f"Error: {e}") 47 | except PermissionError as e: 48 | print(f"Error: {e}") 49 | except ValueError as e: 50 | print(f"Error: {e}") 51 | except Exception as e: 52 | print(f"Unexpected error: {e}") 53 | -------------------------------------------------------------------------------- /apps/miroflow-agent/benchmarks/check_progress/check_progress_browsecomp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import argparse 5 | import os 6 | 7 | from common import ProgressChecker 8 | 9 | # Benchmark configuration 10 | FILENAME = os.path.basename(__file__) 11 | BENCHMARK_NAME = "browsecomp" 12 | BENCHMARK_NAME_STD = "BrowseComp-EN" 13 | TASKS_PER_RUN = 1265 14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" 15 | TASK_ID_PATTERN = r"task_([a-f0-9]+)" 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser( 20 | description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." 21 | ) 22 | parser.add_argument( 23 | "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" 24 | ) 25 | return parser.parse_args() 26 | 27 | 28 | if __name__ == "__main__": 29 | args = parse_args() 30 | 31 | try: 32 | # Create progress checker and run analysis 33 | checker = ProgressChecker( 34 | args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH 35 | ) 36 | summary = checker.run_analysis( 37 | benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN 38 | ) 39 | # Exit with appropriate code 40 | if summary.total_tasks == 0: 41 | print("No task files found in any run directories") 42 | elif summary.total_completed == 0: 43 | print("No tasks completed yet") 44 | 45 | except FileNotFoundError as e: 46 | print(f"Error: {e}") 47 | except PermissionError as e: 48 | print(f"Error: {e}") 49 | except ValueError as e: 50 | print(f"Error: {e}") 51 | except Exception as e: 52 | print(f"Unexpected error: {e}") 53 | -------------------------------------------------------------------------------- /apps/miroflow-agent/benchmarks/check_progress/check_progress_hle-text-500.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import argparse 5 | import os 6 | 7 | from common import ProgressChecker 8 | 9 | # Benchmark configuration 10 | FILENAME = os.path.basename(__file__) 11 | BENCHMARK_NAME = "hle-text-500" 12 | BENCHMARK_NAME_STD = "HLE-Text-500" 13 | TASKS_PER_RUN = 500 14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" 15 | TASK_ID_PATTERN = r"task_([a-f0-9]+)" 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser( 20 | description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." 21 | ) 22 | parser.add_argument( 23 | "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" 24 | ) 25 | return parser.parse_args() 26 | 27 | 28 | if __name__ == "__main__": 29 | args = parse_args() 30 | 31 | try: 32 | # Create progress checker and run analysis 33 | checker = ProgressChecker( 34 | args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH 35 | ) 36 | summary = checker.run_analysis( 37 | benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN 38 | ) 39 | # Exit with appropriate code 40 | if summary.total_tasks == 0: 41 | print("No task files found in any run directories") 42 | elif summary.total_completed == 0: 43 | print("No tasks completed yet") 44 | 45 | except FileNotFoundError as e: 46 | print(f"Error: {e}") 47 | except PermissionError as e: 48 | print(f"Error: {e}") 49 | except ValueError as e: 50 | print(f"Error: {e}") 51 | except Exception as e: 52 | print(f"Unexpected error: {e}") 53 | -------------------------------------------------------------------------------- /apps/miroflow-agent/benchmarks/check_progress/check_progress_webwalkerqa.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import argparse 5 | import os 6 | 7 | from common import ProgressChecker 8 | 9 | # Benchmark configuration 10 | FILENAME = os.path.basename(__file__) 11 | BENCHMARK_NAME = "webwalkerqa" 12 | BENCHMARK_NAME_STD = "WebWalkerQA" 13 | TASKS_PER_RUN = 680 14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" 15 | TASK_ID_PATTERN = r"task_task_id_(\d+)" 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser( 20 | description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." 21 | ) 22 | parser.add_argument( 23 | "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" 24 | ) 25 | return parser.parse_args() 26 | 27 | 28 | if __name__ == "__main__": 29 | args = parse_args() 30 | 31 | try: 32 | # Create progress checker and run analysis 33 | checker = ProgressChecker( 34 | args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH 35 | ) 36 | summary = checker.run_analysis( 37 | benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN 38 | ) 39 | # Exit with appropriate code 40 | if summary.total_tasks == 0: 41 | print("No task files found in any run directories") 42 | elif summary.total_completed == 0: 43 | print("No tasks completed yet") 44 | 45 | except FileNotFoundError as e: 46 | print(f"Error: {e}") 47 | except PermissionError as e: 48 | print(f"Error: {e}") 49 | except ValueError as e: 50 | print(f"Error: {e}") 51 | except Exception as e: 52 | print(f"Unexpected error: {e}") 53 | -------------------------------------------------------------------------------- /apps/miroflow-agent/benchmarks/check_progress/check_progress_browsecomp_zh.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import argparse 5 | import os 6 | 7 | from common import ProgressChecker 8 | 9 | # Benchmark configuration 10 | FILENAME = os.path.basename(__file__) 11 | BENCHMARK_NAME = "browsecomp_zh" 12 | BENCHMARK_NAME_STD = "BrowseComp-ZH" 13 | TASKS_PER_RUN = 289 14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" 15 | TASK_ID_PATTERN = r"task_([a-f0-9]+)" 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser( 20 | description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." 21 | ) 22 | parser.add_argument( 23 | "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" 24 | ) 25 | return parser.parse_args() 26 | 27 | 28 | if __name__ == "__main__": 29 | args = parse_args() 30 | 31 | try: 32 | # Create progress checker and run analysis 33 | checker = ProgressChecker( 34 | args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH 35 | ) 36 | summary = checker.run_analysis( 37 | benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN 38 | ) 39 | # Exit with appropriate code 40 | if summary.total_tasks == 0: 41 | print("No task files found in any run directories") 42 | elif summary.total_completed == 0: 43 | print("No tasks completed yet") 44 | 45 | except FileNotFoundError as e: 46 | print(f"Error: {e}") 47 | except PermissionError as e: 48 | print(f"Error: {e}") 49 | except ValueError as e: 50 | print(f"Error: {e}") 51 | except Exception as e: 52 | print(f"Unexpected error: {e}") 53 | -------------------------------------------------------------------------------- /apps/miroflow-agent/benchmarks/check_progress/check_progress_hle-text-2158.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import argparse 5 | import os 6 | 7 | from common import ProgressChecker 8 | 9 | # Benchmark configuration 10 | FILENAME = os.path.basename(__file__) 11 | BENCHMARK_NAME = "hle-text-2158" 12 | BENCHMARK_NAME_STD = "HLE-Text-2158" 13 | TASKS_PER_RUN = 2158 14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" 15 | TASK_ID_PATTERN = r"task_([a-f0-9]+)" 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser( 20 | description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." 21 | ) 22 | parser.add_argument( 23 | "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" 24 | ) 25 | return parser.parse_args() 26 | 27 | 28 | if __name__ == "__main__": 29 | args = parse_args() 30 | 31 | try: 32 | # Create progress checker and run analysis 33 | checker = ProgressChecker( 34 | args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH 35 | ) 36 | summary = checker.run_analysis( 37 | benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN 38 | ) 39 | # Exit with appropriate code 40 | if summary.total_tasks == 0: 41 | print("No task files found in any run directories") 42 | elif summary.total_completed == 0: 43 | print("No tasks completed yet") 44 | 45 | except FileNotFoundError as e: 46 | print(f"Error: {e}") 47 | except PermissionError as e: 48 | print(f"Error: {e}") 49 | except ValueError as e: 50 | print(f"Error: {e}") 51 | except Exception as e: 52 | print(f"Unexpected error: {e}") 53 | -------------------------------------------------------------------------------- /apps/miroflow-agent/benchmarks/check_progress/check_progress_xbench_deepsearch.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import argparse 5 | import os 6 | 7 | from common import ProgressChecker 8 | 9 | # Benchmark configuration 10 | FILENAME = os.path.basename(__file__) 11 | BENCHMARK_NAME = "xbench_deepsearch" 12 | BENCHMARK_NAME_STD = "XBench-DeepSearch" 13 | TASKS_PER_RUN = 100 14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" 15 | TASK_ID_PATTERN = r"task_([a-f0-9]+)" 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser( 20 | description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." 21 | ) 22 | parser.add_argument( 23 | "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" 24 | ) 25 | return parser.parse_args() 26 | 27 | 28 | if __name__ == "__main__": 29 | args = parse_args() 30 | 31 | try: 32 | # Create progress checker and run analysis 33 | checker = ProgressChecker( 34 | args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH 35 | ) 36 | summary = checker.run_analysis( 37 | benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN 38 | ) 39 | # Exit with appropriate code 40 | if summary.total_tasks == 0: 41 | print("No task files found in any run directories") 42 | elif summary.total_completed == 0: 43 | print("No tasks completed yet") 44 | 45 | except FileNotFoundError as e: 46 | print(f"Error: {e}") 47 | except PermissionError as e: 48 | print(f"Error: {e}") 49 | except ValueError as e: 50 | print(f"Error: {e}") 51 | except Exception as e: 52 | print(f"Unexpected error: {e}") 53 | -------------------------------------------------------------------------------- /apps/miroflow-agent/benchmarks/check_progress/check_progress_gaia-validation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import argparse 5 | import os 6 | 7 | from common import GAIAProgressChecker as ProgressChecker 8 | 9 | # Benchmark configuration 10 | FILENAME = os.path.basename(__file__) 11 | BENCHMARK_NAME = "gaia-2023-validation" 12 | BENCHMARK_NAME_STD = "GAIA-Val-165" 13 | TASKS_PER_RUN = 165 14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" 15 | TASK_ID_PATTERN = r"task_([^_]+(?:-[^_]+)*)" 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser( 20 | description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." 21 | ) 22 | parser.add_argument( 23 | "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" 24 | ) 25 | return parser.parse_args() 26 | 27 | 28 | if __name__ == "__main__": 29 | args = parse_args() 30 | 31 | try: 32 | # Create progress checker and run analysis 33 | checker = ProgressChecker( 34 | args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH 35 | ) 36 | summary = checker.run_analysis( 37 | benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN 38 | ) 39 | # Exit with appropriate code 40 | if summary.total_tasks == 0: 41 | print("No task files found in any run directories") 42 | elif summary.total_completed == 0: 43 | print("No tasks completed yet") 44 | 45 | except FileNotFoundError as e: 46 | print(f"Error: {e}") 47 | except PermissionError as e: 48 | print(f"Error: {e}") 49 | except ValueError as e: 50 | print(f"Error: {e}") 51 | except Exception as e: 52 | print(f"Unexpected error: {e}") 53 | -------------------------------------------------------------------------------- /apps/miroflow-agent/benchmarks/check_progress/check_progress_gaia-validation-text-103.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import argparse 5 | import os 6 | 7 | from common import GAIAProgressChecker as ProgressChecker 8 | 9 | # Benchmark configuration 10 | FILENAME = os.path.basename(__file__) 11 | BENCHMARK_NAME = "gaia-2023-validation-text-103" 12 | BENCHMARK_NAME_STD = "GAIA-Text-103" 13 | TASKS_PER_RUN = 103 14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl" 15 | TASK_ID_PATTERN = r"task_([^_]+(?:-[^_]+)*)" 16 | 17 | 18 | def parse_args(): 19 | parser = argparse.ArgumentParser( 20 | description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs." 21 | ) 22 | parser.add_argument( 23 | "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory" 24 | ) 25 | return parser.parse_args() 26 | 27 | 28 | if __name__ == "__main__": 29 | args = parse_args() 30 | 31 | try: 32 | # Create progress checker and run analysis 33 | checker = ProgressChecker( 34 | args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH 35 | ) 36 | summary = checker.run_analysis( 37 | benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN 38 | ) 39 | # Exit with appropriate code 40 | if summary.total_tasks == 0: 41 | print("No task files found in any run directories") 42 | elif summary.total_completed == 0: 43 | print("No tasks completed yet") 44 | 45 | except FileNotFoundError as e: 46 | print(f"Error: {e}") 47 | except PermissionError as e: 48 | print(f"Error: {e}") 49 | except ValueError as e: 50 | print(f"Error: {e}") 51 | except Exception as e: 52 | print(f"Unexpected error: {e}") 53 | -------------------------------------------------------------------------------- /libs/miroflow-tools/src/miroflow_tools/mcp_servers/reasoning_mcp_server.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import logging 5 | import os 6 | 7 | from anthropic import Anthropic 8 | from fastmcp import FastMCP 9 | 10 | logger = logging.getLogger("miroflow") 11 | 12 | ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "") 13 | ANTHROPIC_BASE_URL = os.environ.get("ANTHROPIC_BASE_URL", "https://api.anthropic.com") 14 | 15 | # Initialize FastMCP server 16 | mcp = FastMCP("reasoning-mcp-server") 17 | 18 | 19 | @mcp.tool() 20 | async def reasoning(question: str) -> str: 21 | """You can use this tool to solve hard math problem, puzzle, riddle and IQ test question that requires a lot of chain of thought efforts. 22 | DO NOT use this tool for simple and obvious question. 23 | 24 | Args: 25 | question: The hard question. 26 | 27 | Returns: 28 | The answer to the question. 29 | """ 30 | messages_for_llm = [ 31 | { 32 | "role": "user", 33 | "content": [ 34 | { 35 | "type": "text", 36 | "text": question, 37 | } 38 | ], 39 | } 40 | ] 41 | 42 | client = Anthropic(api_key=ANTHROPIC_API_KEY, base_url=ANTHROPIC_BASE_URL) 43 | response = client.messages.create( 44 | model="claude-3-7-sonnet-20250219", 45 | max_tokens=21000, 46 | thinking={ 47 | "type": "enabled", 48 | "budget_tokens": 19000, 49 | }, 50 | messages=messages_for_llm, 51 | stream=False, 52 | ) 53 | 54 | try: 55 | return response.content[-1].text 56 | except Exception: 57 | logger.info("Reasoning Error: only thinking content is returned") 58 | return response.content[-1].thinking 59 | 60 | 61 | if __name__ == "__main__": 62 | mcp.run(transport="stdio") 63 | -------------------------------------------------------------------------------- /libs/miroflow-tools/src/miroflow_tools/dev_mcp_servers/stateless_python_server.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import os 5 | 6 | from e2b_code_interpreter import Sandbox 7 | from mcp.server.fastmcp import FastMCP 8 | 9 | # Initialize FastMCP server 10 | mcp = FastMCP("stateless-python-server") 11 | 12 | # API keys 13 | E2B_API_KEY = os.environ.get("E2B_API_KEY") 14 | 15 | # DEFAULT CONFS 16 | DEFAULT_TIMEOUT = 300 # seconds 17 | 18 | 19 | @mcp.tool() 20 | async def python(code: str) -> str: 21 | """Use this tool to execute STATELESS Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files). 22 | When you send a message containing python code to python, it will be executed in a stateless docker container, and the stdout of that process will be returned to you. You have to use print statements to access the output. 23 | IMPORTANT: Your python environment is not shared between calls. You will have to pass your entire code each time. 24 | 25 | Args: 26 | code: The python code to run. 27 | 28 | Returns: 29 | A string containing the execution result including stdout and stderr. 30 | """ 31 | sandbox = Sandbox.create( 32 | timeout=DEFAULT_TIMEOUT, api_key=E2B_API_KEY, template="1av7fdjfvcparqo8efq6" 33 | ) 34 | 35 | max_attempts = 2 36 | for attempt in range(1, max_attempts + 1): 37 | try: 38 | execution = sandbox.run_code(code) 39 | break 40 | except Exception as e: 41 | if attempt == max_attempts: 42 | raise e 43 | execution = sandbox.run_code(code) 44 | 45 | sandbox.kill() 46 | 47 | return str(execution) 48 | 49 | 50 | if __name__ == "__main__": 51 | mcp.run(transport="stdio") 52 | -------------------------------------------------------------------------------- /apps/collect-trace/utils/merge_chatml_msgs_to_one_json.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import argparse 5 | import glob 6 | import json 7 | import os 8 | 9 | 10 | def merge_json_files(input_dir, type="main"): 11 | # List to store all messages 12 | all_conversations = [] 13 | 14 | # Get all JSON files matching the pattern 15 | json_files = glob.glob(os.path.join(input_dir, f"*{type}*.json")) 16 | 17 | # Read each JSON file and merge its content 18 | for json_file in json_files: 19 | try: 20 | with open(json_file, "r", encoding="utf-8") as f: 21 | data = json.load(f) 22 | conversation = { 23 | "messages": data, 24 | } 25 | all_conversations.append(conversation) 26 | print(f"Successfully processed: {json_file}") 27 | except Exception as e: 28 | print(f"Error processing {json_file}: {str(e)}") 29 | 30 | output_file = os.path.join(input_dir, f"{type}_merged.json") 31 | # Write the merged data to a new JSON file 32 | with open(output_file, "w", encoding="utf-8") as f: 33 | json.dump(all_conversations, f, ensure_ascii=False, indent=2) 34 | 35 | print( 36 | f"\nMerging complete! All {type} JSON files have been merged into {output_file}" 37 | ) 38 | print(f"Total number of files processed: {len(json_files)}") 39 | print(f"Total number of messages: {len(all_conversations)}") 40 | 41 | 42 | def main(): 43 | parser = argparse.ArgumentParser( 44 | description="Merge multiple JSON files which contain chat messages into a single file" 45 | ) 46 | parser.add_argument( 47 | "--input_dir", 48 | type=str, 49 | required=True, 50 | help="File pattern with wildcards to match JSON files (e.g., '*.json' or 'data/*main*.json')", 51 | ) 52 | 53 | args = parser.parse_args() 54 | 55 | merge_json_files(args.input_dir, type="main_agent") 56 | merge_json_files(args.input_dir, type="agent-browsing") 57 | 58 | 59 | if __name__ == "__main__": 60 | main() 61 | -------------------------------------------------------------------------------- /assets/QA.md: -------------------------------------------------------------------------------- 1 | # MiroFlow QA Documentation 2 | 3 | ## Q1: Can I extract GAIA-Text-103 results from existing GAIA-Validation evaluations? 4 | 5 | **Answer:** Yes! If you have completed GAIA-Validation evaluations, you can extract and re-grade the GAIA-Text-103 subset using our specialized tools. 6 | 7 | ### Step-by-Step Process 8 | 9 | 1. **Extract GAIA-Text-103 Tasks** 10 | 11 | ```bash 12 | # Extract text-103 tasks to a separate directory 13 | uv run benchmarks/subset_extraction/gaia-to-text-103-mover.py ../../logs/gaia-validation/0806/qwen_MiroThinker-32B-SFT_evaluation 14 | ``` 15 | 16 | This creates a new directory: `gaia-text-103-extraction/qwen_MiroThinker-32B-SFT_evaluation` 17 | 18 | 1. **Re-grade with GAIA-Text-103 Evaluator** 19 | 20 | ```bash 21 | # Apply GAIA-Text-103 specific grading 22 | uv run benchmarks/subset_extraction/gaia-text-103-grader.py ../../logs/gaia-validation/0806/gaia-text-103-extraction 23 | ``` 24 | 25 | 1. **Verify Results** 26 | 27 | ```bash 28 | # Check accuracy and generate statistics 29 | uv run benchmarks/check_progress/check_progress_gaia-validation-text-103.py ../../logs/gaia-validation/0806/gaia-text-103-extraction 30 | ``` 31 | 32 | ## Q2: Does the choice of judgment model affect evaluation performance? 33 | 34 | **Answer:** Yes, there is a measurable difference in evaluation outcomes between the two judgment models. 35 | 36 | We have standardized on GPT-4.1-2025-04-14 as our primary judgment model for several practical reasons: 37 | 38 | - **Ease of deployment:** No need to host additional GPU-intensive models 39 | - **Consistency:** Aligns with evaluation standards used in other benchmarks (SimpleQA, BrowseComp) 40 | - **Reproducibility:** Provides a consistent baseline for cross-evaluation comparisons 41 | 42 | ## Code Quality Checks 43 | 44 | Before submitting a pull request, ensure your code meets our quality standards: 45 | 46 | ```bash 47 | # Fix linting issues automatically 48 | uv tool run ruff@0.8.0 check --fix . 49 | 50 | # Format code according to our style guidelines 51 | uv tool run ruff@0.8.0 format . 52 | ``` 53 | 54 | ## Know Issues 55 | 56 | - The context management component before the summary requires further refinement to improve accuracy and reliability. I guess this is because the length estimation is not accurate. 57 | -------------------------------------------------------------------------------- /apps/collect-trace/README.md: -------------------------------------------------------------------------------- 1 | # Collect Trace 2 | 3 | > TL;DR: Treat an RLVR-format dataset (Question + verifiable answer) as a benchmark. Run the evaluation pipeline; use LLM-as-judge to verify correctness; then harvest the correct interaction traces as training data (for SFT / DPO). 4 | 5 | ## 📝 Overview 6 | 7 | Collect Trace is a key component in the MiroThinker training pipeline. Instead of hand-curating training samples, it reuses RLVR datasets as test sets, and collects multi-turn interaction traces only from items judged correct. 8 | 9 | Workflow: 10 | 11 | 1. Load each RLVR item’s question and verifiable answer. 12 | 13 | 1. Run the agent in the evaluation pipeline (with tool use / browsing as needed). 14 | 15 | 1. Verify the model’s answer with an LLM-as-judge against the RLVR reference answer. 16 | 17 | 1. Only for items judged correct, collect the full multi-turn trace and convert it into SFT / DPO-ready samples. 18 | 19 | ## 🚀 Quick Start 20 | 21 | ### Prerequisites 22 | 23 | - Python 3.12+ 24 | - [uv](https://github.com/astral-sh/uv) package manager 25 | - OpenAI API key (for LLM-based validation) 26 | - RLVR dataset (JSONL; contains question and a verifiable answer) 27 | 28 | ### Installation 29 | 30 | 1. **Navigate to the collect-trace directory**: 31 | 32 | ```bash 33 | cd apps/collect-trace 34 | ``` 35 | 36 | 1. **Install dependencies**: 37 | 38 | ```bash 39 | uv sync 40 | ``` 41 | 42 | 1. **Set up environment variables**: 43 | 44 | ```bash 45 | # Create .env if missing (safe; won't overwrite existing file) 46 | [ -f ../../apps/miroflow-agent/.env ] || cp ../../apps/miroflow-agent/.env.example ../../apps/miroflow-agent/.env 47 | # (Alternative on macOS/Linux) cp -n ../../apps/miroflow-agent/.env.example ../../apps/miroflow-agent/.env || true 48 | 49 | # Edit .env and fill in your keys 50 | # Required: OPENAI_API_KEY (for LLM as judging) 51 | # Optional: other keys for specific tools 52 | ``` 53 | 54 | ### Basic Usage 55 | 56 | Run a benchmark evaluation to collect traces: 57 | 58 | ```bash 59 | # Using Claude-3.6 for trace collection 60 | bash scripts/collect_trace_claude37.sh 61 | 62 | # Using GPT-5 for trace collection 63 | bash scripts/collect_trace_gpt5.sh 64 | 65 | # Using Qwen-3 for trace collection 66 | bash scripts/collect_trace_qwen3.sh 67 | ``` 68 | -------------------------------------------------------------------------------- /apps/collect-trace/utils/converters/system_prompts.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | main_system_prompt_foreword = """In this environment you have access to a set of tools you can use to answer the user's question. \n \nYou only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use.""" 5 | 6 | sub_agent_system_prompt_foreword = """In this environment you have access to a set of tools you can use to answer the user's question. \n \nYou only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use.""" 7 | 8 | system_prompt_tool_instrcutions = """# Tool-Use Formatting Instructions \n\nTool-use is formatted using XML-style tags. The tool-use is enclosed in and each parameter is similarly enclosed within its own set of tags.\n\nThe Model Context Protocol (MCP) connects to servers that provide additional tools and resources to extend your capabilities. You can use the server's tools via the `use_mcp_tool`.\n\nDescription: \nRequest to use a tool provided by a MCP server. Each MCP server can provide multiple tools with different capabilities. Tools have defined input schemas that specify required and optional parameters.\n\nParameters:\n- server_name: (required) The name of the MCP server providing the tool\n- tool_name: (required) The name of the tool to execute\n- arguments: (required) A JSON object containing the tool's input parameters, following the tool's input schema, quotes within string must be properly escaped, ensure it's valid JSON\n\nUsage:\n\nserver name here\ntool name here\n\n{\n\"param1\": \"value1\",\n\"param2\": \"value2 \\\"escaped string\\\"\"\n}\n\n\n\nImportant Notes:\n- Tool-use must be placed **at the end** of your response, **top-level**, and not nested within other tags.\n- Always adhere to this format for the tool use to ensure proper parsing and execution.\n\nString and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions.\nHere are the functions available in JSONSchema format:\n\n""" 9 | -------------------------------------------------------------------------------- /apps/miroflow-agent/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "miroflow-agent" 3 | version = "0.1.0" 4 | description = "Add your description here" 5 | readme = "README.md" 6 | requires-python = ">=3.12" 7 | dependencies = [ 8 | "miroflow-tools>=0.1.0", 9 | "huggingface-hub>=0.28.0", 10 | "requests>=2.32.3", 11 | "rich>=13.9.4", 12 | "jinja2>=3.1.4", 13 | "pillow>=11.0.0", 14 | "markdownify>=0.14.1", 15 | "duckduckgo-search>=6.3.7", 16 | "python-dotenv", 17 | "pdfminer-six", 18 | "python-pptx", 19 | "puremagic", 20 | "pydub", 21 | "SpeechRecognition", 22 | "youtube_transcript_api", 23 | "mcp", 24 | "fastmcp", 25 | "anthropic", 26 | "e2b-code-interpreter==1.2.1", 27 | "jsonlines>=4.0.0", 28 | "mammoth>=1.9.0", 29 | "numpy>=2.2.5", 30 | "ipdb>=0.13.13", 31 | "datasets>=3.5.0", 32 | "openpyxl>=3.1.5", 33 | "markitdown-mcp>=0.0.1a3", 34 | "markitdown>=0.1.1", 35 | "regex>=2024.11.6", 36 | "openai>=1.78.1", 37 | "tenacity>=9.1.2", 38 | "transformers>=4.51.3", 39 | "omegaconf>=2.3.0", 40 | "wikipedia", 41 | "mutagen", 42 | "hydra-core", 43 | "google-genai", 44 | "tiktoken>=0.9.0", 45 | "aiohttp", 46 | "colorama>=0.4.6", 47 | "json-repair>=0.49.0", 48 | "tencentcloud-sdk-python>=3.0.1451" 49 | ] 50 | 51 | [build-system] 52 | requires = ["hatchling"] 53 | build-backend = "hatchling.build" 54 | 55 | [tool.hatch.build.targets.wheel] 56 | packages = ["src"] 57 | 58 | 59 | [tool.uv.sources] 60 | miroflow-tools = { path = "../../libs/miroflow-tools", editable = true } 61 | 62 | [dependency-groups] 63 | dev = [ 64 | "inline-snapshot>=0.23.2", 65 | "pyright>=1.1.403", 66 | "pytest>=8.4.1", 67 | "pytest-asyncio>=1.0.0", 68 | "pytest-cov>=6.2.1", 69 | "pytest-html>=4.1.1", 70 | "pytest-xdist>=3.7.0", 71 | "ty>=0.0.1a14", 72 | ] 73 | 74 | [tool.pytest.ini_options] 75 | # see https://docs.pytest.org/en/stable/reference/customize.html#pyproject-toml 76 | minversion = "8.3.5" 77 | testpaths = ["tests"] 78 | # make warning go away 79 | # https://github.com/pytest-dev/pytest-asyncio/issues/924#issuecomment-2321921915 80 | asyncio_default_fixture_loop_scope = "function" 81 | addopts = [ 82 | # summary for failed AND passed tests 83 | "-rA", 84 | # only show stderr for test. stdlog can contain sensitive information 85 | "--show-capture=stderr", 86 | # use `pytest-xdist` to run tests in parallel 87 | "-n=auto", 88 | # use `pytest-html` to generate test report in html format 89 | "--html=report.html", 90 | "--self-contained-html", 91 | # use `pytest-testmon` to run tests on changed files only 92 | # "--testmon", 93 | # use `pytest-cov` to generate test coverage report 94 | "--cov=miroflow_agent", 95 | "--cov-report=html", 96 | ] 97 | -------------------------------------------------------------------------------- /libs/miroflow-tools/src/miroflow_tools/mcp_servers/vision_mcp_server.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import base64 5 | import os 6 | 7 | from anthropic import Anthropic 8 | from fastmcp import FastMCP 9 | 10 | ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "") 11 | ANTHROPIC_BASE_URL = os.environ.get("ANTHROPIC_BASE_URL", "https://api.anthropic.com") 12 | 13 | # Initialize FastMCP server 14 | mcp = FastMCP("vision-mcp-server") 15 | 16 | 17 | def guess_mime_media_type_from_extension(file_path: str) -> str: 18 | """Guess the MIME type based on the file extension.""" 19 | _, ext = os.path.splitext(file_path) 20 | ext = ext.lower() 21 | if ext in [".jpg", ".jpeg"]: 22 | return "image/jpeg" 23 | elif ext == ".png": 24 | return "image/png" 25 | elif ext == ".gif": 26 | return "image/gif" 27 | else: 28 | return "image/jpeg" # Default to JPEG if unknown 29 | 30 | 31 | @mcp.tool() 32 | async def visual_question_answering(image_path_or_url: str, question: str) -> str: 33 | """Ask question about an image or a video and get the answer with a vision language model. 34 | 35 | Args: 36 | image_path_or_url: The path of the image file locally or its URL. 37 | question: The question to ask about the image. 38 | 39 | Returns: 40 | The answer to the image-related question. 41 | """ 42 | messages_for_llm = [ 43 | { 44 | "role": "user", 45 | "content": [ 46 | { 47 | "type": "image", 48 | "source": None, 49 | }, 50 | { 51 | "type": "text", 52 | "text": question, 53 | }, 54 | ], 55 | } 56 | ] 57 | 58 | try: 59 | if os.path.exists(image_path_or_url): # Check if the file exists locally 60 | with open(image_path_or_url, "rb") as image_file: 61 | image_data = base64.b64encode(image_file.read()).decode("utf-8") 62 | messages_for_llm[0]["content"][0]["source"] = dict( 63 | type="base64", 64 | media_type=guess_mime_media_type_from_extension(image_path_or_url), 65 | data=image_data, 66 | ) 67 | else: # Otherwise, assume it's a URL 68 | messages_for_llm[0]["content"][0]["source"] = dict( 69 | type="url", url=image_path_or_url 70 | ) 71 | 72 | client = Anthropic(api_key=ANTHROPIC_API_KEY, base_url=ANTHROPIC_BASE_URL) 73 | response = client.messages.create( 74 | model="claude-3-7-sonnet-20250219", 75 | max_tokens=1024, 76 | messages=messages_for_llm, 77 | ) 78 | except Exception as e: 79 | return f"Error: {e}" 80 | 81 | try: 82 | return response.content[0].text 83 | except (AttributeError, IndexError): 84 | return str(response) 85 | 86 | 87 | if __name__ == "__main__": 88 | mcp.run(transport="stdio") 89 | -------------------------------------------------------------------------------- /libs/miroflow-tools/src/miroflow_tools/mcp_servers/reasoning_mcp_server_os.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import logging 5 | import os 6 | import random 7 | import time 8 | 9 | import requests 10 | from fastmcp import FastMCP 11 | 12 | logger = logging.getLogger("miroflow") 13 | 14 | REASONING_API_KEY = os.environ.get("REASONING_API_KEY") 15 | REASONING_BASE_URL = os.environ.get("REASONING_BASE_URL") 16 | REASONING_MODEL_NAME = os.environ.get("REASONING_MODEL_NAME") 17 | 18 | # Initialize FastMCP server 19 | mcp = FastMCP("reasoning-mcp-server-os") 20 | 21 | # Retry configuration 22 | MAX_RETRIES = 10 23 | BACKOFF_BASE = 1.0 # initial backoff in seconds 24 | BACKOFF_MAX = 30.0 # maximum backoff in seconds 25 | 26 | 27 | def post_with_retry(url, json, headers): 28 | """Send POST request with retry and exponential backoff. 29 | Returns response object if success, otherwise None.""" 30 | for attempt in range(1, MAX_RETRIES + 1): 31 | try: 32 | resp = requests.post(url, json=json, headers=headers, timeout=600) 33 | if resp.status_code == 200: 34 | return resp 35 | else: 36 | logger.warning( 37 | f"HTTP {resp.status_code} on attempt {attempt}: {resp.text[:200]}" 38 | ) 39 | except requests.exceptions.RequestException as e: 40 | logger.warning(f"Request failed on attempt {attempt}: {e}") 41 | 42 | # Backoff before next retry 43 | if attempt < MAX_RETRIES: 44 | sleep_time = min(BACKOFF_BASE * (2 ** (attempt - 1)), BACKOFF_MAX) 45 | # Add jitter to avoid thundering herd 46 | sleep_time *= 0.8 + 0.4 * random.random() 47 | logger.info(f"Retrying in {sleep_time:.1f}s...") 48 | time.sleep(sleep_time) 49 | 50 | logger.warning(f"All {MAX_RETRIES} retries failed for {url}") 51 | return None 52 | 53 | 54 | @mcp.tool() 55 | async def reasoning(question: str) -> str: 56 | """You can use this tool use solve hard math problem, puzzle, riddle and IQ test question that requires a lot of chain of thought efforts. 57 | DO NOT use this tool for simple and obvious question. 58 | 59 | Args: 60 | question: The hard question. 61 | 62 | Returns: 63 | The answer to the question. 64 | """ 65 | payload = { 66 | "model": REASONING_MODEL_NAME, 67 | "messages": [{"role": "user", "content": question}], 68 | "temperature": 0.6, 69 | "top_p": 0.95, 70 | } 71 | headers = { 72 | "Authorization": f"Bearer {REASONING_API_KEY}", 73 | "Content-Type": "application/json", 74 | } 75 | 76 | response = post_with_retry(REASONING_BASE_URL, json=payload, headers=headers) 77 | if response is None: 78 | return "Reasoning service unavailable. Please try again later." 79 | 80 | json_response = response.json() 81 | try: 82 | content = json_response["choices"][0]["message"]["content"] 83 | if "" in content: 84 | content = content.split("", 1)[1].strip() 85 | return content 86 | except Exception: 87 | logger.info("Reasoning Error: only thinking content is returned") 88 | return json_response["choices"][0]["message"]["reasoning_content"] 89 | 90 | 91 | if __name__ == "__main__": 92 | mcp.run(transport="stdio") 93 | -------------------------------------------------------------------------------- /apps/miroflow-agent/scripts/run_evaluate_multiple_runs_hle.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parse environment variables, use defaults if not set 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"} 6 | 7 | # Configuration parameters 8 | NUM_RUNS=${NUM_RUNS:-3} 9 | BENCHMARK_NAME="hle" 10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} 11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"} 12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} 13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10} 14 | PASS_AT_K=${PASS_AT_K:-1} 15 | TEMPERATURE=${TEMPERATURE:-1.0} 16 | API_KEY=${API_KEY:-"xxx"} 17 | 18 | # Set results directory 19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" 20 | 21 | echo "Starting $NUM_RUNS runs of the evaluation..." 22 | echo "Results will be saved in: $RESULTS_DIR" 23 | 24 | # Create results directory 25 | mkdir -p "$RESULTS_DIR" 26 | 27 | # Launch all parallel tasks 28 | for i in $(seq 1 $NUM_RUNS); do 29 | echo "==========================================" 30 | echo "Launching experiment $i/$NUM_RUNS" 31 | echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" 32 | echo "==========================================" 33 | 34 | # Set specific identifier for this run 35 | RUN_ID="run_$i" 36 | 37 | # Run experiment (background execution) 38 | ( 39 | uv run python benchmarks/common_benchmark.py \ 40 | benchmark=$BENCHMARK_NAME \ 41 | benchmark.data.metadata_file="standardized_data.jsonl" \ 42 | llm=qwen-3 \ 43 | llm.provider=$LLM_PROVIDER \ 44 | llm.model_name=$LLM_MODEL \ 45 | llm.base_url=$BASE_URL \ 46 | llm.async_client=true \ 47 | llm.temperature=$TEMPERATURE \ 48 | llm.max_context_length=$MAX_CONTEXT_LENGTH \ 49 | llm.api_key=$API_KEY \ 50 | benchmark.execution.max_tasks=null \ 51 | benchmark.execution.max_concurrent=$MAX_CONCURRENT \ 52 | benchmark.execution.pass_at_k=$PASS_AT_K \ 53 | benchmark.data.data_dir=../../data/hle \ 54 | agent=$AGENT_SET \ 55 | hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 56 | 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 57 | 58 | # Check if run was successful 59 | if [ $? -eq 0 ]; then 60 | echo "Run $i completed successfully" 61 | RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) 62 | if [ -f "$RESULT_FILE" ]; then 63 | echo "Results saved to $RESULT_FILE" 64 | else 65 | echo "Warning: Result file not found for run $i" 66 | fi 67 | else 68 | echo "Run $i failed!" 69 | fi 70 | ) & 71 | 72 | # Small delay between launches to avoid simultaneous requests 73 | sleep 2 74 | done 75 | 76 | echo "All $NUM_RUNS runs have been launched in parallel" 77 | echo "Waiting for all runs to complete..." 78 | 79 | # Wait for all background tasks to complete 80 | wait 81 | 82 | echo "==========================================" 83 | echo "All $NUM_RUNS runs completed!" 84 | echo "==========================================" 85 | 86 | # Calculate average scores 87 | echo "Calculating average scores..." 88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" 89 | 90 | echo "==========================================" 91 | echo "Multiple runs evaluation completed!" 92 | echo "Check results in: $RESULTS_DIR" 93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" 94 | echo "==========================================" 95 | -------------------------------------------------------------------------------- /apps/miroflow-agent/scripts/run_evaluate_multiple_runs_debug.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parse environment variables, use defaults if not set 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"} 6 | 7 | # Configuration parameters 8 | NUM_RUNS=${NUM_RUNS:-1} 9 | BENCHMARK_NAME="debug" 10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} 11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"} 12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} 13 | MAX_CONCURRENT=${MAX_CONCURRENT:-1} 14 | PASS_AT_K=${PASS_AT_K:-1} 15 | TEMPERATURE=${TEMPERATURE:-1.0} 16 | API_KEY=${API_KEY:-"xxx"} 17 | 18 | # Set results directory 19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" 20 | 21 | echo "Starting $NUM_RUNS runs of the evaluation..." 22 | echo "Results will be saved in: $RESULTS_DIR" 23 | 24 | # Create results directory 25 | mkdir -p "$RESULTS_DIR" 26 | 27 | # Launch all parallel tasks 28 | for i in $(seq 1 $NUM_RUNS); do 29 | echo "==========================================" 30 | echo "Launching experiment $i/$NUM_RUNS" 31 | echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" 32 | echo "==========================================" 33 | 34 | # Set specific identifier for this run 35 | RUN_ID="run_$i" 36 | 37 | # Run experiment (background execution) 38 | ( 39 | uv run python benchmarks/common_benchmark.py \ 40 | benchmark=$BENCHMARK_NAME \ 41 | benchmark.data.metadata_file="standardized_data.jsonl" \ 42 | llm=qwen-3 \ 43 | llm.provider=$LLM_PROVIDER \ 44 | llm.model_name=$LLM_MODEL \ 45 | llm.base_url=$BASE_URL \ 46 | llm.async_client=true \ 47 | llm.temperature=$TEMPERATURE \ 48 | llm.max_context_length=$MAX_CONTEXT_LENGTH \ 49 | llm.api_key=$API_KEY \ 50 | benchmark.execution.max_tasks=null \ 51 | benchmark.execution.max_concurrent=$MAX_CONCURRENT \ 52 | benchmark.execution.pass_at_k=$PASS_AT_K \ 53 | benchmark.data.data_dir=../../data/debug \ 54 | agent=$AGENT_SET \ 55 | hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 56 | 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 57 | 58 | # Check if run was successful 59 | if [ $? -eq 0 ]; then 60 | echo "Run $i completed successfully" 61 | RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) 62 | if [ -f "$RESULT_FILE" ]; then 63 | echo "Results saved to $RESULT_FILE" 64 | else 65 | echo "Warning: Result file not found for run $i" 66 | fi 67 | else 68 | echo "Run $i failed!" 69 | fi 70 | ) & 71 | 72 | # Small delay between launches to avoid simultaneous requests 73 | sleep 2 74 | done 75 | 76 | echo "All $NUM_RUNS runs have been launched in parallel" 77 | echo "Waiting for all runs to complete..." 78 | 79 | # Wait for all background tasks to complete 80 | wait 81 | 82 | echo "==========================================" 83 | echo "All $NUM_RUNS runs completed!" 84 | echo "==========================================" 85 | 86 | # Calculate average scores 87 | echo "Calculating average scores..." 88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" 89 | 90 | echo "==========================================" 91 | echo "Multiple runs evaluation completed!" 92 | echo "Check results in: $RESULTS_DIR" 93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" 94 | echo "==========================================" 95 | -------------------------------------------------------------------------------- /apps/miroflow-agent/scripts/run_evaluate_multiple_runs_frames.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parse environment variables, use defaults if not set 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"} 6 | 7 | # Configuration parameters 8 | NUM_RUNS=${NUM_RUNS:-3} 9 | BENCHMARK_NAME="frames" 10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} 11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"} 12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} 13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10} 14 | PASS_AT_K=${PASS_AT_K:-1} 15 | TEMPERATURE=${TEMPERATURE:-1.0} 16 | API_KEY=${API_KEY:-"xxx"} 17 | 18 | # Set results directory 19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" 20 | 21 | echo "Starting $NUM_RUNS runs of the evaluation..." 22 | echo "Results will be saved in: $RESULTS_DIR" 23 | 24 | # Create results directory 25 | mkdir -p "$RESULTS_DIR" 26 | 27 | # Launch all parallel tasks 28 | for i in $(seq 1 $NUM_RUNS); do 29 | echo "==========================================" 30 | echo "Launching experiment $i/$NUM_RUNS" 31 | echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" 32 | echo "==========================================" 33 | 34 | # Set specific identifier for this run 35 | RUN_ID="run_$i" 36 | 37 | # Run experiment (background execution) 38 | ( 39 | uv run python benchmarks/common_benchmark.py \ 40 | benchmark=$BENCHMARK_NAME \ 41 | benchmark.data.metadata_file="standardized_data.jsonl" \ 42 | llm=qwen-3 \ 43 | llm.provider=$LLM_PROVIDER \ 44 | llm.model_name=$LLM_MODEL \ 45 | llm.base_url=$BASE_URL \ 46 | llm.async_client=true \ 47 | llm.temperature=$TEMPERATURE \ 48 | llm.max_context_length=$MAX_CONTEXT_LENGTH \ 49 | llm.api_key=$API_KEY \ 50 | benchmark.execution.max_tasks=null \ 51 | benchmark.execution.max_concurrent=$MAX_CONCURRENT \ 52 | benchmark.execution.pass_at_k=$PASS_AT_K \ 53 | benchmark.data.data_dir=../../data/frames \ 54 | agent=$AGENT_SET \ 55 | hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 56 | 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 57 | 58 | # Check if run was successful 59 | if [ $? -eq 0 ]; then 60 | echo "Run $i completed successfully" 61 | RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) 62 | if [ -f "$RESULT_FILE" ]; then 63 | echo "Results saved to $RESULT_FILE" 64 | else 65 | echo "Warning: Result file not found for run $i" 66 | fi 67 | else 68 | echo "Run $i failed!" 69 | fi 70 | ) & 71 | 72 | # Small delay between launches to avoid simultaneous requests 73 | sleep 2 74 | done 75 | 76 | echo "All $NUM_RUNS runs have been launched in parallel" 77 | echo "Waiting for all runs to complete..." 78 | 79 | # Wait for all background tasks to complete 80 | wait 81 | 82 | echo "==========================================" 83 | echo "All $NUM_RUNS runs completed!" 84 | echo "==========================================" 85 | 86 | # Calculate average scores 87 | echo "Calculating average scores..." 88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" 89 | 90 | echo "==========================================" 91 | echo "Multiple runs evaluation completed!" 92 | echo "Check results in: $RESULTS_DIR" 93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" 94 | echo "==========================================" 95 | -------------------------------------------------------------------------------- /apps/miroflow-agent/scripts/run_evaluate_multiple_runs_seal-0.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parse environment variables, use defaults if not set 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"} 6 | 7 | # Configuration parameters 8 | NUM_RUNS=${NUM_RUNS:-8} 9 | BENCHMARK_NAME="seal-0" 10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} 11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"} 12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} 13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10} 14 | PASS_AT_K=${PASS_AT_K:-1} 15 | TEMPERATURE=${TEMPERATURE:-1.0} 16 | API_KEY=${API_KEY:-"xxx"} 17 | 18 | # Set results directory 19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" 20 | 21 | echo "Starting $NUM_RUNS runs of the evaluation..." 22 | echo "Results will be saved in: $RESULTS_DIR" 23 | 24 | # Create results directory 25 | mkdir -p "$RESULTS_DIR" 26 | 27 | # Launch all parallel tasks 28 | for i in $(seq 1 $NUM_RUNS); do 29 | echo "==========================================" 30 | echo "Launching experiment $i/$NUM_RUNS" 31 | echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" 32 | echo "==========================================" 33 | 34 | # Set specific identifier for this run 35 | RUN_ID="run_$i" 36 | 37 | # Run experiment (background execution) 38 | ( 39 | uv run python benchmarks/common_benchmark.py \ 40 | benchmark=$BENCHMARK_NAME \ 41 | benchmark.data.metadata_file="standardized_data.jsonl" \ 42 | llm=qwen-3 \ 43 | llm.provider=$LLM_PROVIDER \ 44 | llm.model_name=$LLM_MODEL \ 45 | llm.base_url=$BASE_URL \ 46 | llm.async_client=true \ 47 | llm.temperature=$TEMPERATURE \ 48 | llm.max_context_length=$MAX_CONTEXT_LENGTH \ 49 | llm.api_key=$API_KEY \ 50 | benchmark.execution.max_tasks=null \ 51 | benchmark.execution.max_concurrent=$MAX_CONCURRENT \ 52 | benchmark.execution.pass_at_k=$PASS_AT_K \ 53 | benchmark.data.data_dir=../../data/seal-0 \ 54 | agent=$AGENT_SET \ 55 | hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 56 | 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 57 | 58 | # Check if run was successful 59 | if [ $? -eq 0 ]; then 60 | echo "Run $i completed successfully" 61 | RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) 62 | if [ -f "$RESULT_FILE" ]; then 63 | echo "Results saved to $RESULT_FILE" 64 | else 65 | echo "Warning: Result file not found for run $i" 66 | fi 67 | else 68 | echo "Run $i failed!" 69 | fi 70 | ) & 71 | 72 | # Small delay between launches to avoid simultaneous requests 73 | sleep 2 74 | done 75 | 76 | echo "All $NUM_RUNS runs have been launched in parallel" 77 | echo "Waiting for all runs to complete..." 78 | 79 | # Wait for all background tasks to complete 80 | wait 81 | 82 | echo "==========================================" 83 | echo "All $NUM_RUNS runs completed!" 84 | echo "==========================================" 85 | 86 | # Calculate average scores 87 | echo "Calculating average scores..." 88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" 89 | 90 | echo "==========================================" 91 | echo "Multiple runs evaluation completed!" 92 | echo "Check results in: $RESULTS_DIR" 93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" 94 | echo "==========================================" 95 | -------------------------------------------------------------------------------- /libs/miroflow-tools/src/miroflow_tools/mcp_servers/browser_session.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import asyncio 5 | import json 6 | import logging 7 | 8 | from mcp import StdioServerParameters 9 | from mcp.client.session import ClientSession 10 | from mcp.client.sse import sse_client 11 | from mcp.client.stdio import stdio_client 12 | 13 | logger = logging.getLogger("miroflow") 14 | 15 | 16 | class PlaywrightSession: 17 | """Class to maintain a persistent Playwright MCP session.""" 18 | 19 | def __init__(self, server_params): 20 | self.server_params = server_params 21 | self.read = None 22 | self.write = None 23 | self.session = None 24 | self._client = None 25 | 26 | async def connect(self): 27 | """Connect to the MCP server and initialize the session.""" 28 | if self.session is None: 29 | if isinstance(self.server_params, StdioServerParameters): 30 | self._client = stdio_client(self.server_params) 31 | else: 32 | self._client = sse_client(self.server_params) 33 | self.read, self.write = await self._client.__aenter__() 34 | self.session = ClientSession(self.read, self.write, sampling_callback=None) 35 | await self.session.__aenter__() 36 | await self.session.initialize() 37 | logger.info("Connected to MCP server and initialized session") 38 | 39 | async def call_tool(self, tool_name, arguments=None): 40 | """Call a tool while maintaining the session.""" 41 | if self.session is None: 42 | await self.connect() 43 | 44 | logger.info(f"Calling tool '{tool_name}'") 45 | tool_result = await self.session.call_tool(tool_name, arguments=arguments) 46 | result_content = tool_result.content[0].text if tool_result.content else "" 47 | return result_content 48 | 49 | async def close(self): 50 | """Close the session and connection.""" 51 | if self.session: 52 | await self.session.__aexit__(None, None, None) 53 | self.session = None 54 | 55 | if self._client: 56 | await self._client.__aexit__(None, None, None) 57 | self._client = None 58 | self.read = None 59 | self.write = None 60 | logger.info("Closed MCP session") 61 | 62 | 63 | # Example usage: 64 | async def test_persistent_session(): 65 | # Create a persistent session 66 | mcp_session = PlaywrightSession("http://localhost:8931") 67 | 68 | try: 69 | # First call: Navigate to a website 70 | await mcp_session.call_tool("browser_navigate", {"url": "https://example.com"}) 71 | logger.info("Navigation complete") 72 | 73 | # Wait a moment for the page to load 74 | await asyncio.sleep(2) 75 | 76 | # Second call: Take a snapshot of the current page 77 | snapshot_result = await mcp_session.call_tool("browser_snapshot", {}) 78 | 79 | # Process and save the snapshot 80 | snapshot_json = json.loads(snapshot_result) 81 | logger.info(f"Snapshot taken of page: {snapshot_json.get('url')}") 82 | logger.info(f"Page title: {snapshot_json.get('title')}") 83 | 84 | with open("snapshot.json", "w") as f: 85 | json.dump(snapshot_json, f, indent=2, ensure_ascii=False) 86 | 87 | logger.info("Snapshot saved to snapshot.json") 88 | 89 | finally: 90 | # Close the session when done with all tool calls 91 | await mcp_session.close() 92 | 93 | 94 | if __name__ == "__main__": 95 | asyncio.run(test_persistent_session()) 96 | -------------------------------------------------------------------------------- /apps/miroflow-agent/scripts/run_evaluate_multiple_runs_aime2025.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parse environment variables, use defaults if not set 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"} 6 | 7 | # Configuration parameters 8 | NUM_RUNS=${NUM_RUNS:-32} 9 | BENCHMARK_NAME="aime2025" 10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} 11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"} 12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} 13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10} 14 | PASS_AT_K=${PASS_AT_K:-1} 15 | TEMPERATURE=${TEMPERATURE:-1.0} 16 | API_KEY=${API_KEY:-"xxx"} 17 | 18 | # Set results directory 19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" 20 | 21 | echo "Starting $NUM_RUNS runs of the evaluation..." 22 | echo "Results will be saved in: $RESULTS_DIR" 23 | 24 | # Create results directory 25 | mkdir -p "$RESULTS_DIR" 26 | 27 | # Launch all parallel tasks 28 | for i in $(seq 1 $NUM_RUNS); do 29 | echo "==========================================" 30 | echo "Launching experiment $i/$NUM_RUNS" 31 | echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" 32 | echo "==========================================" 33 | 34 | # Set specific identifier for this run 35 | RUN_ID="run_$i" 36 | 37 | # Run experiment (background execution) 38 | ( 39 | uv run python benchmarks/common_benchmark.py \ 40 | benchmark=$BENCHMARK_NAME \ 41 | benchmark.data.metadata_file="standardized_data.jsonl" \ 42 | llm=qwen-3 \ 43 | llm.provider=$LLM_PROVIDER \ 44 | llm.model_name=$LLM_MODEL \ 45 | llm.base_url=$BASE_URL \ 46 | llm.async_client=true \ 47 | llm.temperature=$TEMPERATURE \ 48 | llm.max_context_length=$MAX_CONTEXT_LENGTH \ 49 | llm.api_key=$API_KEY \ 50 | benchmark.execution.max_tasks=null \ 51 | benchmark.execution.max_concurrent=$MAX_CONCURRENT \ 52 | benchmark.execution.pass_at_k=$PASS_AT_K \ 53 | benchmark.data.data_dir=../../data/aime2025 \ 54 | agent=$AGENT_SET \ 55 | hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 56 | 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 57 | 58 | # Check if run was successful 59 | if [ $? -eq 0 ]; then 60 | echo "Run $i completed successfully" 61 | RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) 62 | if [ -f "$RESULT_FILE" ]; then 63 | echo "Results saved to $RESULT_FILE" 64 | else 65 | echo "Warning: Result file not found for run $i" 66 | fi 67 | else 68 | echo "Run $i failed!" 69 | fi 70 | ) & 71 | 72 | # Small delay between launches to avoid simultaneous requests 73 | sleep 2 74 | done 75 | 76 | echo "All $NUM_RUNS runs have been launched in parallel" 77 | echo "Waiting for all runs to complete..." 78 | 79 | # Wait for all background tasks to complete 80 | wait 81 | 82 | echo "==========================================" 83 | echo "All $NUM_RUNS runs completed!" 84 | echo "==========================================" 85 | 86 | # Calculate average scores 87 | echo "Calculating average scores..." 88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" 89 | 90 | echo "==========================================" 91 | echo "Multiple runs evaluation completed!" 92 | echo "Check results in: $RESULTS_DIR" 93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" 94 | echo "==========================================" 95 | -------------------------------------------------------------------------------- /apps/miroflow-agent/scripts/run_evaluate_multiple_runs_browsecomp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parse environment variables, use defaults if not set 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"} 6 | 7 | # Configuration parameters 8 | NUM_RUNS=${NUM_RUNS:-3} 9 | BENCHMARK_NAME="browsecomp" 10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} 11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"} 12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} 13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10} 14 | PASS_AT_K=${PASS_AT_K:-1} 15 | TEMPERATURE=${TEMPERATURE:-1.0} 16 | API_KEY=${API_KEY:-"xxx"} 17 | 18 | # Set results directory 19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" 20 | 21 | echo "Starting $NUM_RUNS runs of the evaluation..." 22 | echo "Results will be saved in: $RESULTS_DIR" 23 | 24 | # Create results directory 25 | mkdir -p "$RESULTS_DIR" 26 | 27 | # Launch all parallel tasks 28 | for i in $(seq 1 $NUM_RUNS); do 29 | echo "==========================================" 30 | echo "Launching experiment $i/$NUM_RUNS" 31 | echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" 32 | echo "==========================================" 33 | 34 | # Set specific identifier for this run 35 | RUN_ID="run_$i" 36 | 37 | # Run experiment (background execution) 38 | ( 39 | uv run python benchmarks/common_benchmark.py \ 40 | benchmark=$BENCHMARK_NAME \ 41 | benchmark.data.metadata_file="standardized_data.jsonl" \ 42 | llm=qwen-3 \ 43 | llm.provider=$LLM_PROVIDER \ 44 | llm.model_name=$LLM_MODEL \ 45 | llm.base_url=$BASE_URL \ 46 | llm.async_client=true \ 47 | llm.temperature=$TEMPERATURE \ 48 | llm.max_context_length=$MAX_CONTEXT_LENGTH \ 49 | llm.api_key=$API_KEY \ 50 | benchmark.execution.max_tasks=null \ 51 | benchmark.execution.max_concurrent=$MAX_CONCURRENT \ 52 | benchmark.execution.pass_at_k=$PASS_AT_K \ 53 | benchmark.data.data_dir=../../data/browsecomp \ 54 | agent=$AGENT_SET \ 55 | hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 56 | 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 57 | 58 | # Check if run was successful 59 | if [ $? -eq 0 ]; then 60 | echo "Run $i completed successfully" 61 | RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) 62 | if [ -f "$RESULT_FILE" ]; then 63 | echo "Results saved to $RESULT_FILE" 64 | else 65 | echo "Warning: Result file not found for run $i" 66 | fi 67 | else 68 | echo "Run $i failed!" 69 | fi 70 | ) & 71 | 72 | # Small delay between launches to avoid simultaneous requests 73 | sleep 2 74 | done 75 | 76 | echo "All $NUM_RUNS runs have been launched in parallel" 77 | echo "Waiting for all runs to complete..." 78 | 79 | # Wait for all background tasks to complete 80 | wait 81 | 82 | echo "==========================================" 83 | echo "All $NUM_RUNS runs completed!" 84 | echo "==========================================" 85 | 86 | # Calculate average scores 87 | echo "Calculating average scores..." 88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" 89 | 90 | echo "==========================================" 91 | echo "Multiple runs evaluation completed!" 92 | echo "Check results in: $RESULTS_DIR" 93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" 94 | echo "==========================================" 95 | -------------------------------------------------------------------------------- /apps/miroflow-agent/scripts/run_evaluate_multiple_runs_webwalkerqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parse environment variables, use defaults if not set 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"} 6 | 7 | # Configuration parameters 8 | NUM_RUNS=${NUM_RUNS:-3} 9 | BENCHMARK_NAME="webwalkerqa" 10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} 11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"} 12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} 13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10} 14 | PASS_AT_K=${PASS_AT_K:-1} 15 | TEMPERATURE=${TEMPERATURE:-1.0} 16 | API_KEY=${API_KEY:-"xxx"} 17 | 18 | # Set results directory 19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" 20 | 21 | echo "Starting $NUM_RUNS runs of the evaluation..." 22 | echo "Results will be saved in: $RESULTS_DIR" 23 | 24 | # Create results directory 25 | mkdir -p "$RESULTS_DIR" 26 | 27 | # Launch all parallel tasks 28 | for i in $(seq 1 $NUM_RUNS); do 29 | echo "==========================================" 30 | echo "Launching experiment $i/$NUM_RUNS" 31 | echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" 32 | echo "==========================================" 33 | 34 | # Set specific identifier for this run 35 | RUN_ID="run_$i" 36 | 37 | # Run experiment (background execution) 38 | ( 39 | uv run python benchmarks/common_benchmark.py \ 40 | benchmark=$BENCHMARK_NAME \ 41 | benchmark.data.metadata_file="standardized_data.jsonl" \ 42 | llm=qwen-3 \ 43 | llm.provider=$LLM_PROVIDER \ 44 | llm.model_name=$LLM_MODEL \ 45 | llm.base_url=$BASE_URL \ 46 | llm.async_client=true \ 47 | llm.temperature=$TEMPERATURE \ 48 | llm.max_context_length=$MAX_CONTEXT_LENGTH \ 49 | llm.api_key=$API_KEY \ 50 | benchmark.execution.max_tasks=null \ 51 | benchmark.execution.max_concurrent=$MAX_CONCURRENT \ 52 | benchmark.execution.pass_at_k=$PASS_AT_K \ 53 | benchmark.data.data_dir=../../data/webwalkerqa \ 54 | agent=$AGENT_SET \ 55 | hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 56 | 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 57 | 58 | # Check if run was successful 59 | if [ $? -eq 0 ]; then 60 | echo "Run $i completed successfully" 61 | RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) 62 | if [ -f "$RESULT_FILE" ]; then 63 | echo "Results saved to $RESULT_FILE" 64 | else 65 | echo "Warning: Result file not found for run $i" 66 | fi 67 | else 68 | echo "Run $i failed!" 69 | fi 70 | ) & 71 | 72 | # Small delay between launches to avoid simultaneous requests 73 | sleep 2 74 | done 75 | 76 | echo "All $NUM_RUNS runs have been launched in parallel" 77 | echo "Waiting for all runs to complete..." 78 | 79 | # Wait for all background tasks to complete 80 | wait 81 | 82 | echo "==========================================" 83 | echo "All $NUM_RUNS runs completed!" 84 | echo "==========================================" 85 | 86 | # Calculate average scores 87 | echo "Calculating average scores..." 88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" 89 | 90 | echo "==========================================" 91 | echo "Multiple runs evaluation completed!" 92 | echo "Check results in: $RESULTS_DIR" 93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" 94 | echo "==========================================" 95 | -------------------------------------------------------------------------------- /apps/miroflow-agent/scripts/run_evaluate_multiple_runs_browsecomp_zh.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parse environment variables, use defaults if not set 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"} 6 | 7 | # Configuration parameters 8 | NUM_RUNS=${NUM_RUNS:-3} 9 | BENCHMARK_NAME="browsecomp_zh" 10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} 11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"} 12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} 13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10} 14 | PASS_AT_K=${PASS_AT_K:-1} 15 | TEMPERATURE=${TEMPERATURE:-1.0} 16 | API_KEY=${API_KEY:-"xxx"} 17 | 18 | # Set results directory 19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" 20 | 21 | echo "Starting $NUM_RUNS runs of the evaluation..." 22 | echo "Results will be saved in: $RESULTS_DIR" 23 | 24 | # Create results directory 25 | mkdir -p "$RESULTS_DIR" 26 | 27 | # Launch all parallel tasks 28 | for i in $(seq 1 $NUM_RUNS); do 29 | echo "==========================================" 30 | echo "Launching experiment $i/$NUM_RUNS" 31 | echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" 32 | echo "==========================================" 33 | 34 | # Set specific identifier for this run 35 | RUN_ID="run_$i" 36 | 37 | # Run experiment (background execution) 38 | ( 39 | uv run python benchmarks/common_benchmark.py \ 40 | benchmark=$BENCHMARK_NAME \ 41 | benchmark.data.metadata_file="standardized_data.jsonl" \ 42 | llm=qwen-3 \ 43 | llm.provider=$LLM_PROVIDER \ 44 | llm.model_name=$LLM_MODEL \ 45 | llm.base_url=$BASE_URL \ 46 | llm.async_client=true \ 47 | llm.temperature=$TEMPERATURE \ 48 | llm.max_context_length=$MAX_CONTEXT_LENGTH \ 49 | llm.api_key=$API_KEY \ 50 | benchmark.execution.max_tasks=null \ 51 | benchmark.execution.max_concurrent=$MAX_CONCURRENT \ 52 | benchmark.execution.pass_at_k=$PASS_AT_K \ 53 | benchmark.data.data_dir=../../data/browsecomp_zh \ 54 | agent=$AGENT_SET \ 55 | hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 56 | 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 57 | 58 | # Check if run was successful 59 | if [ $? -eq 0 ]; then 60 | echo "Run $i completed successfully" 61 | RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) 62 | if [ -f "$RESULT_FILE" ]; then 63 | echo "Results saved to $RESULT_FILE" 64 | else 65 | echo "Warning: Result file not found for run $i" 66 | fi 67 | else 68 | echo "Run $i failed!" 69 | fi 70 | ) & 71 | 72 | # Small delay between launches to avoid simultaneous requests 73 | sleep 2 74 | done 75 | 76 | echo "All $NUM_RUNS runs have been launched in parallel" 77 | echo "Waiting for all runs to complete..." 78 | 79 | # Wait for all background tasks to complete 80 | wait 81 | 82 | echo "==========================================" 83 | echo "All $NUM_RUNS runs completed!" 84 | echo "==========================================" 85 | 86 | # Calculate average scores 87 | echo "Calculating average scores..." 88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" 89 | 90 | echo "==========================================" 91 | echo "Multiple runs evaluation completed!" 92 | echo "Check results in: $RESULTS_DIR" 93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" 94 | echo "==========================================" 95 | -------------------------------------------------------------------------------- /apps/miroflow-agent/scripts/run_evaluate_multiple_runs_hle-text-2158.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parse environment variables, use defaults if not set 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"} 6 | 7 | # Configuration parameters 8 | NUM_RUNS=${NUM_RUNS:-3} 9 | BENCHMARK_NAME="hle-text-2158" 10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} 11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"} 12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} 13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10} 14 | PASS_AT_K=${PASS_AT_K:-1} 15 | TEMPERATURE=${TEMPERATURE:-1.0} 16 | API_KEY=${API_KEY:-"xxx"} 17 | 18 | # Set results directory 19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" 20 | 21 | echo "Starting $NUM_RUNS runs of the evaluation..." 22 | echo "Results will be saved in: $RESULTS_DIR" 23 | 24 | # Create results directory 25 | mkdir -p "$RESULTS_DIR" 26 | 27 | # Launch all parallel tasks 28 | for i in $(seq 1 $NUM_RUNS); do 29 | echo "==========================================" 30 | echo "Launching experiment $i/$NUM_RUNS" 31 | echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" 32 | echo "==========================================" 33 | 34 | # Set specific identifier for this run 35 | RUN_ID="run_$i" 36 | 37 | # Run experiment (background execution) 38 | ( 39 | uv run python benchmarks/common_benchmark.py \ 40 | benchmark=$BENCHMARK_NAME \ 41 | benchmark.data.metadata_file="standardized_data_original.jsonl" \ 42 | llm=qwen-3 \ 43 | llm.provider=$LLM_PROVIDER \ 44 | llm.model_name=$LLM_MODEL \ 45 | llm.base_url=$BASE_URL \ 46 | llm.async_client=true \ 47 | llm.temperature=$TEMPERATURE \ 48 | llm.max_context_length=$MAX_CONTEXT_LENGTH \ 49 | llm.api_key=$API_KEY \ 50 | benchmark.execution.max_tasks=null \ 51 | benchmark.execution.max_concurrent=$MAX_CONCURRENT \ 52 | benchmark.execution.pass_at_k=$PASS_AT_K \ 53 | benchmark.data.data_dir=../../data/hle-text-2158 \ 54 | agent=$AGENT_SET \ 55 | hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 56 | 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 57 | 58 | # Check if run was successful 59 | if [ $? -eq 0 ]; then 60 | echo "Run $i completed successfully" 61 | RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) 62 | if [ -f "$RESULT_FILE" ]; then 63 | echo "Results saved to $RESULT_FILE" 64 | else 65 | echo "Warning: Result file not found for run $i" 66 | fi 67 | else 68 | echo "Run $i failed!" 69 | fi 70 | ) & 71 | 72 | # Small delay between launches to avoid simultaneous requests 73 | sleep 2 74 | done 75 | 76 | echo "All $NUM_RUNS runs have been launched in parallel" 77 | echo "Waiting for all runs to complete..." 78 | 79 | # Wait for all background tasks to complete 80 | wait 81 | 82 | echo "==========================================" 83 | echo "All $NUM_RUNS runs completed!" 84 | echo "==========================================" 85 | 86 | # Calculate average scores 87 | echo "Calculating average scores..." 88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" 89 | 90 | echo "==========================================" 91 | echo "Multiple runs evaluation completed!" 92 | echo "Check results in: $RESULTS_DIR" 93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" 94 | echo "==========================================" 95 | -------------------------------------------------------------------------------- /apps/miroflow-agent/scripts/run_evaluate_multiple_runs_hle-text-500.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parse environment variables, use defaults if not set 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"} 6 | 7 | # Configuration parameters 8 | NUM_RUNS=${NUM_RUNS:-3} 9 | BENCHMARK_NAME="hle-text-500" 10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} 11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"} 12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} 13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10} 14 | PASS_AT_K=${PASS_AT_K:-1} 15 | TEMPERATURE=${TEMPERATURE:-1.0} 16 | API_KEY=${API_KEY:-"xxx"} 17 | 18 | # Set results directory 19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" 20 | 21 | echo "Starting $NUM_RUNS runs of the evaluation..." 22 | echo "Results will be saved in: $RESULTS_DIR" 23 | 24 | # Create results directory 25 | mkdir -p "$RESULTS_DIR" 26 | 27 | # Launch all parallel tasks 28 | for i in $(seq 1 $NUM_RUNS); do 29 | echo "==========================================" 30 | echo "Launching experiment $i/$NUM_RUNS" 31 | echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" 32 | echo "==========================================" 33 | 34 | # Set specific identifier for this run 35 | RUN_ID="run_$i" 36 | 37 | # Run experiment (background execution) 38 | ( 39 | uv run python benchmarks/common_benchmark.py \ 40 | benchmark=$BENCHMARK_NAME \ 41 | benchmark.data.metadata_file="standardized_data_original.jsonl" \ 42 | llm=qwen-3 \ 43 | llm.provider=$LLM_PROVIDER \ 44 | llm.model_name=$LLM_MODEL \ 45 | llm.base_url=$BASE_URL \ 46 | llm.async_client=true \ 47 | llm.temperature=$TEMPERATURE \ 48 | llm.max_context_length=$MAX_CONTEXT_LENGTH \ 49 | llm.api_key=$API_KEY \ 50 | benchmark.execution.max_tasks=null \ 51 | benchmark.execution.max_concurrent=$MAX_CONCURRENT \ 52 | benchmark.execution.pass_at_k=$PASS_AT_K \ 53 | benchmark.data.data_dir=../../data/hle-text-500 \ 54 | agent=$AGENT_SET \ 55 | hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 56 | 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 57 | 58 | # Check if run was successful 59 | if [ $? -eq 0 ]; then 60 | echo "Run $i completed successfully" 61 | RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) 62 | if [ -f "$RESULT_FILE" ]; then 63 | echo "Results saved to $RESULT_FILE" 64 | else 65 | echo "Warning: Result file not found for run $i" 66 | fi 67 | else 68 | echo "Run $i failed!" 69 | fi 70 | ) & 71 | 72 | # Small delay between launches to avoid simultaneous requests 73 | sleep 2 74 | done 75 | 76 | echo "All $NUM_RUNS runs have been launched in parallel" 77 | echo "Waiting for all runs to complete..." 78 | 79 | # Wait for all background tasks to complete 80 | wait 81 | 82 | echo "==========================================" 83 | echo "All $NUM_RUNS runs completed!" 84 | echo "==========================================" 85 | 86 | # Calculate average scores 87 | echo "Calculating average scores..." 88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" 89 | 90 | echo "==========================================" 91 | echo "Multiple runs evaluation completed!" 92 | echo "Check results in: $RESULTS_DIR" 93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" 94 | echo "==========================================" 95 | -------------------------------------------------------------------------------- /apps/miroflow-agent/scripts/run_evaluate_multiple_runs_gaia-validation.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parse environment variables, use defaults if not set 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"} 6 | 7 | # Configuration parameters 8 | NUM_RUNS=${NUM_RUNS:-8} 9 | BENCHMARK_NAME="gaia-validation" 10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} 11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"} 12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} 13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10} 14 | PASS_AT_K=${PASS_AT_K:-1} 15 | TEMPERATURE=${TEMPERATURE:-1.0} 16 | API_KEY=${API_KEY:-"xxx"} 17 | 18 | # Set results directory 19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" 20 | 21 | echo "Starting $NUM_RUNS runs of the evaluation..." 22 | echo "Results will be saved in: $RESULTS_DIR" 23 | 24 | # Create results directory 25 | mkdir -p "$RESULTS_DIR" 26 | 27 | # Launch all parallel tasks 28 | for i in $(seq 1 $NUM_RUNS); do 29 | echo "==========================================" 30 | echo "Launching experiment $i/$NUM_RUNS" 31 | echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" 32 | echo "==========================================" 33 | 34 | # Set specific identifier for this run 35 | RUN_ID="run_$i" 36 | 37 | # Run experiment (background execution) 38 | ( 39 | uv run python benchmarks/common_benchmark.py \ 40 | benchmark=$BENCHMARK_NAME \ 41 | benchmark.data.metadata_file="standardized_data.jsonl" \ 42 | llm=qwen-3 \ 43 | llm.provider=$LLM_PROVIDER \ 44 | llm.model_name=$LLM_MODEL \ 45 | llm.base_url=$BASE_URL \ 46 | llm.async_client=true \ 47 | llm.temperature=$TEMPERATURE \ 48 | llm.max_context_length=$MAX_CONTEXT_LENGTH \ 49 | llm.api_key=$API_KEY \ 50 | benchmark.execution.max_tasks=null \ 51 | benchmark.execution.max_concurrent=$MAX_CONCURRENT \ 52 | benchmark.execution.pass_at_k=$PASS_AT_K \ 53 | benchmark.data.data_dir=../../data/gaia-2023-validation \ 54 | agent=$AGENT_SET \ 55 | hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 56 | 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 57 | 58 | # Check if run was successful 59 | if [ $? -eq 0 ]; then 60 | echo "Run $i completed successfully" 61 | RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) 62 | if [ -f "$RESULT_FILE" ]; then 63 | echo "Results saved to $RESULT_FILE" 64 | else 65 | echo "Warning: Result file not found for run $i" 66 | fi 67 | else 68 | echo "Run $i failed!" 69 | fi 70 | ) & 71 | 72 | # Small delay between launches to avoid simultaneous requests 73 | sleep 2 74 | done 75 | 76 | echo "All $NUM_RUNS runs have been launched in parallel" 77 | echo "Waiting for all runs to complete..." 78 | 79 | # Wait for all background tasks to complete 80 | wait 81 | 82 | echo "==========================================" 83 | echo "All $NUM_RUNS runs completed!" 84 | echo "==========================================" 85 | 86 | # Calculate average scores 87 | echo "Calculating average scores..." 88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" 89 | 90 | echo "==========================================" 91 | echo "Multiple runs evaluation completed!" 92 | echo "Check results in: $RESULTS_DIR" 93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" 94 | echo "==========================================" 95 | -------------------------------------------------------------------------------- /apps/miroflow-agent/scripts/run_evaluate_multiple_runs_xbench_deepsearch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parse environment variables, use defaults if not set 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"} 6 | 7 | # Configuration parameters 8 | NUM_RUNS=${NUM_RUNS:-8} 9 | BENCHMARK_NAME="xbench_deepsearch" 10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} 11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"} 12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} 13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10} 14 | PASS_AT_K=${PASS_AT_K:-1} 15 | TEMPERATURE=${TEMPERATURE:-1.0} 16 | API_KEY=${API_KEY:-"xxx"} 17 | 18 | # Set results directory 19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" 20 | 21 | echo "Starting $NUM_RUNS runs of the evaluation..." 22 | echo "Results will be saved in: $RESULTS_DIR" 23 | 24 | # Create results directory 25 | mkdir -p "$RESULTS_DIR" 26 | 27 | # Launch all parallel tasks 28 | for i in $(seq 1 $NUM_RUNS); do 29 | echo "==========================================" 30 | echo "Launching experiment $i/$NUM_RUNS" 31 | echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" 32 | echo "==========================================" 33 | 34 | # Set specific identifier for this run 35 | RUN_ID="run_$i" 36 | 37 | # Run experiment (background execution) 38 | ( 39 | uv run python benchmarks/common_benchmark.py \ 40 | benchmark=$BENCHMARK_NAME \ 41 | benchmark.data.metadata_file="standardized_data.jsonl" \ 42 | llm=qwen-3 \ 43 | llm.provider=$LLM_PROVIDER \ 44 | llm.model_name=$LLM_MODEL \ 45 | llm.base_url=$BASE_URL \ 46 | llm.async_client=true \ 47 | llm.temperature=$TEMPERATURE \ 48 | llm.max_context_length=$MAX_CONTEXT_LENGTH \ 49 | llm.api_key=$API_KEY \ 50 | benchmark.execution.max_tasks=null \ 51 | benchmark.execution.max_concurrent=$MAX_CONCURRENT \ 52 | benchmark.execution.pass_at_k=$PASS_AT_K \ 53 | benchmark.data.data_dir=../../data/xbench_deepsearch \ 54 | agent=$AGENT_SET \ 55 | hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 56 | 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 57 | 58 | # Check if run was successful 59 | if [ $? -eq 0 ]; then 60 | echo "Run $i completed successfully" 61 | RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) 62 | if [ -f "$RESULT_FILE" ]; then 63 | echo "Results saved to $RESULT_FILE" 64 | else 65 | echo "Warning: Result file not found for run $i" 66 | fi 67 | else 68 | echo "Run $i failed!" 69 | fi 70 | ) & 71 | 72 | # Small delay between launches to avoid simultaneous requests 73 | sleep 2 74 | done 75 | 76 | echo "All $NUM_RUNS runs have been launched in parallel" 77 | echo "Waiting for all runs to complete..." 78 | 79 | # Wait for all background tasks to complete 80 | wait 81 | 82 | echo "==========================================" 83 | echo "All $NUM_RUNS runs completed!" 84 | echo "==========================================" 85 | 86 | # Calculate average scores 87 | echo "Calculating average scores..." 88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" 89 | 90 | echo "==========================================" 91 | echo "Multiple runs evaluation completed!" 92 | echo "Check results in: $RESULTS_DIR" 93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" 94 | echo "==========================================" 95 | -------------------------------------------------------------------------------- /apps/miroflow-agent/scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parse environment variables, use defaults if not set 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"} 6 | 7 | # Configuration parameters 8 | NUM_RUNS=${NUM_RUNS:-8} 9 | BENCHMARK_NAME="gaia-validation-text-103" 10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} 11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"} 12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} 13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10} 14 | PASS_AT_K=${PASS_AT_K:-1} 15 | TEMPERATURE=${TEMPERATURE:-1.0} 16 | API_KEY=${API_KEY:-"xxx"} 17 | 18 | # Set results directory 19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" 20 | 21 | echo "Starting $NUM_RUNS runs of the evaluation..." 22 | echo "Results will be saved in: $RESULTS_DIR" 23 | 24 | # Create results directory 25 | mkdir -p "$RESULTS_DIR" 26 | 27 | # Launch all parallel tasks 28 | for i in $(seq 1 $NUM_RUNS); do 29 | echo "==========================================" 30 | echo "Launching experiment $i/$NUM_RUNS" 31 | echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" 32 | echo "==========================================" 33 | 34 | # Set specific identifier for this run 35 | RUN_ID="run_$i" 36 | 37 | # Run experiment (background execution) 38 | ( 39 | uv run python benchmarks/common_benchmark.py \ 40 | benchmark=$BENCHMARK_NAME \ 41 | benchmark.data.metadata_file="standardized_data.jsonl" \ 42 | llm=qwen-3 \ 43 | llm.provider=$LLM_PROVIDER \ 44 | llm.model_name=$LLM_MODEL \ 45 | llm.base_url=$BASE_URL \ 46 | llm.async_client=true \ 47 | llm.temperature=$TEMPERATURE \ 48 | llm.max_context_length=$MAX_CONTEXT_LENGTH \ 49 | llm.api_key=$API_KEY \ 50 | benchmark.execution.max_tasks=null \ 51 | benchmark.execution.max_concurrent=$MAX_CONCURRENT \ 52 | benchmark.execution.pass_at_k=$PASS_AT_K \ 53 | benchmark.data.data_dir=../../data/gaia-2023-validation-text-103 \ 54 | agent=$AGENT_SET \ 55 | hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 56 | 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 57 | 58 | # Check if run was successful 59 | if [ $? -eq 0 ]; then 60 | echo "Run $i completed successfully" 61 | RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) 62 | if [ -f "$RESULT_FILE" ]; then 63 | echo "Results saved to $RESULT_FILE" 64 | else 65 | echo "Warning: Result file not found for run $i" 66 | fi 67 | else 68 | echo "Run $i failed!" 69 | fi 70 | ) & 71 | 72 | # Small delay between launches to avoid simultaneous requests 73 | sleep 2 74 | done 75 | 76 | echo "All $NUM_RUNS runs have been launched in parallel" 77 | echo "Waiting for all runs to complete..." 78 | 79 | # Wait for all background tasks to complete 80 | wait 81 | 82 | echo "==========================================" 83 | echo "All $NUM_RUNS runs completed!" 84 | echo "==========================================" 85 | 86 | # Calculate average scores 87 | echo "Calculating average scores..." 88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" 89 | 90 | echo "==========================================" 91 | echo "Multiple runs evaluation completed!" 92 | echo "Check results in: $RESULTS_DIR" 93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" 94 | echo "==========================================" 95 | -------------------------------------------------------------------------------- /apps/collect-trace/utils/process_logs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import argparse 5 | import json 6 | import os 7 | import shutil 8 | 9 | 10 | def get_successful_log_paths(jsonl_file_path: str) -> list: 11 | """ 12 | Collects the paths of successful log files from a dataset. 13 | 14 | This function extracts log file paths of successful records based on 15 | the value of `final_judge_result`. If the dataset has been fully 16 | processed, it reads from a `benchmark_results.jsonl` file. Otherwise, 17 | if processing was interrupted, it falls back to scanning individual 18 | `.json` files in the given directory. 19 | 20 | Success is determined by: 21 | - `PASS_AT_K_SUCCESS` for records in JSONL files. 22 | - `CORRECT` for records in individual JSON files. 23 | 24 | Args: 25 | jsonl_file_path (str): Path to a JSONL file or a directory of JSON files. 26 | 27 | Returns: 28 | list: A list of log file paths for successful records. 29 | """ 30 | log_paths = [] 31 | 32 | if jsonl_file_path.endswith(".jsonl"): 33 | with open(jsonl_file_path, "r", encoding="utf-8") as f: 34 | for line in f: 35 | line = line.strip() 36 | if line: 37 | try: 38 | data = json.loads(line) 39 | if data.get("final_judge_result") == "PASS_AT_K_SUCCESS": 40 | log_path = data.get("log_file_path") 41 | if log_path: 42 | log_paths.append(log_path) 43 | except json.JSONDecodeError: 44 | continue 45 | else: 46 | filenames = os.listdir(jsonl_file_path) 47 | filenames = [filename for filename in filenames if filename.endswith(".json")] 48 | for filename in filenames: 49 | filepath = os.path.join(jsonl_file_path, filename) 50 | try: 51 | data = json.load(open(filepath, "r")) 52 | except Exception: 53 | continue 54 | try: 55 | final_judge_result = data["final_judge_result"] 56 | except KeyError: 57 | print(data.keys()) 58 | continue 59 | if final_judge_result == "CORRECT": 60 | log_paths.append(filepath) 61 | 62 | return log_paths 63 | 64 | 65 | # Usage example 66 | if __name__ == "__main__": 67 | parser = argparse.ArgumentParser( 68 | description="Extract successful log paths from JSONL file" 69 | ) 70 | parser.add_argument( 71 | "file_path", help="Path to the JSONL file containing benchmark results" 72 | ) 73 | args = parser.parse_args() 74 | 75 | result = get_successful_log_paths(args.file_path) 76 | 77 | # Get the parent directory of args.file_path 78 | parent_dir = os.path.abspath(os.path.dirname(args.file_path)) 79 | 80 | # Create successful logs directory 81 | success_log_dir = parent_dir + "/successful_logs" 82 | success_chatml_log_dir = parent_dir + "/successful_chatml_logs" 83 | os.makedirs(success_log_dir, exist_ok=True) 84 | print(f"Successful logs directory: {success_log_dir}") 85 | 86 | for i, path in enumerate(result, 1): 87 | basename = os.path.basename(path) 88 | print(f"Copying file: {path} to {success_log_dir}/{basename}") 89 | shutil.copy(path, f"{success_log_dir}/{basename}") 90 | 91 | os.system( 92 | f"uv run utils/converters/convert_to_chatml_auto_batch.py {success_log_dir}/*.json -o {success_chatml_log_dir}" 93 | ) 94 | os.system( 95 | f"uv run utils/merge_chatml_msgs_to_one_json.py --input_dir {success_chatml_log_dir}" 96 | ) 97 | -------------------------------------------------------------------------------- /libs/miroflow-tools/src/miroflow_tools/mcp_servers/reading_mcp_server.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import argparse 5 | import logging 6 | import sys 7 | 8 | from fastmcp import FastMCP 9 | from mcp import ClientSession, StdioServerParameters 10 | from mcp.client.stdio import stdio_client 11 | 12 | logger = logging.getLogger("miroflow") 13 | 14 | # Initialize FastMCP server 15 | mcp = FastMCP("reading-mcp-server") 16 | 17 | 18 | @mcp.tool() 19 | async def convert_to_markdown(uri: str) -> str: 20 | """Convert various types of resources (doc, ppt, pdf, excel, csv, zip file etc.) 21 | described by an file: or data: URI to markdown. 22 | 23 | Args: 24 | uri: Required. The URI of the resource to convert. Need to start with 'file:' or 'data:' schemes. 25 | 26 | Returns: 27 | str: The converted markdown content, or an error message if conversion fails. 28 | """ 29 | if not uri or not uri.strip(): 30 | return "Error: URI parameter is required and cannot be empty." 31 | 32 | # Validate URI scheme 33 | valid_schemes = ["http:", "https:", "file:", "data:"] 34 | if not any(uri.lower().startswith(scheme) for scheme in valid_schemes): 35 | return f"Error: Invalid URI scheme. Supported schemes are: {', '.join(valid_schemes)}" 36 | 37 | tool_name = "convert_to_markdown" 38 | arguments = {"uri": uri} 39 | 40 | server_params = StdioServerParameters( 41 | command=sys.executable, 42 | args=["-m", "markitdown_mcp"], 43 | ) 44 | 45 | result_content = "" 46 | try: 47 | async with stdio_client(server_params) as (read, write): 48 | async with ClientSession(read, write, sampling_callback=None) as session: 49 | await session.initialize() 50 | try: 51 | tool_result = await session.call_tool( 52 | tool_name, arguments=arguments 53 | ) 54 | result_content = ( 55 | tool_result.content[-1].text if tool_result.content else "" 56 | ) 57 | except Exception as tool_error: 58 | logger.info(f"Tool execution error: {tool_error}") 59 | return f"Error: Tool execution failed: {str(tool_error)}" 60 | except Exception as session_error: 61 | logger.info(f"Session error: {session_error}") 62 | return ( 63 | f"Error: Failed to connect to markitdown-mcp server: {str(session_error)}" 64 | ) 65 | 66 | return result_content 67 | 68 | 69 | if __name__ == "__main__": 70 | # Set up argument parser 71 | parser = argparse.ArgumentParser(description="Reading MCP Server") 72 | parser.add_argument( 73 | "--transport", 74 | choices=["stdio", "http"], 75 | default="stdio", 76 | help="Transport method: 'stdio' or 'http' (default: stdio)", 77 | ) 78 | parser.add_argument( 79 | "--port", 80 | type=int, 81 | default=8080, 82 | help="Port to use when running with HTTP transport (default: 8080)", 83 | ) 84 | parser.add_argument( 85 | "--path", 86 | type=str, 87 | default="/mcp", 88 | help="URL path to use when running with HTTP transport (default: /mcp)", 89 | ) 90 | 91 | # Parse command line arguments 92 | args = parser.parse_args() 93 | 94 | # Run the server with the specified transport method 95 | if args.transport == "stdio": 96 | mcp.run(transport="stdio") 97 | else: 98 | # For HTTP transport, include port and path options 99 | mcp.run(transport="streamable-http", port=args.port, path=args.path) 100 | -------------------------------------------------------------------------------- /apps/visualize-trace/run.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2025 MiroMind 3 | # This source code is licensed under the MIT License. 4 | 5 | import os 6 | import subprocess 7 | import sys 8 | 9 | 10 | def check_dependencies(): 11 | """Check if dependencies are installed""" 12 | try: 13 | import importlib.util 14 | 15 | if importlib.util.find_spec("flask") is not None: 16 | print("✓ Flask is installed") 17 | return True 18 | else: 19 | raise ImportError("Flask not found") 20 | except ImportError: 21 | print("✗ Flask is not installed") 22 | print("Please use the following commands to install dependencies:") 23 | print(" uv sync") 24 | print("or:") 25 | print(" uv pip install -r requirements.txt") 26 | return False 27 | 28 | 29 | def install_dependencies(): 30 | """Install dependencies (recommended to use uv)""" 31 | print("Installing dependencies...") 32 | try: 33 | # Try using uv first 34 | try: 35 | subprocess.check_call(["uv", "sync"]) 36 | print("✓ Dependencies installed successfully using uv") 37 | return True 38 | except (subprocess.CalledProcessError, FileNotFoundError): 39 | # Fallback to pip 40 | subprocess.check_call( 41 | [sys.executable, "-m", "pip", "install", "-r", "requirements.txt"] 42 | ) 43 | print("✓ Dependencies installed successfully using pip") 44 | return True 45 | except subprocess.CalledProcessError: 46 | print("✗ Failed to install dependencies") 47 | print("Please manually run: uv sync or pip install -r requirements.txt") 48 | return False 49 | 50 | 51 | def main(): 52 | """Main function""" 53 | import argparse 54 | 55 | # Parse command line arguments 56 | parser = argparse.ArgumentParser(description="Trace Analysis Web Demo") 57 | parser.add_argument( 58 | "-p", 59 | "--port", 60 | type=int, 61 | default=5000, 62 | help="Specify port number (default: 5000)", 63 | ) 64 | args = parser.parse_args() 65 | 66 | print("=" * 50) 67 | print("Trace Analysis Web Demo") 68 | print("=" * 50) 69 | 70 | # Check dependencies 71 | if not check_dependencies(): 72 | print("\nInstalling dependencies...") 73 | if not install_dependencies(): 74 | print( 75 | "Please manually install dependencies: pip install -r requirements.txt" 76 | ) 77 | return 78 | 79 | # Check JSON files 80 | parent_dir = os.path.dirname(os.path.abspath(__file__)) 81 | json_files = [ 82 | f for f in os.listdir(os.path.join(parent_dir, "..")) if f.endswith(".json") 83 | ] 84 | 85 | if not json_files: 86 | print("\nWarning: No JSON files found in parent directory") 87 | print("Please ensure trace JSON files are in the trace_analyze/ directory") 88 | else: 89 | print(f"\nFound {len(json_files)} JSON files:") 90 | for file in json_files[:5]: # Only show first 5 91 | print(f" - {file}") 92 | if len(json_files) > 5: 93 | print(f" ... and {len(json_files) - 5} other files") 94 | 95 | # Start application 96 | print("\nStarting web application...") 97 | print(f"Application will run at http://localhost:{args.port}") 98 | print("Press Ctrl+C to stop the application") 99 | print("=" * 50) 100 | 101 | try: 102 | from app import app 103 | 104 | app.run(debug=True, host="0.0.0.0", port=args.port) 105 | except KeyboardInterrupt: 106 | print("\nApplication stopped") 107 | except Exception as e: 108 | print(f"\nFailed to start application: {e}") 109 | 110 | 111 | if __name__ == "__main__": 112 | main() 113 | -------------------------------------------------------------------------------- /libs/miroflow-tools/src/miroflow_tools/mcp_servers/vision_mcp_server_os.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import base64 5 | import os 6 | 7 | import aiohttp 8 | import requests 9 | from fastmcp import FastMCP 10 | 11 | VISION_API_KEY = os.environ.get("VISION_API_KEY") 12 | VISION_BASE_URL = os.environ.get("VISION_BASE_URL") 13 | VISION_MODEL_NAME = os.environ.get("VISION_MODEL_NAME") 14 | 15 | # Initialize FastMCP server 16 | mcp = FastMCP("vision-mcp-server-os") 17 | 18 | 19 | def guess_mime_media_type_from_extension(file_path: str) -> str: 20 | """Guess the MIME type based on the file extension.""" 21 | _, ext = os.path.splitext(file_path) 22 | ext = ext.lower() 23 | if ext in [".jpg", ".jpeg"]: 24 | return "image/jpeg" 25 | elif ext == ".png": 26 | return "image/png" 27 | elif ext == ".gif": 28 | return "image/gif" 29 | else: 30 | return "image/jpeg" # Default to JPEG if unknown 31 | 32 | 33 | @mcp.tool() 34 | async def visual_question_answering(image_path_or_url: str, question: str) -> str: 35 | """Ask question about an image or a video and get the answer with a vision language model. 36 | 37 | Args: 38 | image_path_or_url: The path of the image file locally or its URL. 39 | question: The question to ask about the image. 40 | 41 | Returns: 42 | The answer to the image-related question. 43 | """ 44 | messages_for_llm = [ 45 | { 46 | "role": "user", 47 | "content": [ 48 | {"type": "image_url", "image_url": {"url": None}}, 49 | { 50 | "type": "text", 51 | "text": question, 52 | }, 53 | ], 54 | } 55 | ] 56 | 57 | headers = { 58 | "Authorization": f"Bearer {VISION_API_KEY}", 59 | "Content-Type": "application/json", 60 | } 61 | 62 | try: 63 | if os.path.exists(image_path_or_url): # Check if the file exists locally 64 | with open(image_path_or_url, "rb") as image_file: 65 | image_data = base64.b64encode(image_file.read()).decode("utf-8") 66 | mime_type = guess_mime_media_type_from_extension(image_path_or_url) 67 | messages_for_llm[0]["content"][0]["image_url"]["url"] = ( 68 | f"data:{mime_type};base64,{image_data}" 69 | ) 70 | elif image_path_or_url.startswith(("http://", "https://")): 71 | async with aiohttp.ClientSession() as session: 72 | async with session.get(image_path_or_url) as resp: 73 | if resp.status == 200: 74 | image_bytes = await resp.read() 75 | mime_type = resp.headers.get( 76 | "Content-Type", "image/png" 77 | ) # fallback MIME type 78 | image_data = base64.b64encode(image_bytes).decode("utf-8") 79 | messages_for_llm[0]["content"][0]["image_url"]["url"] = ( 80 | f"data:{mime_type};base64,{image_data}" 81 | ) 82 | else: 83 | return f"Failed to fetch image from URL: {image_path_or_url}" 84 | else: 85 | messages_for_llm[0]["content"][0]["image_url"]["url"] = image_path_or_url 86 | 87 | payload = {"model": VISION_MODEL_NAME, "messages": messages_for_llm} 88 | 89 | response = requests.post(VISION_BASE_URL, json=payload, headers=headers) 90 | 91 | except Exception as e: 92 | return f"Error: {e}" 93 | 94 | try: 95 | return response.json()["choices"][0]["message"]["content"] 96 | except (AttributeError, IndexError): 97 | return response.json() 98 | 99 | 100 | if __name__ == "__main__": 101 | mcp.run(transport="stdio") 102 | -------------------------------------------------------------------------------- /libs/miroflow-tools/src/miroflow_tools/mcp_servers/utils/url_unquote.py: -------------------------------------------------------------------------------- 1 | import re 2 | from urllib.parse import unquote 3 | 4 | from markdown_it import MarkdownIt 5 | 6 | # Reserved character encodings to be protected -> temporary placeholders 7 | PROTECT = { 8 | "%2F": "__SLASH__", 9 | "%2f": "__SLASH__", 10 | "%3F": "__QMARK__", 11 | "%3f": "__QMARK__", 12 | "%23": "__HASH__", 13 | "%26": "__AMP__", 14 | "%3D": "__EQUAL__", 15 | "%20": "__SPACE__", 16 | "%2B": "__PLUS__", 17 | "%25": "__PERCENT__", 18 | } 19 | 20 | # Reverse mapping: placeholder -> original %xx (use uppercase for uniform output) 21 | RESTORE = {v: k.upper() for k, v in PROTECT.items()} 22 | 23 | 24 | def safe_unquote(s: str, encoding="utf-8", errors="ignore") -> str: 25 | # 1. Replace with placeholders 26 | for k, v in PROTECT.items(): 27 | s = s.replace(k, v) 28 | # 2. Decode (only affects unprotected parts, e.g., Chinese characters) 29 | s = unquote(s, encoding=encoding, errors=errors) 30 | # 3. Replace placeholders back to original %xx 31 | for v, k in RESTORE.items(): 32 | s = s.replace(v, k) 33 | return s 34 | 35 | 36 | def decode_http_urls_in_dict(data): 37 | """ 38 | Traverse all values in the data structure: 39 | - If it's a string starting with http, apply urllib.parse.unquote 40 | - If it's a list, recursively process each element 41 | - If it's a dict, recursively process each value 42 | - Other types remain unchanged 43 | """ 44 | if isinstance(data, str): 45 | if "%" in data: 46 | return safe_unquote(data) 47 | else: 48 | return data 49 | elif isinstance(data, list): 50 | return [decode_http_urls_in_dict(item) for item in data] 51 | elif isinstance(data, dict): 52 | return {key: decode_http_urls_in_dict(value) for key, value in data.items()} 53 | else: 54 | return data 55 | 56 | 57 | md = MarkdownIt("commonmark") 58 | 59 | 60 | def strip_markdown_links(markdown: str) -> str: 61 | tokens = md.parse(markdown) 62 | 63 | def render(ts): 64 | out = [] 65 | for tok in ts: 66 | t = tok.type 67 | 68 | # 1) Links: drop the wrapper, keep inner text (children will be rendered) 69 | if t == "link_open" or t == "link_close": 70 | continue 71 | 72 | # 2) Images: skip the entire image block 73 | if t == "image": 74 | continue 75 | 76 | # 3) Line breaks and block closings 77 | if t == "softbreak": # inline single line break 78 | out.append("\n") 79 | continue 80 | if ( 81 | t == "hardbreak" 82 | ): # explicit line break (two spaces + newline in Markdown) 83 | out.append("\n") 84 | continue 85 | if t in ("paragraph_close", "heading_close", "blockquote_close"): 86 | out.append("\n\n") 87 | continue 88 | if t in ("list_item_close", "bullet_list_close", "ordered_list_close"): 89 | out.append("\n") 90 | continue 91 | if t == "hr": 92 | out.append("\n\n") 93 | continue 94 | 95 | # 4) Inline or nested tokens 96 | if tok.children: 97 | out.append(render(tok.children)) 98 | continue 99 | 100 | # Preserve inline code style 101 | if t == "code_inline": 102 | out.append(f"`{tok.content}`") 103 | else: 104 | out.append(tok.content or "") 105 | 106 | return "".join(out) 107 | 108 | text = render(tokens) 109 | 110 | # normalize excessive blank lines (avoid more than 2 consecutive newlines) 111 | text = re.sub(r"\n{3,}", "\n\n", text).rstrip() + "\n" 112 | 113 | return text.strip() 114 | -------------------------------------------------------------------------------- /apps/collect-trace/utils/converters/example_usage.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import json 5 | import os 6 | import sys 7 | import tempfile 8 | from pathlib import Path 9 | 10 | # Add parent directory to Python path 11 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", "..")) 12 | 13 | from utils.converters import ( 14 | extract_and_save_chat_history, 15 | extract_message_history_from_log, 16 | ) 17 | 18 | 19 | def example_1_basic_conversion(): 20 | """Example 1: Basic conversion using Python API""" 21 | print("=== Example 1: Basic Conversion ===") 22 | 23 | # Sample log data 24 | log_data = { 25 | "main_agent_message_history": { 26 | "system_prompt": "You are a helpful assistant.", 27 | "message_history": [ 28 | { 29 | "role": "developer", 30 | "content": [ 31 | {"type": "text", "text": "You are a helpful assistant."} 32 | ], 33 | }, 34 | { 35 | "role": "user", 36 | "content": [{"type": "text", "text": "Hello, how are you?"}], 37 | }, 38 | { 39 | "role": "assistant", 40 | "content": [{"type": "text", "text": "I'm doing well, thank you!"}], 41 | }, 42 | ], 43 | }, 44 | "browser_agent_message_history_sessions": { 45 | "browser_agent_1": { 46 | "system_prompt": "You are a browsing agent.", 47 | "message_history": [ 48 | { 49 | "role": "developer", 50 | "content": [ 51 | {"type": "text", "text": "You are a browsing agent."} 52 | ], 53 | }, 54 | { 55 | "role": "user", 56 | "content": [{"type": "text", "text": "Search for something"}], 57 | }, 58 | { 59 | "role": "assistant", 60 | "content": [{"type": "text", "text": "I found it."}], 61 | }, 62 | ], 63 | } 64 | }, 65 | "env_info": {"llm_provider": "openai"}, 66 | } 67 | 68 | # Convert using OAI method 69 | chatml_data = extract_message_history_from_log(log_data) 70 | print( 71 | f"OAI conversion result: {len(chatml_data['main_agent'])} messages in main agent" 72 | ) 73 | print( 74 | f"OAI conversion result: {len(chatml_data['browser_agents']['browser_agent_1'])} messages in browser agent" 75 | ) 76 | 77 | # Convert using Non-OAI method 78 | with tempfile.TemporaryDirectory() as temp_dir: 79 | temp_path = Path(temp_dir) 80 | extract_and_save_chat_history(log_data, temp_path, "example") 81 | 82 | # Check generated files 83 | main_file = temp_path / "example_main_agent_chatml.json" 84 | browser_file = temp_path / "example_browser_agent_1_chatml.json" 85 | 86 | if main_file.exists(): 87 | with open(main_file, "r") as f: 88 | main_content = json.load(f) 89 | print( 90 | f"Non-OAI conversion result: {len(main_content)} messages in main agent" 91 | ) 92 | 93 | if browser_file.exists(): 94 | with open(browser_file, "r") as f: 95 | browser_content = json.load(f) 96 | print( 97 | f"Non-OAI conversion result: {len(browser_content)} messages in browser agent" 98 | ) 99 | 100 | 101 | if __name__ == "__main__": 102 | print("ChatML Conversion Utilities - Usage Examples") 103 | print("=" * 50) 104 | 105 | example_1_basic_conversion() 106 | 107 | print("\n" + "=" * 50) 108 | print("Examples completed successfully!") 109 | print("\nFor more information, see the README.md file.") 110 | -------------------------------------------------------------------------------- /apps/miroflow-agent/scripts/run_evaluate_multiple_runs_futurex.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Parse environment variables, use defaults if not set 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"} 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"} 6 | 7 | # Configuration parameters 8 | NUM_RUNS=${NUM_RUNS:-8} 9 | BENCHMARK_NAME="futurex" 10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"} 11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"} 12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144} 13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10} 14 | PASS_AT_K=${PASS_AT_K:-1} 15 | TEMPERATURE=${TEMPERATURE:-1.0} 16 | API_KEY=${API_KEY:-"xxx"} 17 | 18 | # Set results directory 19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}" 20 | 21 | echo "Starting $NUM_RUNS runs of the evaluation..." 22 | echo "Results will be saved in: $RESULTS_DIR" 23 | 24 | # Create results directory 25 | mkdir -p "$RESULTS_DIR" 26 | 27 | # Launch all parallel tasks 28 | for i in $(seq 1 $NUM_RUNS); do 29 | echo "==========================================" 30 | echo "Launching experiment $i/$NUM_RUNS" 31 | echo "Output log: please view $RESULTS_DIR/run_${i}_output.log" 32 | echo "==========================================" 33 | 34 | # Set specific identifier for this run 35 | RUN_ID="run_$i" 36 | 37 | # Run experiment (background execution) 38 | ( 39 | uv run python benchmarks/common_benchmark.py \ 40 | benchmark=$BENCHMARK_NAME \ 41 | benchmark.data.metadata_file="standardized_data_250924_250930.jsonl" \ 42 | llm=qwen-3 \ 43 | llm.provider=$LLM_PROVIDER \ 44 | llm.model_name=$LLM_MODEL \ 45 | llm.base_url=$BASE_URL \ 46 | llm.async_client=true \ 47 | llm.temperature=$TEMPERATURE \ 48 | llm.max_context_length=$MAX_CONTEXT_LENGTH \ 49 | llm.api_key=$API_KEY \ 50 | benchmark.execution.max_tasks=null \ 51 | benchmark.execution.max_concurrent=$MAX_CONCURRENT \ 52 | benchmark.execution.pass_at_k=$PASS_AT_K \ 53 | benchmark.data.data_dir=../../data/futurex \ 54 | agent=$AGENT_SET \ 55 | hydra.run.dir=${RESULTS_DIR}/$RUN_ID \ 56 | 2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 57 | 58 | # Check if run was successful 59 | if [ $? -eq 0 ]; then 60 | echo "Run $i completed successfully" 61 | RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1) 62 | if [ -f "$RESULT_FILE" ]; then 63 | echo "Results saved to $RESULT_FILE" 64 | else 65 | echo "Warning: Result file not found for run $i" 66 | fi 67 | else 68 | echo "Run $i failed!" 69 | fi 70 | ) & 71 | 72 | # Small delay between launches to avoid simultaneous requests 73 | sleep 2 74 | done 75 | 76 | echo "All $NUM_RUNS runs have been launched in parallel" 77 | echo "Waiting for all runs to complete..." 78 | 79 | # Wait for all background tasks to complete 80 | wait 81 | 82 | echo "==========================================" 83 | echo "All $NUM_RUNS runs completed!" 84 | echo "==========================================" 85 | 86 | # Calculate average scores 87 | # echo "Calculating average scores..." 88 | # uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR" 89 | echo "Extracting predictions and formatting for FutureX submission..." 90 | uv run python benchmarks/evaluators/extract_futurex_results.py "$RESULTS_DIR" 91 | 92 | # Check status and provide user-friendly message 93 | if [ $? -eq 0 ]; then 94 | echo "✅ Submission file generated: $RESULTS_DIR/futurex_submission.jsonl" 95 | echo "You can now upload this file to the FutureX test server." 96 | else 97 | echo "❌ Failed to generate submission file. Please check the logs for details." 98 | fi 99 | 100 | echo "==========================================" 101 | echo "Multiple runs evaluation completed!" 102 | echo "Check results in: $RESULTS_DIR" 103 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log" 104 | echo "==========================================" 105 | -------------------------------------------------------------------------------- /assets/qwen3_nonthinking.jinja: -------------------------------------------------------------------------------- 1 | {%- if tools %} 2 | {{- '<|im_start|>system\n' }} 3 | {%- if messages[0].role == 'system' %} 4 | {{- messages[0].content + '\n\n' }} 5 | {%- endif %} 6 | {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n" }} 7 | {%- for tool in tools %} 8 | {{- "\n" }} 9 | {{- tool | tojson }} 10 | {%- endfor %} 11 | {{- "\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\"name\": , \"arguments\": }\n<|im_end|>\n" }} 12 | {%- else %} 13 | {%- if messages[0].role == 'system' %} 14 | {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }} 15 | {%- endif %} 16 | {%- endif %} 17 | {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} 18 | {%- for message in messages[::-1] %} 19 | {%- set index = (messages|length - 1) - loop.index0 %} 20 | {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('') and message.content.endswith('')) %} 21 | {%- set ns.multi_step_tool = false %} 22 | {%- set ns.last_query_index = index %} 23 | {%- endif %} 24 | {%- endfor %} 25 | {%- for message in messages %} 26 | {%- if (message.role == "user") or (message.role == "system" and not loop.first) %} 27 | {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} 28 | {%- elif message.role == "assistant" %} 29 | {%- set content = message.content %} 30 | {%- set reasoning_content = '' %} 31 | {%- if message.reasoning_content is defined and message.reasoning_content is not none %} 32 | {%- set reasoning_content = message.reasoning_content %} 33 | {%- else %} 34 | {%- if '' in message.content %} 35 | {%- set content = message.content.split('')[-1].lstrip('\n') %} 36 | {%- set reasoning_content = message.content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} 37 | {%- endif %} 38 | {%- endif %} 39 | {%- if loop.index0 > ns.last_query_index %} 40 | {%- if loop.last or (not loop.last and reasoning_content) %} 41 | {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} 42 | {%- else %} 43 | {{- '<|im_start|>' + message.role + '\n' + content }} 44 | {%- endif %} 45 | {%- else %} 46 | {{- '<|im_start|>' + message.role + '\n' + content }} 47 | {%- endif %} 48 | {%- if message.tool_calls %} 49 | {%- for tool_call in message.tool_calls %} 50 | {%- if (loop.first and content) or (not loop.first) %} 51 | {{- '\n' }} 52 | {%- endif %} 53 | {%- if tool_call.function %} 54 | {%- set tool_call = tool_call.function %} 55 | {%- endif %} 56 | {{- '\n{"name": "' }} 57 | {{- tool_call.name }} 58 | {{- '", "arguments": ' }} 59 | {%- if tool_call.arguments is string %} 60 | {{- tool_call.arguments }} 61 | {%- else %} 62 | {{- tool_call.arguments | tojson }} 63 | {%- endif %} 64 | {{- '}\n' }} 65 | {%- endfor %} 66 | {%- endif %} 67 | {{- '<|im_end|>\n' }} 68 | {%- elif message.role == "tool" %} 69 | {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %} 70 | {{- '<|im_start|>user' }} 71 | {%- endif %} 72 | {{- '\n\n' }} 73 | {{- message.content }} 74 | {{- '\n' }} 75 | {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %} 76 | {{- '<|im_end|>\n' }} 77 | {%- endif %} 78 | {%- endif %} 79 | {%- endfor %} 80 | {%- if add_generation_prompt %} 81 | {{- '<|im_start|>assistant\n\n\n\n\n' }} 82 | {%- endif %} -------------------------------------------------------------------------------- /apps/miroflow-agent/benchmarks/evaluators/calculate_average_score.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # Copyright (c) 2025 MiroMind 3 | # This source code is licensed under the MIT License. 4 | 5 | import glob 6 | import os 7 | import re 8 | import statistics 9 | import sys 10 | 11 | 12 | def detect_pass_at_k(results_dir: str) -> tuple: 13 | """Detect the pass_at_k value used in the results directory""" 14 | 15 | # Find all possible pass_at_k files 16 | pattern = os.path.join( 17 | results_dir, "run_*", "benchmark_results_pass_at_*_accuracy.txt" 18 | ) 19 | all_files = glob.glob(pattern) 20 | 21 | if not all_files: 22 | print(f"No accuracy files found in {results_dir}") 23 | print(f"Expected pattern: {pattern}") 24 | return None, [] 25 | 26 | # Extract pass_at_k value from the first file 27 | filename = os.path.basename(all_files[0]) 28 | match = re.search(r"pass_at_(\d+)_accuracy\.txt", filename) 29 | 30 | if not match: 31 | print(f"Cannot extract pass_at_k from filename: {filename}") 32 | return None, [] 33 | 34 | k = int(match.group(1)) 35 | 36 | # Get all files with this k value 37 | accuracy_files = glob.glob( 38 | os.path.join( 39 | results_dir, "run_*", f"benchmark_results_pass_at_{k}_accuracy.txt" 40 | ) 41 | ) 42 | 43 | return k, accuracy_files 44 | 45 | 46 | def calculate_average_scores(results_dir: str) -> dict: 47 | """Calculate average scores from multiple runs - automatically detect pass_at_k value""" 48 | 49 | # Detect pass_at_k value and corresponding files 50 | pass_at_k, accuracy_files = detect_pass_at_k(results_dir) 51 | 52 | if pass_at_k is None: 53 | return None 54 | 55 | print(f"Detected pass_at_{pass_at_k} files") 56 | print(f"Found {len(accuracy_files)} accuracy files") 57 | 58 | scores = [] 59 | 60 | # Read each accuracy file 61 | for i, file_path in enumerate(sorted(accuracy_files), 1): 62 | try: 63 | with open(file_path, "r") as f: 64 | content = f.read().strip() 65 | # Remove percentage sign and convert to float 66 | score = float(content.replace("%", "")) 67 | scores.append(score) 68 | print(f"Run {i}: {score:.2f}%") 69 | except Exception as e: 70 | print(f"Error reading {file_path}: {e}") 71 | continue 72 | 73 | if not scores: 74 | print("No valid scores found") 75 | return None 76 | 77 | # Calculate statistics 78 | stats = { 79 | "pass_at_k": pass_at_k, 80 | "num_runs": len(scores), 81 | "individual_scores": scores, 82 | "average_score": statistics.mean(scores), 83 | "std_dev": statistics.stdev(scores) if len(scores) > 1 else 0, 84 | "min_score": min(scores), 85 | "max_score": max(scores), 86 | } 87 | 88 | return stats 89 | 90 | 91 | def print_results(stats: dict): 92 | """Print results""" 93 | print("\n" + "=" * 50) 94 | print("EVALUATION RESULTS") 95 | print("=" * 50) 96 | 97 | print(f"Pass@{stats['pass_at_k']} Results:") 98 | print(f"Number of runs: {stats['num_runs']}") 99 | print(f"Individual scores: {[f'{s:.2f}%' for s in stats['individual_scores']]}") 100 | print() 101 | print(f"Standard deviation: {stats['std_dev']:.2f}%") 102 | print(f"Min score: {stats['min_score']:.2f}%") 103 | print(f"Max score: {stats['max_score']:.2f}%") 104 | print(f"Average score: {stats['average_score']:.2f}%") 105 | print("=" * 50) 106 | 107 | 108 | def main(): 109 | if len(sys.argv) < 2: 110 | print("Usage: python calculate_average_score.py ") 111 | print("Example: python calculate_average_score.py logs/gaia-validation/mytest") 112 | sys.exit(1) 113 | 114 | results_dir = sys.argv[1] 115 | 116 | if not os.path.exists(results_dir): 117 | print(f"Results directory does not exist: {results_dir}") 118 | sys.exit(1) 119 | 120 | print(f"Analyzing results from: {results_dir}") 121 | 122 | stats = calculate_average_scores(results_dir) 123 | 124 | if stats: 125 | print_results(stats) 126 | 127 | # Save simple statistics results 128 | output_file = os.path.join( 129 | results_dir, f"average_scores_pass_at_{stats['pass_at_k']}.txt" 130 | ) 131 | with open(output_file, "w") as f: 132 | f.write("EVALUATION RESULTS\n") 133 | f.write("=" * 50 + "\n") 134 | f.write(f"Pass@{stats['pass_at_k']} Results:\n") 135 | f.write(f"Number of runs: {stats['num_runs']}\n") 136 | f.write( 137 | f"Individual scores: {[f'{s:.2f}%' for s in stats['individual_scores']]}\n" 138 | ) 139 | f.write(f"Standard deviation: {stats['std_dev']:.2f}%\n") 140 | f.write(f"Min score: {stats['min_score']:.2f}%\n") 141 | f.write(f"Max score: {stats['max_score']:.2f}%\n") 142 | f.write(f"Average score: {stats['average_score']:.2f}%\n") 143 | f.write("=" * 50 + "\n") 144 | 145 | print(f"\nResults saved to: {output_file}") 146 | else: 147 | print("Failed to calculate statistics") 148 | sys.exit(1) 149 | 150 | 151 | if __name__ == "__main__": 152 | main() 153 | -------------------------------------------------------------------------------- /libs/miroflow-tools/src/miroflow_tools/mcp_servers/serper_mcp_server.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | """ 5 | adapted from 6 | https://github.com/MiroMindAI/MiroRL/blob/5073693549ffe05a157a1886e87650ef3be6606e/mirorl/tools/serper_search.py#L1 7 | """ 8 | 9 | import os 10 | from typing import Any, Dict 11 | 12 | import requests 13 | from mcp.server.fastmcp import FastMCP 14 | from tenacity import ( 15 | retry, 16 | retry_if_exception_type, 17 | stop_after_attempt, 18 | wait_exponential, 19 | ) 20 | 21 | from .utils import decode_http_urls_in_dict 22 | 23 | SERPER_BASE_URL = os.getenv("SERPER_BASE_URL", "https://google.serper.dev") 24 | SERPER_API_KEY = os.getenv("SERPER_API_KEY", "") 25 | 26 | # Initialize FastMCP server 27 | mcp = FastMCP("serper-mcp-server") 28 | 29 | 30 | @retry( 31 | stop=stop_after_attempt(3), 32 | wait=wait_exponential(multiplier=1, min=4, max=10), 33 | retry=retry_if_exception_type( 34 | (requests.ConnectionError, requests.Timeout, requests.HTTPError) 35 | ), 36 | ) 37 | def make_serper_request( 38 | payload: Dict[str, Any], headers: Dict[str, str] 39 | ) -> requests.Response: 40 | """Make HTTP request to Serper API with retry logic.""" 41 | response = requests.post(f"{SERPER_BASE_URL}/search", json=payload, headers=headers) 42 | response.raise_for_status() 43 | return response 44 | 45 | 46 | def _is_huggingface_dataset_or_space_url(url): 47 | """ 48 | Check if the URL is a HuggingFace dataset or space URL. 49 | :param url: The URL to check 50 | :return: True if it's a HuggingFace dataset or space URL, False otherwise 51 | """ 52 | if not url: 53 | return False 54 | return "huggingface.co/datasets" in url or "huggingface.co/spaces" in url 55 | 56 | 57 | @mcp.tool() 58 | def google_search( 59 | q: str, 60 | gl: str = "us", 61 | hl: str = "en", 62 | location: str | None = None, 63 | num: int | None = None, 64 | tbs: str | None = None, 65 | page: int | None = None, 66 | autocorrect: bool | None = None, 67 | ) -> Dict[str, Any]: 68 | """ 69 | Tool to perform web searches via Serper API and retrieve rich results. 70 | 71 | It is able to retrieve organic search results, people also ask, 72 | related searches, and knowledge graph. 73 | 74 | Args: 75 | q: Search query string 76 | gl: Optional region code for search results in ISO 3166-1 alpha-2 format (e.g., 'us') 77 | hl: Optional language code for search results in ISO 639-1 format (e.g., 'en') 78 | location: Optional location for search results (e.g., 'SoHo, New York, United States', 'California, United States') 79 | num: Number of results to return (default: 10) 80 | tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week, 81 | 'qdr:m' for past month, 'qdr:y' for past year) 82 | page: Page number of results to return (default: 1) 83 | autocorrect: Whether to autocorrect spelling in query 84 | 85 | Returns: 86 | Dictionary containing search results and metadata. 87 | """ 88 | # Check for API key 89 | if not SERPER_API_KEY: 90 | return { 91 | "success": False, 92 | "error": "SERPER_API_KEY environment variable not set", 93 | "results": [], 94 | } 95 | 96 | # Validate required parameter 97 | if not q or not q.strip(): 98 | return { 99 | "success": False, 100 | "error": "Search query 'q' is required and cannot be empty", 101 | "results": [], 102 | } 103 | 104 | try: 105 | # Build payload with all supported parameters 106 | payload: dict[str, Any] = { 107 | "q": q.strip(), 108 | "gl": gl, 109 | "hl": hl, 110 | } 111 | 112 | # Add optional parameters if provided 113 | if location: 114 | payload["location"] = location 115 | if num is not None: 116 | payload["num"] = num 117 | else: 118 | payload["num"] = 10 # Default 119 | if tbs: 120 | payload["tbs"] = tbs 121 | if page is not None: 122 | payload["page"] = page 123 | if autocorrect is not None: 124 | payload["autocorrect"] = autocorrect 125 | 126 | # Set up headers 127 | headers = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"} 128 | 129 | # Make the API request 130 | response = make_serper_request(payload, headers) 131 | data = response.json() 132 | 133 | # filter out HuggingFace dataset or space urls 134 | organic_results = [] 135 | if "organic" in data: 136 | for item in data["organic"]: 137 | if _is_huggingface_dataset_or_space_url(item.get("link", "")): 138 | continue 139 | organic_results.append(item) 140 | 141 | # Keep all original fields, but overwrite "organic" 142 | response_data = dict(data) 143 | response_data["organic"] = organic_results 144 | response_data = decode_http_urls_in_dict(response_data) 145 | 146 | return response_data 147 | 148 | except Exception as e: 149 | return {"success": False, "error": f"Unexpected error: {str(e)}", "results": []} 150 | 151 | 152 | if __name__ == "__main__": 153 | mcp.run() 154 | -------------------------------------------------------------------------------- /apps/miroflow-agent/src/logging/summary_time_cost.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import json 5 | from collections import defaultdict 6 | from pathlib import Path 7 | 8 | from .task_logger import logger 9 | 10 | 11 | def _get_summary_template(): 12 | """Returns a template for the summary data structure.""" 13 | return { 14 | "total_tasks": 0, 15 | "total_wall_time": 0.0, 16 | "primary_breakdown": { 17 | "main_agent": defaultdict(float), 18 | "browsing_agent": defaultdict(float), 19 | }, 20 | "cross_cutting_breakdown": defaultdict(float), 21 | "tool_workload_breakdown": defaultdict(float), 22 | } 23 | 24 | 25 | def _update_summary_data(summary_block, perf_summary, tool_workload): 26 | """Updates a summary block with data from a single result.""" 27 | summary_block["total_tasks"] += 1 28 | summary_block["total_wall_time"] += perf_summary.get("total_wall_time", 0.0) 29 | 30 | # Update primary breakdown 31 | primary_breakdown = perf_summary.get("primary_breakdown", {}) 32 | for agent, data in primary_breakdown.items(): 33 | if agent in summary_block["primary_breakdown"]: 34 | for key, value in data.items(): 35 | summary_block["primary_breakdown"][agent][key] += value 36 | 37 | # Update cross-cutting breakdown 38 | cross_cutting_breakdown = perf_summary.get("cross_cutting_breakdown", {}) 39 | for key, value in cross_cutting_breakdown.items(): 40 | summary_block["cross_cutting_breakdown"][key] += value 41 | 42 | # Update tool workload breakdown 43 | for key, value in tool_workload.items(): 44 | summary_block["tool_workload_breakdown"][key] += value 45 | 46 | 47 | def _calculate_averages(summary_block): 48 | """Calculates and adds average values to a summary block.""" 49 | num_tasks = summary_block["total_tasks"] 50 | if num_tasks == 0: 51 | return 52 | 53 | summary_block["average_wall_time"] = summary_block["total_wall_time"] / num_tasks 54 | 55 | # Calculate averages for primary breakdown 56 | for agent, data in summary_block["primary_breakdown"].items(): 57 | summary_block["primary_breakdown"][agent] = dict(data) # Convert back to dict 58 | avg_data = {f"avg_{k}": v / num_tasks for k, v in data.items()} 59 | summary_block["primary_breakdown"][agent].update(avg_data) 60 | 61 | # Calculate averages for cross-cutting breakdown 62 | summary_block["cross_cutting_breakdown"] = dict( 63 | summary_block["cross_cutting_breakdown"] 64 | ) 65 | avg_cross_cutting = { 66 | f"avg_{k}": v / num_tasks 67 | for k, v in summary_block["cross_cutting_breakdown"].items() 68 | } 69 | summary_block["cross_cutting_breakdown"].update(avg_cross_cutting) 70 | 71 | # Calculate averages for tool workload breakdown 72 | summary_block["tool_workload_breakdown"] = dict( 73 | summary_block["tool_workload_breakdown"] 74 | ) 75 | avg_tool_workload = { 76 | f"avg_{k}": v / num_tasks 77 | for k, v in summary_block["tool_workload_breakdown"].items() 78 | } 79 | summary_block["tool_workload_breakdown"].update(avg_tool_workload) 80 | 81 | 82 | def generate_summary(log_dir: Path): 83 | """ 84 | Generates a summary of benchmark results by reading log files from a directory, 85 | calculating total and average trace data, both overall and grouped by 86 | final_judge_result. 87 | 88 | Args: 89 | log_dir: The directory where the individual result log files are and where 90 | the summary file will be saved. 91 | """ 92 | results = [] 93 | for log_file in log_dir.glob("*.json"): 94 | if log_file.name == "summary.json": 95 | continue 96 | try: 97 | with open(log_file, "r", encoding="utf-8") as f: 98 | results.append(json.load(f)) 99 | except json.JSONDecodeError: 100 | logger.info(f"Warning: Could not decode JSON from {log_file}. Skipping.") 101 | except Exception as e: 102 | logger.info(f"Warning: Could not read file {log_file}: {e}. Skipping.") 103 | 104 | overall_summary = _get_summary_template() 105 | summary_by_judge = defaultdict(_get_summary_template) 106 | 107 | for result in results: 108 | trace_data = result.get("trace_data") 109 | if not trace_data or "performance_summary" not in trace_data: 110 | continue 111 | 112 | perf_summary = trace_data["performance_summary"] 113 | tool_workload = trace_data.get("tool_workload_breakdown", {}) 114 | 115 | # Update overall summary 116 | _update_summary_data(overall_summary, perf_summary, tool_workload) 117 | 118 | # Update summary by judge result 119 | judge_result = result.get("final_judge_result", "unknown") 120 | _update_summary_data( 121 | summary_by_judge[judge_result], perf_summary, tool_workload 122 | ) 123 | 124 | # Calculate averages for all summary blocks 125 | _calculate_averages(overall_summary) 126 | for judge_result in summary_by_judge: 127 | _calculate_averages(summary_by_judge[judge_result]) 128 | 129 | summary_data = { 130 | "overall_summary": overall_summary, 131 | "summary_by_final_judge_result": dict(summary_by_judge), 132 | } 133 | 134 | summary_file = log_dir / "summary_time_cost.json" 135 | with open(summary_file, "w", encoding="utf-8") as f: 136 | json.dump(summary_data, f, indent=4, ensure_ascii=False) 137 | -------------------------------------------------------------------------------- /assets/LOCAL-TOOL-DEPLOYMENT.md: -------------------------------------------------------------------------------- 1 | # Local Tool Deployment Guide 2 | 3 | This guide explains how to deploy open-source tools locally for use with MiroThinker. These tools are optional enhancements that can replace commercial alternatives in your agent configuration. 4 | 5 | ## Overview 6 | 7 | MiroThinker supports several optional open-source tools that you can deploy locally: 8 | 9 | - **Audio Transcription**: Whisper-Large-v3-Turbo for transcribing audio files 10 | - **Visual Question Answering**: Qwen2.5-VL-72B-Instruct for answering questions about images 11 | - **Reasoning Engine**: Qwen3-235B-A22B-Thinking-2507 for complex reasoning tasks 12 | 13 | These tools are used when you configure your agent with `tool-transcribe-os`, `tool-vqa-os`, or `tool-reasoning-os` in your agent configuration file. 14 | 15 | ## Prerequisites 16 | 17 | - **GPU**: NVIDIA GPU with sufficient VRAM 18 | - **Python 3.10+** 19 | - **CUDA**: Compatible CUDA toolkit installed 20 | - **Model Storage**: Sufficient disk space to download model checkpoints 21 | 22 | ## Tool Deployment 23 | 24 | ### 1. Audio Transcription Tool (`tool-transcribe-os`) 25 | 26 | **Model**: [Whisper-Large-v3-Turbo](https://huggingface.co/openai/whisper-large-v3-turbo) 27 | 28 | **Description**: Transcribes audio files (MP3, WAV, M4A, AAC, OGG, FLAC, WMA) to text. Supports both local files and remote URLs. 29 | 30 | **Deployment with vLLM**: 31 | 32 | ```bash 33 | # Install vLLM with audio support 34 | pip install vllm==0.10.0 35 | pip install vllm[audio] 36 | 37 | # Start the server 38 | vllm serve openai/whisper-large-v3-turbo \ 39 | --served-model-name whisper-large-v3-turbo \ 40 | --task transcription \ 41 | --host 0.0.0.0 \ 42 | --port 8000 43 | ``` 44 | 45 | **Configuration in `.env`**: 46 | 47 | ```bash 48 | WHISPER_MODEL_NAME="openai/whisper-large-v3-turbo" 49 | WHISPER_API_KEY=your_api_key # Optional, if your server requires authentication 50 | WHISPER_BASE_URL="http://0.0.0.0:8000/v1" 51 | ``` 52 | 53 | ### 2. Visual Question Answering Tool (`tool-vqa-os`) 54 | 55 | **Model**: [Qwen2.5-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct) 56 | 57 | **Description**: Answers questions about images. Supports local image files and URLs. Automatically encodes local images to Base64 for API requests. Compatible with JPEG, PNG, GIF formats. 58 | 59 | **Deployment with SGLang**: 60 | 61 | ```bash 62 | # Install SGLang 63 | pip install sglang[all] 64 | 65 | # Start the server 66 | python3 -m sglang.launch_server \ 67 | --model-path Qwen/Qwen2.5-VL-72B-Instruct \ 68 | --tp 8 \ 69 | --host 0.0.0.0 \ 70 | --port 8001 \ 71 | --trust-remote-code \ 72 | --enable-metrics 73 | ``` 74 | 75 | **Configuration in `.env`**: 76 | 77 | ```bash 78 | VISION_MODEL_NAME="Qwen/Qwen2.5-VL-72B-Instruct" 79 | VISION_API_KEY=your_api_key # Optional, if your server requires authentication 80 | VISION_BASE_URL="http://0.0.0.0:8001/v1/chat/completions" 81 | ``` 82 | 83 | ### 3. Reasoning Engine Tool (`tool-reasoning-os`) 84 | 85 | **Model**: [Qwen3-235B-A22B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507) 86 | 87 | **Description**: A reasoning service for solving complex analytical problems, such as advanced mathematics, puzzles, and riddles. Supports long-context reasoning tasks (up to 131K tokens). 88 | 89 | **Deployment with SGLang**: 90 | 91 | ```bash 92 | # Install SGLang 93 | pip install sglang[all] 94 | 95 | # Start the server 96 | python3 -m sglang.launch_server \ 97 | --model-path Qwen/Qwen3-235B-A22B-Thinking-2507 \ 98 | --tp 8 \ 99 | --host 0.0.0.0 \ 100 | --port 8002 \ 101 | --trust-remote-code \ 102 | --context-length 131072 \ 103 | --enable-metrics 104 | ``` 105 | 106 | **Configuration in `.env`**: 107 | 108 | ```bash 109 | REASONING_MODEL_NAME="Qwen/Qwen3-235B-A22B-Thinking-2507" 110 | REASONING_API_KEY=your_api_key # Optional, if your server requires authentication 111 | REASONING_BASE_URL="http://0.0.0.0:8002/v1/chat/completions" 112 | ``` 113 | 114 | ## Using Deployed Tools 115 | 116 | Once you have deployed the tools, configure your agent to use them: 117 | 118 | 1. **Edit your agent configuration** (e.g., `apps/miroflow-agent/conf/agent/my_custom_config.yaml`): 119 | 120 | ```yaml 121 | main_agent: 122 | tools: 123 | - tool-python 124 | - search_and_scrape_webpage 125 | - jina_scrape_llm_summary 126 | - tool-transcribe-os # Use local Whisper deployment 127 | - tool-vqa-os # Use local Qwen2.5-VL deployment 128 | - tool-reasoning-os # Use local Qwen3-235B deployment 129 | max_turns: 400 130 | ``` 131 | 132 | 2. **Configure environment variables** in `apps/miroflow-agent/.env` as shown in each tool's deployment section above. 133 | 134 | 1. **Run your agent**: 135 | 136 | ```bash 137 | cd apps/miroflow-agent 138 | uv run main.py llm=qwen-3 agent=my_custom_config llm.base_url=https://your_base_url/v1 139 | ``` 140 | 141 | ## Commercial Alternatives 142 | 143 | If you prefer not to deploy these tools locally, you can use commercial alternatives: 144 | 145 | - **`tool-transcribe`**: Uses OpenAI's GPT-4o mini Transcribe API 146 | - **`tool-vqa`**: Uses Claude Sonnet 3.7 API 147 | - **`tool-reasoning`**: Uses Claude Sonnet 3.7 API 148 | 149 | Simply replace `-os` versions with commercial versions in your agent configuration and configure the corresponding API keys (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`). 150 | 151 | ## Additional Resources 152 | 153 | - **SGLang Documentation**: [https://sglang.readthedocs.io/](https://sglang.readthedocs.io/) 154 | - **vLLM Documentation**: [https://docs.vllm.ai/](https://docs.vllm.ai/) 155 | - **Model Cards**: Check HuggingFace model pages for specific requirements and recommendations 156 | -------------------------------------------------------------------------------- /assets/MiromindAI_H.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[codz] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | #poetry.toml 110 | 111 | # pdm 112 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 113 | # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. 114 | # https://pdm-project.org/en/latest/usage/project/#working-with-version-control 115 | #pdm.lock 116 | #pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # pixi 121 | # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. 122 | #pixi.lock 123 | # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one 124 | # in the .venv directory. It is recommended not to include this directory in version control. 125 | .pixi 126 | 127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 128 | __pypackages__/ 129 | 130 | # Celery stuff 131 | celerybeat-schedule 132 | celerybeat.pid 133 | 134 | # SageMath parsed files 135 | *.sage.py 136 | 137 | # Environments 138 | .env 139 | .envrc 140 | .venv 141 | env/ 142 | venv/ 143 | ENV/ 144 | env.bak/ 145 | venv.bak/ 146 | 147 | # Spyder project settings 148 | .spyderproject 149 | .spyproject 150 | 151 | # Rope project settings 152 | .ropeproject 153 | 154 | # mkdocs documentation 155 | /site 156 | 157 | # mypy 158 | .mypy_cache/ 159 | .dmypy.json 160 | dmypy.json 161 | 162 | # Pyre type checker 163 | .pyre/ 164 | 165 | # pytype static type analyzer 166 | .pytype/ 167 | 168 | # Cython debug symbols 169 | cython_debug/ 170 | 171 | # PyCharm 172 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 173 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 174 | # and can be added to the global gitignore or merged into this file. For a more nuclear 175 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 176 | #.idea/ 177 | 178 | # Abstra 179 | # Abstra is an AI-powered process automation framework. 180 | # Ignore directories containing user credentials, local state, and settings. 181 | # Learn more at https://abstra.io/docs 182 | .abstra/ 183 | 184 | # Visual Studio Code 185 | # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 186 | # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore 187 | # and can be added to the global gitignore or merged into this file. However, if you prefer, 188 | # you could uncomment the following to ignore the entire vscode folder 189 | # .vscode/ 190 | 191 | # Ruff stuff: 192 | .ruff_cache/ 193 | 194 | # PyPI configuration file 195 | .pypirc 196 | 197 | # Cursor 198 | # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to 199 | # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data 200 | # refer to https://docs.cursor.com/context/ignore-files 201 | .cursorignore 202 | .cursorindexingignore 203 | 204 | # Marimo 205 | marimo/_static/ 206 | marimo/_lsp/ 207 | __marimo__/ 208 | 209 | 210 | # -- ADDED -- 211 | # Log files 212 | logs/ 213 | 214 | # Data directory - exclude everything except README 215 | data/ 216 | 217 | 218 | .idea/ 219 | 220 | .DS_Store 221 | 222 | apps/collect-trace/scripts/*/*.sh -------------------------------------------------------------------------------- /apps/collect-trace/utils/converters/convert_non_oai_to_chatml.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import json 5 | import sys 6 | from pathlib import Path 7 | from typing import Any, Dict, List 8 | 9 | 10 | def convert_to_json_chatml(messages: List[Dict[str, Any]]) -> List[Dict[str, str]]: 11 | """ 12 | Convert message list to OpenAI JSON format ChatML 13 | Filter out messages with role 'tool', convert content None to empty string 14 | """ 15 | chatml_list = [] 16 | for message in messages: 17 | role = message.get("role", "") 18 | if role == "tool": 19 | continue # Skip tool messages 20 | if role == "system": 21 | continue # Skip system messages 22 | content = message.get("content", "") 23 | if content is None: 24 | content = "" 25 | # Handle different content formats 26 | if isinstance(content, list): 27 | text_parts = [] 28 | for item in content: 29 | if isinstance(item, dict) and item.get("type") == "text": 30 | text_parts.append(item.get("text", "")) 31 | content = " ".join(text_parts) 32 | elif isinstance(content, str): 33 | pass 34 | else: 35 | content = str(content) 36 | chatml_list.append({"role": role, "content": content}) 37 | return chatml_list 38 | 39 | 40 | def extract_and_save_chat_history( 41 | log_data: Dict[str, Any], output_dir: Path, input_filename: str 42 | ): 43 | """ 44 | Extract message history from log data and save as ChatML format 45 | 46 | Args: 47 | log_data: Log data dictionary 48 | output_dir: Output directory 49 | input_filename: Input filename (without extension) 50 | """ 51 | # Ensure output directory exists 52 | output_dir.mkdir(parents=True, exist_ok=True) 53 | 54 | # 1. Extract main_agent_message_history 55 | main_agent_history = log_data.get("main_agent_message_history", {}) 56 | if main_agent_history and "message_history" in main_agent_history: 57 | main_messages = main_agent_history["message_history"] 58 | if main_messages: 59 | chatml_list = convert_to_json_chatml(main_messages) 60 | chatml_list.insert( 61 | 0, 62 | { 63 | "role": "system", 64 | "content": main_agent_history.get("system_prompt", ""), 65 | }, 66 | ) 67 | # Save main agent chat records 68 | main_output_file = output_dir / f"{input_filename}_main_agent_chatml.json" 69 | with open(main_output_file, "w", encoding="utf-8") as f: 70 | json.dump(chatml_list, f, ensure_ascii=False, indent=2) 71 | 72 | print(f"✓ Saved main agent chat records: {main_output_file}") 73 | 74 | # 2. Extract sub_agent_message_history_sessions 75 | sub_agent_sessions = log_data.get("sub_agent_message_history_sessions", {}) 76 | if sub_agent_sessions: 77 | for session_name, session_data in sub_agent_sessions.items(): 78 | if "message_history" in session_data: 79 | sub_agent_messages = session_data["message_history"] 80 | if sub_agent_messages: 81 | chatml_list = convert_to_json_chatml(sub_agent_messages) 82 | chatml_list.insert( 83 | 0, 84 | { 85 | "role": "system", 86 | "content": session_data.get("system_prompt", ""), 87 | }, 88 | ) 89 | 90 | # Save browser agent chat records 91 | sub_agent_output_file = ( 92 | output_dir / f"{input_filename}_{session_name}_chatml.json" 93 | ) 94 | with open(sub_agent_output_file, "w", encoding="utf-8") as f: 95 | json.dump(chatml_list, f, ensure_ascii=False, indent=2) 96 | 97 | print(f"✓ Saved sub agent chat records: {sub_agent_output_file}") 98 | 99 | 100 | def main(): 101 | """Main function""" 102 | if len(sys.argv) < 2: 103 | print("Usage: python convert_non_oai_to_chatml.py [output_dir]") 104 | print( 105 | "Example: python convert_non_oai_to_chatml.py logs/debug_logs/task_1.json" 106 | ) 107 | print( 108 | "Example: python convert_non_oai_to_chatml.py logs/debug_logs/task_1.json ./extracted_chats" 109 | ) 110 | sys.exit(1) 111 | 112 | log_file_path = Path(sys.argv[1]) 113 | output_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("extracted_chats") 114 | 115 | # Check if input file exists 116 | if not log_file_path.exists(): 117 | print(f"Error: Log file does not exist: {log_file_path}") 118 | sys.exit(1) 119 | 120 | try: 121 | # Read log file 122 | print(f"Reading log file: {log_file_path}") 123 | with open(log_file_path, "r", encoding="utf-8") as f: 124 | log_data = json.load(f) 125 | 126 | # Extract input filename (without extension) 127 | input_filename = log_file_path.stem 128 | 129 | # Extract and save chat history 130 | print(f"Extracting chat history to: {output_dir}") 131 | extract_and_save_chat_history(log_data, output_dir, input_filename) 132 | 133 | print("\n✓ Chat history extraction completed!") 134 | print(f"Output directory: {output_dir.absolute()}") 135 | 136 | except json.JSONDecodeError as e: 137 | print(f"Error: Cannot parse JSON file: {e}") 138 | sys.exit(1) 139 | except Exception as e: 140 | print(f"Error: {e}") 141 | sys.exit(1) 142 | 143 | 144 | if __name__ == "__main__": 145 | main() 146 | -------------------------------------------------------------------------------- /apps/miroflow-agent/src/io/output_formatter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025 MiroMind 2 | # This source code is licensed under the MIT License. 3 | 4 | import re 5 | 6 | 7 | class OutputFormatter: 8 | def _extract_boxed_content(self, text: str) -> str: 9 | r""" 10 | Extract the content of the last \boxed{...} occurrence in the given text. 11 | Supports: 12 | - Arbitrary levels of nested braces 13 | - Escaped braces (\{ and \}) 14 | - Whitespace between \boxed and the opening brace 15 | - Empty content inside braces 16 | - Incomplete boxed expressions (extracts to end of string as fallback) 17 | Returns an empty string if no match is found. 18 | """ 19 | if not text: 20 | return "" 21 | 22 | _BOXED_RE = re.compile(r"\\boxed\b", re.DOTALL) 23 | 24 | last_result = None # Track the last boxed content (complete or incomplete) 25 | i = 0 26 | n = len(text) 27 | 28 | while True: 29 | # Find the next \boxed occurrence 30 | m = _BOXED_RE.search(text, i) 31 | if not m: 32 | break 33 | j = m.end() 34 | 35 | # Skip any whitespace after \boxed 36 | while j < n and text[j].isspace(): 37 | j += 1 38 | 39 | # Require that the next character is '{' 40 | if j >= n or text[j] != "{": 41 | i = j 42 | continue 43 | 44 | # Parse the brace content manually to handle nesting and escapes 45 | depth = 0 46 | k = j 47 | escaped = False 48 | found_closing = False 49 | while k < n: 50 | ch = text[k] 51 | if escaped: 52 | escaped = False 53 | elif ch == "\\": 54 | escaped = True 55 | elif ch == "{": 56 | depth += 1 57 | elif ch == "}": 58 | depth -= 1 59 | # When depth returns to zero, the boxed content ends 60 | if depth == 0: 61 | last_result = text[j + 1 : k] 62 | i = k + 1 63 | found_closing = True 64 | break 65 | k += 1 66 | 67 | # If we didn't find a closing brace, this is an incomplete boxed 68 | # Store it as the last result (will be overwritten if we find more boxed later) 69 | if not found_closing and depth > 0: 70 | last_result = text[j + 1 : n] 71 | i = k # Continue from where we stopped 72 | elif not found_closing: 73 | i = j + 1 # Move past this invalid boxed 74 | 75 | # Return the last boxed content found (complete or incomplete) 76 | return last_result.strip() if last_result else "" 77 | 78 | def format_tool_result_for_user(self, tool_call_execution_result): 79 | """ 80 | Format tool execution results to be fed back to LLM as user messages. 81 | Only includes necessary information (results or errors). 82 | """ 83 | server_name = tool_call_execution_result["server_name"] 84 | tool_name = tool_call_execution_result["tool_name"] 85 | 86 | if "error" in tool_call_execution_result: 87 | # Provide concise error information to LLM 88 | content = f"Tool call to {tool_name} on {server_name} failed. Error: {tool_call_execution_result['error']}" 89 | elif "result" in tool_call_execution_result: 90 | # Provide the original output result of the tool 91 | content = tool_call_execution_result["result"] 92 | # Consider truncating overly long results 93 | max_len = 100_000 # 100k chars = 25k tokens 94 | if len(content) > max_len: 95 | content = content[:max_len] + "\n... [Result truncated]" 96 | else: 97 | content = f"Tool call to {tool_name} on {server_name} completed, but produced no specific output or result." 98 | 99 | # Return format suitable as user message content 100 | # return [{"type": "text", "text": content}] 101 | return {"type": "text", "text": content} 102 | 103 | def format_final_summary_and_log(self, final_answer_text, client=None): 104 | """Format final summary information, including answers and token statistics""" 105 | summary_lines = [] 106 | summary_lines.append("\n" + "=" * 30 + " Final Answer " + "=" * 30) 107 | summary_lines.append(final_answer_text) 108 | 109 | # Extract boxed result - find the last match using safer regex patterns 110 | boxed_result = self._extract_boxed_content(final_answer_text) 111 | 112 | # Add extracted result section 113 | summary_lines.append("\n" + "-" * 20 + " Extracted Result " + "-" * 20) 114 | 115 | if boxed_result: 116 | summary_lines.append(boxed_result) 117 | elif final_answer_text: 118 | summary_lines.append("No \\boxed{} content found.") 119 | boxed_result = "No \\boxed{} content found in the final answer." 120 | 121 | # Token usage statistics and cost estimation - use client method 122 | if client and hasattr(client, "format_token_usage_summary"): 123 | token_summary_lines, log_string = client.format_token_usage_summary() 124 | summary_lines.extend(token_summary_lines) 125 | else: 126 | # If no client or client doesn't support it, use default format 127 | summary_lines.append("\n" + "-" * 20 + " Token Usage & Cost " + "-" * 20) 128 | summary_lines.append("Token usage information not available.") 129 | summary_lines.append("-" * (40 + len(" Token Usage & Cost "))) 130 | log_string = "Token usage information not available." 131 | 132 | return "\n".join(summary_lines), boxed_result, log_string 133 | --------------------------------------------------------------------------------