├── apps
    ├── miroflow-agent
    │   ├── src
    │   │   ├── __init__.py
    │   │   ├── io
    │   │   │   ├── __init__.py
    │   │   │   └── output_formatter.py
    │   │   ├── config
    │   │   │   └── __init__.py
    │   │   ├── core
    │   │   │   └── __init__.py
    │   │   ├── logging
    │   │   │   ├── __init__.py
    │   │   │   └── summary_time_cost.py
    │   │   ├── utils
    │   │   │   ├── __init__.py
    │   │   │   └── wrapper_utils.py
    │   │   └── llm
    │   │   │   ├── providers
    │   │   │       └── __init__.py
    │   │   │   ├── __init__.py
    │   │   │   ├── util.py
    │   │   │   └── factory.py
    │   ├── benchmarks
    │   │   ├── __init__.py
    │   │   ├── evaluators
    │   │   │   ├── __init__.py
    │   │   │   └── calculate_average_score.py
    │   │   └── check_progress
    │   │   │   ├── check_progress_hle.py
    │   │   │   ├── check_progress_frames.py
    │   │   │   ├── check_progress_seal-0.py
    │   │   │   ├── check_progress_aime2025.py
    │   │   │   ├── check_progress_browsecomp.py
    │   │   │   ├── check_progress_hle-text-500.py
    │   │   │   ├── check_progress_webwalkerqa.py
    │   │   │   ├── check_progress_browsecomp_zh.py
    │   │   │   ├── check_progress_hle-text-2158.py
    │   │   │   ├── check_progress_xbench_deepsearch.py
    │   │   │   ├── check_progress_gaia-validation.py
    │   │   │   └── check_progress_gaia-validation-text-103.py
    │   ├── conf
    │   │   ├── __init__.py
    │   │   ├── llm
    │   │   │   ├── gpt-5.yaml
    │   │   │   ├── claude-3-7.yaml
    │   │   │   ├── qwen-3.yaml
    │   │   │   └── default.yaml
    │   │   ├── benchmark
    │   │   │   ├── hle.yaml
    │   │   │   ├── debug.yaml
    │   │   │   ├── seal-0.yaml
    │   │   │   ├── frames.yaml
    │   │   │   ├── aime2025.yaml
    │   │   │   ├── futurex.yaml
    │   │   │   ├── browsecomp.yaml
    │   │   │   ├── collect_trace.yaml
    │   │   │   ├── hle-text-500.yaml
    │   │   │   ├── webwalkerqa.yaml
    │   │   │   ├── hle-text-2158.yaml
    │   │   │   ├── browsecomp_zh.yaml
    │   │   │   ├── gaia-validation.yaml
    │   │   │   ├── xbench_deepsearch.yaml
    │   │   │   ├── gaia-validation-text-103.yaml
    │   │   │   └── default.yaml
    │   │   ├── config.yaml
    │   │   └── agent
    │   │   │   ├── single_agent.yaml
    │   │   │   ├── single_agent_keep5.yaml
    │   │   │   ├── default.yaml
    │   │   │   ├── multi_agent.yaml
    │   │   │   └── multi_agent_os.yaml
    │   ├── README.md
    │   ├── .env.example
    │   ├── main.py
    │   ├── pyproject.toml
    │   └── scripts
    │   │   ├── run_evaluate_multiple_runs_hle.sh
    │   │   ├── run_evaluate_multiple_runs_debug.sh
    │   │   ├── run_evaluate_multiple_runs_frames.sh
    │   │   ├── run_evaluate_multiple_runs_seal-0.sh
    │   │   ├── run_evaluate_multiple_runs_aime2025.sh
    │   │   ├── run_evaluate_multiple_runs_browsecomp.sh
    │   │   ├── run_evaluate_multiple_runs_webwalkerqa.sh
    │   │   ├── run_evaluate_multiple_runs_browsecomp_zh.sh
    │   │   ├── run_evaluate_multiple_runs_hle-text-2158.sh
    │   │   ├── run_evaluate_multiple_runs_hle-text-500.sh
    │   │   ├── run_evaluate_multiple_runs_gaia-validation.sh
    │   │   ├── run_evaluate_multiple_runs_xbench_deepsearch.sh
    │   │   ├── run_evaluate_multiple_runs_gaia-validation-text-103.sh
    │   │   └── run_evaluate_multiple_runs_futurex.sh
    ├── visualize-trace
    │   ├── .python-version
    │   ├── requirements.txt
    │   ├── pyproject.toml
    │   ├── README.md
    │   └── run.py
    ├── collect-trace
    │   ├── pyproject.toml
    │   ├── utils
    │   │   ├── converters
    │   │   │   ├── __init__.py
    │   │   │   ├── system_prompts.py
    │   │   │   ├── example_usage.py
    │   │   │   └── convert_non_oai_to_chatml.py
    │   │   ├── merge_chatml_msgs_to_one_json.py
    │   │   └── process_logs.py
    │   ├── scripts
    │   │   ├── collect_trace_qwen3.sh
    │   │   ├── collect_trace_gpt41.sh
    │   │   ├── collect_trace_gpt5.sh
    │   │   └── collect_trace_claude37.sh
    │   └── README.md
    └── gradio-demo
    │   ├── pyproject.toml
    │   ├── .env.example
    │   └── utils.py
├── libs
    └── miroflow-tools
    │   ├── src
    │       ├── __init__.py
    │       └── miroflow_tools
    │       │   ├── mcp_servers
    │       │       ├── __init__.py
    │       │       ├── utils
    │       │       │   ├── __init__.py
    │       │       │   └── url_unquote.py
    │       │       ├── reasoning_mcp_server.py
    │       │       ├── vision_mcp_server.py
    │       │       ├── reasoning_mcp_server_os.py
    │       │       ├── browser_session.py
    │       │       ├── reading_mcp_server.py
    │       │       ├── vision_mcp_server_os.py
    │       │       └── serper_mcp_server.py
    │       │   ├── __init__.py
    │       │   └── dev_mcp_servers
    │       │       └── stateless_python_server.py
    │   └── pyproject.toml
├── assets
    ├── futurex-09-12.png
    ├── gaia_text_103.png
    ├── miro_thinker.png
    ├── miroflow_logo.png
    ├── miromind_wechat.png
    ├── MiroThinker_v1.0_Technical_Report.pdf
    ├── QA.md
    ├── qwen3_nonthinking.jinja
    ├── LOCAL-TOOL-DEPLOYMENT.md
    └── MiromindAI_H.svg
├── justfile
├── LICENSE
├── .github
    └── workflows
    │   └── run-ruff.yml
└── .gitignore


/apps/miroflow-agent/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/src/io/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libs/miroflow-tools/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/benchmarks/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/src/config/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/src/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/src/logging/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/src/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/apps/visualize-trace/.python-version:
--------------------------------------------------------------------------------
1 | 3.11 


--------------------------------------------------------------------------------
/apps/miroflow-agent/benchmarks/evaluators/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libs/miroflow-tools/src/miroflow_tools/mcp_servers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/apps/visualize-trace/requirements.txt:
--------------------------------------------------------------------------------
1 | flask==2.3.3
2 | werkzeug==2.3.7 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/__init__.py:
--------------------------------------------------------------------------------
1 | # This file makes the conf directory a Python package
2 | 


--------------------------------------------------------------------------------
/assets/futurex-09-12.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MiroMindAI/MiroThinker/HEAD/assets/futurex-09-12.png


--------------------------------------------------------------------------------
/assets/gaia_text_103.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MiroMindAI/MiroThinker/HEAD/assets/gaia_text_103.png


--------------------------------------------------------------------------------
/assets/miro_thinker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MiroMindAI/MiroThinker/HEAD/assets/miro_thinker.png


--------------------------------------------------------------------------------
/assets/miroflow_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MiroMindAI/MiroThinker/HEAD/assets/miroflow_logo.png


--------------------------------------------------------------------------------
/assets/miromind_wechat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MiroMindAI/MiroThinker/HEAD/assets/miromind_wechat.png


--------------------------------------------------------------------------------
/assets/MiroThinker_v1.0_Technical_Report.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MiroMindAI/MiroThinker/HEAD/assets/MiroThinker_v1.0_Technical_Report.pdf


--------------------------------------------------------------------------------
/libs/miroflow-tools/src/miroflow_tools/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 MiroMind
2 | # This source code is licensed under the MIT License.
3 | 
4 | from .manager import ToolManager
5 | 
6 | __all__ = ["ToolManager"]
7 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/llm/gpt-5.yaml:
--------------------------------------------------------------------------------
 1 | # conf/llm/gpt-5.yaml
 2 | defaults:
 3 |   - default
 4 |   - _self_
 5 | 
 6 | provider: "openai"
 7 | model_name: "gpt-5-2025-08-07"
 8 | base_url: https://api.openai.com/v1
 9 | max_context_length: 65536
10 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/llm/claude-3-7.yaml:
--------------------------------------------------------------------------------
 1 | # conf/llm/claude-3-7.yaml
 2 | defaults:
 3 |   - default
 4 |   - _self_
 5 | 
 6 | provider: "anthropic"
 7 | model_name: "claude-3-7-sonnet-20250219"
 8 | base_url: https://api.anthropic.com
 9 | max_context_length: 65536
10 | 


--------------------------------------------------------------------------------
/libs/miroflow-tools/src/miroflow_tools/mcp_servers/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .url_unquote import decode_http_urls_in_dict, safe_unquote, strip_markdown_links
2 | 
3 | __all__ = [
4 |     "safe_unquote",
5 |     "decode_http_urls_in_dict",
6 |     "strip_markdown_links",
7 | ]
8 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/llm/qwen-3.yaml:
--------------------------------------------------------------------------------
 1 | # conf/llm/qwen-3.yaml
 2 | defaults:
 3 |   - default
 4 |   - _self_
 5 | 
 6 | provider: "qwen"
 7 | model_name: "qwen-3"
 8 | base_url: "https://your-api.com/v1"
 9 | max_context_length: 262144
10 | max_tokens: 16384
11 | top_p: 0.95
12 | repetition_penalty: 1.05
13 | 


--------------------------------------------------------------------------------
/apps/visualize-trace/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "trace-dashboard"
 3 | version = "1.0.0"
 4 | description = "A web dashboard for analyzing trace JSON files"
 5 | requires-python = ">=3.8"
 6 | dependencies = [
 7 |     "flask>=2.3.3",
 8 |     "werkzeug>=2.3.7",
 9 | ]
10 | 
11 | [tool.uv]
12 | dev-dependencies = [] 


--------------------------------------------------------------------------------
/apps/miroflow-agent/src/llm/providers/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | from .anthropic_client import AnthropicClient
 5 | from .openai_client import OpenAIClient
 6 | 
 7 | __all__ = [
 8 |     "AnthropicClient",
 9 |     "OpenAIClient",
10 | ]
11 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/benchmark/hle.yaml:
--------------------------------------------------------------------------------
 1 | # conf/benchmark/hle.yaml
 2 | defaults:
 3 |   - default
 4 |   - _self_
 5 | 
 6 | name: "hle"
 7 | 
 8 | data:
 9 |   data_dir: "../../data/hle"
10 | 
11 | execution:
12 |   max_tasks: null  # null means no limit
13 |   max_concurrent: 5
14 |   pass_at_k: 1
15 |   format_error_retry_limit: 0
16 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/benchmark/debug.yaml:
--------------------------------------------------------------------------------
 1 | # conf/benchmark/debug.yaml
 2 | defaults:
 3 |   - default
 4 |   - _self_
 5 | 
 6 | name: "debug"
 7 | 
 8 | data:
 9 |   data_dir: "../../data/debug"
10 | 
11 | execution:
12 |   max_tasks: null  # null means no limit
13 |   max_concurrent: 5
14 |   pass_at_k: 1
15 |   format_error_retry_limit: 0
16 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/benchmark/seal-0.yaml:
--------------------------------------------------------------------------------
 1 | # conf/benchmark/seal-0.yaml
 2 | defaults:
 3 |   - default
 4 |   - _self_
 5 | 
 6 | name: "seal-0"
 7 | 
 8 | data:
 9 |   data_dir: "../../data/seal-0"
10 | 
11 | execution:
12 |   max_tasks: null  # null means no limit
13 |   max_concurrent: 5
14 |   pass_at_k: 1
15 |   format_error_retry_limit: 0


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/benchmark/frames.yaml:
--------------------------------------------------------------------------------
 1 | # conf/benchmark/frames.yaml
 2 | defaults:
 3 |   - default
 4 |   - _self_
 5 | 
 6 | name: "frames"
 7 | 
 8 | data:
 9 |   data_dir: "../../data/frames"
10 | 
11 | execution:
12 |   max_tasks: null  # null means no limit
13 |   max_concurrent: 5
14 |   pass_at_k: 1
15 |   format_error_retry_limit: 0
16 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/benchmark/aime2025.yaml:
--------------------------------------------------------------------------------
 1 | # conf/benchmark/aime2025.yaml
 2 | defaults:
 3 |   - default
 4 |   - _self_
 5 | 
 6 | name: "aime2025"
 7 | 
 8 | data:
 9 |   data_dir: "../../data/aime2025"
10 | 
11 | execution:
12 |   max_tasks: null  # null means no limit
13 |   max_concurrent: 5
14 |   pass_at_k: 1
15 |   format_error_retry_limit: 0


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/benchmark/futurex.yaml:
--------------------------------------------------------------------------------
 1 | # conf/benchmark/futurex.yaml
 2 | defaults:
 3 |   - default
 4 |   - _self_
 5 | 
 6 | name: "futurex"
 7 | 
 8 | data:
 9 |   data_dir: "../../data/futurex"
10 | 
11 | execution:
12 |   max_tasks: null  # null means no limit
13 |   max_concurrent: 5
14 |   pass_at_k: 1
15 |   format_error_retry_limit: 0
16 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/benchmark/browsecomp.yaml:
--------------------------------------------------------------------------------
 1 | # conf/benchmark/browsecomp.yaml
 2 | defaults:
 3 |   - default
 4 |   - _self_
 5 | 
 6 | name: "browsecomp"
 7 | 
 8 | data:
 9 |   data_dir: "../../data/browsecomp"
10 | 
11 | execution:
12 |   max_tasks: null  # null means no limit
13 |   max_concurrent: 5
14 |   pass_at_k: 1
15 |   format_error_retry_limit: 0
16 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/benchmark/collect_trace.yaml:
--------------------------------------------------------------------------------
 1 | # conf/benchmark/collect_trace.yaml
 2 | defaults:
 3 |   - default
 4 |   - _self_
 5 | 
 6 | name: "collect_trace"
 7 | 
 8 | data:
 9 |   data_dir: "../../data/debug"
10 | 
11 | execution:
12 |   max_tasks: null  # null means no limit
13 |   max_concurrent: 5
14 |   pass_at_k: 1
15 |   format_error_retry_limit: 0
16 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/benchmark/hle-text-500.yaml:
--------------------------------------------------------------------------------
 1 | # conf/benchmark/hle-text-500.yaml
 2 | defaults:
 3 |   - default
 4 |   - _self_
 5 | 
 6 | name: "hle-text-500"
 7 | 
 8 | data:
 9 |   data_dir: "../../data/hle-text-500"
10 | 
11 | execution:
12 |   max_tasks: null  # null means no limit
13 |   max_concurrent: 5
14 |   pass_at_k: 1
15 |   format_error_retry_limit: 0


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/benchmark/webwalkerqa.yaml:
--------------------------------------------------------------------------------
 1 | # conf/benchmark/webwalkerqa.yaml
 2 | defaults:
 3 |   - default
 4 |   - _self_
 5 | 
 6 | name: "webwalkerqa"
 7 | 
 8 | data:
 9 |   data_dir: "../../data/webwalkerqa"
10 | 
11 | execution:
12 |   max_tasks: null  # null means no limit
13 |   max_concurrent: 5
14 |   pass_at_k: 1
15 |   format_error_retry_limit: 0
16 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/benchmark/hle-text-2158.yaml:
--------------------------------------------------------------------------------
 1 | # conf/benchmark/hle-text-2158.yaml
 2 | defaults:
 3 |   - default
 4 |   - _self_
 5 | 
 6 | name: "hle-text-2158"
 7 | 
 8 | data:
 9 |   data_dir: "../../data/hle-text-2158"
10 | 
11 | execution:
12 |   max_tasks: null  # null means no limit
13 |   max_concurrent: 5
14 |   pass_at_k: 1
15 |   format_error_retry_limit: 0


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/benchmark/browsecomp_zh.yaml:
--------------------------------------------------------------------------------
 1 | # conf/benchmark/browsecomp_zh.yaml
 2 | defaults:
 3 |   - default
 4 |   - _self_
 5 | 
 6 | name: "browsecomp_zh"
 7 | 
 8 | data:
 9 |   data_dir: "../../data/browsecomp_zh"
10 | 
11 | execution:
12 |   max_tasks: null  # null means no limit
13 |   max_concurrent: 5
14 |   pass_at_k: 1
15 |   format_error_retry_limit: 0
16 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/benchmark/gaia-validation.yaml:
--------------------------------------------------------------------------------
 1 | # conf/benchmark/gaia-validation.yaml
 2 | defaults:
 3 |   - default
 4 |   - _self_
 5 | 
 6 | name: "gaia-validation"
 7 | 
 8 | data:
 9 |   data_dir: "../../data/gaia-2023-validation"
10 | 
11 | execution:
12 |   max_tasks: null  # null means no limit
13 |   max_concurrent: 5
14 |   pass_at_k: 1
15 |   format_error_retry_limit: 0
16 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/benchmark/xbench_deepsearch.yaml:
--------------------------------------------------------------------------------
 1 | # conf/benchmark/xbench_deepsearch.yaml
 2 | defaults:
 3 |   - default
 4 |   - _self_
 5 | 
 6 | name: "xbench_deepsearch"
 7 | 
 8 | data:
 9 |   data_dir: "../../data/xbench_deepsearch"
10 | 
11 | execution:
12 |   max_tasks: null  # null means no limit
13 |   max_concurrent: 5
14 |   pass_at_k: 1
15 |   format_error_retry_limit: 0
16 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/config.yaml:
--------------------------------------------------------------------------------
 1 | # conf/config.yaml
 2 | defaults:
 3 |   - llm: default
 4 |   - agent: default
 5 |   - benchmark: default
 6 |   - _self_  # Allows variables to be defined at the top of this file
 7 | 
 8 | hydra:
 9 |   run:
10 |     dir: ../../logs/debug
11 | 
12 | # You can define some top-level or default parameters here
13 | project_name: "miroflow-agent"
14 | debug_dir: "../../logs/debug"
15 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/llm/default.yaml:
--------------------------------------------------------------------------------
 1 | # conf/llm/default.yaml - Default LLM configuration
 2 | provider: "anthropic" # openai, anthropic, qwen
 3 | model_name: "claude-3-7-sonnet-20250219"
 4 | async_client: false
 5 | temperature: 0.3
 6 | top_p: 1.0
 7 | min_p: 0.0
 8 | top_k: -1
 9 | max_tokens: 4096
10 | api_key: ""
11 | base_url: https://api.anthropic.com
12 | keep_tool_result: -1
13 | repetition_penalty: 1.0
14 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/benchmark/gaia-validation-text-103.yaml:
--------------------------------------------------------------------------------
 1 | # conf/benchmark/gaia-validation-text-103.yaml
 2 | defaults:
 3 |   - default
 4 |   - _self_
 5 | 
 6 | name: "gaia-validation-text-103"
 7 | 
 8 | data:
 9 |   data_dir: "../../data/gaia-2023-validation-text-103"
10 | 
11 | execution:
12 |   max_tasks: null  # null means no limit
13 |   max_concurrent: 5
14 |   pass_at_k: 1
15 |   format_error_retry_limit: 0
16 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/src/llm/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | from .base_client import BaseClient
 5 | from .factory import ClientFactory
 6 | from .providers import (
 7 |     AnthropicClient,
 8 |     OpenAIClient,
 9 | )
10 | 
11 | __all__ = [
12 |     "BaseClient",
13 |     "ClientFactory",
14 |     "AnthropicClient",
15 |     "OpenAIClient",
16 | ]
17 | 


--------------------------------------------------------------------------------
/apps/collect-trace/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "collect-trace"
 3 | version = "0.1.0"
 4 | description = "Executes a user-defined agent loop for capturing multi-turn interaction traces"
 5 | readme = "README.md"
 6 | requires-python = ">=3.12"
 7 | authors = [{ name = "MiroMind Team", email = "service@miromind.ai" }]
 8 | dependencies = [
 9 |     "miroflow-tools>=0.1.0",
10 |     "dotenv>=0.9.9",
11 |     "openai>=1.90.0",
12 | ]
13 | 
14 | [tool.uv.sources]
15 | miroflow-tools = { path = "../../libs/miroflow-tools", editable = true }
16 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/agent/single_agent.yaml:
--------------------------------------------------------------------------------
 1 | # conf/agent/single_agent.yaml
 2 | # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
 3 | # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
 4 | defaults:
 5 |   - default
 6 |   - _self_
 7 | 
 8 | main_agent:
 9 |   tools:
10 |     - search_and_scrape_webpage
11 |     - jina_scrape_llm_summary
12 |     - tool-python
13 |   max_turns: 600  # Maximum number of turns for main agent execution
14 | 
15 | sub_agents:
16 | 
17 | keep_tool_result: -1 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/agent/single_agent_keep5.yaml:
--------------------------------------------------------------------------------
 1 | # conf/agent/single_agent_keep5.yaml
 2 | # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
 3 | # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
 4 | defaults:
 5 |   - default
 6 |   - _self_
 7 | 
 8 | main_agent:
 9 |   tools:
10 |     - search_and_scrape_webpage
11 |     - jina_scrape_llm_summary
12 |     - tool-python
13 |   max_turns: 600  # Maximum number of turns for main agent execution
14 | 
15 | sub_agents:
16 | 
17 | keep_tool_result: 5


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/benchmark/default.yaml:
--------------------------------------------------------------------------------
 1 | # conf/benchmark/default.yaml - Default benchmark configuration
 2 | # This is a base configuration for benchmarks. Specific benchmarks can override these defaults.
 3 | name: "default"
 4 | 
 5 | data:
 6 |   metadata_file: "standardized_data.jsonl"
 7 |   field_mapping:
 8 |     task_id_field: "task_id"
 9 |     task_question_field: "task_question"
10 |     ground_truth_field: "ground_truth"
11 |     file_name_field: "file_name"
12 | 
13 | execution:
14 |   max_tasks: null  # null means no limit
15 |   max_concurrent: 5 
16 |   pass_at_k: 1
17 |   format_error_retry_limit: 0


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/agent/default.yaml:
--------------------------------------------------------------------------------
 1 | # conf/agent/default.yaml
 2 | # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
 3 | # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
 4 | main_agent:
 5 |   tools:
 6 |     - tool-python
 7 |     - tool-vqa
 8 |     - tool-transcribe
 9 |     - tool-reasoning
10 |     - tool-reader
11 |   max_turns: 20  # Maximum number of turns for main agent execution
12 | 
13 | sub_agents:
14 |   agent-browsing:
15 |     tools:
16 |       - tool-google-search
17 |       - tool-vqa
18 |       - tool-reader
19 |       - tool-python
20 |     max_turns: 20
21 | 
22 | keep_tool_result: -1 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/agent/multi_agent.yaml:
--------------------------------------------------------------------------------
 1 | # conf/agent/multi_agent.yaml
 2 | # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
 3 | # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
 4 | defaults:
 5 |   - default
 6 |   - _self_
 7 | 
 8 | main_agent:
 9 |   tools:
10 |     - tool-python
11 |     - tool-vqa
12 |     - tool-transcribe
13 |     - tool-reasoning
14 |     - tool-reader
15 |   max_turns: 50  # Maximum number of turns for main agent execution
16 | 
17 | sub_agents:
18 |   agent-browsing:
19 |     tools:
20 |       - tool-google-search
21 |       - tool-vqa
22 |       - tool-reader
23 |       - tool-python
24 |     max_turns: 50
25 | 
26 | keep_tool_result: -1
27 | 
28 | 


--------------------------------------------------------------------------------
/apps/gradio-demo/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "gradio-demo"
 3 | version = "0.1.0"
 4 | description = "Gradio Demo"
 5 | readme = "README.md"
 6 | requires-python = ">=3.12"
 7 | dependencies = [
 8 |     "pydantic>=2.10.0",
 9 |     "python-dotenv>=1.0.0",
10 |     "hydra-core>=1.3.0",
11 |     "miroflow-agent",
12 |     "aiohttp>=3.12.15",
13 |     "gradio>=5.42.0",
14 | ]
15 | 
16 | [build-system]
17 | requires = ["hatchling"]
18 | build-backend = "hatchling.build"
19 | 
20 | [tool.hatch.build.targets.wheel]
21 | packages = ["./"]
22 | 
23 | [tool.uv.sources]
24 | miroflow-agent = { path = "../miroflow-agent", editable = true }
25 | 
26 | [dependency-groups]
27 | dev = [
28 |     "pytest>=8.4.1",
29 |     "pytest-asyncio>=1.0.0",
30 |     "httpx>=0.28.1",
31 | ]
32 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/conf/agent/multi_agent_os.yaml:
--------------------------------------------------------------------------------
 1 | # conf/agent/multi_agent_os.yaml
 2 | # The name of tools and sub-agents defined in: apps/miroflow-agent/src/config/settings.py
 3 | # Each sub-agent prompt is written in: apps/miroflow-agent/src/utils/prompt_utils.py
 4 | defaults:
 5 |   - default
 6 |   - _self_
 7 | 
 8 | main_agent:
 9 |   tools:
10 |     - tool-python
11 |     - tool-vqa-os
12 |     - tool-transcribe-os
13 |     - tool-reasoning-os
14 |     - tool-reader
15 |   max_turns: 50  # Maximum number of turns for main agent execution
16 | 
17 | sub_agents:
18 |   agent-browsing:
19 |     tools:
20 |       - tool-google-search
21 |       - tool-vqa-os
22 |       - tool-reader
23 |       - tool-python
24 |     max_turns: 50
25 | 
26 | keep_tool_result: -1
27 | 
28 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/src/llm/util.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import asyncio
 5 | import functools
 6 | from typing import Awaitable, Callable, TypeVar
 7 | 
 8 | T = TypeVar("T")
 9 | 
10 | 
11 | def with_timeout(
12 |     timeout_s: float = 300.0,
13 | ) -> Callable[[Callable[..., Awaitable[T]]], Callable[..., Awaitable[T]]]:
14 |     """
15 |     Decorator: wraps any *async* function in asyncio.wait_for().
16 |     Usage:
17 |         @with_timeout(20)
18 |         async def create_message_foo(...): ...
19 |     """
20 | 
21 |     def decorator(func: Callable[..., Awaitable[T]]) -> Callable[..., Awaitable[T]]:
22 |         @functools.wraps(func)
23 |         async def wrapper(*args, **kwargs) -> T:
24 |             return await asyncio.wait_for(func(*args, **kwargs), timeout=timeout_s)
25 | 
26 |         return wrapper
27 | 
28 |     return decorator
29 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/src/utils/wrapper_utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | 
 4 | class ErrorBox:
 5 |     def __init__(self, error_msg: str):
 6 |         self.error_msg = error_msg
 7 | 
 8 |     def __str__(self):
 9 |         return self.error_msg
10 | 
11 |     @staticmethod
12 |     def is_error_box(something):
13 |         return isinstance(something, ErrorBox)
14 | 
15 | 
16 | class ResponseBox:
17 |     def __init__(self, response: Any, extra_info: dict = None):
18 |         self.response = response
19 |         self.extra_info = extra_info
20 | 
21 |     def __str__(self):
22 |         return self.response
23 | 
24 |     @staticmethod
25 |     def is_response_box(something):
26 |         return isinstance(something, ResponseBox)
27 | 
28 |     def has_extra_info(self):
29 |         return self.extra_info is not None
30 | 
31 |     def get_extra_info(self):
32 |         return self.extra_info
33 | 
34 |     def get_response(self):
35 |         return self.response
36 | 


--------------------------------------------------------------------------------
/justfile:
--------------------------------------------------------------------------------
 1 | default:
 2 |     just --list
 3 | 
 4 | # lint monorepo
 5 | [group('precommit')]
 6 | lint:
 7 |     uv tool run ruff@0.8.0 check --fix .
 8 | 
 9 | # sort imports
10 | [group('precommit')]
11 | sort-imports:
12 |     uv tool run ruff@0.8.0 check --select I --fix .
13 | 
14 | # format monorepo
15 | [group('precommit')]
16 | format:
17 |     uv tool run ruff@0.8.0 format .
18 | 
19 | # check license
20 | [group('precommit')]
21 | check-license:
22 |     uv run reuse lint
23 | 
24 | # insert license for contributor
25 | insert-license:
26 |     # https://reuse.readthedocs.io/en/stable/scripts.html#add-headers-to-staged-files-based-on-git-settings
27 |     git diff --name-only --cached | xargs -I {} reuse annotate -c "$(git config --get user.name) <$(git config --get user.email)>" "{}"
28 | 
29 | # format markdown files
30 | [group('precommit')]
31 | format-md:
32 |     find . -name "*.md" -type f | xargs uv tool run mdformat@0.7.17
33 | 
34 | # run precommit before PR
35 | [group('precommit')]
36 | precommit: lint sort-imports format-md format
37 | 


--------------------------------------------------------------------------------
/apps/collect-trace/utils/converters/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | from .convert_non_oai_to_chatml import (
 5 |     convert_to_json_chatml,
 6 |     extract_and_save_chat_history,
 7 | )
 8 | from .convert_oai_to_chatml import (
 9 |     extract_message_history_from_log,
10 |     oai_tool_message_to_chat_message,
11 |     process_log_file,
12 |     save_chatml_to_files,
13 | )
14 | from .convert_to_chatml_auto_batch import (
15 |     batch_process_files,
16 |     determine_conversion_method,
17 |     get_llm_provider,
18 |     process_single_file,
19 | )
20 | 
21 | __all__ = [
22 |     # OAI conversion functions
23 |     "oai_tool_message_to_chat_message",
24 |     "extract_message_history_from_log",
25 |     "save_chatml_to_files",
26 |     "process_log_file",
27 |     # Non-OAI conversion functions
28 |     "convert_to_json_chatml",
29 |     "extract_and_save_chat_history",
30 |     # Auto batch conversion functions
31 |     "get_llm_provider",
32 |     "determine_conversion_method",
33 |     "process_single_file",
34 |     "batch_process_files",
35 | ]
36 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 MiroMind
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/.github/workflows/run-ruff.yml:
--------------------------------------------------------------------------------
 1 | name: lint
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches: [ "main" ]
 6 | 
 7 | jobs:
 8 |   lint:
 9 |     if: github.repository_owner == 'MiroMindAI'
10 |     name: lint pull request
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |     - name: checkout code
14 |       uses: actions/checkout@v4
15 | 
16 |     - name: Install uv
17 |       uses: astral-sh/setup-uv@v5
18 | 
19 |     - name: Check static error
20 |       run: |
21 |         uv tool run ruff@0.8.0 check --show-fixes --output-format=github
22 | 
23 |     - name: Reformat code style
24 |       run: |
25 |         echo '## Reformat summary' >> $GITHUB_STEP_SUMMARY
26 |         if diff_output="$(uv tool run ruff@0.8.0 format --diff 2>&1)"; then
27 |           echo "$diff_output"
28 |           echo '✅ Format check passed.' >> "$GITHUB_STEP_SUMMARY"
29 |         else
30 |           echo "$diff_output"
31 |           echo '❌ Format issues detected.' >> "$GITHUB_STEP_SUMMARY"
32 |           {
33 |             echo '```diff'
34 |             echo "$diff_output"
35 |             echo '```'
36 |           } >> "$GITHUB_STEP_SUMMARY"
37 |           exit 1
38 |         fi


--------------------------------------------------------------------------------
/apps/miroflow-agent/src/llm/factory.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | from typing import Optional
 5 | 
 6 | from omegaconf import DictConfig, OmegaConf
 7 | 
 8 | from ..logging.task_logger import TaskLog
 9 | from .providers.anthropic_client import AnthropicClient
10 | from .providers.openai_client import OpenAIClient
11 | 
12 | 
13 | def ClientFactory(
14 |     task_id: str, cfg: DictConfig, task_log: Optional[TaskLog] = None, **kwargs
15 | ):
16 |     """
17 |     Automatically select provider and create LLM client based on configuration
18 |     """
19 |     provider = cfg.llm.provider
20 |     config = OmegaConf.merge(cfg, kwargs)
21 | 
22 |     client_creators = {
23 |         "anthropic": lambda: AnthropicClient(
24 |             task_id=task_id, task_log=task_log, cfg=config
25 |         ),
26 |         "qwen": lambda: OpenAIClient(task_id=task_id, task_log=task_log, cfg=config),
27 |         "openai": lambda: OpenAIClient(task_id=task_id, task_log=task_log, cfg=config),
28 |     }
29 | 
30 |     factory = client_creators.get(provider)
31 |     if not factory:
32 |         raise ValueError(f"Unsupported provider: {provider}")
33 | 
34 |     return factory()
35 | 


--------------------------------------------------------------------------------
/apps/visualize-trace/README.md:
--------------------------------------------------------------------------------
 1 | # Trace Analysis Web Demo
 2 | 
 3 | An interactive web interface for analyzing and visualizing trace JSON files.
 4 | 
 5 | ## Installation and Running
 6 | 
 7 | ### Method 1: Using Python (Recommended)
 8 | 
 9 | ```bash
10 | pip install -r requirements.txt
11 | python run.py
12 | ```
13 | 
14 | The startup script will automatically check and install dependencies, then start the web application. Visit `http://127.0.0.1:5000`
15 | 
16 | ### Method 2: Using uv
17 | 
18 | ```bash
19 | uv run run.py
20 | ```
21 | 
22 | ## Usage
23 | 
24 | 1. **Start the application**: After running, visit `http://127.0.0.1:5000` in your browser
25 | 
26 | 1. **Load files**:
27 | 
28 |    - Select the trace JSON file to analyze from the dropdown menu in the top navigation bar
29 |    - Click the "Load" button to load the file
30 | 
31 | 1. **View analysis results**:
32 | 
33 |    - **Left panel**: Shows basic information, execution summary, and performance statistics
34 |    - **Right panel**: Displays detailed execution flow
35 |    - **Bottom panel**: Shows spans statistics and step logs statistics
36 | 
37 | 1. **Interactive operations**:
38 | 
39 |    - Click on execution steps to expand/collapse detailed information
40 |    - Use "Expand All"/"Collapse All" buttons to control all steps
41 |    - Click "View Details" button to see complete message content
42 | 


--------------------------------------------------------------------------------
/apps/collect-trace/scripts/collect_trace_qwen3.sh:
--------------------------------------------------------------------------------
 1 | # Get the directory where the current script is located
 2 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 3 | echo "Current script directory: $SCRIPT_DIR"
 4 | 
 5 | 
 6 | # Enter the apps/miroflow-agent directory
 7 | TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent"
 8 | echo "Target directory: $TARGET_DIR"
 9 | cd $TARGET_DIR
10 | 
11 | mkdir -p ../../logs
12 | LOG_DIR="../../logs/collect_trace_qwen3"
13 | echo "Log directory: $LOG_DIR"
14 | mkdir -p $LOG_DIR
15 | 
16 | # Collect traces
17 | uv run python benchmarks/common_benchmark.py \
18 |     benchmark=collect_trace \
19 |     benchmark.data.data_dir="../../data/debug" \
20 |     benchmark.data.metadata_file="standardized_data.jsonl" \
21 |     llm=qwen-3 \
22 |     llm.provider=qwen \
23 |     llm.model_name=qwen-3-32b \
24 |     llm.api_key="" \
25 |     llm.base_url=https://your-api.com/v1 \
26 |     llm.async_client=true \
27 |     benchmark.execution.max_tasks=null \
28 |     benchmark.execution.max_concurrent=10 \
29 |     benchmark.execution.pass_at_k=1 \
30 |     agent=single_agent \
31 |     hydra.run.dir=$LOG_DIR \
32 |     2>&1 | tee "$LOG_DIR/output.log"
33 | 
34 | # Enter the apps/collect-trace directory
35 | TARGET_DIR="$SCRIPT_DIR/../"
36 | echo "Target directory: $TARGET_DIR"
37 | cd $TARGET_DIR
38 | 
39 | # Process traces
40 | uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/README.md:
--------------------------------------------------------------------------------
 1 | # MiroFlow Agent
 2 | 
 3 | ## Quick Start
 4 | 
 5 | The simplest way to run a case is using the default command:
 6 | 
 7 | ```bash
 8 | # Run Claude-3.7-Sonnet with single-agent configuration
 9 | uv run python main.py llm=claude-3-7 agent=single_agent_keep5 benchmark=debug
10 | 
11 | # Run GPT-5 with single-agent configuration
12 | uv run python main.py llm=gpt-5 agent=single_agent_keep5 benchmark=debug
13 | 
14 | # Use a different benchmark configuration
15 | uv run python main.py llm=qwen-3 agent=single_agent_keep5 benchmark=debug llm.base_url=<base_url>
16 | ```
17 | 
18 | This will execute the default task: "What is the title of today's arxiv paper in computer science?"
19 | 
20 | ## Available Configurations
21 | 
22 | - **LLM Models**: `claude-3-7`, `gpt-5`, `qwen-3`
23 | - **Agent Configs**: `single_agent`, `single_agent_keep5`, `multi_agent`, `multi_agent_os`
24 | - **Benchmark Configs**: `debug`, `browsecomp`, `frames`, etc.
25 | 
26 | ### Customizing the Task
27 | 
28 | To change the task description, you need to modify the `main.py` file directly:
29 | 
30 | ```python
31 | # In main.py, change line 43:
32 | task_description = "Your custom task here"
33 | ```
34 | 
35 | ### Output
36 | 
37 | The agent will:
38 | 
39 | 1. Execute the task using available tools
40 | 1. Generate a final summary and boxed answer
41 | 1. Save logs to `../../logs/debug/` directory
42 | 1. Display the results in the terminal
43 | 
44 | ### Troubleshooting
45 | 
46 | - Make sure your API keys are set correctly
47 | - Check the logs in the `logs/debug/` directory for detailed execution information
48 | - Ensure all dependencies are installed with `uv sync`
49 | 


--------------------------------------------------------------------------------
/apps/collect-trace/scripts/collect_trace_gpt41.sh:
--------------------------------------------------------------------------------
 1 | # Check if OPENAI_API_KEY is set
 2 | if [ -z "$OPENAI_API_KEY" ]; then
 3 |     echo "Error: OPENAI_API_KEY is not set."
 4 |     exit 1
 5 | else
 6 |     echo "OPENAI_API_KEY detected."
 7 | fi
 8 | 
 9 | # Get the directory where the current script is located
10 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
11 | echo "Current script directory: $SCRIPT_DIR"
12 | 
13 | 
14 | # Enter the apps/miroflow-agent directory
15 | TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent"
16 | echo "Target directory: $TARGET_DIR"
17 | cd $TARGET_DIR
18 | 
19 | mkdir -p ../../logs
20 | LOG_DIR="../../logs/collect_trace_gpt41"
21 | echo "Log directory: $LOG_DIR"
22 | mkdir -p $LOG_DIR
23 | 
24 | # Collect traces
25 | uv run python benchmarks/common_benchmark.py \
26 |     benchmark=collect_trace \
27 |     benchmark.data.data_dir="../../data/debug" \
28 |     benchmark.data.metadata_file="standardized_data.jsonl" \
29 |     llm=gpt-5 \
30 |     llm.provider=openai \
31 |     llm.model_name=gpt-4.1-mini \
32 |     llm.api_key="$OPENAI_API_KEY" \
33 |     llm.base_url=https://api.openai.com/v1 \
34 |     llm.async_client=true \
35 |     benchmark.execution.max_tasks=null \
36 |     benchmark.execution.max_concurrent=10 \
37 |     benchmark.execution.pass_at_k=1 \
38 |     agent=single_agent \
39 |     hydra.run.dir=$LOG_DIR \
40 |     2>&1 | tee "$LOG_DIR/output.log"
41 | 
42 | # Enter the apps/collect-trace directory
43 | TARGET_DIR="$SCRIPT_DIR/../"
44 | echo "Target directory: $TARGET_DIR"
45 | cd $TARGET_DIR
46 | 
47 | # Process traces
48 | uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/apps/collect-trace/scripts/collect_trace_gpt5.sh:
--------------------------------------------------------------------------------
 1 | # Check if OPENAI_API_KEY is set
 2 | if [ -z "$OPENAI_API_KEY" ]; then
 3 |     echo "Error: OPENAI_API_KEY is not set."
 4 |     exit 1
 5 | else
 6 |     echo "OPENAI_API_KEY detected."
 7 | fi
 8 | 
 9 | # Get the directory where the current script is located
10 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
11 | echo "Current script directory: $SCRIPT_DIR"
12 | 
13 | 
14 | # Enter the apps/miroflow-agent directory
15 | TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent"
16 | echo "Target directory: $TARGET_DIR"
17 | cd $TARGET_DIR
18 | 
19 | mkdir -p ../../logs
20 | LOG_DIR="../../logs/collect_trace_gpt5"
21 | echo "Log directory: $LOG_DIR"
22 | mkdir -p $LOG_DIR
23 | 
24 | # Collect traces
25 | uv run python benchmarks/common_benchmark.py \
26 |     benchmark=collect_trace \
27 |     benchmark.data.data_dir="../../data/debug" \
28 |     benchmark.data.metadata_file="standardized_data.jsonl" \
29 |     llm=gpt-5 \
30 |     llm.provider=openai \
31 |     llm.model_name=gpt-5-2025-08-07 \
32 |     llm.api_key="$OPENAI_API_KEY" \
33 |     llm.base_url=https://api.openai.com/v1 \
34 |     llm.async_client=true \
35 |     benchmark.execution.max_tasks=null \
36 |     benchmark.execution.max_concurrent=10 \
37 |     benchmark.execution.pass_at_k=1 \
38 |     agent=single_agent \
39 |     hydra.run.dir=$LOG_DIR \
40 |     2>&1 | tee "$LOG_DIR/output.log"
41 | 
42 | # Enter the apps/collect-trace directory
43 | TARGET_DIR="$SCRIPT_DIR/../"
44 | echo "Target directory: $TARGET_DIR"
45 | cd $TARGET_DIR
46 | 
47 | # Process traces
48 | uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/apps/gradio-demo/.env.example:
--------------------------------------------------------------------------------
 1 | # API for Google Search (recommend)
 2 | SERPER_API_KEY=your_serper_key
 3 | SERPER_BASE_URL="https://google.serper.dev"
 4 | 
 5 | # API for Web Scraping (recommend)
 6 | JINA_API_KEY=your_jina_key
 7 | JINA_BASE_URL="https://r.jina.ai"
 8 | 
 9 | # API for Linux Sandbox (recommend)
10 | E2B_API_KEY=your_e2b_key
11 | 
12 | # API for LLM-as-Judge (for benchmark testing)
13 | OPENAI_API_KEY=your_openai_key
14 | OPENAI_BASE_URL=https://api.openai.com/v1
15 | 
16 | # API for Open-Source Audio Transcription Tool (for benchmark testing)
17 | WHISPER_MODEL_NAME="openai/whisper-large-v3-turbo"
18 | WHISPER_API_KEY=your_whisper_key
19 | WHISPER_BASE_URL="https://your_whisper_base_url/v1"
20 | 
21 | # API for Open-Source VQA Tool (for benchmark testing)
22 | VISION_MODEL_NAME="Qwen/Qwen2.5-VL-72B-Instruct"
23 | VISION_API_KEY=your_vision_key
24 | VISION_BASE_URL="https://your_vision_base_url/v1/chat/completions"
25 | 
26 | # API for Open-Source Reasoning Tool (for benchmark testing)
27 | REASONING_MODEL_NAME="Qwen/Qwen3-235B-A22B-Thinking-2507"
28 | REASONING_API_KEY=your_reasoning_key
29 | REASONING_BASE_URL="https://your_reasoning_base_url/v1/chat/completions"
30 | 
31 | # API for Claude Sonnet 3.7 as Commercial Tools (optional)
32 | ANTHROPIC_API_KEY=your_anthropic_key
33 | ANTHROPIC_BASE_URL=https://api.anthropic.com
34 | 
35 | # API for Sougou Search (optional)
36 | TENCENTCLOUD_SECRET_ID=your_tencent_cloud_secret_id
37 | TENCENTCLOUD_SECRET_KEY=your_tencent_cloud_secret_key
38 | 
39 | # API for Summary LLM (optional)
40 | SUMMARY_LLM_BASE_URL=your_summary_llm_base_url
41 | SUMMARY_LLM_MODEL_NAME=your_summary_llm_model_name
42 | SUMMARY_LLM_API_KEY=your_summary_llm_api_key


--------------------------------------------------------------------------------
/apps/gradio-demo/utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def contains_chinese(text):
 5 |     """
 6 |     Detect if a string contains Chinese characters or Chinese punctuation
 7 | 
 8 |     Args:
 9 |         text (str): The string to detect
10 | 
11 |     Returns:
12 |         bool: True if contains Chinese characters or punctuation, False otherwise
13 |     """
14 |     # Chinese character Unicode ranges:
15 |     # \u4e00-\u9fff: CJK Unified Ideographs
16 |     # \u3400-\u4dbf: CJK Extension A
17 |     # \uf900-\ufaff: CJK Compatibility Ideographs
18 |     # \u3000-\u303f: CJK Symbols and Punctuation
19 |     # \uff00-\uffef: Fullwidth ASCII, Fullwidth punctuation
20 |     chinese_pattern = re.compile(
21 |         r"[\u4e00-\u9fff\u3400-\u4dbf\uf900-\ufaff\u3000-\u303f\uff00-\uffef]"
22 |     )
23 |     return bool(chinese_pattern.search(text))
24 | 
25 | 
26 | def replace_chinese_punctuation(text):
27 |     # Handle single-character replacements with translate
28 |     punctuation_map = str.maketrans(
29 |         {
30 |             "，": ",",
31 |             "。": ".",
32 |             "！": "!",
33 |             "？": "?",
34 |             "；": ";",
35 |             "：": ":",
36 |             "“": '"',
37 |             "”": '"',
38 |             "‘": "'",
39 |             "’": "'",
40 |             "（": "(",
41 |             "）": ")",
42 |             "【": "[",
43 |             "】": "]",
44 |             "《": "<",
45 |             "》": ">",
46 |             "、": ",",
47 |             "—": "-",
48 |         }
49 |     )
50 |     # First, replace multi-character punctuation
51 |     text = text.replace("……", "...")
52 |     # Then apply single-character replacements
53 |     return text.translate(punctuation_map)
54 | 


--------------------------------------------------------------------------------
/apps/collect-trace/scripts/collect_trace_claude37.sh:
--------------------------------------------------------------------------------
 1 | # Check if ANTHROPIC_API_KEY is set
 2 | if [ -z "$ANTHROPIC_API_KEY" ]; then
 3 |     echo "Error: ANTHROPIC_API_KEY is not set."
 4 |     exit 1
 5 | else
 6 |     echo "ANTHROPIC_API_KEY detected."
 7 | fi
 8 | 
 9 | # Get the directory where the current script is located
10 | SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
11 | echo "Current script directory: $SCRIPT_DIR"
12 | 
13 | 
14 | # Enter the apps/miroflow-agent directory
15 | TARGET_DIR="$SCRIPT_DIR/../../miroflow-agent"
16 | echo "Target directory: $TARGET_DIR"
17 | cd $TARGET_DIR
18 | 
19 | mkdir -p ../../logs
20 | LOG_DIR="../../logs/collect_trace_claude37"
21 | echo "Log directory: $LOG_DIR"
22 | mkdir -p $LOG_DIR
23 | 
24 | # Collect traces
25 | uv run python benchmarks/common_benchmark.py \
26 |     benchmark=collect_trace \
27 |     benchmark.data.data_dir="../../data/debug" \
28 |     benchmark.data.metadata_file="standardized_data.jsonl" \
29 |     llm=claude-3-7 \
30 |     llm.provider=anthropic \
31 |     llm.model_name=claude-3-7-sonnet-20250219 \
32 |     llm.api_key="$ANTHROPIC_API_KEY" \
33 |     llm.base_url=https://api.anthropic.com \
34 |     llm.async_client=true \
35 |     benchmark.execution.max_tasks=null \
36 |     benchmark.execution.max_concurrent=10 \
37 |     benchmark.execution.pass_at_k=1 \
38 |     agent=single_agent \
39 |     hydra.run.dir=$LOG_DIR \
40 |     2>&1 | tee "$LOG_DIR/output.log"
41 | 
42 | # Enter the apps/collect-trace directory
43 | TARGET_DIR="$SCRIPT_DIR/../"
44 | echo "Target directory: $TARGET_DIR"
45 | cd $TARGET_DIR
46 | 
47 | # Process traces
48 | uv run python $TARGET_DIR/utils/process_logs.py $LOG_DIR/benchmark_results.jsonl
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/.env.example:
--------------------------------------------------------------------------------
 1 | # API for Google Search (recommend)
 2 | SERPER_API_KEY=your_serper_key
 3 | SERPER_BASE_URL="https://google.serper.dev"
 4 | 
 5 | # API for Web Scraping (recommend)
 6 | JINA_API_KEY=your_jina_key
 7 | JINA_BASE_URL="https://r.jina.ai"
 8 | 
 9 | # API for Linux Sandbox (recommend)
10 | E2B_API_KEY=your_e2b_key
11 | 
12 | # API for LLM-as-Judge (for benchmark testing)
13 | OPENAI_API_KEY=your_openai_key
14 | OPENAI_BASE_URL=https://api.openai.com/v1
15 | 
16 | # API for Open-Source Audio Transcription Tool (for benchmark testing)
17 | WHISPER_MODEL_NAME="openai/whisper-large-v3-turbo"
18 | WHISPER_API_KEY=your_whisper_key
19 | WHISPER_BASE_URL="https://your_whisper_base_url/v1"
20 | 
21 | # API for Open-Source VQA Tool (for benchmark testing)
22 | VISION_MODEL_NAME="Qwen/Qwen2.5-VL-72B-Instruct"
23 | VISION_API_KEY=your_vision_key
24 | VISION_BASE_URL="https://your_vision_base_url/v1/chat/completions"
25 | 
26 | # API for Open-Source Reasoning Tool (for benchmark testing)
27 | REASONING_MODEL_NAME="Qwen/Qwen3-235B-A22B-Thinking-2507"
28 | REASONING_API_KEY=your_reasoning_key
29 | REASONING_BASE_URL="https://your_reasoning_base_url/v1/chat/completions"
30 | 
31 | # API for Claude Sonnet 3.7 as Commercial Tools (optional)
32 | ANTHROPIC_API_KEY=your_anthropic_key
33 | ANTHROPIC_BASE_URL=https://api.anthropic.com
34 | 
35 | # API for Sougou Search (optional)
36 | TENCENTCLOUD_SECRET_ID=your_tencent_cloud_secret_id
37 | TENCENTCLOUD_SECRET_KEY=your_tencent_cloud_secret_key
38 | 
39 | # API for Summary LLM (optional)
40 | SUMMARY_LLM_BASE_URL="https://your_summary_llm_base_url/v1/chat/completions"
41 | SUMMARY_LLM_MODEL_NAME=your_summary_llm_model_name
42 | SUMMARY_LLM_API_KEY=your_summary_llm_api_key


--------------------------------------------------------------------------------
/apps/miroflow-agent/main.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import asyncio
 5 | 
 6 | import hydra
 7 | from omegaconf import DictConfig, OmegaConf
 8 | 
 9 | # Import from the new modular structure
10 | from src.core.pipeline import (
11 |     create_pipeline_components,
12 |     execute_task_pipeline,
13 | )
14 | from src.logging.task_logger import bootstrap_logger
15 | 
16 | # Configure logger and get the configured instance
17 | logger = bootstrap_logger()
18 | 
19 | 
20 | async def amain(cfg: DictConfig) -> None:
21 |     """Asynchronous main function."""
22 | 
23 |     logger.info(OmegaConf.to_yaml(cfg))
24 | 
25 |     # Create pipeline components using the factory function
26 |     main_agent_tool_manager, sub_agent_tool_managers, output_formatter = (
27 |         create_pipeline_components(cfg)
28 |     )
29 | 
30 |     # Define task parameters
31 |     task_id = "task_example"
32 |     task_description = "What is the title of today's arxiv paper in computer science?"
33 |     task_file_name = ""
34 | 
35 |     # Execute task using the pipeline
36 |     final_summary, final_boxed_answer, log_file_path = await execute_task_pipeline(
37 |         cfg=cfg,
38 |         task_id=task_id,
39 |         task_file_name=task_file_name,
40 |         task_description=task_description,
41 |         main_agent_tool_manager=main_agent_tool_manager,
42 |         sub_agent_tool_managers=sub_agent_tool_managers,
43 |         output_formatter=output_formatter,
44 |         log_dir=cfg.debug_dir,
45 |     )
46 | 
47 | 
48 | @hydra.main(config_path="conf", config_name="config", version_base=None)
49 | def main(cfg: DictConfig) -> None:
50 |     asyncio.run(amain(cfg))
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     main()
55 | 


--------------------------------------------------------------------------------
/libs/miroflow-tools/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "miroflow-tools"
 3 | version = "0.1.0"
 4 | description = "Tool management and MCP server utilities for MiroFlow"
 5 | readme = "README.md"
 6 | authors = [
 7 |     { name = "MiroMind Team", email = "service@miromind.ai" }
 8 | ]
 9 | requires-python = ">=3.12"
10 | dependencies = [
11 |     "mcp>=1.0.0",
12 |     "fastmcp>=0.1.0",
13 |     "playwright>=1.40.0",
14 |     "requests>=2.32.0",
15 |     "e2b-code-interpreter==1.2.1",
16 |     "wikipedia",
17 |     "mutagen",
18 |     "markitdown-mcp>=0.0.1a3",
19 |     "google-genai",
20 |     "aiohttp",
21 |     "redis"
22 | ]
23 | 
24 | [build-system]
25 | requires = ["hatchling"]
26 | build-backend = "hatchling.build"
27 | 
28 | [tool.hatch.build.targets.wheel]
29 | packages = ["src/miroflow_tools"]
30 | 
31 | [dependency-groups]
32 | dev = [
33 |     "pytest>=8.4.1",
34 |     "pytest-asyncio>=1.0.0",
35 |     "pytest-cov>=6.2.1",
36 |     "pytest-html>=4.1.1",
37 |     "pytest-xdist>=3.7.0",
38 |     "pytest-mock>=3.10.0",
39 |     "pytest-timeout>=2.1.0",
40 |     "inline-snapshot>=0.23.2",
41 | ]
42 | 
43 | [tool.pytest.ini_options]
44 | minversion = "8.3.5"
45 | testpaths = ["src/test"]
46 | asyncio_default_fixture_loop_scope = "function"
47 | addopts = [
48 |     "-rA",
49 |     "--show-capture=stderr",
50 |     "-n=auto",
51 |     "--html=report.html",
52 |     "--self-contained-html",
53 |     "--cov=miroflow_tools",
54 |     "--cov-report=html",
55 |     "--strict-markers",
56 |     "-v",
57 | ]
58 | markers = [
59 |     "integration: marks tests as integration tests (may be slow)",
60 |     "unit: marks tests as unit tests",
61 |     "slow: marks tests as slow (deselect with '-m \"not slow\"')",
62 |     "requires_api_key: marks tests that require real API credentials",
63 | ] 


--------------------------------------------------------------------------------
/apps/miroflow-agent/benchmarks/check_progress/check_progress_hle.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | import os
 6 | 
 7 | from common import ProgressChecker
 8 | 
 9 | # Benchmark configuration
10 | FILENAME = os.path.basename(__file__)
11 | BENCHMARK_NAME = "hle"
12 | BENCHMARK_NAME_STD = "HLE-2500"
13 | TASKS_PER_RUN = 2500
14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
15 | TASK_ID_PATTERN = r"task_([a-f0-9]+)"
16 | 
17 | 
18 | def parse_args():
19 |     parser = argparse.ArgumentParser(
20 |         description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
21 |     )
22 |     parser.add_argument(
23 |         "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
24 |     )
25 |     return parser.parse_args()
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     args = parse_args()
30 | 
31 |     try:
32 |         # Create progress checker and run analysis
33 |         checker = ProgressChecker(
34 |             args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
35 |         )
36 |         summary = checker.run_analysis(
37 |             benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
38 |         )
39 |         # Exit with appropriate code
40 |         if summary.total_tasks == 0:
41 |             print("No task files found in any run directories")
42 |         elif summary.total_completed == 0:
43 |             print("No tasks completed yet")
44 | 
45 |     except FileNotFoundError as e:
46 |         print(f"Error: {e}")
47 |     except PermissionError as e:
48 |         print(f"Error: {e}")
49 |     except ValueError as e:
50 |         print(f"Error: {e}")
51 |     except Exception as e:
52 |         print(f"Unexpected error: {e}")
53 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/benchmarks/check_progress/check_progress_frames.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | import os
 6 | 
 7 | from common import ProgressChecker
 8 | 
 9 | # Benchmark configuration
10 | FILENAME = os.path.basename(__file__)
11 | BENCHMARK_NAME = "frames"
12 | BENCHMARK_NAME_STD = "Frames"
13 | TASKS_PER_RUN = 824
14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
15 | TASK_ID_PATTERN = r"task_([a-f0-9]+)"
16 | 
17 | 
18 | def parse_args():
19 |     parser = argparse.ArgumentParser(
20 |         description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
21 |     )
22 |     parser.add_argument(
23 |         "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
24 |     )
25 |     return parser.parse_args()
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     args = parse_args()
30 | 
31 |     try:
32 |         # Create progress checker and run analysis
33 |         checker = ProgressChecker(
34 |             args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
35 |         )
36 |         summary = checker.run_analysis(
37 |             benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
38 |         )
39 |         # Exit with appropriate code
40 |         if summary.total_tasks == 0:
41 |             print("No task files found in any run directories")
42 |         elif summary.total_completed == 0:
43 |             print("No tasks completed yet")
44 | 
45 |     except FileNotFoundError as e:
46 |         print(f"Error: {e}")
47 |     except PermissionError as e:
48 |         print(f"Error: {e}")
49 |     except ValueError as e:
50 |         print(f"Error: {e}")
51 |     except Exception as e:
52 |         print(f"Unexpected error: {e}")
53 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/benchmarks/check_progress/check_progress_seal-0.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | import os
 6 | 
 7 | from common import ProgressChecker
 8 | 
 9 | # Benchmark configuration
10 | FILENAME = os.path.basename(__file__)
11 | BENCHMARK_NAME = "seal-0"
12 | BENCHMARK_NAME_STD = "SEAL-0"
13 | TASKS_PER_RUN = 111
14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
15 | TASK_ID_PATTERN = r"task_([a-f0-9]+)"
16 | 
17 | 
18 | def parse_args():
19 |     parser = argparse.ArgumentParser(
20 |         description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
21 |     )
22 |     parser.add_argument(
23 |         "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
24 |     )
25 |     return parser.parse_args()
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     args = parse_args()
30 | 
31 |     try:
32 |         # Create progress checker and run analysis
33 |         checker = ProgressChecker(
34 |             args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
35 |         )
36 |         summary = checker.run_analysis(
37 |             benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
38 |         )
39 |         # Exit with appropriate code
40 |         if summary.total_tasks == 0:
41 |             print("No task files found in any run directories")
42 |         elif summary.total_completed == 0:
43 |             print("No tasks completed yet")
44 | 
45 |     except FileNotFoundError as e:
46 |         print(f"Error: {e}")
47 |     except PermissionError as e:
48 |         print(f"Error: {e}")
49 |     except ValueError as e:
50 |         print(f"Error: {e}")
51 |     except Exception as e:
52 |         print(f"Unexpected error: {e}")
53 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/benchmarks/check_progress/check_progress_aime2025.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | import os
 6 | 
 7 | from common import ProgressChecker
 8 | 
 9 | # Benchmark configuration
10 | FILENAME = os.path.basename(__file__)
11 | BENCHMARK_NAME = "aime2025"
12 | BENCHMARK_NAME_STD = "AIME2025"
13 | TASKS_PER_RUN = 30
14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
15 | TASK_ID_PATTERN = r"task_([a-f0-9]+)"
16 | 
17 | 
18 | def parse_args():
19 |     parser = argparse.ArgumentParser(
20 |         description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
21 |     )
22 |     parser.add_argument(
23 |         "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
24 |     )
25 |     return parser.parse_args()
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     args = parse_args()
30 | 
31 |     try:
32 |         # Create progress checker and run analysis
33 |         checker = ProgressChecker(
34 |             args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
35 |         )
36 |         summary = checker.run_analysis(
37 |             benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
38 |         )
39 |         # Exit with appropriate code
40 |         if summary.total_tasks == 0:
41 |             print("No task files found in any run directories")
42 |         elif summary.total_completed == 0:
43 |             print("No tasks completed yet")
44 | 
45 |     except FileNotFoundError as e:
46 |         print(f"Error: {e}")
47 |     except PermissionError as e:
48 |         print(f"Error: {e}")
49 |     except ValueError as e:
50 |         print(f"Error: {e}")
51 |     except Exception as e:
52 |         print(f"Unexpected error: {e}")
53 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/benchmarks/check_progress/check_progress_browsecomp.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | import os
 6 | 
 7 | from common import ProgressChecker
 8 | 
 9 | # Benchmark configuration
10 | FILENAME = os.path.basename(__file__)
11 | BENCHMARK_NAME = "browsecomp"
12 | BENCHMARK_NAME_STD = "BrowseComp-EN"
13 | TASKS_PER_RUN = 1265
14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
15 | TASK_ID_PATTERN = r"task_([a-f0-9]+)"
16 | 
17 | 
18 | def parse_args():
19 |     parser = argparse.ArgumentParser(
20 |         description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
21 |     )
22 |     parser.add_argument(
23 |         "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
24 |     )
25 |     return parser.parse_args()
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     args = parse_args()
30 | 
31 |     try:
32 |         # Create progress checker and run analysis
33 |         checker = ProgressChecker(
34 |             args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
35 |         )
36 |         summary = checker.run_analysis(
37 |             benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
38 |         )
39 |         # Exit with appropriate code
40 |         if summary.total_tasks == 0:
41 |             print("No task files found in any run directories")
42 |         elif summary.total_completed == 0:
43 |             print("No tasks completed yet")
44 | 
45 |     except FileNotFoundError as e:
46 |         print(f"Error: {e}")
47 |     except PermissionError as e:
48 |         print(f"Error: {e}")
49 |     except ValueError as e:
50 |         print(f"Error: {e}")
51 |     except Exception as e:
52 |         print(f"Unexpected error: {e}")
53 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/benchmarks/check_progress/check_progress_hle-text-500.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | import os
 6 | 
 7 | from common import ProgressChecker
 8 | 
 9 | # Benchmark configuration
10 | FILENAME = os.path.basename(__file__)
11 | BENCHMARK_NAME = "hle-text-500"
12 | BENCHMARK_NAME_STD = "HLE-Text-500"
13 | TASKS_PER_RUN = 500
14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
15 | TASK_ID_PATTERN = r"task_([a-f0-9]+)"
16 | 
17 | 
18 | def parse_args():
19 |     parser = argparse.ArgumentParser(
20 |         description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
21 |     )
22 |     parser.add_argument(
23 |         "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
24 |     )
25 |     return parser.parse_args()
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     args = parse_args()
30 | 
31 |     try:
32 |         # Create progress checker and run analysis
33 |         checker = ProgressChecker(
34 |             args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
35 |         )
36 |         summary = checker.run_analysis(
37 |             benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
38 |         )
39 |         # Exit with appropriate code
40 |         if summary.total_tasks == 0:
41 |             print("No task files found in any run directories")
42 |         elif summary.total_completed == 0:
43 |             print("No tasks completed yet")
44 | 
45 |     except FileNotFoundError as e:
46 |         print(f"Error: {e}")
47 |     except PermissionError as e:
48 |         print(f"Error: {e}")
49 |     except ValueError as e:
50 |         print(f"Error: {e}")
51 |     except Exception as e:
52 |         print(f"Unexpected error: {e}")
53 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/benchmarks/check_progress/check_progress_webwalkerqa.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | import os
 6 | 
 7 | from common import ProgressChecker
 8 | 
 9 | # Benchmark configuration
10 | FILENAME = os.path.basename(__file__)
11 | BENCHMARK_NAME = "webwalkerqa"
12 | BENCHMARK_NAME_STD = "WebWalkerQA"
13 | TASKS_PER_RUN = 680
14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
15 | TASK_ID_PATTERN = r"task_task_id_(\d+)"
16 | 
17 | 
18 | def parse_args():
19 |     parser = argparse.ArgumentParser(
20 |         description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
21 |     )
22 |     parser.add_argument(
23 |         "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
24 |     )
25 |     return parser.parse_args()
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     args = parse_args()
30 | 
31 |     try:
32 |         # Create progress checker and run analysis
33 |         checker = ProgressChecker(
34 |             args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
35 |         )
36 |         summary = checker.run_analysis(
37 |             benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
38 |         )
39 |         # Exit with appropriate code
40 |         if summary.total_tasks == 0:
41 |             print("No task files found in any run directories")
42 |         elif summary.total_completed == 0:
43 |             print("No tasks completed yet")
44 | 
45 |     except FileNotFoundError as e:
46 |         print(f"Error: {e}")
47 |     except PermissionError as e:
48 |         print(f"Error: {e}")
49 |     except ValueError as e:
50 |         print(f"Error: {e}")
51 |     except Exception as e:
52 |         print(f"Unexpected error: {e}")
53 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/benchmarks/check_progress/check_progress_browsecomp_zh.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | import os
 6 | 
 7 | from common import ProgressChecker
 8 | 
 9 | # Benchmark configuration
10 | FILENAME = os.path.basename(__file__)
11 | BENCHMARK_NAME = "browsecomp_zh"
12 | BENCHMARK_NAME_STD = "BrowseComp-ZH"
13 | TASKS_PER_RUN = 289
14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
15 | TASK_ID_PATTERN = r"task_([a-f0-9]+)"
16 | 
17 | 
18 | def parse_args():
19 |     parser = argparse.ArgumentParser(
20 |         description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
21 |     )
22 |     parser.add_argument(
23 |         "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
24 |     )
25 |     return parser.parse_args()
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     args = parse_args()
30 | 
31 |     try:
32 |         # Create progress checker and run analysis
33 |         checker = ProgressChecker(
34 |             args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
35 |         )
36 |         summary = checker.run_analysis(
37 |             benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
38 |         )
39 |         # Exit with appropriate code
40 |         if summary.total_tasks == 0:
41 |             print("No task files found in any run directories")
42 |         elif summary.total_completed == 0:
43 |             print("No tasks completed yet")
44 | 
45 |     except FileNotFoundError as e:
46 |         print(f"Error: {e}")
47 |     except PermissionError as e:
48 |         print(f"Error: {e}")
49 |     except ValueError as e:
50 |         print(f"Error: {e}")
51 |     except Exception as e:
52 |         print(f"Unexpected error: {e}")
53 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/benchmarks/check_progress/check_progress_hle-text-2158.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | import os
 6 | 
 7 | from common import ProgressChecker
 8 | 
 9 | # Benchmark configuration
10 | FILENAME = os.path.basename(__file__)
11 | BENCHMARK_NAME = "hle-text-2158"
12 | BENCHMARK_NAME_STD = "HLE-Text-2158"
13 | TASKS_PER_RUN = 2158
14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
15 | TASK_ID_PATTERN = r"task_([a-f0-9]+)"
16 | 
17 | 
18 | def parse_args():
19 |     parser = argparse.ArgumentParser(
20 |         description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
21 |     )
22 |     parser.add_argument(
23 |         "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
24 |     )
25 |     return parser.parse_args()
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     args = parse_args()
30 | 
31 |     try:
32 |         # Create progress checker and run analysis
33 |         checker = ProgressChecker(
34 |             args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
35 |         )
36 |         summary = checker.run_analysis(
37 |             benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
38 |         )
39 |         # Exit with appropriate code
40 |         if summary.total_tasks == 0:
41 |             print("No task files found in any run directories")
42 |         elif summary.total_completed == 0:
43 |             print("No tasks completed yet")
44 | 
45 |     except FileNotFoundError as e:
46 |         print(f"Error: {e}")
47 |     except PermissionError as e:
48 |         print(f"Error: {e}")
49 |     except ValueError as e:
50 |         print(f"Error: {e}")
51 |     except Exception as e:
52 |         print(f"Unexpected error: {e}")
53 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/benchmarks/check_progress/check_progress_xbench_deepsearch.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | import os
 6 | 
 7 | from common import ProgressChecker
 8 | 
 9 | # Benchmark configuration
10 | FILENAME = os.path.basename(__file__)
11 | BENCHMARK_NAME = "xbench_deepsearch"
12 | BENCHMARK_NAME_STD = "XBench-DeepSearch"
13 | TASKS_PER_RUN = 100
14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
15 | TASK_ID_PATTERN = r"task_([a-f0-9]+)"
16 | 
17 | 
18 | def parse_args():
19 |     parser = argparse.ArgumentParser(
20 |         description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
21 |     )
22 |     parser.add_argument(
23 |         "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
24 |     )
25 |     return parser.parse_args()
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     args = parse_args()
30 | 
31 |     try:
32 |         # Create progress checker and run analysis
33 |         checker = ProgressChecker(
34 |             args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
35 |         )
36 |         summary = checker.run_analysis(
37 |             benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
38 |         )
39 |         # Exit with appropriate code
40 |         if summary.total_tasks == 0:
41 |             print("No task files found in any run directories")
42 |         elif summary.total_completed == 0:
43 |             print("No tasks completed yet")
44 | 
45 |     except FileNotFoundError as e:
46 |         print(f"Error: {e}")
47 |     except PermissionError as e:
48 |         print(f"Error: {e}")
49 |     except ValueError as e:
50 |         print(f"Error: {e}")
51 |     except Exception as e:
52 |         print(f"Unexpected error: {e}")
53 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/benchmarks/check_progress/check_progress_gaia-validation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | import os
 6 | 
 7 | from common import GAIAProgressChecker as ProgressChecker
 8 | 
 9 | # Benchmark configuration
10 | FILENAME = os.path.basename(__file__)
11 | BENCHMARK_NAME = "gaia-2023-validation"
12 | BENCHMARK_NAME_STD = "GAIA-Val-165"
13 | TASKS_PER_RUN = 165
14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
15 | TASK_ID_PATTERN = r"task_([^_]+(?:-[^_]+)*)"
16 | 
17 | 
18 | def parse_args():
19 |     parser = argparse.ArgumentParser(
20 |         description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
21 |     )
22 |     parser.add_argument(
23 |         "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
24 |     )
25 |     return parser.parse_args()
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     args = parse_args()
30 | 
31 |     try:
32 |         # Create progress checker and run analysis
33 |         checker = ProgressChecker(
34 |             args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
35 |         )
36 |         summary = checker.run_analysis(
37 |             benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
38 |         )
39 |         # Exit with appropriate code
40 |         if summary.total_tasks == 0:
41 |             print("No task files found in any run directories")
42 |         elif summary.total_completed == 0:
43 |             print("No tasks completed yet")
44 | 
45 |     except FileNotFoundError as e:
46 |         print(f"Error: {e}")
47 |     except PermissionError as e:
48 |         print(f"Error: {e}")
49 |     except ValueError as e:
50 |         print(f"Error: {e}")
51 |     except Exception as e:
52 |         print(f"Unexpected error: {e}")
53 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/benchmarks/check_progress/check_progress_gaia-validation-text-103.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | import os
 6 | 
 7 | from common import GAIAProgressChecker as ProgressChecker
 8 | 
 9 | # Benchmark configuration
10 | FILENAME = os.path.basename(__file__)
11 | BENCHMARK_NAME = "gaia-2023-validation-text-103"
12 | BENCHMARK_NAME_STD = "GAIA-Text-103"
13 | TASKS_PER_RUN = 103
14 | DATA_PATH = f"../../data/{BENCHMARK_NAME}/standardized_data.jsonl"
15 | TASK_ID_PATTERN = r"task_([^_]+(?:-[^_]+)*)"
16 | 
17 | 
18 | def parse_args():
19 |     parser = argparse.ArgumentParser(
20 |         description=f"Check progress of {BENCHMARK_NAME_STD} benchmark runs."
21 |     )
22 |     parser.add_argument(
23 |         "path", help=f"Path to {BENCHMARK_NAME_STD} benchmark directory"
24 |     )
25 |     return parser.parse_args()
26 | 
27 | 
28 | if __name__ == "__main__":
29 |     args = parse_args()
30 | 
31 |     try:
32 |         # Create progress checker and run analysis
33 |         checker = ProgressChecker(
34 |             args.path, task_per_run=TASKS_PER_RUN, data_path=DATA_PATH
35 |         )
36 |         summary = checker.run_analysis(
37 |             benchmark_name_std=BENCHMARK_NAME_STD, task_id_pattern=TASK_ID_PATTERN
38 |         )
39 |         # Exit with appropriate code
40 |         if summary.total_tasks == 0:
41 |             print("No task files found in any run directories")
42 |         elif summary.total_completed == 0:
43 |             print("No tasks completed yet")
44 | 
45 |     except FileNotFoundError as e:
46 |         print(f"Error: {e}")
47 |     except PermissionError as e:
48 |         print(f"Error: {e}")
49 |     except ValueError as e:
50 |         print(f"Error: {e}")
51 |     except Exception as e:
52 |         print(f"Unexpected error: {e}")
53 | 


--------------------------------------------------------------------------------
/libs/miroflow-tools/src/miroflow_tools/mcp_servers/reasoning_mcp_server.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import logging
 5 | import os
 6 | 
 7 | from anthropic import Anthropic
 8 | from fastmcp import FastMCP
 9 | 
10 | logger = logging.getLogger("miroflow")
11 | 
12 | ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
13 | ANTHROPIC_BASE_URL = os.environ.get("ANTHROPIC_BASE_URL", "https://api.anthropic.com")
14 | 
15 | # Initialize FastMCP server
16 | mcp = FastMCP("reasoning-mcp-server")
17 | 
18 | 
19 | @mcp.tool()
20 | async def reasoning(question: str) -> str:
21 |     """You can use this tool to solve hard math problem, puzzle, riddle and IQ test question that requires a lot of chain of thought efforts.
22 |     DO NOT use this tool for simple and obvious question.
23 | 
24 |     Args:
25 |         question: The hard question.
26 | 
27 |     Returns:
28 |         The answer to the question.
29 |     """
30 |     messages_for_llm = [
31 |         {
32 |             "role": "user",
33 |             "content": [
34 |                 {
35 |                     "type": "text",
36 |                     "text": question,
37 |                 }
38 |             ],
39 |         }
40 |     ]
41 | 
42 |     client = Anthropic(api_key=ANTHROPIC_API_KEY, base_url=ANTHROPIC_BASE_URL)
43 |     response = client.messages.create(
44 |         model="claude-3-7-sonnet-20250219",
45 |         max_tokens=21000,
46 |         thinking={
47 |             "type": "enabled",
48 |             "budget_tokens": 19000,
49 |         },
50 |         messages=messages_for_llm,
51 |         stream=False,
52 |     )
53 | 
54 |     try:
55 |         return response.content[-1].text
56 |     except Exception:
57 |         logger.info("Reasoning Error: only thinking content is returned")
58 |         return response.content[-1].thinking
59 | 
60 | 
61 | if __name__ == "__main__":
62 |     mcp.run(transport="stdio")
63 | 


--------------------------------------------------------------------------------
/libs/miroflow-tools/src/miroflow_tools/dev_mcp_servers/stateless_python_server.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import os
 5 | 
 6 | from e2b_code_interpreter import Sandbox
 7 | from mcp.server.fastmcp import FastMCP
 8 | 
 9 | # Initialize FastMCP server
10 | mcp = FastMCP("stateless-python-server")
11 | 
12 | # API keys
13 | E2B_API_KEY = os.environ.get("E2B_API_KEY")
14 | 
15 | # DEFAULT CONFS
16 | DEFAULT_TIMEOUT = 300  # seconds
17 | 
18 | 
19 | @mcp.tool()
20 | async def python(code: str) -> str:
21 |     """Use this tool to execute STATELESS Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).
22 |     When you send a message containing python code to python, it will be executed in a stateless docker container, and the stdout of that process will be returned to you. You have to use print statements to access the output.
23 |     IMPORTANT: Your python environment is not shared between calls. You will have to pass your entire code each time.
24 | 
25 |         Args:
26 |             code: The python code to run.
27 | 
28 |         Returns:
29 |             A string containing the execution result including stdout and stderr.
30 |     """
31 |     sandbox = Sandbox.create(
32 |         timeout=DEFAULT_TIMEOUT, api_key=E2B_API_KEY, template="1av7fdjfvcparqo8efq6"
33 |     )
34 | 
35 |     max_attempts = 2
36 |     for attempt in range(1, max_attempts + 1):
37 |         try:
38 |             execution = sandbox.run_code(code)
39 |             break
40 |         except Exception as e:
41 |             if attempt == max_attempts:
42 |                 raise e
43 |     execution = sandbox.run_code(code)
44 | 
45 |     sandbox.kill()
46 | 
47 |     return str(execution)
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     mcp.run(transport="stdio")
52 | 


--------------------------------------------------------------------------------
/apps/collect-trace/utils/merge_chatml_msgs_to_one_json.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | import glob
 6 | import json
 7 | import os
 8 | 
 9 | 
10 | def merge_json_files(input_dir, type="main"):
11 |     # List to store all messages
12 |     all_conversations = []
13 | 
14 |     # Get all JSON files matching the pattern
15 |     json_files = glob.glob(os.path.join(input_dir, f"*{type}*.json"))
16 | 
17 |     # Read each JSON file and merge its content
18 |     for json_file in json_files:
19 |         try:
20 |             with open(json_file, "r", encoding="utf-8") as f:
21 |                 data = json.load(f)
22 |                 conversation = {
23 |                     "messages": data,
24 |                 }
25 |                 all_conversations.append(conversation)
26 |             print(f"Successfully processed: {json_file}")
27 |         except Exception as e:
28 |             print(f"Error processing {json_file}: {str(e)}")
29 | 
30 |     output_file = os.path.join(input_dir, f"{type}_merged.json")
31 |     # Write the merged data to a new JSON file
32 |     with open(output_file, "w", encoding="utf-8") as f:
33 |         json.dump(all_conversations, f, ensure_ascii=False, indent=2)
34 | 
35 |     print(
36 |         f"\nMerging complete! All {type} JSON files have been merged into {output_file}"
37 |     )
38 |     print(f"Total number of files processed: {len(json_files)}")
39 |     print(f"Total number of messages: {len(all_conversations)}")
40 | 
41 | 
42 | def main():
43 |     parser = argparse.ArgumentParser(
44 |         description="Merge multiple JSON files which contain chat messages into a single file"
45 |     )
46 |     parser.add_argument(
47 |         "--input_dir",
48 |         type=str,
49 |         required=True,
50 |         help="File pattern with wildcards to match JSON files (e.g., '*.json' or 'data/*main*.json')",
51 |     )
52 | 
53 |     args = parser.parse_args()
54 | 
55 |     merge_json_files(args.input_dir, type="main_agent")
56 |     merge_json_files(args.input_dir, type="agent-browsing")
57 | 
58 | 
59 | if __name__ == "__main__":
60 |     main()
61 | 


--------------------------------------------------------------------------------
/assets/QA.md:
--------------------------------------------------------------------------------
 1 | # MiroFlow QA Documentation
 2 | 
 3 | ## Q1: Can I extract GAIA-Text-103 results from existing GAIA-Validation evaluations?
 4 | 
 5 | **Answer:** Yes! If you have completed GAIA-Validation evaluations, you can extract and re-grade the GAIA-Text-103 subset using our specialized tools.
 6 | 
 7 | ### Step-by-Step Process
 8 | 
 9 | 1. **Extract GAIA-Text-103 Tasks**
10 | 
11 |    ```bash
12 |    # Extract text-103 tasks to a separate directory
13 |    uv run benchmarks/subset_extraction/gaia-to-text-103-mover.py ../../logs/gaia-validation/0806/qwen_MiroThinker-32B-SFT_evaluation
14 |    ```
15 | 
16 |    This creates a new directory: `gaia-text-103-extraction/qwen_MiroThinker-32B-SFT_evaluation`
17 | 
18 | 1. **Re-grade with GAIA-Text-103 Evaluator**
19 | 
20 |    ```bash
21 |    # Apply GAIA-Text-103 specific grading
22 |    uv run benchmarks/subset_extraction/gaia-text-103-grader.py ../../logs/gaia-validation/0806/gaia-text-103-extraction
23 |    ```
24 | 
25 | 1. **Verify Results**
26 | 
27 |    ```bash
28 |    # Check accuracy and generate statistics
29 |    uv run benchmarks/check_progress/check_progress_gaia-validation-text-103.py ../../logs/gaia-validation/0806/gaia-text-103-extraction
30 |    ```
31 | 
32 | ## Q2: Does the choice of judgment model affect evaluation performance?
33 | 
34 | **Answer:** Yes, there is a measurable difference in evaluation outcomes between the two judgment models.
35 | 
36 | We have standardized on GPT-4.1-2025-04-14 as our primary judgment model for several practical reasons:
37 | 
38 | - **Ease of deployment:** No need to host additional GPU-intensive models
39 | - **Consistency:** Aligns with evaluation standards used in other benchmarks (SimpleQA, BrowseComp)
40 | - **Reproducibility:** Provides a consistent baseline for cross-evaluation comparisons
41 | 
42 | ## Code Quality Checks
43 | 
44 | Before submitting a pull request, ensure your code meets our quality standards:
45 | 
46 | ```bash
47 | # Fix linting issues automatically
48 | uv tool run ruff@0.8.0 check --fix .
49 | 
50 | # Format code according to our style guidelines
51 | uv tool run ruff@0.8.0 format .
52 | ```
53 | 
54 | ## Know Issues
55 | 
56 | - The context management component before the summary requires further refinement to improve accuracy and reliability. I guess this is because the length estimation is not accurate.
57 | 


--------------------------------------------------------------------------------
/apps/collect-trace/README.md:
--------------------------------------------------------------------------------
 1 | # Collect Trace
 2 | 
 3 | > TL;DR: Treat an RLVR-format dataset (Question + verifiable answer) as a benchmark. Run the evaluation pipeline; use LLM-as-judge to verify correctness; then harvest the correct interaction traces as training data (for SFT / DPO).
 4 | 
 5 | ## 📝 Overview
 6 | 
 7 | Collect Trace is a key component in the MiroThinker training pipeline. Instead of hand-curating training samples, it reuses RLVR datasets as test sets, and collects multi-turn interaction traces only from items judged correct.
 8 | 
 9 | Workflow:
10 | 
11 | 1. Load each RLVR item’s question and verifiable answer.
12 | 
13 | 1. Run the agent in the evaluation pipeline (with tool use / browsing as needed).
14 | 
15 | 1. Verify the model’s answer with an LLM-as-judge against the RLVR reference answer.
16 | 
17 | 1. Only for items judged correct, collect the full multi-turn trace and convert it into SFT / DPO-ready samples.
18 | 
19 | ## 🚀 Quick Start
20 | 
21 | ### Prerequisites
22 | 
23 | - Python 3.12+
24 | - [uv](https://github.com/astral-sh/uv) package manager
25 | - OpenAI API key (for LLM-based validation)
26 | - RLVR dataset (JSONL; contains question and a verifiable answer)
27 | 
28 | ### Installation
29 | 
30 | 1. **Navigate to the collect-trace directory**:
31 | 
32 |    ```bash
33 |    cd apps/collect-trace
34 |    ```
35 | 
36 | 1. **Install dependencies**:
37 | 
38 |    ```bash
39 |    uv sync
40 |    ```
41 | 
42 | 1. **Set up environment variables**:
43 | 
44 |    ```bash
45 |    # Create .env if missing (safe; won't overwrite existing file)
46 |    [ -f ../../apps/miroflow-agent/.env ] || cp ../../apps/miroflow-agent/.env.example ../../apps/miroflow-agent/.env
47 |    # (Alternative on macOS/Linux) cp -n ../../apps/miroflow-agent/.env.example ../../apps/miroflow-agent/.env || true
48 | 
49 |    # Edit .env and fill in your keys
50 |    # Required: OPENAI_API_KEY (for LLM as judging)
51 |    # Optional: other keys for specific tools
52 |    ```
53 | 
54 | ### Basic Usage
55 | 
56 | Run a benchmark evaluation to collect traces:
57 | 
58 | ```bash
59 | # Using Claude-3.6 for trace collection
60 | bash scripts/collect_trace_claude37.sh
61 | 
62 | # Using GPT-5 for trace collection  
63 | bash scripts/collect_trace_gpt5.sh
64 | 
65 | # Using Qwen-3 for trace collection  
66 | bash scripts/collect_trace_qwen3.sh
67 | ```
68 | 


--------------------------------------------------------------------------------
/apps/collect-trace/utils/converters/system_prompts.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2025 MiroMind
2 | # This source code is licensed under the MIT License.
3 | 
4 | main_system_prompt_foreword = """In this environment you have access to a set of tools you can use to answer the user's question. \n    \nYou only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use."""
5 | 
6 | sub_agent_system_prompt_foreword = """In this environment you have access to a set of tools you can use to answer the user's question. \n    \nYou only have access to the tools provided below. You can only use one tool per message, and will receive the result of that tool in the user's next response. You use tools step-by-step to accomplish a given task, with each tool-use informed by the result of the previous tool-use."""
7 | 
8 | system_prompt_tool_instrcutions = """# Tool-Use Formatting Instructions \n\nTool-use is formatted using XML-style tags. The tool-use is enclosed in <use_mcp_tool></use_mcp_tool> and each parameter is similarly enclosed within its own set of tags.\n\nThe Model Context Protocol (MCP) connects to servers that provide additional tools and resources to extend your capabilities. You can use the server's tools via the `use_mcp_tool`.\n\nDescription: \nRequest to use a tool provided by a MCP server. Each MCP server can provide multiple tools with different capabilities. Tools have defined input schemas that specify required and optional parameters.\n\nParameters:\n- server_name: (required) The name of the MCP server providing the tool\n- tool_name: (required) The name of the tool to execute\n- arguments: (required) A JSON object containing the tool's input parameters, following the tool's input schema, quotes within string must be properly escaped, ensure it's valid JSON\n\nUsage:\n<use_mcp_tool>\n<server_name>server name here</server_name>\n<tool_name>tool name here</tool_name>\n<arguments>\n{\n\"param1\": \"value1\",\n\"param2\": \"value2 \\\"escaped string\\\"\"\n}\n</arguments>\n</use_mcp_tool>\n\nImportant Notes:\n- Tool-use must be placed **at the end** of your response, **top-level**, and not nested within other tags.\n- Always adhere to this format for the tool use to ensure proper parsing and execution.\n\nString and scalar parameters should be specified as is, while lists and objects should use JSON format. Note that spaces for string values are not stripped. The output is not expected to be valid XML and is parsed with regular expressions.\nHere are the functions available in JSONSchema format:\n\n"""
9 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "miroflow-agent"
 3 | version = "0.1.0"
 4 | description = "Add your description here"
 5 | readme = "README.md"
 6 | requires-python = ">=3.12"
 7 | dependencies = [
 8 |     "miroflow-tools>=0.1.0",
 9 |     "huggingface-hub>=0.28.0",
10 |     "requests>=2.32.3",
11 |     "rich>=13.9.4",
12 |     "jinja2>=3.1.4",
13 |     "pillow>=11.0.0",
14 |     "markdownify>=0.14.1",
15 |     "duckduckgo-search>=6.3.7",
16 |     "python-dotenv",
17 |     "pdfminer-six",
18 |     "python-pptx",
19 |     "puremagic",
20 |     "pydub",
21 |     "SpeechRecognition",
22 |     "youtube_transcript_api",
23 |     "mcp",
24 |     "fastmcp",
25 |     "anthropic",
26 |     "e2b-code-interpreter==1.2.1",
27 |     "jsonlines>=4.0.0",
28 |     "mammoth>=1.9.0",
29 |     "numpy>=2.2.5",
30 |     "ipdb>=0.13.13",
31 |     "datasets>=3.5.0",
32 |     "openpyxl>=3.1.5",
33 |     "markitdown-mcp>=0.0.1a3",
34 |     "markitdown>=0.1.1",
35 |     "regex>=2024.11.6",
36 |     "openai>=1.78.1",
37 |     "tenacity>=9.1.2",
38 |     "transformers>=4.51.3",
39 |     "omegaconf>=2.3.0",
40 |     "wikipedia",
41 |     "mutagen",
42 |     "hydra-core",
43 |     "google-genai",
44 |     "tiktoken>=0.9.0",
45 |     "aiohttp",
46 |     "colorama>=0.4.6",
47 |     "json-repair>=0.49.0",
48 |     "tencentcloud-sdk-python>=3.0.1451"
49 | ]
50 | 
51 | [build-system]
52 | requires = ["hatchling"]
53 | build-backend = "hatchling.build"
54 | 
55 | [tool.hatch.build.targets.wheel]
56 | packages = ["src"]
57 | 
58 | 
59 | [tool.uv.sources]
60 | miroflow-tools = { path = "../../libs/miroflow-tools", editable = true }
61 | 
62 | [dependency-groups]
63 | dev = [
64 |     "inline-snapshot>=0.23.2",
65 |     "pyright>=1.1.403",
66 |     "pytest>=8.4.1",
67 |     "pytest-asyncio>=1.0.0",
68 |     "pytest-cov>=6.2.1",
69 |     "pytest-html>=4.1.1",
70 |     "pytest-xdist>=3.7.0",
71 |     "ty>=0.0.1a14",
72 | ]
73 | 
74 | [tool.pytest.ini_options]
75 | # see https://docs.pytest.org/en/stable/reference/customize.html#pyproject-toml
76 | minversion = "8.3.5"
77 | testpaths = ["tests"]
78 | # make warning go away
79 | # https://github.com/pytest-dev/pytest-asyncio/issues/924#issuecomment-2321921915
80 | asyncio_default_fixture_loop_scope = "function"
81 | addopts = [
82 |     # summary for failed AND passed tests
83 |     "-rA",
84 |     # only show stderr for test. stdlog can contain sensitive information
85 |     "--show-capture=stderr",
86 |     # use `pytest-xdist` to run tests in parallel
87 |     "-n=auto",
88 |     # use `pytest-html` to generate test report in html format
89 |     "--html=report.html",
90 |     "--self-contained-html",
91 |     # use `pytest-testmon` to run tests on changed files only
92 |     # "--testmon",
93 |     # use `pytest-cov` to generate test coverage report
94 |     "--cov=miroflow_agent",
95 |     "--cov-report=html",
96 | ]
97 | 


--------------------------------------------------------------------------------
/libs/miroflow-tools/src/miroflow_tools/mcp_servers/vision_mcp_server.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import base64
 5 | import os
 6 | 
 7 | from anthropic import Anthropic
 8 | from fastmcp import FastMCP
 9 | 
10 | ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
11 | ANTHROPIC_BASE_URL = os.environ.get("ANTHROPIC_BASE_URL", "https://api.anthropic.com")
12 | 
13 | # Initialize FastMCP server
14 | mcp = FastMCP("vision-mcp-server")
15 | 
16 | 
17 | def guess_mime_media_type_from_extension(file_path: str) -> str:
18 |     """Guess the MIME type based on the file extension."""
19 |     _, ext = os.path.splitext(file_path)
20 |     ext = ext.lower()
21 |     if ext in [".jpg", ".jpeg"]:
22 |         return "image/jpeg"
23 |     elif ext == ".png":
24 |         return "image/png"
25 |     elif ext == ".gif":
26 |         return "image/gif"
27 |     else:
28 |         return "image/jpeg"  # Default to JPEG if unknown
29 | 
30 | 
31 | @mcp.tool()
32 | async def visual_question_answering(image_path_or_url: str, question: str) -> str:
33 |     """Ask question about an image or a video and get the answer with a vision language model.
34 | 
35 |     Args:
36 |         image_path_or_url: The path of the image file locally or its URL.
37 |         question: The question to ask about the image.
38 | 
39 |     Returns:
40 |         The answer to the image-related question.
41 |     """
42 |     messages_for_llm = [
43 |         {
44 |             "role": "user",
45 |             "content": [
46 |                 {
47 |                     "type": "image",
48 |                     "source": None,
49 |                 },
50 |                 {
51 |                     "type": "text",
52 |                     "text": question,
53 |                 },
54 |             ],
55 |         }
56 |     ]
57 | 
58 |     try:
59 |         if os.path.exists(image_path_or_url):  # Check if the file exists locally
60 |             with open(image_path_or_url, "rb") as image_file:
61 |                 image_data = base64.b64encode(image_file.read()).decode("utf-8")
62 |                 messages_for_llm[0]["content"][0]["source"] = dict(
63 |                     type="base64",
64 |                     media_type=guess_mime_media_type_from_extension(image_path_or_url),
65 |                     data=image_data,
66 |                 )
67 |         else:  # Otherwise, assume it's a URL
68 |             messages_for_llm[0]["content"][0]["source"] = dict(
69 |                 type="url", url=image_path_or_url
70 |             )
71 | 
72 |         client = Anthropic(api_key=ANTHROPIC_API_KEY, base_url=ANTHROPIC_BASE_URL)
73 |         response = client.messages.create(
74 |             model="claude-3-7-sonnet-20250219",
75 |             max_tokens=1024,
76 |             messages=messages_for_llm,
77 |         )
78 |     except Exception as e:
79 |         return f"Error: {e}"
80 | 
81 |     try:
82 |         return response.content[0].text
83 |     except (AttributeError, IndexError):
84 |         return str(response)
85 | 
86 | 
87 | if __name__ == "__main__":
88 |     mcp.run(transport="stdio")
89 | 


--------------------------------------------------------------------------------
/libs/miroflow-tools/src/miroflow_tools/mcp_servers/reasoning_mcp_server_os.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import logging
 5 | import os
 6 | import random
 7 | import time
 8 | 
 9 | import requests
10 | from fastmcp import FastMCP
11 | 
12 | logger = logging.getLogger("miroflow")
13 | 
14 | REASONING_API_KEY = os.environ.get("REASONING_API_KEY")
15 | REASONING_BASE_URL = os.environ.get("REASONING_BASE_URL")
16 | REASONING_MODEL_NAME = os.environ.get("REASONING_MODEL_NAME")
17 | 
18 | # Initialize FastMCP server
19 | mcp = FastMCP("reasoning-mcp-server-os")
20 | 
21 | # Retry configuration
22 | MAX_RETRIES = 10
23 | BACKOFF_BASE = 1.0  # initial backoff in seconds
24 | BACKOFF_MAX = 30.0  # maximum backoff in seconds
25 | 
26 | 
27 | def post_with_retry(url, json, headers):
28 |     """Send POST request with retry and exponential backoff.
29 |     Returns response object if success, otherwise None."""
30 |     for attempt in range(1, MAX_RETRIES + 1):
31 |         try:
32 |             resp = requests.post(url, json=json, headers=headers, timeout=600)
33 |             if resp.status_code == 200:
34 |                 return resp
35 |             else:
36 |                 logger.warning(
37 |                     f"HTTP {resp.status_code} on attempt {attempt}: {resp.text[:200]}"
38 |                 )
39 |         except requests.exceptions.RequestException as e:
40 |             logger.warning(f"Request failed on attempt {attempt}: {e}")
41 | 
42 |         # Backoff before next retry
43 |         if attempt < MAX_RETRIES:
44 |             sleep_time = min(BACKOFF_BASE * (2 ** (attempt - 1)), BACKOFF_MAX)
45 |             # Add jitter to avoid thundering herd
46 |             sleep_time *= 0.8 + 0.4 * random.random()
47 |             logger.info(f"Retrying in {sleep_time:.1f}s...")
48 |             time.sleep(sleep_time)
49 | 
50 |     logger.warning(f"All {MAX_RETRIES} retries failed for {url}")
51 |     return None
52 | 
53 | 
54 | @mcp.tool()
55 | async def reasoning(question: str) -> str:
56 |     """You can use this tool use solve hard math problem, puzzle, riddle and IQ test question that requires a lot of chain of thought efforts.
57 |     DO NOT use this tool for simple and obvious question.
58 | 
59 |     Args:
60 |         question: The hard question.
61 | 
62 |     Returns:
63 |         The answer to the question.
64 |     """
65 |     payload = {
66 |         "model": REASONING_MODEL_NAME,
67 |         "messages": [{"role": "user", "content": question}],
68 |         "temperature": 0.6,
69 |         "top_p": 0.95,
70 |     }
71 |     headers = {
72 |         "Authorization": f"Bearer {REASONING_API_KEY}",
73 |         "Content-Type": "application/json",
74 |     }
75 | 
76 |     response = post_with_retry(REASONING_BASE_URL, json=payload, headers=headers)
77 |     if response is None:
78 |         return "Reasoning service unavailable. Please try again later."
79 | 
80 |     json_response = response.json()
81 |     try:
82 |         content = json_response["choices"][0]["message"]["content"]
83 |         if "</think>" in content:
84 |             content = content.split("</think>", 1)[1].strip()
85 |         return content
86 |     except Exception:
87 |         logger.info("Reasoning Error: only thinking content is returned")
88 |         return json_response["choices"][0]["message"]["reasoning_content"]
89 | 
90 | 
91 | if __name__ == "__main__":
92 |     mcp.run(transport="stdio")
93 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/scripts/run_evaluate_multiple_runs_hle.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parse environment variables, use defaults if not set
 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
 6 | 
 7 | # Configuration parameters
 8 | NUM_RUNS=${NUM_RUNS:-3}
 9 | BENCHMARK_NAME="hle"
10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10}
14 | PASS_AT_K=${PASS_AT_K:-1}
15 | TEMPERATURE=${TEMPERATURE:-1.0}
16 | API_KEY=${API_KEY:-"xxx"}
17 | 
18 | # Set results directory
19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
20 | 
21 | echo "Starting $NUM_RUNS runs of the evaluation..."
22 | echo "Results will be saved in: $RESULTS_DIR"
23 | 
24 | # Create results directory
25 | mkdir -p "$RESULTS_DIR"
26 | 
27 | # Launch all parallel tasks
28 | for i in $(seq 1 $NUM_RUNS); do
29 |     echo "=========================================="
30 |     echo "Launching experiment $i/$NUM_RUNS"
31 |     echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
32 |     echo "=========================================="
33 |     
34 |     # Set specific identifier for this run
35 |     RUN_ID="run_$i"
36 |     
37 |     # Run experiment (background execution)
38 |     (
39 |         uv run python benchmarks/common_benchmark.py \
40 |             benchmark=$BENCHMARK_NAME \
41 |             benchmark.data.metadata_file="standardized_data.jsonl" \
42 |             llm=qwen-3 \
43 |             llm.provider=$LLM_PROVIDER \
44 |             llm.model_name=$LLM_MODEL \
45 |             llm.base_url=$BASE_URL \
46 |             llm.async_client=true \
47 |             llm.temperature=$TEMPERATURE \
48 |             llm.max_context_length=$MAX_CONTEXT_LENGTH \
49 |             llm.api_key=$API_KEY \
50 |             benchmark.execution.max_tasks=null \
51 |             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
52 |             benchmark.execution.pass_at_k=$PASS_AT_K \
53 |             benchmark.data.data_dir=../../data/hle \
54 |             agent=$AGENT_SET \
55 |             hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
56 |             2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
57 |         
58 |         # Check if run was successful
59 |         if [ $? -eq 0 ]; then
60 |             echo "Run $i completed successfully"
61 |             RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
62 |             if [ -f "$RESULT_FILE" ]; then
63 |                 echo "Results saved to $RESULT_FILE"
64 |             else
65 |                 echo "Warning: Result file not found for run $i"
66 |             fi
67 |         else
68 |             echo "Run $i failed!"
69 |         fi
70 |     ) &
71 |     
72 |     # Small delay between launches to avoid simultaneous requests
73 |     sleep 2
74 | done
75 | 
76 | echo "All $NUM_RUNS runs have been launched in parallel"
77 | echo "Waiting for all runs to complete..."
78 | 
79 | # Wait for all background tasks to complete
80 | wait
81 | 
82 | echo "=========================================="
83 | echo "All $NUM_RUNS runs completed!"
84 | echo "=========================================="
85 | 
86 | # Calculate average scores
87 | echo "Calculating average scores..."
88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
89 | 
90 | echo "=========================================="
91 | echo "Multiple runs evaluation completed!"
92 | echo "Check results in: $RESULTS_DIR"
93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
94 | echo "==========================================" 
95 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/scripts/run_evaluate_multiple_runs_debug.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parse environment variables, use defaults if not set
 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
 6 | 
 7 | # Configuration parameters
 8 | NUM_RUNS=${NUM_RUNS:-1}
 9 | BENCHMARK_NAME="debug"
10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
13 | MAX_CONCURRENT=${MAX_CONCURRENT:-1}
14 | PASS_AT_K=${PASS_AT_K:-1}
15 | TEMPERATURE=${TEMPERATURE:-1.0}
16 | API_KEY=${API_KEY:-"xxx"}
17 | 
18 | # Set results directory
19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
20 | 
21 | echo "Starting $NUM_RUNS runs of the evaluation..."
22 | echo "Results will be saved in: $RESULTS_DIR"
23 | 
24 | # Create results directory
25 | mkdir -p "$RESULTS_DIR"
26 | 
27 | # Launch all parallel tasks
28 | for i in $(seq 1 $NUM_RUNS); do
29 |     echo "=========================================="
30 |     echo "Launching experiment $i/$NUM_RUNS"
31 |     echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
32 |     echo "=========================================="
33 |     
34 |     # Set specific identifier for this run
35 |     RUN_ID="run_$i"
36 |     
37 |     # Run experiment (background execution)
38 |     (
39 |         uv run python benchmarks/common_benchmark.py \
40 |             benchmark=$BENCHMARK_NAME \
41 |             benchmark.data.metadata_file="standardized_data.jsonl" \
42 |             llm=qwen-3 \
43 |             llm.provider=$LLM_PROVIDER \
44 |             llm.model_name=$LLM_MODEL \
45 |             llm.base_url=$BASE_URL \
46 |             llm.async_client=true \
47 |             llm.temperature=$TEMPERATURE \
48 |             llm.max_context_length=$MAX_CONTEXT_LENGTH \
49 |             llm.api_key=$API_KEY \
50 |             benchmark.execution.max_tasks=null \
51 |             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
52 |             benchmark.execution.pass_at_k=$PASS_AT_K \
53 |             benchmark.data.data_dir=../../data/debug \
54 |             agent=$AGENT_SET \
55 |             hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
56 |             2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
57 |         
58 |         # Check if run was successful
59 |         if [ $? -eq 0 ]; then
60 |             echo "Run $i completed successfully"
61 |             RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
62 |             if [ -f "$RESULT_FILE" ]; then
63 |                 echo "Results saved to $RESULT_FILE"
64 |             else
65 |                 echo "Warning: Result file not found for run $i"
66 |             fi
67 |         else
68 |             echo "Run $i failed!"
69 |         fi
70 |     ) &
71 |     
72 |     # Small delay between launches to avoid simultaneous requests
73 |     sleep 2
74 | done
75 | 
76 | echo "All $NUM_RUNS runs have been launched in parallel"
77 | echo "Waiting for all runs to complete..."
78 | 
79 | # Wait for all background tasks to complete
80 | wait
81 | 
82 | echo "=========================================="
83 | echo "All $NUM_RUNS runs completed!"
84 | echo "=========================================="
85 | 
86 | # Calculate average scores
87 | echo "Calculating average scores..."
88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
89 | 
90 | echo "=========================================="
91 | echo "Multiple runs evaluation completed!"
92 | echo "Check results in: $RESULTS_DIR"
93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
94 | echo "==========================================" 
95 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/scripts/run_evaluate_multiple_runs_frames.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parse environment variables, use defaults if not set
 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
 6 | 
 7 | # Configuration parameters
 8 | NUM_RUNS=${NUM_RUNS:-3}
 9 | BENCHMARK_NAME="frames"
10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10}
14 | PASS_AT_K=${PASS_AT_K:-1}
15 | TEMPERATURE=${TEMPERATURE:-1.0}
16 | API_KEY=${API_KEY:-"xxx"}
17 | 
18 | # Set results directory
19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
20 | 
21 | echo "Starting $NUM_RUNS runs of the evaluation..."
22 | echo "Results will be saved in: $RESULTS_DIR"
23 | 
24 | # Create results directory
25 | mkdir -p "$RESULTS_DIR"
26 | 
27 | # Launch all parallel tasks
28 | for i in $(seq 1 $NUM_RUNS); do
29 |     echo "=========================================="
30 |     echo "Launching experiment $i/$NUM_RUNS"
31 |     echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
32 |     echo "=========================================="
33 |     
34 |     # Set specific identifier for this run
35 |     RUN_ID="run_$i"
36 |     
37 |     # Run experiment (background execution)
38 |     (
39 |         uv run python benchmarks/common_benchmark.py \
40 |             benchmark=$BENCHMARK_NAME \
41 |             benchmark.data.metadata_file="standardized_data.jsonl" \
42 |             llm=qwen-3 \
43 |             llm.provider=$LLM_PROVIDER \
44 |             llm.model_name=$LLM_MODEL \
45 |             llm.base_url=$BASE_URL \
46 |             llm.async_client=true \
47 |             llm.temperature=$TEMPERATURE \
48 |             llm.max_context_length=$MAX_CONTEXT_LENGTH \
49 |             llm.api_key=$API_KEY \
50 |             benchmark.execution.max_tasks=null \
51 |             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
52 |             benchmark.execution.pass_at_k=$PASS_AT_K \
53 |             benchmark.data.data_dir=../../data/frames \
54 |             agent=$AGENT_SET \
55 |             hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
56 |             2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
57 |         
58 |         # Check if run was successful
59 |         if [ $? -eq 0 ]; then
60 |             echo "Run $i completed successfully"
61 |             RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
62 |             if [ -f "$RESULT_FILE" ]; then
63 |                 echo "Results saved to $RESULT_FILE"
64 |             else
65 |                 echo "Warning: Result file not found for run $i"
66 |             fi
67 |         else
68 |             echo "Run $i failed!"
69 |         fi
70 |     ) &
71 |     
72 |     # Small delay between launches to avoid simultaneous requests
73 |     sleep 2
74 | done
75 | 
76 | echo "All $NUM_RUNS runs have been launched in parallel"
77 | echo "Waiting for all runs to complete..."
78 | 
79 | # Wait for all background tasks to complete
80 | wait
81 | 
82 | echo "=========================================="
83 | echo "All $NUM_RUNS runs completed!"
84 | echo "=========================================="
85 | 
86 | # Calculate average scores
87 | echo "Calculating average scores..."
88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
89 | 
90 | echo "=========================================="
91 | echo "Multiple runs evaluation completed!"
92 | echo "Check results in: $RESULTS_DIR"
93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
94 | echo "==========================================" 
95 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/scripts/run_evaluate_multiple_runs_seal-0.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parse environment variables, use defaults if not set
 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
 6 | 
 7 | # Configuration parameters
 8 | NUM_RUNS=${NUM_RUNS:-8}
 9 | BENCHMARK_NAME="seal-0"
10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10}
14 | PASS_AT_K=${PASS_AT_K:-1}
15 | TEMPERATURE=${TEMPERATURE:-1.0}
16 | API_KEY=${API_KEY:-"xxx"}
17 | 
18 | # Set results directory
19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
20 | 
21 | echo "Starting $NUM_RUNS runs of the evaluation..."
22 | echo "Results will be saved in: $RESULTS_DIR"
23 | 
24 | # Create results directory
25 | mkdir -p "$RESULTS_DIR"
26 | 
27 | # Launch all parallel tasks
28 | for i in $(seq 1 $NUM_RUNS); do
29 |     echo "=========================================="
30 |     echo "Launching experiment $i/$NUM_RUNS"
31 |     echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
32 |     echo "=========================================="
33 |     
34 |     # Set specific identifier for this run
35 |     RUN_ID="run_$i"
36 |     
37 |     # Run experiment (background execution)
38 |     (
39 |         uv run python benchmarks/common_benchmark.py \
40 |             benchmark=$BENCHMARK_NAME \
41 |             benchmark.data.metadata_file="standardized_data.jsonl" \
42 |             llm=qwen-3 \
43 |             llm.provider=$LLM_PROVIDER \
44 |             llm.model_name=$LLM_MODEL \
45 |             llm.base_url=$BASE_URL \
46 |             llm.async_client=true \
47 |             llm.temperature=$TEMPERATURE \
48 |             llm.max_context_length=$MAX_CONTEXT_LENGTH \
49 |             llm.api_key=$API_KEY \
50 |             benchmark.execution.max_tasks=null \
51 |             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
52 |             benchmark.execution.pass_at_k=$PASS_AT_K \
53 |             benchmark.data.data_dir=../../data/seal-0 \
54 |             agent=$AGENT_SET \
55 |             hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
56 |             2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
57 |         
58 |         # Check if run was successful
59 |         if [ $? -eq 0 ]; then
60 |             echo "Run $i completed successfully"
61 |             RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
62 |             if [ -f "$RESULT_FILE" ]; then
63 |                 echo "Results saved to $RESULT_FILE"
64 |             else
65 |                 echo "Warning: Result file not found for run $i"
66 |             fi
67 |         else
68 |             echo "Run $i failed!"
69 |         fi
70 |     ) &
71 |     
72 |     # Small delay between launches to avoid simultaneous requests
73 |     sleep 2
74 | done
75 | 
76 | echo "All $NUM_RUNS runs have been launched in parallel"
77 | echo "Waiting for all runs to complete..."
78 | 
79 | # Wait for all background tasks to complete
80 | wait
81 | 
82 | echo "=========================================="
83 | echo "All $NUM_RUNS runs completed!"
84 | echo "=========================================="
85 | 
86 | # Calculate average scores
87 | echo "Calculating average scores..."
88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
89 | 
90 | echo "=========================================="
91 | echo "Multiple runs evaluation completed!"
92 | echo "Check results in: $RESULTS_DIR"
93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
94 | echo "==========================================" 
95 | 


--------------------------------------------------------------------------------
/libs/miroflow-tools/src/miroflow_tools/mcp_servers/browser_session.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import asyncio
 5 | import json
 6 | import logging
 7 | 
 8 | from mcp import StdioServerParameters
 9 | from mcp.client.session import ClientSession
10 | from mcp.client.sse import sse_client
11 | from mcp.client.stdio import stdio_client
12 | 
13 | logger = logging.getLogger("miroflow")
14 | 
15 | 
16 | class PlaywrightSession:
17 |     """Class to maintain a persistent Playwright MCP session."""
18 | 
19 |     def __init__(self, server_params):
20 |         self.server_params = server_params
21 |         self.read = None
22 |         self.write = None
23 |         self.session = None
24 |         self._client = None
25 | 
26 |     async def connect(self):
27 |         """Connect to the MCP server and initialize the session."""
28 |         if self.session is None:
29 |             if isinstance(self.server_params, StdioServerParameters):
30 |                 self._client = stdio_client(self.server_params)
31 |             else:
32 |                 self._client = sse_client(self.server_params)
33 |             self.read, self.write = await self._client.__aenter__()
34 |             self.session = ClientSession(self.read, self.write, sampling_callback=None)
35 |             await self.session.__aenter__()
36 |             await self.session.initialize()
37 |             logger.info("Connected to MCP server and initialized session")
38 | 
39 |     async def call_tool(self, tool_name, arguments=None):
40 |         """Call a tool while maintaining the session."""
41 |         if self.session is None:
42 |             await self.connect()
43 | 
44 |         logger.info(f"Calling tool '{tool_name}'")
45 |         tool_result = await self.session.call_tool(tool_name, arguments=arguments)
46 |         result_content = tool_result.content[0].text if tool_result.content else ""
47 |         return result_content
48 | 
49 |     async def close(self):
50 |         """Close the session and connection."""
51 |         if self.session:
52 |             await self.session.__aexit__(None, None, None)
53 |             self.session = None
54 | 
55 |         if self._client:
56 |             await self._client.__aexit__(None, None, None)
57 |             self._client = None
58 |             self.read = None
59 |             self.write = None
60 |             logger.info("Closed MCP session")
61 | 
62 | 
63 | # Example usage:
64 | async def test_persistent_session():
65 |     # Create a persistent session
66 |     mcp_session = PlaywrightSession("http://localhost:8931")
67 | 
68 |     try:
69 |         # First call: Navigate to a website
70 |         await mcp_session.call_tool("browser_navigate", {"url": "https://example.com"})
71 |         logger.info("Navigation complete")
72 | 
73 |         # Wait a moment for the page to load
74 |         await asyncio.sleep(2)
75 | 
76 |         # Second call: Take a snapshot of the current page
77 |         snapshot_result = await mcp_session.call_tool("browser_snapshot", {})
78 | 
79 |         # Process and save the snapshot
80 |         snapshot_json = json.loads(snapshot_result)
81 |         logger.info(f"Snapshot taken of page: {snapshot_json.get('url')}")
82 |         logger.info(f"Page title: {snapshot_json.get('title')}")
83 | 
84 |         with open("snapshot.json", "w") as f:
85 |             json.dump(snapshot_json, f, indent=2, ensure_ascii=False)
86 | 
87 |         logger.info("Snapshot saved to snapshot.json")
88 | 
89 |     finally:
90 |         # Close the session when done with all tool calls
91 |         await mcp_session.close()
92 | 
93 | 
94 | if __name__ == "__main__":
95 |     asyncio.run(test_persistent_session())
96 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/scripts/run_evaluate_multiple_runs_aime2025.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parse environment variables, use defaults if not set
 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
 6 | 
 7 | # Configuration parameters
 8 | NUM_RUNS=${NUM_RUNS:-32}
 9 | BENCHMARK_NAME="aime2025"
10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10}
14 | PASS_AT_K=${PASS_AT_K:-1}
15 | TEMPERATURE=${TEMPERATURE:-1.0}
16 | API_KEY=${API_KEY:-"xxx"}
17 | 
18 | # Set results directory
19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
20 | 
21 | echo "Starting $NUM_RUNS runs of the evaluation..."
22 | echo "Results will be saved in: $RESULTS_DIR"
23 | 
24 | # Create results directory
25 | mkdir -p "$RESULTS_DIR"
26 | 
27 | # Launch all parallel tasks
28 | for i in $(seq 1 $NUM_RUNS); do
29 |     echo "=========================================="
30 |     echo "Launching experiment $i/$NUM_RUNS"
31 |     echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
32 |     echo "=========================================="
33 |     
34 |     # Set specific identifier for this run
35 |     RUN_ID="run_$i"
36 |     
37 |     # Run experiment (background execution)
38 |     (
39 |         uv run python benchmarks/common_benchmark.py \
40 |             benchmark=$BENCHMARK_NAME \
41 |             benchmark.data.metadata_file="standardized_data.jsonl" \
42 |             llm=qwen-3 \
43 |             llm.provider=$LLM_PROVIDER \
44 |             llm.model_name=$LLM_MODEL \
45 |             llm.base_url=$BASE_URL \
46 |             llm.async_client=true \
47 |             llm.temperature=$TEMPERATURE \
48 |             llm.max_context_length=$MAX_CONTEXT_LENGTH \
49 |             llm.api_key=$API_KEY \
50 |             benchmark.execution.max_tasks=null \
51 |             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
52 |             benchmark.execution.pass_at_k=$PASS_AT_K \
53 |             benchmark.data.data_dir=../../data/aime2025 \
54 |             agent=$AGENT_SET \
55 |             hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
56 |             2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
57 |         
58 |         # Check if run was successful
59 |         if [ $? -eq 0 ]; then
60 |             echo "Run $i completed successfully"
61 |             RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
62 |             if [ -f "$RESULT_FILE" ]; then
63 |                 echo "Results saved to $RESULT_FILE"
64 |             else
65 |                 echo "Warning: Result file not found for run $i"
66 |             fi
67 |         else
68 |             echo "Run $i failed!"
69 |         fi
70 |     ) &
71 |     
72 |     # Small delay between launches to avoid simultaneous requests
73 |     sleep 2
74 | done
75 | 
76 | echo "All $NUM_RUNS runs have been launched in parallel"
77 | echo "Waiting for all runs to complete..."
78 | 
79 | # Wait for all background tasks to complete
80 | wait
81 | 
82 | echo "=========================================="
83 | echo "All $NUM_RUNS runs completed!"
84 | echo "=========================================="
85 | 
86 | # Calculate average scores
87 | echo "Calculating average scores..."
88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
89 | 
90 | echo "=========================================="
91 | echo "Multiple runs evaluation completed!"
92 | echo "Check results in: $RESULTS_DIR"
93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
94 | echo "==========================================" 
95 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/scripts/run_evaluate_multiple_runs_browsecomp.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parse environment variables, use defaults if not set
 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
 6 | 
 7 | # Configuration parameters
 8 | NUM_RUNS=${NUM_RUNS:-3}
 9 | BENCHMARK_NAME="browsecomp"
10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10}
14 | PASS_AT_K=${PASS_AT_K:-1}
15 | TEMPERATURE=${TEMPERATURE:-1.0}
16 | API_KEY=${API_KEY:-"xxx"}
17 | 
18 | # Set results directory
19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
20 | 
21 | echo "Starting $NUM_RUNS runs of the evaluation..."
22 | echo "Results will be saved in: $RESULTS_DIR"
23 | 
24 | # Create results directory
25 | mkdir -p "$RESULTS_DIR"
26 | 
27 | # Launch all parallel tasks
28 | for i in $(seq 1 $NUM_RUNS); do
29 |     echo "=========================================="
30 |     echo "Launching experiment $i/$NUM_RUNS"
31 |     echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
32 |     echo "=========================================="
33 |     
34 |     # Set specific identifier for this run
35 |     RUN_ID="run_$i"
36 |     
37 |     # Run experiment (background execution)
38 |     (
39 |         uv run python benchmarks/common_benchmark.py \
40 |             benchmark=$BENCHMARK_NAME \
41 |             benchmark.data.metadata_file="standardized_data.jsonl" \
42 |             llm=qwen-3 \
43 |             llm.provider=$LLM_PROVIDER \
44 |             llm.model_name=$LLM_MODEL \
45 |             llm.base_url=$BASE_URL \
46 |             llm.async_client=true \
47 |             llm.temperature=$TEMPERATURE \
48 |             llm.max_context_length=$MAX_CONTEXT_LENGTH \
49 |             llm.api_key=$API_KEY \
50 |             benchmark.execution.max_tasks=null \
51 |             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
52 |             benchmark.execution.pass_at_k=$PASS_AT_K \
53 |             benchmark.data.data_dir=../../data/browsecomp \
54 |             agent=$AGENT_SET \
55 |             hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
56 |             2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
57 |         
58 |         # Check if run was successful
59 |         if [ $? -eq 0 ]; then
60 |             echo "Run $i completed successfully"
61 |             RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
62 |             if [ -f "$RESULT_FILE" ]; then
63 |                 echo "Results saved to $RESULT_FILE"
64 |             else
65 |                 echo "Warning: Result file not found for run $i"
66 |             fi
67 |         else
68 |             echo "Run $i failed!"
69 |         fi
70 |     ) &
71 |     
72 |     # Small delay between launches to avoid simultaneous requests
73 |     sleep 2
74 | done
75 | 
76 | echo "All $NUM_RUNS runs have been launched in parallel"
77 | echo "Waiting for all runs to complete..."
78 | 
79 | # Wait for all background tasks to complete
80 | wait
81 | 
82 | echo "=========================================="
83 | echo "All $NUM_RUNS runs completed!"
84 | echo "=========================================="
85 | 
86 | # Calculate average scores
87 | echo "Calculating average scores..."
88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
89 | 
90 | echo "=========================================="
91 | echo "Multiple runs evaluation completed!"
92 | echo "Check results in: $RESULTS_DIR"
93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
94 | echo "==========================================" 
95 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/scripts/run_evaluate_multiple_runs_webwalkerqa.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parse environment variables, use defaults if not set
 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
 6 | 
 7 | # Configuration parameters
 8 | NUM_RUNS=${NUM_RUNS:-3}
 9 | BENCHMARK_NAME="webwalkerqa"
10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10}
14 | PASS_AT_K=${PASS_AT_K:-1}
15 | TEMPERATURE=${TEMPERATURE:-1.0}
16 | API_KEY=${API_KEY:-"xxx"}
17 | 
18 | # Set results directory
19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
20 | 
21 | echo "Starting $NUM_RUNS runs of the evaluation..."
22 | echo "Results will be saved in: $RESULTS_DIR"
23 | 
24 | # Create results directory
25 | mkdir -p "$RESULTS_DIR"
26 | 
27 | # Launch all parallel tasks
28 | for i in $(seq 1 $NUM_RUNS); do
29 |     echo "=========================================="
30 |     echo "Launching experiment $i/$NUM_RUNS"
31 |     echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
32 |     echo "=========================================="
33 |     
34 |     # Set specific identifier for this run
35 |     RUN_ID="run_$i"
36 |     
37 |     # Run experiment (background execution)
38 |     (
39 |         uv run python benchmarks/common_benchmark.py \
40 |             benchmark=$BENCHMARK_NAME \
41 |             benchmark.data.metadata_file="standardized_data.jsonl" \
42 |             llm=qwen-3 \
43 |             llm.provider=$LLM_PROVIDER \
44 |             llm.model_name=$LLM_MODEL \
45 |             llm.base_url=$BASE_URL \
46 |             llm.async_client=true \
47 |             llm.temperature=$TEMPERATURE \
48 |             llm.max_context_length=$MAX_CONTEXT_LENGTH \
49 |             llm.api_key=$API_KEY \
50 |             benchmark.execution.max_tasks=null \
51 |             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
52 |             benchmark.execution.pass_at_k=$PASS_AT_K \
53 |             benchmark.data.data_dir=../../data/webwalkerqa \
54 |             agent=$AGENT_SET \
55 |             hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
56 |             2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
57 |         
58 |         # Check if run was successful
59 |         if [ $? -eq 0 ]; then
60 |             echo "Run $i completed successfully"
61 |             RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
62 |             if [ -f "$RESULT_FILE" ]; then
63 |                 echo "Results saved to $RESULT_FILE"
64 |             else
65 |                 echo "Warning: Result file not found for run $i"
66 |             fi
67 |         else
68 |             echo "Run $i failed!"
69 |         fi
70 |     ) &
71 |     
72 |     # Small delay between launches to avoid simultaneous requests
73 |     sleep 2
74 | done
75 | 
76 | echo "All $NUM_RUNS runs have been launched in parallel"
77 | echo "Waiting for all runs to complete..."
78 | 
79 | # Wait for all background tasks to complete
80 | wait
81 | 
82 | echo "=========================================="
83 | echo "All $NUM_RUNS runs completed!"
84 | echo "=========================================="
85 | 
86 | # Calculate average scores
87 | echo "Calculating average scores..."
88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
89 | 
90 | echo "=========================================="
91 | echo "Multiple runs evaluation completed!"
92 | echo "Check results in: $RESULTS_DIR"
93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
94 | echo "==========================================" 
95 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/scripts/run_evaluate_multiple_runs_browsecomp_zh.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parse environment variables, use defaults if not set
 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
 6 | 
 7 | # Configuration parameters
 8 | NUM_RUNS=${NUM_RUNS:-3}
 9 | BENCHMARK_NAME="browsecomp_zh"
10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10}
14 | PASS_AT_K=${PASS_AT_K:-1}
15 | TEMPERATURE=${TEMPERATURE:-1.0}
16 | API_KEY=${API_KEY:-"xxx"}
17 | 
18 | # Set results directory
19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
20 | 
21 | echo "Starting $NUM_RUNS runs of the evaluation..."
22 | echo "Results will be saved in: $RESULTS_DIR"
23 | 
24 | # Create results directory
25 | mkdir -p "$RESULTS_DIR"
26 | 
27 | # Launch all parallel tasks
28 | for i in $(seq 1 $NUM_RUNS); do
29 |     echo "=========================================="
30 |     echo "Launching experiment $i/$NUM_RUNS"
31 |     echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
32 |     echo "=========================================="
33 |     
34 |     # Set specific identifier for this run
35 |     RUN_ID="run_$i"
36 |     
37 |     # Run experiment (background execution)
38 |     (
39 |         uv run python benchmarks/common_benchmark.py \
40 |             benchmark=$BENCHMARK_NAME \
41 |             benchmark.data.metadata_file="standardized_data.jsonl" \
42 |             llm=qwen-3 \
43 |             llm.provider=$LLM_PROVIDER \
44 |             llm.model_name=$LLM_MODEL \
45 |             llm.base_url=$BASE_URL \
46 |             llm.async_client=true \
47 |             llm.temperature=$TEMPERATURE \
48 |             llm.max_context_length=$MAX_CONTEXT_LENGTH \
49 |             llm.api_key=$API_KEY \
50 |             benchmark.execution.max_tasks=null \
51 |             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
52 |             benchmark.execution.pass_at_k=$PASS_AT_K \
53 |             benchmark.data.data_dir=../../data/browsecomp_zh \
54 |             agent=$AGENT_SET \
55 |             hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
56 |             2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
57 |         
58 |         # Check if run was successful
59 |         if [ $? -eq 0 ]; then
60 |             echo "Run $i completed successfully"
61 |             RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
62 |             if [ -f "$RESULT_FILE" ]; then
63 |                 echo "Results saved to $RESULT_FILE"
64 |             else
65 |                 echo "Warning: Result file not found for run $i"
66 |             fi
67 |         else
68 |             echo "Run $i failed!"
69 |         fi
70 |     ) &
71 |     
72 |     # Small delay between launches to avoid simultaneous requests
73 |     sleep 2
74 | done
75 | 
76 | echo "All $NUM_RUNS runs have been launched in parallel"
77 | echo "Waiting for all runs to complete..."
78 | 
79 | # Wait for all background tasks to complete
80 | wait
81 | 
82 | echo "=========================================="
83 | echo "All $NUM_RUNS runs completed!"
84 | echo "=========================================="
85 | 
86 | # Calculate average scores
87 | echo "Calculating average scores..."
88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
89 | 
90 | echo "=========================================="
91 | echo "Multiple runs evaluation completed!"
92 | echo "Check results in: $RESULTS_DIR"
93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
94 | echo "==========================================" 
95 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/scripts/run_evaluate_multiple_runs_hle-text-2158.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parse environment variables, use defaults if not set
 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
 6 | 
 7 | # Configuration parameters
 8 | NUM_RUNS=${NUM_RUNS:-3}
 9 | BENCHMARK_NAME="hle-text-2158"
10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10}
14 | PASS_AT_K=${PASS_AT_K:-1}
15 | TEMPERATURE=${TEMPERATURE:-1.0}
16 | API_KEY=${API_KEY:-"xxx"}
17 | 
18 | # Set results directory
19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
20 | 
21 | echo "Starting $NUM_RUNS runs of the evaluation..."
22 | echo "Results will be saved in: $RESULTS_DIR"
23 | 
24 | # Create results directory
25 | mkdir -p "$RESULTS_DIR"
26 | 
27 | # Launch all parallel tasks
28 | for i in $(seq 1 $NUM_RUNS); do
29 |     echo "=========================================="
30 |     echo "Launching experiment $i/$NUM_RUNS"
31 |     echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
32 |     echo "=========================================="
33 |     
34 |     # Set specific identifier for this run
35 |     RUN_ID="run_$i"
36 |     
37 |     # Run experiment (background execution)
38 |     (
39 |         uv run python benchmarks/common_benchmark.py \
40 |             benchmark=$BENCHMARK_NAME \
41 |             benchmark.data.metadata_file="standardized_data_original.jsonl" \
42 |             llm=qwen-3 \
43 |             llm.provider=$LLM_PROVIDER \
44 |             llm.model_name=$LLM_MODEL \
45 |             llm.base_url=$BASE_URL \
46 |             llm.async_client=true \
47 |             llm.temperature=$TEMPERATURE \
48 |             llm.max_context_length=$MAX_CONTEXT_LENGTH \
49 |             llm.api_key=$API_KEY \
50 |             benchmark.execution.max_tasks=null \
51 |             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
52 |             benchmark.execution.pass_at_k=$PASS_AT_K \
53 |             benchmark.data.data_dir=../../data/hle-text-2158 \
54 |             agent=$AGENT_SET \
55 |             hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
56 |             2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
57 |         
58 |         # Check if run was successful
59 |         if [ $? -eq 0 ]; then
60 |             echo "Run $i completed successfully"
61 |             RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
62 |             if [ -f "$RESULT_FILE" ]; then
63 |                 echo "Results saved to $RESULT_FILE"
64 |             else
65 |                 echo "Warning: Result file not found for run $i"
66 |             fi
67 |         else
68 |             echo "Run $i failed!"
69 |         fi
70 |     ) &
71 |     
72 |     # Small delay between launches to avoid simultaneous requests
73 |     sleep 2
74 | done
75 | 
76 | echo "All $NUM_RUNS runs have been launched in parallel"
77 | echo "Waiting for all runs to complete..."
78 | 
79 | # Wait for all background tasks to complete
80 | wait
81 | 
82 | echo "=========================================="
83 | echo "All $NUM_RUNS runs completed!"
84 | echo "=========================================="
85 | 
86 | # Calculate average scores
87 | echo "Calculating average scores..."
88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
89 | 
90 | echo "=========================================="
91 | echo "Multiple runs evaluation completed!"
92 | echo "Check results in: $RESULTS_DIR"
93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
94 | echo "==========================================" 
95 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/scripts/run_evaluate_multiple_runs_hle-text-500.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parse environment variables, use defaults if not set
 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
 6 | 
 7 | # Configuration parameters
 8 | NUM_RUNS=${NUM_RUNS:-3}
 9 | BENCHMARK_NAME="hle-text-500"
10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10}
14 | PASS_AT_K=${PASS_AT_K:-1}
15 | TEMPERATURE=${TEMPERATURE:-1.0}
16 | API_KEY=${API_KEY:-"xxx"}
17 | 
18 | # Set results directory
19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
20 | 
21 | echo "Starting $NUM_RUNS runs of the evaluation..."
22 | echo "Results will be saved in: $RESULTS_DIR"
23 | 
24 | # Create results directory
25 | mkdir -p "$RESULTS_DIR"
26 | 
27 | # Launch all parallel tasks
28 | for i in $(seq 1 $NUM_RUNS); do
29 |     echo "=========================================="
30 |     echo "Launching experiment $i/$NUM_RUNS"
31 |     echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
32 |     echo "=========================================="
33 |     
34 |     # Set specific identifier for this run
35 |     RUN_ID="run_$i"
36 |     
37 |     # Run experiment (background execution)
38 |     (
39 |         uv run python benchmarks/common_benchmark.py \
40 |             benchmark=$BENCHMARK_NAME \
41 |             benchmark.data.metadata_file="standardized_data_original.jsonl" \
42 |             llm=qwen-3 \
43 |             llm.provider=$LLM_PROVIDER \
44 |             llm.model_name=$LLM_MODEL \
45 |             llm.base_url=$BASE_URL \
46 |             llm.async_client=true \
47 |             llm.temperature=$TEMPERATURE \
48 |             llm.max_context_length=$MAX_CONTEXT_LENGTH \
49 |             llm.api_key=$API_KEY \
50 |             benchmark.execution.max_tasks=null \
51 |             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
52 |             benchmark.execution.pass_at_k=$PASS_AT_K \
53 |             benchmark.data.data_dir=../../data/hle-text-500 \
54 |             agent=$AGENT_SET \
55 |             hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
56 |             2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
57 |         
58 |         # Check if run was successful
59 |         if [ $? -eq 0 ]; then
60 |             echo "Run $i completed successfully"
61 |             RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
62 |             if [ -f "$RESULT_FILE" ]; then
63 |                 echo "Results saved to $RESULT_FILE"
64 |             else
65 |                 echo "Warning: Result file not found for run $i"
66 |             fi
67 |         else
68 |             echo "Run $i failed!"
69 |         fi
70 |     ) &
71 |     
72 |     # Small delay between launches to avoid simultaneous requests
73 |     sleep 2
74 | done
75 | 
76 | echo "All $NUM_RUNS runs have been launched in parallel"
77 | echo "Waiting for all runs to complete..."
78 | 
79 | # Wait for all background tasks to complete
80 | wait
81 | 
82 | echo "=========================================="
83 | echo "All $NUM_RUNS runs completed!"
84 | echo "=========================================="
85 | 
86 | # Calculate average scores
87 | echo "Calculating average scores..."
88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
89 | 
90 | echo "=========================================="
91 | echo "Multiple runs evaluation completed!"
92 | echo "Check results in: $RESULTS_DIR"
93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
94 | echo "==========================================" 
95 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/scripts/run_evaluate_multiple_runs_gaia-validation.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parse environment variables, use defaults if not set
 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
 6 | 
 7 | # Configuration parameters
 8 | NUM_RUNS=${NUM_RUNS:-8}
 9 | BENCHMARK_NAME="gaia-validation"
10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10}
14 | PASS_AT_K=${PASS_AT_K:-1}
15 | TEMPERATURE=${TEMPERATURE:-1.0}
16 | API_KEY=${API_KEY:-"xxx"}
17 | 
18 | # Set results directory
19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
20 | 
21 | echo "Starting $NUM_RUNS runs of the evaluation..."
22 | echo "Results will be saved in: $RESULTS_DIR"
23 | 
24 | # Create results directory
25 | mkdir -p "$RESULTS_DIR"
26 | 
27 | # Launch all parallel tasks
28 | for i in $(seq 1 $NUM_RUNS); do
29 |     echo "=========================================="
30 |     echo "Launching experiment $i/$NUM_RUNS"
31 |     echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
32 |     echo "=========================================="
33 |     
34 |     # Set specific identifier for this run
35 |     RUN_ID="run_$i"
36 |     
37 |     # Run experiment (background execution)
38 |     (
39 |         uv run python benchmarks/common_benchmark.py \
40 |             benchmark=$BENCHMARK_NAME \
41 |             benchmark.data.metadata_file="standardized_data.jsonl" \
42 |             llm=qwen-3 \
43 |             llm.provider=$LLM_PROVIDER \
44 |             llm.model_name=$LLM_MODEL \
45 |             llm.base_url=$BASE_URL \
46 |             llm.async_client=true \
47 |             llm.temperature=$TEMPERATURE \
48 |             llm.max_context_length=$MAX_CONTEXT_LENGTH \
49 |             llm.api_key=$API_KEY \
50 |             benchmark.execution.max_tasks=null \
51 |             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
52 |             benchmark.execution.pass_at_k=$PASS_AT_K \
53 |             benchmark.data.data_dir=../../data/gaia-2023-validation \
54 |             agent=$AGENT_SET \
55 |             hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
56 |             2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
57 |         
58 |         # Check if run was successful
59 |         if [ $? -eq 0 ]; then
60 |             echo "Run $i completed successfully"
61 |             RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
62 |             if [ -f "$RESULT_FILE" ]; then
63 |                 echo "Results saved to $RESULT_FILE"
64 |             else
65 |                 echo "Warning: Result file not found for run $i"
66 |             fi
67 |         else
68 |             echo "Run $i failed!"
69 |         fi
70 |     ) &
71 |     
72 |     # Small delay between launches to avoid simultaneous requests
73 |     sleep 2
74 | done
75 | 
76 | echo "All $NUM_RUNS runs have been launched in parallel"
77 | echo "Waiting for all runs to complete..."
78 | 
79 | # Wait for all background tasks to complete
80 | wait
81 | 
82 | echo "=========================================="
83 | echo "All $NUM_RUNS runs completed!"
84 | echo "=========================================="
85 | 
86 | # Calculate average scores
87 | echo "Calculating average scores..."
88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
89 | 
90 | echo "=========================================="
91 | echo "Multiple runs evaluation completed!"
92 | echo "Check results in: $RESULTS_DIR"
93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
94 | echo "==========================================" 
95 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/scripts/run_evaluate_multiple_runs_xbench_deepsearch.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parse environment variables, use defaults if not set
 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
 6 | 
 7 | # Configuration parameters
 8 | NUM_RUNS=${NUM_RUNS:-8}
 9 | BENCHMARK_NAME="xbench_deepsearch"
10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10}
14 | PASS_AT_K=${PASS_AT_K:-1}
15 | TEMPERATURE=${TEMPERATURE:-1.0}
16 | API_KEY=${API_KEY:-"xxx"}
17 | 
18 | # Set results directory
19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
20 | 
21 | echo "Starting $NUM_RUNS runs of the evaluation..."
22 | echo "Results will be saved in: $RESULTS_DIR"
23 | 
24 | # Create results directory
25 | mkdir -p "$RESULTS_DIR"
26 | 
27 | # Launch all parallel tasks
28 | for i in $(seq 1 $NUM_RUNS); do
29 |     echo "=========================================="
30 |     echo "Launching experiment $i/$NUM_RUNS"
31 |     echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
32 |     echo "=========================================="
33 |     
34 |     # Set specific identifier for this run
35 |     RUN_ID="run_$i"
36 |     
37 |     # Run experiment (background execution)
38 |     (
39 |         uv run python benchmarks/common_benchmark.py \
40 |             benchmark=$BENCHMARK_NAME \
41 |             benchmark.data.metadata_file="standardized_data.jsonl" \
42 |             llm=qwen-3 \
43 |             llm.provider=$LLM_PROVIDER \
44 |             llm.model_name=$LLM_MODEL \
45 |             llm.base_url=$BASE_URL \
46 |             llm.async_client=true \
47 |             llm.temperature=$TEMPERATURE \
48 |             llm.max_context_length=$MAX_CONTEXT_LENGTH \
49 |             llm.api_key=$API_KEY \
50 |             benchmark.execution.max_tasks=null \
51 |             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
52 |             benchmark.execution.pass_at_k=$PASS_AT_K \
53 |             benchmark.data.data_dir=../../data/xbench_deepsearch \
54 |             agent=$AGENT_SET \
55 |             hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
56 |             2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
57 |         
58 |         # Check if run was successful
59 |         if [ $? -eq 0 ]; then
60 |             echo "Run $i completed successfully"
61 |             RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
62 |             if [ -f "$RESULT_FILE" ]; then
63 |                 echo "Results saved to $RESULT_FILE"
64 |             else
65 |                 echo "Warning: Result file not found for run $i"
66 |             fi
67 |         else
68 |             echo "Run $i failed!"
69 |         fi
70 |     ) &
71 |     
72 |     # Small delay between launches to avoid simultaneous requests
73 |     sleep 2
74 | done
75 | 
76 | echo "All $NUM_RUNS runs have been launched in parallel"
77 | echo "Waiting for all runs to complete..."
78 | 
79 | # Wait for all background tasks to complete
80 | wait
81 | 
82 | echo "=========================================="
83 | echo "All $NUM_RUNS runs completed!"
84 | echo "=========================================="
85 | 
86 | # Calculate average scores
87 | echo "Calculating average scores..."
88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
89 | 
90 | echo "=========================================="
91 | echo "Multiple runs evaluation completed!"
92 | echo "Check results in: $RESULTS_DIR"
93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
94 | echo "==========================================" 
95 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/scripts/run_evaluate_multiple_runs_gaia-validation-text-103.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Parse environment variables, use defaults if not set
 4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
 5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
 6 | 
 7 | # Configuration parameters
 8 | NUM_RUNS=${NUM_RUNS:-8}
 9 | BENCHMARK_NAME="gaia-validation-text-103"
10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10}
14 | PASS_AT_K=${PASS_AT_K:-1}
15 | TEMPERATURE=${TEMPERATURE:-1.0}
16 | API_KEY=${API_KEY:-"xxx"}
17 | 
18 | # Set results directory
19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
20 | 
21 | echo "Starting $NUM_RUNS runs of the evaluation..."
22 | echo "Results will be saved in: $RESULTS_DIR"
23 | 
24 | # Create results directory
25 | mkdir -p "$RESULTS_DIR"
26 | 
27 | # Launch all parallel tasks
28 | for i in $(seq 1 $NUM_RUNS); do
29 |     echo "=========================================="
30 |     echo "Launching experiment $i/$NUM_RUNS"
31 |     echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
32 |     echo "=========================================="
33 |     
34 |     # Set specific identifier for this run
35 |     RUN_ID="run_$i"
36 |     
37 |     # Run experiment (background execution)
38 |     (
39 |         uv run python benchmarks/common_benchmark.py \
40 |             benchmark=$BENCHMARK_NAME \
41 |             benchmark.data.metadata_file="standardized_data.jsonl" \
42 |             llm=qwen-3 \
43 |             llm.provider=$LLM_PROVIDER \
44 |             llm.model_name=$LLM_MODEL \
45 |             llm.base_url=$BASE_URL \
46 |             llm.async_client=true \
47 |             llm.temperature=$TEMPERATURE \
48 |             llm.max_context_length=$MAX_CONTEXT_LENGTH \
49 |             llm.api_key=$API_KEY \
50 |             benchmark.execution.max_tasks=null \
51 |             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
52 |             benchmark.execution.pass_at_k=$PASS_AT_K \
53 |             benchmark.data.data_dir=../../data/gaia-2023-validation-text-103 \
54 |             agent=$AGENT_SET \
55 |             hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
56 |             2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
57 |         
58 |         # Check if run was successful
59 |         if [ $? -eq 0 ]; then
60 |             echo "Run $i completed successfully"
61 |             RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
62 |             if [ -f "$RESULT_FILE" ]; then
63 |                 echo "Results saved to $RESULT_FILE"
64 |             else
65 |                 echo "Warning: Result file not found for run $i"
66 |             fi
67 |         else
68 |             echo "Run $i failed!"
69 |         fi
70 |     ) &
71 |     
72 |     # Small delay between launches to avoid simultaneous requests
73 |     sleep 2
74 | done
75 | 
76 | echo "All $NUM_RUNS runs have been launched in parallel"
77 | echo "Waiting for all runs to complete..."
78 | 
79 | # Wait for all background tasks to complete
80 | wait
81 | 
82 | echo "=========================================="
83 | echo "All $NUM_RUNS runs completed!"
84 | echo "=========================================="
85 | 
86 | # Calculate average scores
87 | echo "Calculating average scores..."
88 | uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
89 | 
90 | echo "=========================================="
91 | echo "Multiple runs evaluation completed!"
92 | echo "Check results in: $RESULTS_DIR"
93 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
94 | echo "==========================================" 
95 | 


--------------------------------------------------------------------------------
/apps/collect-trace/utils/process_logs.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2025 MiroMind
 2 | # This source code is licensed under the MIT License.
 3 | 
 4 | import argparse
 5 | import json
 6 | import os
 7 | import shutil
 8 | 
 9 | 
10 | def get_successful_log_paths(jsonl_file_path: str) -> list:
11 |     """
12 |     Collects the paths of successful log files from a dataset.
13 | 
14 |     This function extracts log file paths of successful records based on
15 |     the value of `final_judge_result`. If the dataset has been fully
16 |     processed, it reads from a `benchmark_results.jsonl` file. Otherwise,
17 |     if processing was interrupted, it falls back to scanning individual
18 |     `.json` files in the given directory.
19 | 
20 |     Success is determined by:
21 |     - `PASS_AT_K_SUCCESS` for records in JSONL files.
22 |     - `CORRECT` for records in individual JSON files.
23 | 
24 |     Args:
25 |         jsonl_file_path (str): Path to a JSONL file or a directory of JSON files.
26 | 
27 |     Returns:
28 |         list: A list of log file paths for successful records.
29 |     """
30 |     log_paths = []
31 | 
32 |     if jsonl_file_path.endswith(".jsonl"):
33 |         with open(jsonl_file_path, "r", encoding="utf-8") as f:
34 |             for line in f:
35 |                 line = line.strip()
36 |                 if line:
37 |                     try:
38 |                         data = json.loads(line)
39 |                         if data.get("final_judge_result") == "PASS_AT_K_SUCCESS":
40 |                             log_path = data.get("log_file_path")
41 |                             if log_path:
42 |                                 log_paths.append(log_path)
43 |                     except json.JSONDecodeError:
44 |                         continue
45 |     else:
46 |         filenames = os.listdir(jsonl_file_path)
47 |         filenames = [filename for filename in filenames if filename.endswith(".json")]
48 |         for filename in filenames:
49 |             filepath = os.path.join(jsonl_file_path, filename)
50 |             try:
51 |                 data = json.load(open(filepath, "r"))
52 |             except Exception:
53 |                 continue
54 |             try:
55 |                 final_judge_result = data["final_judge_result"]
56 |             except KeyError:
57 |                 print(data.keys())
58 |                 continue
59 |             if final_judge_result == "CORRECT":
60 |                 log_paths.append(filepath)
61 | 
62 |     return log_paths
63 | 
64 | 
65 | # Usage example
66 | if __name__ == "__main__":
67 |     parser = argparse.ArgumentParser(
68 |         description="Extract successful log paths from JSONL file"
69 |     )
70 |     parser.add_argument(
71 |         "file_path", help="Path to the JSONL file containing benchmark results"
72 |     )
73 |     args = parser.parse_args()
74 | 
75 |     result = get_successful_log_paths(args.file_path)
76 | 
77 |     # Get the parent directory of args.file_path
78 |     parent_dir = os.path.abspath(os.path.dirname(args.file_path))
79 | 
80 |     # Create successful logs directory
81 |     success_log_dir = parent_dir + "/successful_logs"
82 |     success_chatml_log_dir = parent_dir + "/successful_chatml_logs"
83 |     os.makedirs(success_log_dir, exist_ok=True)
84 |     print(f"Successful logs directory: {success_log_dir}")
85 | 
86 |     for i, path in enumerate(result, 1):
87 |         basename = os.path.basename(path)
88 |         print(f"Copying file: {path} to {success_log_dir}/{basename}")
89 |         shutil.copy(path, f"{success_log_dir}/{basename}")
90 | 
91 |     os.system(
92 |         f"uv run utils/converters/convert_to_chatml_auto_batch.py {success_log_dir}/*.json -o {success_chatml_log_dir}"
93 |     )
94 |     os.system(
95 |         f"uv run utils/merge_chatml_msgs_to_one_json.py --input_dir {success_chatml_log_dir}"
96 |     )
97 | 


--------------------------------------------------------------------------------
/libs/miroflow-tools/src/miroflow_tools/mcp_servers/reading_mcp_server.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2025 MiroMind
  2 | # This source code is licensed under the MIT License.
  3 | 
  4 | import argparse
  5 | import logging
  6 | import sys
  7 | 
  8 | from fastmcp import FastMCP
  9 | from mcp import ClientSession, StdioServerParameters
 10 | from mcp.client.stdio import stdio_client
 11 | 
 12 | logger = logging.getLogger("miroflow")
 13 | 
 14 | # Initialize FastMCP server
 15 | mcp = FastMCP("reading-mcp-server")
 16 | 
 17 | 
 18 | @mcp.tool()
 19 | async def convert_to_markdown(uri: str) -> str:
 20 |     """Convert various types of resources (doc, ppt, pdf, excel, csv, zip file etc.)
 21 |     described by an file: or data: URI to markdown.
 22 | 
 23 |     Args:
 24 |         uri: Required. The URI of the resource to convert. Need to start with 'file:' or 'data:' schemes.
 25 | 
 26 |     Returns:
 27 |         str: The converted markdown content, or an error message if conversion fails.
 28 |     """
 29 |     if not uri or not uri.strip():
 30 |         return "Error: URI parameter is required and cannot be empty."
 31 | 
 32 |     # Validate URI scheme
 33 |     valid_schemes = ["http:", "https:", "file:", "data:"]
 34 |     if not any(uri.lower().startswith(scheme) for scheme in valid_schemes):
 35 |         return f"Error: Invalid URI scheme. Supported schemes are: {', '.join(valid_schemes)}"
 36 | 
 37 |     tool_name = "convert_to_markdown"
 38 |     arguments = {"uri": uri}
 39 | 
 40 |     server_params = StdioServerParameters(
 41 |         command=sys.executable,
 42 |         args=["-m", "markitdown_mcp"],
 43 |     )
 44 | 
 45 |     result_content = ""
 46 |     try:
 47 |         async with stdio_client(server_params) as (read, write):
 48 |             async with ClientSession(read, write, sampling_callback=None) as session:
 49 |                 await session.initialize()
 50 |                 try:
 51 |                     tool_result = await session.call_tool(
 52 |                         tool_name, arguments=arguments
 53 |                     )
 54 |                     result_content = (
 55 |                         tool_result.content[-1].text if tool_result.content else ""
 56 |                     )
 57 |                 except Exception as tool_error:
 58 |                     logger.info(f"Tool execution error: {tool_error}")
 59 |                     return f"Error: Tool execution failed: {str(tool_error)}"
 60 |     except Exception as session_error:
 61 |         logger.info(f"Session error: {session_error}")
 62 |         return (
 63 |             f"Error: Failed to connect to markitdown-mcp server: {str(session_error)}"
 64 |         )
 65 | 
 66 |     return result_content
 67 | 
 68 | 
 69 | if __name__ == "__main__":
 70 |     # Set up argument parser
 71 |     parser = argparse.ArgumentParser(description="Reading MCP Server")
 72 |     parser.add_argument(
 73 |         "--transport",
 74 |         choices=["stdio", "http"],
 75 |         default="stdio",
 76 |         help="Transport method: 'stdio' or 'http' (default: stdio)",
 77 |     )
 78 |     parser.add_argument(
 79 |         "--port",
 80 |         type=int,
 81 |         default=8080,
 82 |         help="Port to use when running with HTTP transport (default: 8080)",
 83 |     )
 84 |     parser.add_argument(
 85 |         "--path",
 86 |         type=str,
 87 |         default="/mcp",
 88 |         help="URL path to use when running with HTTP transport (default: /mcp)",
 89 |     )
 90 | 
 91 |     # Parse command line arguments
 92 |     args = parser.parse_args()
 93 | 
 94 |     # Run the server with the specified transport method
 95 |     if args.transport == "stdio":
 96 |         mcp.run(transport="stdio")
 97 |     else:
 98 |         # For HTTP transport, include port and path options
 99 |         mcp.run(transport="streamable-http", port=args.port, path=args.path)
100 | 


--------------------------------------------------------------------------------
/apps/visualize-trace/run.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) 2025 MiroMind
  3 | # This source code is licensed under the MIT License.
  4 | 
  5 | import os
  6 | import subprocess
  7 | import sys
  8 | 
  9 | 
 10 | def check_dependencies():
 11 |     """Check if dependencies are installed"""
 12 |     try:
 13 |         import importlib.util
 14 | 
 15 |         if importlib.util.find_spec("flask") is not None:
 16 |             print("✓ Flask is installed")
 17 |             return True
 18 |         else:
 19 |             raise ImportError("Flask not found")
 20 |     except ImportError:
 21 |         print("✗ Flask is not installed")
 22 |         print("Please use the following commands to install dependencies:")
 23 |         print("  uv sync")
 24 |         print("or:")
 25 |         print("  uv pip install -r requirements.txt")
 26 |         return False
 27 | 
 28 | 
 29 | def install_dependencies():
 30 |     """Install dependencies (recommended to use uv)"""
 31 |     print("Installing dependencies...")
 32 |     try:
 33 |         # Try using uv first
 34 |         try:
 35 |             subprocess.check_call(["uv", "sync"])
 36 |             print("✓ Dependencies installed successfully using uv")
 37 |             return True
 38 |         except (subprocess.CalledProcessError, FileNotFoundError):
 39 |             # Fallback to pip
 40 |             subprocess.check_call(
 41 |                 [sys.executable, "-m", "pip", "install", "-r", "requirements.txt"]
 42 |             )
 43 |             print("✓ Dependencies installed successfully using pip")
 44 |             return True
 45 |     except subprocess.CalledProcessError:
 46 |         print("✗ Failed to install dependencies")
 47 |         print("Please manually run: uv sync or pip install -r requirements.txt")
 48 |         return False
 49 | 
 50 | 
 51 | def main():
 52 |     """Main function"""
 53 |     import argparse
 54 | 
 55 |     # Parse command line arguments
 56 |     parser = argparse.ArgumentParser(description="Trace Analysis Web Demo")
 57 |     parser.add_argument(
 58 |         "-p",
 59 |         "--port",
 60 |         type=int,
 61 |         default=5000,
 62 |         help="Specify port number (default: 5000)",
 63 |     )
 64 |     args = parser.parse_args()
 65 | 
 66 |     print("=" * 50)
 67 |     print("Trace Analysis Web Demo")
 68 |     print("=" * 50)
 69 | 
 70 |     # Check dependencies
 71 |     if not check_dependencies():
 72 |         print("\nInstalling dependencies...")
 73 |         if not install_dependencies():
 74 |             print(
 75 |                 "Please manually install dependencies: pip install -r requirements.txt"
 76 |             )
 77 |             return
 78 | 
 79 |     # Check JSON files
 80 |     parent_dir = os.path.dirname(os.path.abspath(__file__))
 81 |     json_files = [
 82 |         f for f in os.listdir(os.path.join(parent_dir, "..")) if f.endswith(".json")
 83 |     ]
 84 | 
 85 |     if not json_files:
 86 |         print("\nWarning: No JSON files found in parent directory")
 87 |         print("Please ensure trace JSON files are in the trace_analyze/ directory")
 88 |     else:
 89 |         print(f"\nFound {len(json_files)} JSON files:")
 90 |         for file in json_files[:5]:  # Only show first 5
 91 |             print(f"  - {file}")
 92 |         if len(json_files) > 5:
 93 |             print(f"  ... and {len(json_files) - 5} other files")
 94 | 
 95 |     # Start application
 96 |     print("\nStarting web application...")
 97 |     print(f"Application will run at http://localhost:{args.port}")
 98 |     print("Press Ctrl+C to stop the application")
 99 |     print("=" * 50)
100 | 
101 |     try:
102 |         from app import app
103 | 
104 |         app.run(debug=True, host="0.0.0.0", port=args.port)
105 |     except KeyboardInterrupt:
106 |         print("\nApplication stopped")
107 |     except Exception as e:
108 |         print(f"\nFailed to start application: {e}")
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     main()
113 | 


--------------------------------------------------------------------------------
/libs/miroflow-tools/src/miroflow_tools/mcp_servers/vision_mcp_server_os.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2025 MiroMind
  2 | # This source code is licensed under the MIT License.
  3 | 
  4 | import base64
  5 | import os
  6 | 
  7 | import aiohttp
  8 | import requests
  9 | from fastmcp import FastMCP
 10 | 
 11 | VISION_API_KEY = os.environ.get("VISION_API_KEY")
 12 | VISION_BASE_URL = os.environ.get("VISION_BASE_URL")
 13 | VISION_MODEL_NAME = os.environ.get("VISION_MODEL_NAME")
 14 | 
 15 | # Initialize FastMCP server
 16 | mcp = FastMCP("vision-mcp-server-os")
 17 | 
 18 | 
 19 | def guess_mime_media_type_from_extension(file_path: str) -> str:
 20 |     """Guess the MIME type based on the file extension."""
 21 |     _, ext = os.path.splitext(file_path)
 22 |     ext = ext.lower()
 23 |     if ext in [".jpg", ".jpeg"]:
 24 |         return "image/jpeg"
 25 |     elif ext == ".png":
 26 |         return "image/png"
 27 |     elif ext == ".gif":
 28 |         return "image/gif"
 29 |     else:
 30 |         return "image/jpeg"  # Default to JPEG if unknown
 31 | 
 32 | 
 33 | @mcp.tool()
 34 | async def visual_question_answering(image_path_or_url: str, question: str) -> str:
 35 |     """Ask question about an image or a video and get the answer with a vision language model.
 36 | 
 37 |     Args:
 38 |         image_path_or_url: The path of the image file locally or its URL.
 39 |         question: The question to ask about the image.
 40 | 
 41 |     Returns:
 42 |         The answer to the image-related question.
 43 |     """
 44 |     messages_for_llm = [
 45 |         {
 46 |             "role": "user",
 47 |             "content": [
 48 |                 {"type": "image_url", "image_url": {"url": None}},
 49 |                 {
 50 |                     "type": "text",
 51 |                     "text": question,
 52 |                 },
 53 |             ],
 54 |         }
 55 |     ]
 56 | 
 57 |     headers = {
 58 |         "Authorization": f"Bearer {VISION_API_KEY}",
 59 |         "Content-Type": "application/json",
 60 |     }
 61 | 
 62 |     try:
 63 |         if os.path.exists(image_path_or_url):  # Check if the file exists locally
 64 |             with open(image_path_or_url, "rb") as image_file:
 65 |                 image_data = base64.b64encode(image_file.read()).decode("utf-8")
 66 |                 mime_type = guess_mime_media_type_from_extension(image_path_or_url)
 67 |                 messages_for_llm[0]["content"][0]["image_url"]["url"] = (
 68 |                     f"data:{mime_type};base64,{image_data}"
 69 |                 )
 70 |         elif image_path_or_url.startswith(("http://", "https://")):
 71 |             async with aiohttp.ClientSession() as session:
 72 |                 async with session.get(image_path_or_url) as resp:
 73 |                     if resp.status == 200:
 74 |                         image_bytes = await resp.read()
 75 |                         mime_type = resp.headers.get(
 76 |                             "Content-Type", "image/png"
 77 |                         )  # fallback MIME type
 78 |                         image_data = base64.b64encode(image_bytes).decode("utf-8")
 79 |                         messages_for_llm[0]["content"][0]["image_url"]["url"] = (
 80 |                             f"data:{mime_type};base64,{image_data}"
 81 |                         )
 82 |                     else:
 83 |                         return f"Failed to fetch image from URL: {image_path_or_url}"
 84 |         else:
 85 |             messages_for_llm[0]["content"][0]["image_url"]["url"] = image_path_or_url
 86 | 
 87 |         payload = {"model": VISION_MODEL_NAME, "messages": messages_for_llm}
 88 | 
 89 |         response = requests.post(VISION_BASE_URL, json=payload, headers=headers)
 90 | 
 91 |     except Exception as e:
 92 |         return f"Error: {e}"
 93 | 
 94 |     try:
 95 |         return response.json()["choices"][0]["message"]["content"]
 96 |     except (AttributeError, IndexError):
 97 |         return response.json()
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     mcp.run(transport="stdio")
102 | 


--------------------------------------------------------------------------------
/libs/miroflow-tools/src/miroflow_tools/mcp_servers/utils/url_unquote.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from urllib.parse import unquote
  3 | 
  4 | from markdown_it import MarkdownIt
  5 | 
  6 | # Reserved character encodings to be protected -> temporary placeholders
  7 | PROTECT = {
  8 |     "%2F": "__SLASH__",
  9 |     "%2f": "__SLASH__",
 10 |     "%3F": "__QMARK__",
 11 |     "%3f": "__QMARK__",
 12 |     "%23": "__HASH__",
 13 |     "%26": "__AMP__",
 14 |     "%3D": "__EQUAL__",
 15 |     "%20": "__SPACE__",
 16 |     "%2B": "__PLUS__",
 17 |     "%25": "__PERCENT__",
 18 | }
 19 | 
 20 | # Reverse mapping: placeholder -> original %xx (use uppercase for uniform output)
 21 | RESTORE = {v: k.upper() for k, v in PROTECT.items()}
 22 | 
 23 | 
 24 | def safe_unquote(s: str, encoding="utf-8", errors="ignore") -> str:
 25 |     # 1. Replace with placeholders
 26 |     for k, v in PROTECT.items():
 27 |         s = s.replace(k, v)
 28 |     # 2. Decode (only affects unprotected parts, e.g., Chinese characters)
 29 |     s = unquote(s, encoding=encoding, errors=errors)
 30 |     # 3. Replace placeholders back to original %xx
 31 |     for v, k in RESTORE.items():
 32 |         s = s.replace(v, k)
 33 |     return s
 34 | 
 35 | 
 36 | def decode_http_urls_in_dict(data):
 37 |     """
 38 |     Traverse all values in the data structure:
 39 |     - If it's a string starting with http, apply urllib.parse.unquote
 40 |     - If it's a list, recursively process each element
 41 |     - If it's a dict, recursively process each value
 42 |     - Other types remain unchanged
 43 |     """
 44 |     if isinstance(data, str):
 45 |         if "%" in data:
 46 |             return safe_unquote(data)
 47 |         else:
 48 |             return data
 49 |     elif isinstance(data, list):
 50 |         return [decode_http_urls_in_dict(item) for item in data]
 51 |     elif isinstance(data, dict):
 52 |         return {key: decode_http_urls_in_dict(value) for key, value in data.items()}
 53 |     else:
 54 |         return data
 55 | 
 56 | 
 57 | md = MarkdownIt("commonmark")
 58 | 
 59 | 
 60 | def strip_markdown_links(markdown: str) -> str:
 61 |     tokens = md.parse(markdown)
 62 | 
 63 |     def render(ts):
 64 |         out = []
 65 |         for tok in ts:
 66 |             t = tok.type
 67 | 
 68 |             # 1) Links: drop the wrapper, keep inner text (children will be rendered)
 69 |             if t == "link_open" or t == "link_close":
 70 |                 continue
 71 | 
 72 |             # 2) Images: skip the entire image block
 73 |             if t == "image":
 74 |                 continue
 75 | 
 76 |             # 3) Line breaks and block closings
 77 |             if t == "softbreak":  # inline single line break
 78 |                 out.append("\n")
 79 |                 continue
 80 |             if (
 81 |                 t == "hardbreak"
 82 |             ):  # explicit line break (two spaces + newline in Markdown)
 83 |                 out.append("\n")
 84 |                 continue
 85 |             if t in ("paragraph_close", "heading_close", "blockquote_close"):
 86 |                 out.append("\n\n")
 87 |                 continue
 88 |             if t in ("list_item_close", "bullet_list_close", "ordered_list_close"):
 89 |                 out.append("\n")
 90 |                 continue
 91 |             if t == "hr":
 92 |                 out.append("\n\n")
 93 |                 continue
 94 | 
 95 |             # 4) Inline or nested tokens
 96 |             if tok.children:
 97 |                 out.append(render(tok.children))
 98 |                 continue
 99 | 
100 |             # Preserve inline code style
101 |             if t == "code_inline":
102 |                 out.append(f"`{tok.content}`")
103 |             else:
104 |                 out.append(tok.content or "")
105 | 
106 |         return "".join(out)
107 | 
108 |     text = render(tokens)
109 | 
110 |     # normalize excessive blank lines (avoid more than 2 consecutive newlines)
111 |     text = re.sub(r"\n{3,}", "\n\n", text).rstrip() + "\n"
112 | 
113 |     return text.strip()
114 | 


--------------------------------------------------------------------------------
/apps/collect-trace/utils/converters/example_usage.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2025 MiroMind
  2 | # This source code is licensed under the MIT License.
  3 | 
  4 | import json
  5 | import os
  6 | import sys
  7 | import tempfile
  8 | from pathlib import Path
  9 | 
 10 | # Add parent directory to Python path
 11 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..", ".."))
 12 | 
 13 | from utils.converters import (
 14 |     extract_and_save_chat_history,
 15 |     extract_message_history_from_log,
 16 | )
 17 | 
 18 | 
 19 | def example_1_basic_conversion():
 20 |     """Example 1: Basic conversion using Python API"""
 21 |     print("=== Example 1: Basic Conversion ===")
 22 | 
 23 |     # Sample log data
 24 |     log_data = {
 25 |         "main_agent_message_history": {
 26 |             "system_prompt": "You are a helpful assistant.",
 27 |             "message_history": [
 28 |                 {
 29 |                     "role": "developer",
 30 |                     "content": [
 31 |                         {"type": "text", "text": "You are a helpful assistant."}
 32 |                     ],
 33 |                 },
 34 |                 {
 35 |                     "role": "user",
 36 |                     "content": [{"type": "text", "text": "Hello, how are you?"}],
 37 |                 },
 38 |                 {
 39 |                     "role": "assistant",
 40 |                     "content": [{"type": "text", "text": "I'm doing well, thank you!"}],
 41 |                 },
 42 |             ],
 43 |         },
 44 |         "browser_agent_message_history_sessions": {
 45 |             "browser_agent_1": {
 46 |                 "system_prompt": "You are a browsing agent.",
 47 |                 "message_history": [
 48 |                     {
 49 |                         "role": "developer",
 50 |                         "content": [
 51 |                             {"type": "text", "text": "You are a browsing agent."}
 52 |                         ],
 53 |                     },
 54 |                     {
 55 |                         "role": "user",
 56 |                         "content": [{"type": "text", "text": "Search for something"}],
 57 |                     },
 58 |                     {
 59 |                         "role": "assistant",
 60 |                         "content": [{"type": "text", "text": "I found it."}],
 61 |                     },
 62 |                 ],
 63 |             }
 64 |         },
 65 |         "env_info": {"llm_provider": "openai"},
 66 |     }
 67 | 
 68 |     # Convert using OAI method
 69 |     chatml_data = extract_message_history_from_log(log_data)
 70 |     print(
 71 |         f"OAI conversion result: {len(chatml_data['main_agent'])} messages in main agent"
 72 |     )
 73 |     print(
 74 |         f"OAI conversion result: {len(chatml_data['browser_agents']['browser_agent_1'])} messages in browser agent"
 75 |     )
 76 | 
 77 |     # Convert using Non-OAI method
 78 |     with tempfile.TemporaryDirectory() as temp_dir:
 79 |         temp_path = Path(temp_dir)
 80 |         extract_and_save_chat_history(log_data, temp_path, "example")
 81 | 
 82 |         # Check generated files
 83 |         main_file = temp_path / "example_main_agent_chatml.json"
 84 |         browser_file = temp_path / "example_browser_agent_1_chatml.json"
 85 | 
 86 |         if main_file.exists():
 87 |             with open(main_file, "r") as f:
 88 |                 main_content = json.load(f)
 89 |                 print(
 90 |                     f"Non-OAI conversion result: {len(main_content)} messages in main agent"
 91 |                 )
 92 | 
 93 |         if browser_file.exists():
 94 |             with open(browser_file, "r") as f:
 95 |                 browser_content = json.load(f)
 96 |                 print(
 97 |                     f"Non-OAI conversion result: {len(browser_content)} messages in browser agent"
 98 |                 )
 99 | 
100 | 
101 | if __name__ == "__main__":
102 |     print("ChatML Conversion Utilities - Usage Examples")
103 |     print("=" * 50)
104 | 
105 |     example_1_basic_conversion()
106 | 
107 |     print("\n" + "=" * 50)
108 |     print("Examples completed successfully!")
109 |     print("\nFor more information, see the README.md file.")
110 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/scripts/run_evaluate_multiple_runs_futurex.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # Parse environment variables, use defaults if not set
  4 | LLM_MODEL=${LLM_MODEL:-"MiroThinker-Models"}
  5 | BASE_URL=${BASE_URL:-"https://your-api.com/v1"}
  6 | 
  7 | # Configuration parameters
  8 | NUM_RUNS=${NUM_RUNS:-8}
  9 | BENCHMARK_NAME="futurex"
 10 | LLM_PROVIDER=${LLM_PROVIDER:-"qwen"}
 11 | AGENT_SET=${AGENT_SET:-"single_agent_keep5"}
 12 | MAX_CONTEXT_LENGTH=${MAX_CONTEXT_LENGTH:-262144}
 13 | MAX_CONCURRENT=${MAX_CONCURRENT:-10}
 14 | PASS_AT_K=${PASS_AT_K:-1}
 15 | TEMPERATURE=${TEMPERATURE:-1.0}
 16 | API_KEY=${API_KEY:-"xxx"}
 17 | 
 18 | # Set results directory
 19 | RESULTS_DIR="../../logs/${BENCHMARK_NAME}/$(date +%m%d)/${LLM_PROVIDER}_${LLM_MODEL}_${AGENT_SET}"
 20 | 
 21 | echo "Starting $NUM_RUNS runs of the evaluation..."
 22 | echo "Results will be saved in: $RESULTS_DIR"
 23 | 
 24 | # Create results directory
 25 | mkdir -p "$RESULTS_DIR"
 26 | 
 27 | # Launch all parallel tasks
 28 | for i in $(seq 1 $NUM_RUNS); do
 29 |     echo "=========================================="
 30 |     echo "Launching experiment $i/$NUM_RUNS"
 31 |     echo "Output log: please view $RESULTS_DIR/run_${i}_output.log"
 32 |     echo "=========================================="
 33 |     
 34 |     # Set specific identifier for this run
 35 |     RUN_ID="run_$i"
 36 |     
 37 |     # Run experiment (background execution)
 38 |     (
 39 |         uv run python benchmarks/common_benchmark.py \
 40 |             benchmark=$BENCHMARK_NAME \
 41 |             benchmark.data.metadata_file="standardized_data_250924_250930.jsonl" \
 42 |             llm=qwen-3 \
 43 |             llm.provider=$LLM_PROVIDER \
 44 |             llm.model_name=$LLM_MODEL \
 45 |             llm.base_url=$BASE_URL \
 46 |             llm.async_client=true \
 47 |             llm.temperature=$TEMPERATURE \
 48 |             llm.max_context_length=$MAX_CONTEXT_LENGTH \
 49 |             llm.api_key=$API_KEY \
 50 |             benchmark.execution.max_tasks=null \
 51 |             benchmark.execution.max_concurrent=$MAX_CONCURRENT \
 52 |             benchmark.execution.pass_at_k=$PASS_AT_K \
 53 |             benchmark.data.data_dir=../../data/futurex \
 54 |             agent=$AGENT_SET \
 55 |             hydra.run.dir=${RESULTS_DIR}/$RUN_ID \
 56 |             2>&1 | tee "$RESULTS_DIR/${RUN_ID}_output.log" 
 57 |         
 58 |         # Check if run was successful
 59 |         if [ $? -eq 0 ]; then
 60 |             echo "Run $i completed successfully"
 61 |             RESULT_FILE=$(find "${RESULTS_DIR}/$RUN_ID" -name "*accuracy.txt" 2>/dev/null | head -1)
 62 |             if [ -f "$RESULT_FILE" ]; then
 63 |                 echo "Results saved to $RESULT_FILE"
 64 |             else
 65 |                 echo "Warning: Result file not found for run $i"
 66 |             fi
 67 |         else
 68 |             echo "Run $i failed!"
 69 |         fi
 70 |     ) &
 71 |     
 72 |     # Small delay between launches to avoid simultaneous requests
 73 |     sleep 2
 74 | done
 75 | 
 76 | echo "All $NUM_RUNS runs have been launched in parallel"
 77 | echo "Waiting for all runs to complete..."
 78 | 
 79 | # Wait for all background tasks to complete
 80 | wait
 81 | 
 82 | echo "=========================================="
 83 | echo "All $NUM_RUNS runs completed!"
 84 | echo "=========================================="
 85 | 
 86 | # Calculate average scores
 87 | # echo "Calculating average scores..."
 88 | # uv run python benchmarks/evaluators/calculate_average_score.py "$RESULTS_DIR"
 89 | echo "Extracting predictions and formatting for FutureX submission..."
 90 | uv run python benchmarks/evaluators/extract_futurex_results.py "$RESULTS_DIR"
 91 | 
 92 | # Check status and provide user-friendly message
 93 | if [ $? -eq 0 ]; then
 94 |     echo "✅ Submission file generated: $RESULTS_DIR/futurex_submission.jsonl"
 95 |     echo "You can now upload this file to the FutureX test server."
 96 | else
 97 |     echo "❌ Failed to generate submission file. Please check the logs for details."
 98 | fi
 99 | 
100 | echo "=========================================="
101 | echo "Multiple runs evaluation completed!"
102 | echo "Check results in: $RESULTS_DIR"
103 | echo "Check individual run logs: $RESULTS_DIR/run_*_output.log"
104 | echo "==========================================" 
105 | 


--------------------------------------------------------------------------------
/assets/qwen3_nonthinking.jinja:
--------------------------------------------------------------------------------
 1 |  {%- if tools %}
 2 |     {{- '<|im_start|>system\n' }}
 3 |     {%- if messages[0].role == 'system' %}
 4 |         {{- messages[0].content + '\n\n' }}
 5 |     {%- endif %}
 6 |     {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
 7 |     {%- for tool in tools %}
 8 |         {{- "\n" }}
 9 |         {{- tool | tojson }}
10 |     {%- endfor %}
11 |     {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
12 | {%- else %}
13 |     {%- if messages[0].role == 'system' %}
14 |         {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
15 |     {%- endif %}
16 | {%- endif %}
17 | {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
18 | {%- for message in messages[::-1] %}
19 |     {%- set index = (messages|length - 1) - loop.index0 %}
20 |     {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
21 |         {%- set ns.multi_step_tool = false %}
22 |         {%- set ns.last_query_index = index %}
23 |     {%- endif %}
24 | {%- endfor %}
25 | {%- for message in messages %}
26 |     {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
27 |         {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
28 |     {%- elif message.role == "assistant" %}
29 |         {%- set content = message.content %}
30 |         {%- set reasoning_content = '' %}
31 |         {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
32 |             {%- set reasoning_content = message.reasoning_content %}
33 |         {%- else %}
34 |             {%- if '</think>' in message.content %}
35 |                 {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
36 |                 {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
37 |             {%- endif %}
38 |         {%- endif %}
39 |         {%- if loop.index0 > ns.last_query_index %}
40 |             {%- if loop.last or (not loop.last and reasoning_content) %}
41 |                 {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
42 |             {%- else %}
43 |                 {{- '<|im_start|>' + message.role + '\n' + content }}
44 |             {%- endif %}
45 |         {%- else %}
46 |             {{- '<|im_start|>' + message.role + '\n' + content }}
47 |         {%- endif %}
48 |         {%- if message.tool_calls %}
49 |             {%- for tool_call in message.tool_calls %}
50 |                 {%- if (loop.first and content) or (not loop.first) %}
51 |                     {{- '\n' }}
52 |                 {%- endif %}
53 |                 {%- if tool_call.function %}
54 |                     {%- set tool_call = tool_call.function %}
55 |                 {%- endif %}
56 |                 {{- '<tool_call>\n{"name": "' }}
57 |                 {{- tool_call.name }}
58 |                 {{- '", "arguments": ' }}
59 |                 {%- if tool_call.arguments is string %}
60 |                     {{- tool_call.arguments }}
61 |                 {%- else %}
62 |                     {{- tool_call.arguments | tojson }}
63 |                 {%- endif %}
64 |                 {{- '}\n</tool_call>' }}
65 |             {%- endfor %}
66 |         {%- endif %}
67 |         {{- '<|im_end|>\n' }}
68 |     {%- elif message.role == "tool" %}
69 |         {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
70 |             {{- '<|im_start|>user' }}
71 |         {%- endif %}
72 |         {{- '\n<tool_response>\n' }}
73 |         {{- message.content }}
74 |         {{- '\n</tool_response>' }}
75 |         {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
76 |             {{- '<|im_end|>\n' }}
77 |         {%- endif %}
78 |     {%- endif %}
79 | {%- endfor %}
80 | {%- if add_generation_prompt %}
81 |     {{- '<|im_start|>assistant\n<think>\n\n</think>\n\n' }}
82 | {%- endif %}


--------------------------------------------------------------------------------
/apps/miroflow-agent/benchmarks/evaluators/calculate_average_score.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # Copyright (c) 2025 MiroMind
  3 | # This source code is licensed under the MIT License.
  4 | 
  5 | import glob
  6 | import os
  7 | import re
  8 | import statistics
  9 | import sys
 10 | 
 11 | 
 12 | def detect_pass_at_k(results_dir: str) -> tuple:
 13 |     """Detect the pass_at_k value used in the results directory"""
 14 | 
 15 |     # Find all possible pass_at_k files
 16 |     pattern = os.path.join(
 17 |         results_dir, "run_*", "benchmark_results_pass_at_*_accuracy.txt"
 18 |     )
 19 |     all_files = glob.glob(pattern)
 20 | 
 21 |     if not all_files:
 22 |         print(f"No accuracy files found in {results_dir}")
 23 |         print(f"Expected pattern: {pattern}")
 24 |         return None, []
 25 | 
 26 |     # Extract pass_at_k value from the first file
 27 |     filename = os.path.basename(all_files[0])
 28 |     match = re.search(r"pass_at_(\d+)_accuracy\.txt", filename)
 29 | 
 30 |     if not match:
 31 |         print(f"Cannot extract pass_at_k from filename: {filename}")
 32 |         return None, []
 33 | 
 34 |     k = int(match.group(1))
 35 | 
 36 |     # Get all files with this k value
 37 |     accuracy_files = glob.glob(
 38 |         os.path.join(
 39 |             results_dir, "run_*", f"benchmark_results_pass_at_{k}_accuracy.txt"
 40 |         )
 41 |     )
 42 | 
 43 |     return k, accuracy_files
 44 | 
 45 | 
 46 | def calculate_average_scores(results_dir: str) -> dict:
 47 |     """Calculate average scores from multiple runs - automatically detect pass_at_k value"""
 48 | 
 49 |     # Detect pass_at_k value and corresponding files
 50 |     pass_at_k, accuracy_files = detect_pass_at_k(results_dir)
 51 | 
 52 |     if pass_at_k is None:
 53 |         return None
 54 | 
 55 |     print(f"Detected pass_at_{pass_at_k} files")
 56 |     print(f"Found {len(accuracy_files)} accuracy files")
 57 | 
 58 |     scores = []
 59 | 
 60 |     # Read each accuracy file
 61 |     for i, file_path in enumerate(sorted(accuracy_files), 1):
 62 |         try:
 63 |             with open(file_path, "r") as f:
 64 |                 content = f.read().strip()
 65 |                 # Remove percentage sign and convert to float
 66 |                 score = float(content.replace("%", ""))
 67 |                 scores.append(score)
 68 |                 print(f"Run {i}: {score:.2f}%")
 69 |         except Exception as e:
 70 |             print(f"Error reading {file_path}: {e}")
 71 |             continue
 72 | 
 73 |     if not scores:
 74 |         print("No valid scores found")
 75 |         return None
 76 | 
 77 |     # Calculate statistics
 78 |     stats = {
 79 |         "pass_at_k": pass_at_k,
 80 |         "num_runs": len(scores),
 81 |         "individual_scores": scores,
 82 |         "average_score": statistics.mean(scores),
 83 |         "std_dev": statistics.stdev(scores) if len(scores) > 1 else 0,
 84 |         "min_score": min(scores),
 85 |         "max_score": max(scores),
 86 |     }
 87 | 
 88 |     return stats
 89 | 
 90 | 
 91 | def print_results(stats: dict):
 92 |     """Print results"""
 93 |     print("\n" + "=" * 50)
 94 |     print("EVALUATION RESULTS")
 95 |     print("=" * 50)
 96 | 
 97 |     print(f"Pass@{stats['pass_at_k']} Results:")
 98 |     print(f"Number of runs: {stats['num_runs']}")
 99 |     print(f"Individual scores: {[f'{s:.2f}%' for s in stats['individual_scores']]}")
100 |     print()
101 |     print(f"Standard deviation: {stats['std_dev']:.2f}%")
102 |     print(f"Min score: {stats['min_score']:.2f}%")
103 |     print(f"Max score: {stats['max_score']:.2f}%")
104 |     print(f"Average score: {stats['average_score']:.2f}%")
105 |     print("=" * 50)
106 | 
107 | 
108 | def main():
109 |     if len(sys.argv) < 2:
110 |         print("Usage: python calculate_average_score.py <results_directory>")
111 |         print("Example: python calculate_average_score.py logs/gaia-validation/mytest")
112 |         sys.exit(1)
113 | 
114 |     results_dir = sys.argv[1]
115 | 
116 |     if not os.path.exists(results_dir):
117 |         print(f"Results directory does not exist: {results_dir}")
118 |         sys.exit(1)
119 | 
120 |     print(f"Analyzing results from: {results_dir}")
121 | 
122 |     stats = calculate_average_scores(results_dir)
123 | 
124 |     if stats:
125 |         print_results(stats)
126 | 
127 |         # Save simple statistics results
128 |         output_file = os.path.join(
129 |             results_dir, f"average_scores_pass_at_{stats['pass_at_k']}.txt"
130 |         )
131 |         with open(output_file, "w") as f:
132 |             f.write("EVALUATION RESULTS\n")
133 |             f.write("=" * 50 + "\n")
134 |             f.write(f"Pass@{stats['pass_at_k']} Results:\n")
135 |             f.write(f"Number of runs: {stats['num_runs']}\n")
136 |             f.write(
137 |                 f"Individual scores: {[f'{s:.2f}%' for s in stats['individual_scores']]}\n"
138 |             )
139 |             f.write(f"Standard deviation: {stats['std_dev']:.2f}%\n")
140 |             f.write(f"Min score: {stats['min_score']:.2f}%\n")
141 |             f.write(f"Max score: {stats['max_score']:.2f}%\n")
142 |             f.write(f"Average score: {stats['average_score']:.2f}%\n")
143 |             f.write("=" * 50 + "\n")
144 | 
145 |         print(f"\nResults saved to: {output_file}")
146 |     else:
147 |         print("Failed to calculate statistics")
148 |         sys.exit(1)
149 | 
150 | 
151 | if __name__ == "__main__":
152 |     main()
153 | 


--------------------------------------------------------------------------------
/libs/miroflow-tools/src/miroflow_tools/mcp_servers/serper_mcp_server.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2025 MiroMind
  2 | # This source code is licensed under the MIT License.
  3 | 
  4 | """
  5 | adapted from
  6 | https://github.com/MiroMindAI/MiroRL/blob/5073693549ffe05a157a1886e87650ef3be6606e/mirorl/tools/serper_search.py#L1
  7 | """
  8 | 
  9 | import os
 10 | from typing import Any, Dict
 11 | 
 12 | import requests
 13 | from mcp.server.fastmcp import FastMCP
 14 | from tenacity import (
 15 |     retry,
 16 |     retry_if_exception_type,
 17 |     stop_after_attempt,
 18 |     wait_exponential,
 19 | )
 20 | 
 21 | from .utils import decode_http_urls_in_dict
 22 | 
 23 | SERPER_BASE_URL = os.getenv("SERPER_BASE_URL", "https://google.serper.dev")
 24 | SERPER_API_KEY = os.getenv("SERPER_API_KEY", "")
 25 | 
 26 | # Initialize FastMCP server
 27 | mcp = FastMCP("serper-mcp-server")
 28 | 
 29 | 
 30 | @retry(
 31 |     stop=stop_after_attempt(3),
 32 |     wait=wait_exponential(multiplier=1, min=4, max=10),
 33 |     retry=retry_if_exception_type(
 34 |         (requests.ConnectionError, requests.Timeout, requests.HTTPError)
 35 |     ),
 36 | )
 37 | def make_serper_request(
 38 |     payload: Dict[str, Any], headers: Dict[str, str]
 39 | ) -> requests.Response:
 40 |     """Make HTTP request to Serper API with retry logic."""
 41 |     response = requests.post(f"{SERPER_BASE_URL}/search", json=payload, headers=headers)
 42 |     response.raise_for_status()
 43 |     return response
 44 | 
 45 | 
 46 | def _is_huggingface_dataset_or_space_url(url):
 47 |     """
 48 |     Check if the URL is a HuggingFace dataset or space URL.
 49 |     :param url: The URL to check
 50 |     :return: True if it's a HuggingFace dataset or space URL, False otherwise
 51 |     """
 52 |     if not url:
 53 |         return False
 54 |     return "huggingface.co/datasets" in url or "huggingface.co/spaces" in url
 55 | 
 56 | 
 57 | @mcp.tool()
 58 | def google_search(
 59 |     q: str,
 60 |     gl: str = "us",
 61 |     hl: str = "en",
 62 |     location: str | None = None,
 63 |     num: int | None = None,
 64 |     tbs: str | None = None,
 65 |     page: int | None = None,
 66 |     autocorrect: bool | None = None,
 67 | ) -> Dict[str, Any]:
 68 |     """
 69 |     Tool to perform web searches via Serper API and retrieve rich results.
 70 | 
 71 |     It is able to retrieve organic search results, people also ask,
 72 |     related searches, and knowledge graph.
 73 | 
 74 |     Args:
 75 |         q: Search query string
 76 |         gl: Optional region code for search results in ISO 3166-1 alpha-2 format (e.g., 'us')
 77 |         hl: Optional language code for search results in ISO 639-1 format (e.g., 'en')
 78 |         location: Optional location for search results (e.g., 'SoHo, New York, United States', 'California, United States')
 79 |         num: Number of results to return (default: 10)
 80 |         tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week,
 81 |             'qdr:m' for past month, 'qdr:y' for past year)
 82 |         page: Page number of results to return (default: 1)
 83 |         autocorrect: Whether to autocorrect spelling in query
 84 | 
 85 |     Returns:
 86 |         Dictionary containing search results and metadata.
 87 |     """
 88 |     # Check for API key
 89 |     if not SERPER_API_KEY:
 90 |         return {
 91 |             "success": False,
 92 |             "error": "SERPER_API_KEY environment variable not set",
 93 |             "results": [],
 94 |         }
 95 | 
 96 |     # Validate required parameter
 97 |     if not q or not q.strip():
 98 |         return {
 99 |             "success": False,
100 |             "error": "Search query 'q' is required and cannot be empty",
101 |             "results": [],
102 |         }
103 | 
104 |     try:
105 |         # Build payload with all supported parameters
106 |         payload: dict[str, Any] = {
107 |             "q": q.strip(),
108 |             "gl": gl,
109 |             "hl": hl,
110 |         }
111 | 
112 |         # Add optional parameters if provided
113 |         if location:
114 |             payload["location"] = location
115 |         if num is not None:
116 |             payload["num"] = num
117 |         else:
118 |             payload["num"] = 10  # Default
119 |         if tbs:
120 |             payload["tbs"] = tbs
121 |         if page is not None:
122 |             payload["page"] = page
123 |         if autocorrect is not None:
124 |             payload["autocorrect"] = autocorrect
125 | 
126 |         # Set up headers
127 |         headers = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"}
128 | 
129 |         # Make the API request
130 |         response = make_serper_request(payload, headers)
131 |         data = response.json()
132 | 
133 |         # filter out HuggingFace dataset or space urls
134 |         organic_results = []
135 |         if "organic" in data:
136 |             for item in data["organic"]:
137 |                 if _is_huggingface_dataset_or_space_url(item.get("link", "")):
138 |                     continue
139 |                 organic_results.append(item)
140 | 
141 |         # Keep all original fields, but overwrite "organic"
142 |         response_data = dict(data)
143 |         response_data["organic"] = organic_results
144 |         response_data = decode_http_urls_in_dict(response_data)
145 | 
146 |         return response_data
147 | 
148 |     except Exception as e:
149 |         return {"success": False, "error": f"Unexpected error: {str(e)}", "results": []}
150 | 
151 | 
152 | if __name__ == "__main__":
153 |     mcp.run()
154 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/src/logging/summary_time_cost.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2025 MiroMind
  2 | # This source code is licensed under the MIT License.
  3 | 
  4 | import json
  5 | from collections import defaultdict
  6 | from pathlib import Path
  7 | 
  8 | from .task_logger import logger
  9 | 
 10 | 
 11 | def _get_summary_template():
 12 |     """Returns a template for the summary data structure."""
 13 |     return {
 14 |         "total_tasks": 0,
 15 |         "total_wall_time": 0.0,
 16 |         "primary_breakdown": {
 17 |             "main_agent": defaultdict(float),
 18 |             "browsing_agent": defaultdict(float),
 19 |         },
 20 |         "cross_cutting_breakdown": defaultdict(float),
 21 |         "tool_workload_breakdown": defaultdict(float),
 22 |     }
 23 | 
 24 | 
 25 | def _update_summary_data(summary_block, perf_summary, tool_workload):
 26 |     """Updates a summary block with data from a single result."""
 27 |     summary_block["total_tasks"] += 1
 28 |     summary_block["total_wall_time"] += perf_summary.get("total_wall_time", 0.0)
 29 | 
 30 |     # Update primary breakdown
 31 |     primary_breakdown = perf_summary.get("primary_breakdown", {})
 32 |     for agent, data in primary_breakdown.items():
 33 |         if agent in summary_block["primary_breakdown"]:
 34 |             for key, value in data.items():
 35 |                 summary_block["primary_breakdown"][agent][key] += value
 36 | 
 37 |     # Update cross-cutting breakdown
 38 |     cross_cutting_breakdown = perf_summary.get("cross_cutting_breakdown", {})
 39 |     for key, value in cross_cutting_breakdown.items():
 40 |         summary_block["cross_cutting_breakdown"][key] += value
 41 | 
 42 |     # Update tool workload breakdown
 43 |     for key, value in tool_workload.items():
 44 |         summary_block["tool_workload_breakdown"][key] += value
 45 | 
 46 | 
 47 | def _calculate_averages(summary_block):
 48 |     """Calculates and adds average values to a summary block."""
 49 |     num_tasks = summary_block["total_tasks"]
 50 |     if num_tasks == 0:
 51 |         return
 52 | 
 53 |     summary_block["average_wall_time"] = summary_block["total_wall_time"] / num_tasks
 54 | 
 55 |     # Calculate averages for primary breakdown
 56 |     for agent, data in summary_block["primary_breakdown"].items():
 57 |         summary_block["primary_breakdown"][agent] = dict(data)  # Convert back to dict
 58 |         avg_data = {f"avg_{k}": v / num_tasks for k, v in data.items()}
 59 |         summary_block["primary_breakdown"][agent].update(avg_data)
 60 | 
 61 |     # Calculate averages for cross-cutting breakdown
 62 |     summary_block["cross_cutting_breakdown"] = dict(
 63 |         summary_block["cross_cutting_breakdown"]
 64 |     )
 65 |     avg_cross_cutting = {
 66 |         f"avg_{k}": v / num_tasks
 67 |         for k, v in summary_block["cross_cutting_breakdown"].items()
 68 |     }
 69 |     summary_block["cross_cutting_breakdown"].update(avg_cross_cutting)
 70 | 
 71 |     # Calculate averages for tool workload breakdown
 72 |     summary_block["tool_workload_breakdown"] = dict(
 73 |         summary_block["tool_workload_breakdown"]
 74 |     )
 75 |     avg_tool_workload = {
 76 |         f"avg_{k}": v / num_tasks
 77 |         for k, v in summary_block["tool_workload_breakdown"].items()
 78 |     }
 79 |     summary_block["tool_workload_breakdown"].update(avg_tool_workload)
 80 | 
 81 | 
 82 | def generate_summary(log_dir: Path):
 83 |     """
 84 |     Generates a summary of benchmark results by reading log files from a directory,
 85 |     calculating total and average trace data, both overall and grouped by
 86 |     final_judge_result.
 87 | 
 88 |     Args:
 89 |         log_dir: The directory where the individual result log files are and where
 90 |                  the summary file will be saved.
 91 |     """
 92 |     results = []
 93 |     for log_file in log_dir.glob("*.json"):
 94 |         if log_file.name == "summary.json":
 95 |             continue
 96 |         try:
 97 |             with open(log_file, "r", encoding="utf-8") as f:
 98 |                 results.append(json.load(f))
 99 |         except json.JSONDecodeError:
100 |             logger.info(f"Warning: Could not decode JSON from {log_file}. Skipping.")
101 |         except Exception as e:
102 |             logger.info(f"Warning: Could not read file {log_file}: {e}. Skipping.")
103 | 
104 |     overall_summary = _get_summary_template()
105 |     summary_by_judge = defaultdict(_get_summary_template)
106 | 
107 |     for result in results:
108 |         trace_data = result.get("trace_data")
109 |         if not trace_data or "performance_summary" not in trace_data:
110 |             continue
111 | 
112 |         perf_summary = trace_data["performance_summary"]
113 |         tool_workload = trace_data.get("tool_workload_breakdown", {})
114 | 
115 |         # Update overall summary
116 |         _update_summary_data(overall_summary, perf_summary, tool_workload)
117 | 
118 |         # Update summary by judge result
119 |         judge_result = result.get("final_judge_result", "unknown")
120 |         _update_summary_data(
121 |             summary_by_judge[judge_result], perf_summary, tool_workload
122 |         )
123 | 
124 |     # Calculate averages for all summary blocks
125 |     _calculate_averages(overall_summary)
126 |     for judge_result in summary_by_judge:
127 |         _calculate_averages(summary_by_judge[judge_result])
128 | 
129 |     summary_data = {
130 |         "overall_summary": overall_summary,
131 |         "summary_by_final_judge_result": dict(summary_by_judge),
132 |     }
133 | 
134 |     summary_file = log_dir / "summary_time_cost.json"
135 |     with open(summary_file, "w", encoding="utf-8") as f:
136 |         json.dump(summary_data, f, indent=4, ensure_ascii=False)
137 | 


--------------------------------------------------------------------------------
/assets/LOCAL-TOOL-DEPLOYMENT.md:
--------------------------------------------------------------------------------
  1 | # Local Tool Deployment Guide
  2 | 
  3 | This guide explains how to deploy open-source tools locally for use with MiroThinker. These tools are optional enhancements that can replace commercial alternatives in your agent configuration.
  4 | 
  5 | ## Overview
  6 | 
  7 | MiroThinker supports several optional open-source tools that you can deploy locally:
  8 | 
  9 | - **Audio Transcription**: Whisper-Large-v3-Turbo for transcribing audio files
 10 | - **Visual Question Answering**: Qwen2.5-VL-72B-Instruct for answering questions about images
 11 | - **Reasoning Engine**: Qwen3-235B-A22B-Thinking-2507 for complex reasoning tasks
 12 | 
 13 | These tools are used when you configure your agent with `tool-transcribe-os`, `tool-vqa-os`, or `tool-reasoning-os` in your agent configuration file.
 14 | 
 15 | ## Prerequisites
 16 | 
 17 | - **GPU**: NVIDIA GPU with sufficient VRAM
 18 | - **Python 3.10+**
 19 | - **CUDA**: Compatible CUDA toolkit installed
 20 | - **Model Storage**: Sufficient disk space to download model checkpoints
 21 | 
 22 | ## Tool Deployment
 23 | 
 24 | ### 1. Audio Transcription Tool (`tool-transcribe-os`)
 25 | 
 26 | **Model**: [Whisper-Large-v3-Turbo](https://huggingface.co/openai/whisper-large-v3-turbo)
 27 | 
 28 | **Description**: Transcribes audio files (MP3, WAV, M4A, AAC, OGG, FLAC, WMA) to text. Supports both local files and remote URLs.
 29 | 
 30 | **Deployment with vLLM**:
 31 | 
 32 | ```bash
 33 | # Install vLLM with audio support
 34 | pip install vllm==0.10.0
 35 | pip install vllm[audio]
 36 | 
 37 | # Start the server
 38 | vllm serve openai/whisper-large-v3-turbo \
 39 |   --served-model-name whisper-large-v3-turbo \
 40 |   --task transcription \
 41 |   --host 0.0.0.0 \
 42 |   --port 8000
 43 | ```
 44 | 
 45 | **Configuration in `.env`**:
 46 | 
 47 | ```bash
 48 | WHISPER_MODEL_NAME="openai/whisper-large-v3-turbo"
 49 | WHISPER_API_KEY=your_api_key  # Optional, if your server requires authentication
 50 | WHISPER_BASE_URL="http://0.0.0.0:8000/v1"
 51 | ```
 52 | 
 53 | ### 2. Visual Question Answering Tool (`tool-vqa-os`)
 54 | 
 55 | **Model**: [Qwen2.5-VL-72B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-72B-Instruct)
 56 | 
 57 | **Description**: Answers questions about images. Supports local image files and URLs. Automatically encodes local images to Base64 for API requests. Compatible with JPEG, PNG, GIF formats.
 58 | 
 59 | **Deployment with SGLang**:
 60 | 
 61 | ```bash
 62 | # Install SGLang
 63 | pip install sglang[all]
 64 | 
 65 | # Start the server
 66 | python3 -m sglang.launch_server \
 67 |   --model-path Qwen/Qwen2.5-VL-72B-Instruct \
 68 |   --tp 8 \
 69 |   --host 0.0.0.0 \
 70 |   --port 8001 \
 71 |   --trust-remote-code \
 72 |   --enable-metrics
 73 | ```
 74 | 
 75 | **Configuration in `.env`**:
 76 | 
 77 | ```bash
 78 | VISION_MODEL_NAME="Qwen/Qwen2.5-VL-72B-Instruct"
 79 | VISION_API_KEY=your_api_key  # Optional, if your server requires authentication
 80 | VISION_BASE_URL="http://0.0.0.0:8001/v1/chat/completions"
 81 | ```
 82 | 
 83 | ### 3. Reasoning Engine Tool (`tool-reasoning-os`)
 84 | 
 85 | **Model**: [Qwen3-235B-A22B-Thinking-2507](https://huggingface.co/Qwen/Qwen3-235B-A22B-Thinking-2507)
 86 | 
 87 | **Description**: A reasoning service for solving complex analytical problems, such as advanced mathematics, puzzles, and riddles. Supports long-context reasoning tasks (up to 131K tokens).
 88 | 
 89 | **Deployment with SGLang**:
 90 | 
 91 | ```bash
 92 | # Install SGLang
 93 | pip install sglang[all]
 94 | 
 95 | # Start the server
 96 | python3 -m sglang.launch_server \
 97 |   --model-path Qwen/Qwen3-235B-A22B-Thinking-2507 \
 98 |   --tp 8 \
 99 |   --host 0.0.0.0 \
100 |   --port 8002 \
101 |   --trust-remote-code \
102 |   --context-length 131072 \
103 |   --enable-metrics
104 | ```
105 | 
106 | **Configuration in `.env`**:
107 | 
108 | ```bash
109 | REASONING_MODEL_NAME="Qwen/Qwen3-235B-A22B-Thinking-2507"
110 | REASONING_API_KEY=your_api_key  # Optional, if your server requires authentication
111 | REASONING_BASE_URL="http://0.0.0.0:8002/v1/chat/completions"
112 | ```
113 | 
114 | ## Using Deployed Tools
115 | 
116 | Once you have deployed the tools, configure your agent to use them:
117 | 
118 | 1. **Edit your agent configuration** (e.g., `apps/miroflow-agent/conf/agent/my_custom_config.yaml`):
119 | 
120 | ```yaml
121 | main_agent:
122 |   tools:
123 |     - tool-python
124 |     - search_and_scrape_webpage
125 |     - jina_scrape_llm_summary
126 |     - tool-transcribe-os    # Use local Whisper deployment
127 |     - tool-vqa-os           # Use local Qwen2.5-VL deployment
128 |     - tool-reasoning-os     # Use local Qwen3-235B deployment
129 |   max_turns: 400
130 | ```
131 | 
132 | 2. **Configure environment variables** in `apps/miroflow-agent/.env` as shown in each tool's deployment section above.
133 | 
134 | 1. **Run your agent**:
135 | 
136 | ```bash
137 | cd apps/miroflow-agent
138 | uv run main.py llm=qwen-3 agent=my_custom_config llm.base_url=https://your_base_url/v1
139 | ```
140 | 
141 | ## Commercial Alternatives
142 | 
143 | If you prefer not to deploy these tools locally, you can use commercial alternatives:
144 | 
145 | - **`tool-transcribe`**: Uses OpenAI's GPT-4o mini Transcribe API
146 | - **`tool-vqa`**: Uses Claude Sonnet 3.7 API
147 | - **`tool-reasoning`**: Uses Claude Sonnet 3.7 API
148 | 
149 | Simply replace `-os` versions with commercial versions in your agent configuration and configure the corresponding API keys (`OPENAI_API_KEY`, `ANTHROPIC_API_KEY`).
150 | 
151 | ## Additional Resources
152 | 
153 | - **SGLang Documentation**: [https://sglang.readthedocs.io/](https://sglang.readthedocs.io/)
154 | - **vLLM Documentation**: [https://docs.vllm.ai/](https://docs.vllm.ai/)
155 | - **Model Cards**: Check HuggingFace model pages for specific requirements and recommendations
156 | 


--------------------------------------------------------------------------------
/assets/MiromindAI_H.svg:
--------------------------------------------------------------------------------
1 | <svg width="316" height="91" viewBox="0 0 316 91" fill="none" xmlns="http://www.w3.org/2000/svg">
2 | <path fill-rule="evenodd" clip-rule="evenodd" d="M6.97263 67.6052C6.54789 68.7445 5.45995 69.5 4.24401 69.5V69.5C2.1993 69.5 0.79129 67.448 1.52696 65.5402L17.014 25.3783C17.6828 23.644 19.3499 22.5 21.2088 22.5V22.5C23.0676 22.5 24.7348 23.644 25.4036 25.3783L40.8906 65.5402C41.6263 67.448 40.2183 69.5 38.1736 69.5V69.5C36.9576 69.5 35.8697 68.7445 35.4449 67.6052L21.4689 30.1162C21.4285 30.0076 21.3247 29.9355 21.2088 29.9355V29.9355C21.0929 29.9355 20.9891 30.0076 20.9486 30.1162L6.97263 67.6052Z" fill="#5DDBD1"/>
3 | <path fill-rule="evenodd" clip-rule="evenodd" d="M29.8054 67.6433C29.3905 68.7563 28.2845 69.5 27.0442 69.5C25.007 69.5 23.5915 67.5693 24.2937 65.7483L39.9075 25.2575C40.5463 23.601 42.2026 22.5 44.0558 22.5C45.909 22.5 47.5653 23.601 48.204 25.2575L63.8179 65.7483C64.5201 67.5693 63.1046 69.5 61.0674 69.5C59.8271 69.5 58.7211 68.7563 58.3062 67.6433L44.3131 30.1086C44.2744 30.0048 44.1714 29.9355 44.0558 29.9355C43.9402 29.9355 43.8371 30.0048 43.7985 30.1086L29.8054 67.6433Z" fill="#003FA0"/>
4 | <path d="M76.616 71H69.016V33.912H76.616V71ZM67.876 22.512C67.876 21.0427 68.3067 19.852 69.168 18.94C70.08 17.9773 71.296 17.496 72.816 17.496C74.2853 17.496 75.476 17.9773 76.388 18.94C77.3 19.852 77.756 21.0427 77.756 22.512C77.756 23.9307 77.3 25.1213 76.388 26.084C75.476 26.996 74.2853 27.452 72.816 27.452C71.296 27.452 70.08 26.996 69.168 26.084C68.3067 25.1213 67.876 23.9307 67.876 22.512ZM92.3041 33.912L94.1281 43.944V71H86.5281V33.912H92.3041ZM92.9881 47.06L91.1641 46.224V40.676L91.8481 39.84C92.3547 38.928 93.1401 37.9653 94.2041 36.952C95.3187 35.888 96.6107 35.0013 98.0801 34.292C99.5494 33.532 101.044 33.152 102.564 33.152C103.324 33.152 104.033 33.2027 104.692 33.304C105.401 33.4053 105.933 33.5827 106.288 33.836V40.676H103.932C100.487 40.676 97.9787 41.208 96.4081 42.272C94.8374 43.2853 93.6974 44.8813 92.9881 47.06ZM126.775 71.76C123.026 71.76 119.707 70.9747 116.819 69.404C113.982 67.7827 111.778 65.528 110.207 62.64C108.637 59.752 107.851 56.3827 107.851 52.532C107.851 48.6307 108.637 45.236 110.207 42.348C111.778 39.46 113.982 37.2053 116.819 35.584C119.707 33.9627 123.026 33.152 126.775 33.152C130.575 33.152 133.894 33.9627 136.731 35.584C139.569 37.2053 141.773 39.46 143.343 42.348C144.965 45.236 145.775 48.6307 145.775 52.532C145.775 56.3827 144.965 59.752 143.343 62.64C141.773 65.528 139.569 67.7827 136.731 69.404C133.894 70.9747 130.575 71.76 126.775 71.76ZM126.775 64.464C130.17 64.464 132.881 63.3747 134.907 61.196C136.985 58.9667 138.023 56.0787 138.023 52.532C138.023 48.9347 136.985 46.0213 134.907 43.792C132.881 41.5627 130.17 40.448 126.775 40.448C123.431 40.448 120.721 41.5627 118.643 43.792C116.617 45.9707 115.603 48.8587 115.603 52.456C115.603 56.0533 116.617 58.9667 118.643 61.196C120.721 63.3747 123.431 64.464 126.775 64.464ZM152.954 71V33.912H158.73L160.326 40.448L158.426 40.676C159.794 39.0547 161.238 37.6867 162.758 36.572C164.278 35.4573 165.899 34.6213 167.622 34.064C169.345 33.456 171.143 33.152 173.018 33.152C175.653 33.152 177.73 33.608 179.25 34.52C180.77 35.3813 181.885 36.572 182.594 38.092C183.303 39.5613 183.759 41.2333 183.962 43.108C184.165 44.9827 184.266 46.908 184.266 48.884V71H176.666V47.288C176.666 44.6533 176.058 42.88 174.842 41.968C173.677 41.056 172.283 40.6 170.662 40.6C168.331 40.6 166.102 41.3347 163.974 42.804C161.846 44.2227 160.301 45.9453 159.338 47.972V43.716H160.554V71H152.954ZM200.378 71V47.288C200.378 44.6533 199.77 42.88 198.554 41.968C197.389 41.056 195.995 40.6 194.374 40.6C192.043 40.6 189.814 41.3347 187.686 42.804C185.558 44.2227 184.013 45.9453 183.05 47.972L182.138 40.676C183.506 39.0547 184.95 37.6867 186.47 36.572C187.99 35.4573 189.611 34.6213 191.334 34.064C193.057 33.456 194.855 33.152 196.73 33.152C199.365 33.152 201.442 33.608 202.962 34.52C204.482 35.3813 205.597 36.572 206.306 38.092C207.015 39.5613 207.471 41.2333 207.674 43.108C207.877 44.9827 207.978 46.908 207.978 48.884V71H200.378ZM225.128 71H217.528V33.912H225.128V71ZM216.388 22.512C216.388 21.0427 216.818 19.852 217.68 18.94C218.592 17.9773 219.808 17.496 221.328 17.496C222.797 17.496 223.988 17.9773 224.9 18.94C225.812 19.852 226.268 21.0427 226.268 22.512C226.268 23.9307 225.812 25.1213 224.9 26.084C223.988 26.996 222.797 27.452 221.328 27.452C219.808 27.452 218.592 26.996 217.68 26.084C216.818 25.1213 216.388 23.9307 216.388 22.512ZM256.244 33.152C258.878 33.152 260.956 33.608 262.476 34.52C263.996 35.3813 265.11 36.572 265.82 38.092C266.529 39.5613 266.985 41.2333 267.188 43.108C267.39 44.9827 267.492 46.908 267.492 48.884V71H259.892V47.288C259.892 44.6533 259.284 42.88 258.068 41.968C256.902 41.056 255.509 40.6 253.888 40.6C252.266 40.6 250.62 40.9293 248.948 41.588C247.276 42.2467 245.781 43.1333 244.464 44.248C243.146 45.3627 242.133 46.604 241.424 47.972V43.716H242.64V71H235.04V33.912H240.816L242.412 40.448L240.512 40.676C241.93 39.0547 243.476 37.6867 245.148 36.572C246.82 35.4573 248.593 34.6213 250.468 34.064C252.342 33.456 254.268 33.152 256.244 33.152ZM291.348 71.76C287.953 71.76 284.964 70.9493 282.38 69.328C279.796 67.7067 277.795 65.452 276.376 62.564C274.957 59.6253 274.248 56.2307 274.248 52.38C274.248 48.5293 274.957 45.16 276.376 42.272C277.795 39.384 279.796 37.1547 282.38 35.584C284.964 33.9627 287.953 33.152 291.348 33.152C294.692 33.152 297.605 33.9627 300.088 35.584C302.571 37.1547 304.496 39.384 305.864 42.272C307.283 45.16 307.992 48.5293 307.992 52.38C307.992 56.2307 307.283 59.6253 305.864 62.564C304.496 65.452 302.571 67.7067 300.088 69.328C297.605 70.9493 294.692 71.76 291.348 71.76ZM292.488 65.224C295.629 65.224 298.137 64.0587 300.012 61.728C301.937 59.3973 302.9 56.3067 302.9 52.456C302.9 48.6053 301.937 45.5147 300.012 43.184C298.137 40.8533 295.629 39.688 292.488 39.688C289.347 39.688 286.813 40.8533 284.888 43.184C282.963 45.5147 282 48.58 282 52.38C282 56.2307 282.963 59.3467 284.888 61.728C286.813 64.0587 289.347 65.224 292.488 65.224ZM304.344 71L302.52 63.552H303.28V42.424H302.52V19.244H310.12V71H304.344Z" fill="black"/>
5 | </svg>
6 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[codz]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | #poetry.toml
110 | 
111 | # pdm
112 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113 | #   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114 | #   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115 | #pdm.lock
116 | #pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # pixi
121 | #   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122 | #pixi.lock
123 | #   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124 | #   in the .venv directory. It is recommended not to include this directory in version control.
125 | .pixi
126 | 
127 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128 | __pypackages__/
129 | 
130 | # Celery stuff
131 | celerybeat-schedule
132 | celerybeat.pid
133 | 
134 | # SageMath parsed files
135 | *.sage.py
136 | 
137 | # Environments
138 | .env
139 | .envrc
140 | .venv
141 | env/
142 | venv/
143 | ENV/
144 | env.bak/
145 | venv.bak/
146 | 
147 | # Spyder project settings
148 | .spyderproject
149 | .spyproject
150 | 
151 | # Rope project settings
152 | .ropeproject
153 | 
154 | # mkdocs documentation
155 | /site
156 | 
157 | # mypy
158 | .mypy_cache/
159 | .dmypy.json
160 | dmypy.json
161 | 
162 | # Pyre type checker
163 | .pyre/
164 | 
165 | # pytype static type analyzer
166 | .pytype/
167 | 
168 | # Cython debug symbols
169 | cython_debug/
170 | 
171 | # PyCharm
172 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
175 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
176 | #.idea/
177 | 
178 | # Abstra
179 | # Abstra is an AI-powered process automation framework.
180 | # Ignore directories containing user credentials, local state, and settings.
181 | # Learn more at https://abstra.io/docs
182 | .abstra/
183 | 
184 | # Visual Studio Code
185 | #  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 
186 | #  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187 | #  and can be added to the global gitignore or merged into this file. However, if you prefer, 
188 | #  you could uncomment the following to ignore the entire vscode folder
189 | # .vscode/
190 | 
191 | # Ruff stuff:
192 | .ruff_cache/
193 | 
194 | # PyPI configuration file
195 | .pypirc
196 | 
197 | # Cursor
198 | #  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199 | #  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200 | #  refer to https://docs.cursor.com/context/ignore-files
201 | .cursorignore
202 | .cursorindexingignore
203 | 
204 | # Marimo
205 | marimo/_static/
206 | marimo/_lsp/
207 | __marimo__/
208 | 
209 | 
210 | # -- ADDED --
211 | # Log files
212 | logs/
213 | 
214 | # Data directory - exclude everything except README
215 | data/
216 | 
217 | 
218 | .idea/
219 | 
220 | .DS_Store
221 | 
222 | apps/collect-trace/scripts/*/*.sh


--------------------------------------------------------------------------------
/apps/collect-trace/utils/converters/convert_non_oai_to_chatml.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2025 MiroMind
  2 | # This source code is licensed under the MIT License.
  3 | 
  4 | import json
  5 | import sys
  6 | from pathlib import Path
  7 | from typing import Any, Dict, List
  8 | 
  9 | 
 10 | def convert_to_json_chatml(messages: List[Dict[str, Any]]) -> List[Dict[str, str]]:
 11 |     """
 12 |     Convert message list to OpenAI JSON format ChatML
 13 |     Filter out messages with role 'tool', convert content None to empty string
 14 |     """
 15 |     chatml_list = []
 16 |     for message in messages:
 17 |         role = message.get("role", "")
 18 |         if role == "tool":
 19 |             continue  # Skip tool messages
 20 |         if role == "system":
 21 |             continue  # Skip system messages
 22 |         content = message.get("content", "")
 23 |         if content is None:
 24 |             content = ""
 25 |         # Handle different content formats
 26 |         if isinstance(content, list):
 27 |             text_parts = []
 28 |             for item in content:
 29 |                 if isinstance(item, dict) and item.get("type") == "text":
 30 |                     text_parts.append(item.get("text", ""))
 31 |             content = " ".join(text_parts)
 32 |         elif isinstance(content, str):
 33 |             pass
 34 |         else:
 35 |             content = str(content)
 36 |         chatml_list.append({"role": role, "content": content})
 37 |     return chatml_list
 38 | 
 39 | 
 40 | def extract_and_save_chat_history(
 41 |     log_data: Dict[str, Any], output_dir: Path, input_filename: str
 42 | ):
 43 |     """
 44 |     Extract message history from log data and save as ChatML format
 45 | 
 46 |     Args:
 47 |         log_data: Log data dictionary
 48 |         output_dir: Output directory
 49 |         input_filename: Input filename (without extension)
 50 |     """
 51 |     # Ensure output directory exists
 52 |     output_dir.mkdir(parents=True, exist_ok=True)
 53 | 
 54 |     # 1. Extract main_agent_message_history
 55 |     main_agent_history = log_data.get("main_agent_message_history", {})
 56 |     if main_agent_history and "message_history" in main_agent_history:
 57 |         main_messages = main_agent_history["message_history"]
 58 |         if main_messages:
 59 |             chatml_list = convert_to_json_chatml(main_messages)
 60 |             chatml_list.insert(
 61 |                 0,
 62 |                 {
 63 |                     "role": "system",
 64 |                     "content": main_agent_history.get("system_prompt", ""),
 65 |                 },
 66 |             )
 67 |             # Save main agent chat records
 68 |             main_output_file = output_dir / f"{input_filename}_main_agent_chatml.json"
 69 |             with open(main_output_file, "w", encoding="utf-8") as f:
 70 |                 json.dump(chatml_list, f, ensure_ascii=False, indent=2)
 71 | 
 72 |             print(f"✓ Saved main agent chat records: {main_output_file}")
 73 | 
 74 |     # 2. Extract sub_agent_message_history_sessions
 75 |     sub_agent_sessions = log_data.get("sub_agent_message_history_sessions", {})
 76 |     if sub_agent_sessions:
 77 |         for session_name, session_data in sub_agent_sessions.items():
 78 |             if "message_history" in session_data:
 79 |                 sub_agent_messages = session_data["message_history"]
 80 |                 if sub_agent_messages:
 81 |                     chatml_list = convert_to_json_chatml(sub_agent_messages)
 82 |                     chatml_list.insert(
 83 |                         0,
 84 |                         {
 85 |                             "role": "system",
 86 |                             "content": session_data.get("system_prompt", ""),
 87 |                         },
 88 |                     )
 89 | 
 90 |                     # Save browser agent chat records
 91 |                     sub_agent_output_file = (
 92 |                         output_dir / f"{input_filename}_{session_name}_chatml.json"
 93 |                     )
 94 |                     with open(sub_agent_output_file, "w", encoding="utf-8") as f:
 95 |                         json.dump(chatml_list, f, ensure_ascii=False, indent=2)
 96 | 
 97 |                     print(f"✓ Saved sub agent chat records: {sub_agent_output_file}")
 98 | 
 99 | 
100 | def main():
101 |     """Main function"""
102 |     if len(sys.argv) < 2:
103 |         print("Usage: python convert_non_oai_to_chatml.py <log_file_path> [output_dir]")
104 |         print(
105 |             "Example: python convert_non_oai_to_chatml.py logs/debug_logs/task_1.json"
106 |         )
107 |         print(
108 |             "Example: python convert_non_oai_to_chatml.py logs/debug_logs/task_1.json ./extracted_chats"
109 |         )
110 |         sys.exit(1)
111 | 
112 |     log_file_path = Path(sys.argv[1])
113 |     output_dir = Path(sys.argv[2]) if len(sys.argv) > 2 else Path("extracted_chats")
114 | 
115 |     # Check if input file exists
116 |     if not log_file_path.exists():
117 |         print(f"Error: Log file does not exist: {log_file_path}")
118 |         sys.exit(1)
119 | 
120 |     try:
121 |         # Read log file
122 |         print(f"Reading log file: {log_file_path}")
123 |         with open(log_file_path, "r", encoding="utf-8") as f:
124 |             log_data = json.load(f)
125 | 
126 |         # Extract input filename (without extension)
127 |         input_filename = log_file_path.stem
128 | 
129 |         # Extract and save chat history
130 |         print(f"Extracting chat history to: {output_dir}")
131 |         extract_and_save_chat_history(log_data, output_dir, input_filename)
132 | 
133 |         print("\n✓ Chat history extraction completed!")
134 |         print(f"Output directory: {output_dir.absolute()}")
135 | 
136 |     except json.JSONDecodeError as e:
137 |         print(f"Error: Cannot parse JSON file: {e}")
138 |         sys.exit(1)
139 |     except Exception as e:
140 |         print(f"Error: {e}")
141 |         sys.exit(1)
142 | 
143 | 
144 | if __name__ == "__main__":
145 |     main()
146 | 


--------------------------------------------------------------------------------
/apps/miroflow-agent/src/io/output_formatter.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2025 MiroMind
  2 | # This source code is licensed under the MIT License.
  3 | 
  4 | import re
  5 | 
  6 | 
  7 | class OutputFormatter:
  8 |     def _extract_boxed_content(self, text: str) -> str:
  9 |         r"""
 10 |         Extract the content of the last \boxed{...} occurrence in the given text.
 11 |         Supports:
 12 |           - Arbitrary levels of nested braces
 13 |           - Escaped braces (\{ and \})
 14 |           - Whitespace between \boxed and the opening brace
 15 |           - Empty content inside braces
 16 |           - Incomplete boxed expressions (extracts to end of string as fallback)
 17 |         Returns an empty string if no match is found.
 18 |         """
 19 |         if not text:
 20 |             return ""
 21 | 
 22 |         _BOXED_RE = re.compile(r"\\boxed\b", re.DOTALL)
 23 | 
 24 |         last_result = None  # Track the last boxed content (complete or incomplete)
 25 |         i = 0
 26 |         n = len(text)
 27 | 
 28 |         while True:
 29 |             # Find the next \boxed occurrence
 30 |             m = _BOXED_RE.search(text, i)
 31 |             if not m:
 32 |                 break
 33 |             j = m.end()
 34 | 
 35 |             # Skip any whitespace after \boxed
 36 |             while j < n and text[j].isspace():
 37 |                 j += 1
 38 | 
 39 |             # Require that the next character is '{'
 40 |             if j >= n or text[j] != "{":
 41 |                 i = j
 42 |                 continue
 43 | 
 44 |             # Parse the brace content manually to handle nesting and escapes
 45 |             depth = 0
 46 |             k = j
 47 |             escaped = False
 48 |             found_closing = False
 49 |             while k < n:
 50 |                 ch = text[k]
 51 |                 if escaped:
 52 |                     escaped = False
 53 |                 elif ch == "\\":
 54 |                     escaped = True
 55 |                 elif ch == "{":
 56 |                     depth += 1
 57 |                 elif ch == "}":
 58 |                     depth -= 1
 59 |                     # When depth returns to zero, the boxed content ends
 60 |                     if depth == 0:
 61 |                         last_result = text[j + 1 : k]
 62 |                         i = k + 1
 63 |                         found_closing = True
 64 |                         break
 65 |                 k += 1
 66 | 
 67 |             # If we didn't find a closing brace, this is an incomplete boxed
 68 |             # Store it as the last result (will be overwritten if we find more boxed later)
 69 |             if not found_closing and depth > 0:
 70 |                 last_result = text[j + 1 : n]
 71 |                 i = k  # Continue from where we stopped
 72 |             elif not found_closing:
 73 |                 i = j + 1  # Move past this invalid boxed
 74 | 
 75 |         # Return the last boxed content found (complete or incomplete)
 76 |         return last_result.strip() if last_result else ""
 77 | 
 78 |     def format_tool_result_for_user(self, tool_call_execution_result):
 79 |         """
 80 |         Format tool execution results to be fed back to LLM as user messages.
 81 |         Only includes necessary information (results or errors).
 82 |         """
 83 |         server_name = tool_call_execution_result["server_name"]
 84 |         tool_name = tool_call_execution_result["tool_name"]
 85 | 
 86 |         if "error" in tool_call_execution_result:
 87 |             # Provide concise error information to LLM
 88 |             content = f"Tool call to {tool_name} on {server_name} failed. Error: {tool_call_execution_result['error']}"
 89 |         elif "result" in tool_call_execution_result:
 90 |             # Provide the original output result of the tool
 91 |             content = tool_call_execution_result["result"]
 92 |             # Consider truncating overly long results
 93 |             max_len = 100_000  # 100k chars = 25k tokens
 94 |             if len(content) > max_len:
 95 |                 content = content[:max_len] + "\n... [Result truncated]"
 96 |         else:
 97 |             content = f"Tool call to {tool_name} on {server_name} completed, but produced no specific output or result."
 98 | 
 99 |         # Return format suitable as user message content
100 |         # return [{"type": "text", "text": content}]
101 |         return {"type": "text", "text": content}
102 | 
103 |     def format_final_summary_and_log(self, final_answer_text, client=None):
104 |         """Format final summary information, including answers and token statistics"""
105 |         summary_lines = []
106 |         summary_lines.append("\n" + "=" * 30 + " Final Answer " + "=" * 30)
107 |         summary_lines.append(final_answer_text)
108 | 
109 |         # Extract boxed result - find the last match using safer regex patterns
110 |         boxed_result = self._extract_boxed_content(final_answer_text)
111 | 
112 |         # Add extracted result section
113 |         summary_lines.append("\n" + "-" * 20 + " Extracted Result " + "-" * 20)
114 | 
115 |         if boxed_result:
116 |             summary_lines.append(boxed_result)
117 |         elif final_answer_text:
118 |             summary_lines.append("No \\boxed{} content found.")
119 |             boxed_result = "No \\boxed{} content found in the final answer."
120 | 
121 |         # Token usage statistics and cost estimation - use client method
122 |         if client and hasattr(client, "format_token_usage_summary"):
123 |             token_summary_lines, log_string = client.format_token_usage_summary()
124 |             summary_lines.extend(token_summary_lines)
125 |         else:
126 |             # If no client or client doesn't support it, use default format
127 |             summary_lines.append("\n" + "-" * 20 + " Token Usage & Cost " + "-" * 20)
128 |             summary_lines.append("Token usage information not available.")
129 |             summary_lines.append("-" * (40 + len(" Token Usage & Cost ")))
130 |             log_string = "Token usage information not available."
131 | 
132 |         return "\n".join(summary_lines), boxed_result, log_string
133 | 


--------------------------------------------------------------------------------