├── .gitignore
├── LICENSE
├── README.md
├── answers
    └── example
    │   ├── lol_sylas
    │       ├── answer_1.md
    │       ├── answer_2.md
    │       └── answer_3.md
    │   ├── overleaf_template
    │       ├── answer_1.md
    │       ├── answer_2.md
    │       └── answer_3.md
    │   └── yu_lineage
    │       ├── answer_1.md
    │       ├── answer_2.md
    │       └── answer_3.md
├── assets
    └── mind2web2_overview.jpg
├── batch_answer_cache.py
├── cache_all_answers.sh
├── eval_scripts
    └── README.md
├── mind2web2
    ├── __init__.py
    ├── api_tools
    │   ├── __init__.py
    │   ├── tool_arxiv.py
    │   ├── tool_googlemap.py
    │   └── tool_pdf.py
    ├── eval_runner.py
    ├── eval_toolkit.py
    ├── evaluator.py
    ├── llm_client
    │   ├── __init__.py
    │   ├── api_cost.py
    │   ├── azure_openai_client.py
    │   ├── base_client.py
    │   ├── bedrock_anthropic_client.py
    │   └── openai_client.py
    ├── prompts
    │   └── cache_prompts.py
    ├── utils
    │   ├── __init__.py
    │   ├── cache_filesys.py
    │   ├── load_eval_script.py
    │   ├── logging_setup.py
    │   ├── misc.py
    │   ├── page_info_retrieval.py
    │   ├── path_config.py
    │   └── url_tools.py
    └── verification_tree.py
├── pyproject.toml
├── run_cache_manager.py
└── run_eval.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # Ruff stuff:
171 | .ruff_cache/
172 | 
173 | # PyPI configuration file
174 | .pypirc
175 | 
176 | 
177 | 
178 | # Mind2Web2 Specific
179 | 
180 | *.pkl
181 | *.jsonl
182 | .DS_Store
183 | 
184 | /workspace/
185 | /workflow_scripts/
186 | /dataset/answers/
187 | /answers/
188 | /eval_results/
189 | /cache/
190 | /uv.lock
191 | /.claude/
192 | /osunlp/
193 | /tmp_logs/
194 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 OSU Natural Language Processing
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Mind2Web 2 [NeurIPS'25 D&B]
  2 | 
  3 | Mind2Web 2 is a benchmark for agentic search systems, featuring Agent-as-a-Judge methodology for comprehensive, rigorous, and reliable assessment on **long-horizon** and complex tasks that involve **complex and real-time information synthesis**.
  4 | 
  5 | <div align="center">
  6 |   <img src="./assets/mind2web2_overview.jpg" alt="Mind2Web 2 Overview" width="800"/>
  7 |   <p><em>Mind2Web 2 features realistic and diverse long-horizon web search tasks and a novel Agent-as-a-Judge framework to evaluate complex, time-varying, and citation-backed answers.</em></p>
  8 | </div>
  9 | 
 10 | ## 🔗 Links
 11 | 
 12 | - [🏠 Homepage](https://osu-nlp-group.github.io/Mind2Web-2)
 13 | - [🏆 Leaderboard](https://osu-nlp-group.github.io/Mind2Web-2/#leaderboard)
 14 | - [📖 Paper](https://arxiv.org/abs/2506.21506)
 15 | - [😊 Dataset (Tasks) and Evaluation Scripts (Judge Agents)](https://huggingface.co/datasets/osunlp/Mind2Web-2)
 16 | 
 17 | ## 🆕 Updates
 18 | - **2025/10/23**: To improve accessibility and adoption of Mind2Web 2, we release all the evaluation scripts are released for both public dev set and test set. Check out the [Run Evaluation Locally Yourself](#-run-evaluation-locally-yourself) section for instructions.
 19 | - **2025/07/17**: Check out our [submission guideline](#-submission-guideline). We welcome all submissions and look forward to your participation!
 20 | - **2025/07/14**: The scripts of the public development set are released. Give them a try!
 21 | - **2025/06/26**: The GitHub repo is live. The manuscript is now on arXiv.
 22 | 
 23 | 
 24 | ## 📥 Submission Guideline
 25 | 
 26 | To get answers for tasks of Mind2Web 2:
 27 | - If you are developing and testing a base model and have no agent framework at hand, you may start from go-to frameworks such as [Hugging Face's Open Deep Research](). You may want to do some zero-shot or few-shot prompting to let the agent better understand how to provide citations, to pass our attribution verifications in the task evaluations.
 28 | - If you have your own agent, still notice that we expect the agent to also provide **URL sources** to the critical facts included in the answers. You may also refer to the evaluation script to understand how the evaluation is conducted.
 29 | 
 30 | To evaluate answers from an agent system, there are mainly three steps involved: 
 31 | 1. Collecting answers from your agent on our [test set](https://huggingface.co/datasets/osunlp/Mind2Web-2/viewer/default/private_test_set)
 32 | 2. Cache the webpages mentioned in the answers (to ensure consistency and reproducibility), where we provide the script in [Precache Webpage](#3-precache-webpages-optional-but-recommended)
 33 | 3. Run the evaluation.
 34 | 4. (Optionally) We also encourage submitting the avg. time and answer lengths to better understand how the agent works.
 35 | 
 36 | For the submission, you can either:
 37 | - (Recommended) submit your agent's answers as well as providing the webpage cache to us. This ensures the best consistency between the inference and evaluation. We will handle the evaluation cost for you.
 38 | - (Recommended) run the whole evaluation by following the instructions in the next section and submit the evaluation results to us.
 39 | - Only provide your agent answers and let us handle the webpage caching and evaluation for you
 40 | 
 41 | If you choose to submit your agent's answer, please arrange your agent's responses in the following directory structure (see [answers/examples](https://github.com/OSU-NLP-Group/Mind2Web-2/tree/main/answers/example) for reference):
 42 | 
 43 |    ```
 44 |    <agent_name>
 45 |    ├── <task_id>
 46 |    │   ├── answer_1.md
 47 |    │   ├── answer_2.md
 48 |    │   └── ...
 49 |    └── ...
 50 |    ```
 51 | 
 52 | Similarly, the according cache structure should be cache/<agent_name>/<task_id>/
 53 | 
 54 | Compress the directories and send it to us via email: m2w2-leaderboard@googlegroups.com.
 55 | 
 56 | > **Note:**
 57 | 
 58 | > If you would like to **explore our tasks and run the evaluation locally**, please refer to the sections below for environment setup and evaluation instructions.
 59 | 
 60 | 
 61 | ## 🚀 Run Evaluation Locally Yourself
 62 | 
 63 | ### 0. Environment Setup
 64 | 
 65 | #### Option 1: Using uv (Recommended)
 66 | 
 67 | If you have [uv](https://docs.astral.sh/uv/) installed, it provides faster dependency resolution and installation:
 68 | 
 69 | ```bash
 70 | # Automatically create virtual environment and install all dependencies
 71 | uv sync
 72 | 
 73 | # Activate the virtual environment
 74 | source .venv/bin/activate  # On Windows: .venv\Scripts\activate
 75 | 
 76 | # Install browsers for Playwright (we use rebrowser playwright for better webpage fetching)
 77 | rebrowser_playwright install
 78 | ```
 79 | 
 80 | #### Option 2: Using conda + pip
 81 | 
 82 | ```bash
 83 | # Create and activate conda environment
 84 | conda create -n mind2web2 python=3.11
 85 | conda activate mind2web2
 86 | 
 87 | # Install the package in development mode
 88 | pip install -e .
 89 | 
 90 | # Install browsers for Playwright
 91 | #playwright install
 92 | rebrowser_playwright install
 93 | ```
 94 | 
 95 | ### 1. Prepare Your Data
 96 | 
 97 | Organize your agent's responses in the following directory structure:
 98 | 
 99 | ```
100 | answers/
101 | └── <your_agent_name>/
102 |     └── <task_id>/
103 |         ├── answer_1.md
104 |         ├── answer_2.md
105 |         └── ...
106 | ```
107 | 
108 | Each answer file should contain your agent's response in markdown format.
109 | 
110 | ### 2. Set up API Keys
111 | 
112 | Configure the necessary API keys for evaluation:
113 | 
114 | ```bash
115 | # Set up environment variables for OpenAI API
116 | export OPENAI_API_KEY="YOUR_OPENAI_KEY"
117 | 
118 | # (Optional) Environment variables for Azure OpenAI
119 | export AZURE_OPENAI_API_KEY="YOUR_AZURE_OPENAI_API_KEY"
120 | export AZURE_OPENAI_ENDPOINT_URL="YOUR_AZURE_OPENAI_ENDPOINT_URL"
121 | export AZURE_OPENAI_API_VERSION="2025-03-01-preview"
122 | 
123 | # (Optional, but necessary for several tasks) Tool APIs for tasks that require google map APIs
124 | export GOOGLE_MAPS_API_KEY="YOUR_GOOGLE_MAPS_API_KEY"
125 | ```
126 | 
127 | ### 3. Precache Webpages (Optional but Recommended)
128 | 
129 | *Note: This step is not required but highly recommended for reducing evaluation latency, as fetching webpages on-the-fly during evaluation can be very slow.*
130 | 
131 | Before running evaluation, you may want to precache the webpages to improve performance:
132 | 
133 | ```bash
134 | ./cache_all_answers.sh <your_agent_name>
135 | ```
136 | 
137 | We also provide a lightweight app to fix errors in precached webpages (e.g., pages blocked by human verification):
138 | 
139 | ```bash
140 | # Start the Cache Manager GUI
141 | python run_cache_manager.py
142 | 
143 | # Optionally load a cache folder on startup (recommended)
144 | python run_cache_manager.py cache/<your_agent_name>
145 | 
146 | # Debug:
147 | python run_cache_manager.py --log-level DEBUG
148 | 
149 | ```
150 | 
151 | Notes:
152 | - The Cache Manager is a PySide6 (Qt) desktop app located under `cache_manager/`.
153 | - It helps you inspect, fix, and update cached URLs for each task:
154 |   - Open a cache folder via File → “Open Cache Folder…” and select `cache/<agent_name>`.
155 |   - Select a task (left), then a URL to preview its cached text/screenshot.
156 |   - Use "Live" view to reload the page, and click “Update Cache” to capture fresh content and overwrite the cache.
157 |   - Use "Upload MHTML" to manually upload a saved MHTML file for the selected URL.
158 | 
159 | ### 4. Run Evaluation
160 | 
161 | Download the evaluation script from [link](https://huggingface.co/datasets/osunlp/Mind2Web-2), and execute the evaluation using the `run_eval.py` script:
162 | 
163 | #### Basic Usage
164 | 
165 | ```bash
166 | # Evaluate all tasks for a specific agent
167 | python run_eval.py --agent_name <your_agent_name>
168 | 
169 | # Evaluate a specific task
170 | python run_eval.py --agent_name <your_agent_name> --task_id <task_id>
171 | ```
172 | 
173 | for example:
174 | 
175 | ```bash
176 | python run_eval.py --agent_name example
177 | 
178 | python run_eval.py --agent_name example --task_id yu_lineage
179 | ```
180 | 
181 | #### Advanced Configuration
182 | 
183 | - `--agent_name`: Name of your agent (required)
184 | - `--answer_folder`: Path to directory containing answer files (default: `answers/`)
185 | - `--eval_scripts_root`: Root directory for evaluation scripts (default: `eval_scripts/`)
186 | - `--eval_results_root`: Root directory to save evaluation results (default: `eval_results/`)
187 | - `--cache_root`: Root directory for caching webpages (default: `cache/`)
188 | - `--eval_version`: Version of evaluation scripts to use (default: `2025_07_14`)
189 | - `--task_id`: Specific task to evaluate (optional, evaluates all tasks if not provided)
190 | - `--llm_provider`: LLM provider (`openai` or `azure_openai`, default: `openai`)
191 | - `--max_concurrent_tasks`: Maximum concurrent task evaluations (default: 2)
192 | - `--max_concurrent_answers`: Maximum concurrent answer evaluations per task (default: 3)
193 | - `--max_webpage_retrieval`: Maximum concurrent webpage retrievals (default: 5)
194 | - `--max_llm_requests`: Maximum concurrent LLM API requests (default: 30)
195 | - `--dump_cache`: Persist cache to disk (default: True)
196 | - `--overwrite`: Overwrite existing results
197 | 
198 | ## 📝 Citation
199 | 
200 | If you find this work useful, please consider starring our repo and citing our papers:
201 | 
202 | ```bibtex
203 | @inproceedings{
204 |     gou2025mindweb,
205 |     title={Mind2Web 2: Evaluating Agentic Search with Agent-as-a-Judge},
206 |     author={Boyu Gou and Zanming Huang and Yuting Ning and Yu Gu and Michael Lin and Botao Yu and Andrei Kopanev and Weijian Qi and Yiheng Shu and Jiaman Wu and Chan Hee Song and Bernal Jimenez Gutierrez and Yifei Li and Zeyi Liao and Hanane Nour Moussa and TIANSHU ZHANG and Jian Xie and Tianci Xue and Shijie Chen and Boyuan Zheng and Kai Zhang and Zhaowei Cai and Viktor Rozgic and Morteza Ziyadi and Huan Sun and Yu Su},
207 |     booktitle={The Thirty-ninth Annual Conference on Neural Information Processing Systems Datasets and Benchmarks Track},
208 |     year={2025},
209 |     url={https://openreview.net/forum?id=AUaW6DS9si}
210 | }
211 | ```
212 | 


--------------------------------------------------------------------------------
/answers/example/lol_sylas/answer_1.md:
--------------------------------------------------------------------------------
 1 | # The Iconic Sylas Play: Faker's World Championship Legacy
 2 | 
 3 | Based on extensive research across esports databases, match highlights, and tournament records, **no documented instances exist of the specific scenario described occurring in World Championship finals during the S9-S13 timeframe** . However, the research revealed a remarkably similar and historically significant play that likely matches your query.
 4 | 
 5 | ## The definitive match: 2024 World Championship Finals Game 4
 6 | 
 7 | The most prominent Sylas ultimate steal involving Rakan that secured a World Championship victory occurred at **2024 Worlds Finals Game 4** , where **Lee "Faker" Sang-hyeok** of T1 executed one of the most iconic plays in League of Legends history.
 8 | 
 9 | **The game-changing moment occurred at the 20-minute mark** when Faker, playing Sylas on blue side mid lane, positioned strategically near mid-lane and stole Rakan's ultimate "The Quickness" from BLG's support ON. [1] Using the stolen ultimate's movement speed buff combined with Flash and Hextech Rocketbelt, Faker perfectly engaged onto both of BLG's carries (AP and AD), enabling his teammates to follow up with coordinated linear spells that turned the teamfight decisively in T1's favor. [1]
10 | 
11 | This play was crucial for T1's comeback from facing elimination down 2-1 in the series, ultimately leading to their 3-2 championship victory and Faker's fifth World Championship title. [1]
12 | 
13 | ## Faker's complete 2024 Worlds Sylas performance
14 | 
15 | **Tournament Statistics:**
16 | 
17 | - **Games Played with Sylas:** 6 games (most played champion) [2]
18 | - **Win-Loss Record:** 4-2
19 | - **Win Rate:** 66.7%
20 | - **KDA:** 2.56 (16 kills, 16 deaths, 25 assists)
21 | - **Kill Participation:** 61.2%
22 | - **Average CS:** 245.17 per game
23 | - **Damage Per Minute:** 384.1 [3]
24 | 
25 | **Historical Context:** During this tournament, Faker achieved multiple historic milestones, including becoming the first player to reach 500 kills at Worlds (accomplished in the same Game 4) and winning his record-breaking fifth World Championship at age 28. [4]
26 | 
27 | ## Why this specific play stands out
28 | 
29 | **Strategic Impact:** The stolen Rakan ultimate provided the perfect initiation tool, combining movement speed, charm effects, and positioning advantages [5] that locked down BLG's double carries exactly as described in your query. [6]
30 | 
31 | **Tournament Significance:** This play occurred in the Finals Game 4 of the most-viewed esports match in history (6.9 million peak viewers), [7] with Faker earning Finals MVP honors. [8]
32 | 
33 | **Technical Execution:** The play demonstrated perfect game sense, positioning near mid-lane, timing the ultimate theft, and utilizing the full kit synergy between Sylas's abilities and the stolen Rakan ultimate. [9] [6]
34 | 
35 | ## Research findings on S9-S13 period
36 | 
37 | Despite comprehensive analysis of tournament databases, esports journalism coverage, and match highlights from 2019-2023, no documented instances of similar game-defining Sylas plays involving Rakan ultimate steals were found in World Championship finals during that specific timeframe. While Sylas maintained significant presence in professional play [10] (96% presence rate in 2019 Worlds), [11] the particular scenario you described appears to be most accurately represented by Faker's legendary 2024 performance. [12]
38 | 
39 | The 2024 Finals performance represents the pinnacle of competitive Sylas play and ultimate theft execution, [9] making it the most significant documented instance of a Sylas player stealing Rakan's ultimate to secure a World Championship victory through precise double carry lockdown near mid-lane. [13]
40 | 
41 | ## Sources
42 | [1]: https://www.oneesports.gg/league-of-legends/faker-worlds-2024-final-mvp/
43 | [2]: https://www.strafe.com/news/read/how-one-taunt-from-faker-gave-t1-their-fifth-worlds-title-against-all-odds/
44 | [3]: https://lol.fandom.com/wiki/Faker/Statistics/2024
45 | [4]: https://lol.fandom.com/wiki/Faker
46 | [5]: https://dotesports.com/league-of-legends/news/the-most-unexpected-and-busted-sylas-ultimate-interactions
47 | [6]: https://leagueoflegends.fandom.com/wiki/Sylas/LoL
48 | [7]: https://www.redbull.com/us-en/10-moments-from-league-of-legends-world-championship
49 | [8]: https://www.oneesports.gg/league-of-legends/t1-worlds-2024-champion/
50 | [9]: https://wiki.leagueoflegends.com/en-us/Sylas
51 | [10]: https://www.invenglobal.com/articles/18665/faker-and-t1s-triumph-at-worlds-2023-by-numbers
52 | [11]: https://lol.fandom.com/wiki/Sylas
53 | [12]: https://www.invenglobal.com/articles/9431/worlds-2019-the-pantheonyuumi-problem-that-riot-needs-to-fix
54 | [13]: https://en.wikipedia.org/wiki/League_of_Legends_World_Championship


--------------------------------------------------------------------------------
/answers/example/lol_sylas/answer_2.md:
--------------------------------------------------------------------------------
 1 | # The Game-Saving Sylas Play That Defined Worlds 2024
 2 | 
 3 | The legendary League of Legends play you're referring to occurred during the **2024 World Championship Finals (Season 14)** at The O2 Arena in London on November 2, 2024. **Lee "Faker" Sang-hyeok** of T1 executed what many consider the most impactful individual play in recent World Championship history, using Sylas to steal Rakan's ultimate and secure T1's fifth world title when they were on the brink of elimination. [1] [2]
 4 | 
 5 | ## The championship-defining moment
 6 | 
 7 | **Match Context** : T1 faced Bilibili Gaming (BLG) in Game 4 of the finals, down 2-1 in the series and facing elimination. [3] Faker had blind-picked Sylas mid-lane for the crucial must-win game, while BLG's support ON (Luo Wen-Jun) was playing Rakan. [4]
 8 | 
 9 | At approximately the 20-minute mark, with T1 struggling to find their footing, Faker positioned his Sylas strategically to the side of mid-lane during a crucial teamfight. **He used Sylas' Hijack ability to steal Rakan's ultimate "The Quickness"** from ON, then immediately utilized the stolen ability's movement speed and innate charm effect, combined with Flash and Hextech Rocketbelt, to lock down both of BLG's primary damage dealers - their AP and AD carries. [1]
10 | 
11 | This play allowed the rest of T1 to follow up with their full combination of abilities, eliminating BLG's key damage sources and completely turning the tide of the game. [4] The official LoL Esports broadcast famously declared **"FAKER IS NOT OF THIS EARTH"** as the play unfolded. [1] [4] T1 leveraged this momentum to secure the Chemtech Dragon Soul and ultimately win Game 4, forcing the decisive Game 5 which they won 3-2 to claim the championship. [4]
12 | 
13 | ## Faker's tournament-long Sylas mastery
14 | 
15 | Throughout the entire 2024 World Championship, **Faker played Sylas in 6 games with a 66.7% win rate** (4 wins, 2 losses), making it his most-played champion of the tournament. His Sylas statistics across all games were:
16 | 
17 | - **KDA: 2.56** (16 kills, 16 deaths, 25 assists)
18 | - **CS per Game: 245.17** (8.15 CS/min average)
19 | - **Damage per Game: 11,600** (384.1 DPM)
20 | - **Kill Participation: 61.2%**
21 | - **Kill Share: 23.9%**
22 | 
23 | Sylas represented **6 out of Faker's 17 total games** (35% of his champion picks), demonstrating both his confidence in the champion and its strategic importance to T1's championship run. [5] Despite not leading statistical categories among mid-laners, Faker's impact through clutch playmaking was immeasurable. [6]
24 | 
25 | ## Tournament significance and legacy
26 | 
27 | This play occurred during T1's remarkable championship run where they entered as the 4th seed from LCK - the lowest possible seed. Having nearly missed Worlds qualification entirely, the team completed one of the most impressive underdog stories in esports history. [4] **Faker was named Finals MVP** (his second Finals MVP award, first since 2016) [4] and secured his historic **500th kill at Worlds** during this same Game 4. [7]
28 | 
29 | The Sylas pick became increasingly prioritized throughout the tournament, with opponents frequently banning it against T1 in later stages. The champion perfectly suited Faker's playmaking style and became instrumental in T1 becoming the **first roster to win back-to-back World Championships** . [8]
30 | 
31 | ## Verified sources and documentation
32 | 
33 | This information is documented across multiple official sources:
34 | 
35 | - **Leaguepedia Official Statistics Database** : Comprehensive tournament statistics and match details
36 | - **LoL Esports Official Coverage** : Game 4 highlights and tournament broadcasts
37 | - **Oracle's Elixir** : Professional esports analytics with advanced metrics [9]
38 | - **Games of Legends (gol.gg)** : Detailed champion-specific tournament data [10] [11]
39 | - **Official Worlds 2024 Broadcast VODs** : Complete match footage and commentary
40 | 
41 | The play has been extensively covered by major esports outlets including ONE Esports, Strafe Esports, and Sheep Esports, with video documentation available through LoL Esports' official channels and broadcast archives.
42 | 
43 | ## Conclusion
44 | 
45 | Faker's Game 4 Sylas play against BLG represents a perfect convergence of mechanical skill, strategic positioning, and championship mentality. [6] While his overall Sylas statistics were solid but not spectacular, the contextual impact of this single play - executed when elimination loomed - directly enabled T1's fifth World Championship [7] and cemented Faker's legacy as the greatest League of Legends player of all time. [12]
46 | 
47 | ## Sources
48 | [1]: https://www.oneesports.gg/league-of-legends/faker-worlds-2024-final-mvp/
49 | [2]: https://en.wikipedia.org/wiki/2024_League_of_Legends_World_Championship_final
50 | [3]: https://www.tacter.com/lol/guides/t1-wins-the-2024-league-of-legends-world-championship-faker-proves-himself-as-the-goat-27bcee11
51 | [4]: https://www.oneesports.gg/league-of-legends/t1-worlds-2024-champion/
52 | [5]: https://lol.fandom.com/wiki/Faker/Statistics/2024
53 | [6]: https://www.strafe.com/news/read/how-one-taunt-from-faker-gave-t1-their-fifth-worlds-title-against-all-odds/
54 | [7]: https://lol.fandom.com/wiki/Faker
55 | [8]: https://www.mobafire.com/league-of-legends/champion/sylas-148
56 | [9]: https://lol.timsevenhuysen.com/
57 | [10]: https://gol.gg/esports/home/
58 | [11]: https://gol.gg/stats/tools/
59 | [12]: https://www.si.com/esports/league-of-legends/worlds-2024-finals-faker-breaks-viewership-record


--------------------------------------------------------------------------------
/answers/example/lol_sylas/answer_3.md:
--------------------------------------------------------------------------------
 1 | # The Search for the Legendary Sylas-Rakan Ultimate Steal
 2 | 
 3 | After extensive research across multiple World Championship tournaments from S9-S14, **no specific documented play matches exactly the described scenario** of Sylas stealing Rakan's ultimate in World Championship finals to lock down double carries and secure victory. However, several remarkable Sylas ultimate steal moments were discovered that provide valuable context about memorable championship plays.
 4 | 
 5 | ## Key findings from World Championship research
 6 | 
 7 | **Most notable Sylas performances in World Championship finals occurred in 2023 and 2024** , when Faker dominated with the champion. In the 2024 World Championship Finals between T1 and Bilibili Gaming, Faker's Sylas appeared in multiple games of the 3-2 series victory, [1] with Game 4 featuring particularly impactful stolen ultimate usage that helped save T1 from elimination. [1] However, **this involved stolen abilities other than Rakan's ultimate** , and the specific double-carry lockdown scenario was not documented.
 8 | 
 9 | The 2023 World Championship Finals saw Faker's Sylas in Game 2 against Weibo Gaming, where he effectively used stolen Maokai ultimates in T1's dominant 3-0 sweep. [2] Again, no Rakan ultimate steals were recorded in this series.
10 | 
11 | ## Alternative legendary Sylas moments discovered
12 | 
13 | The research revealed **Caps' extraordinary Sylas performance during the 2020 World Championship Quarterfinals** against Gen.G, which featured one of the most clutch ultimate steal sequences in championship history. Playing at less than 2% health, Caps used a stolen Ashe ultimate to engage onto Gen.G's carries, ultimately achieving an 11/0/11 perfect game performance that led to G2's 3-0 sweep. [3]
14 | 
15 | **Earlier championship tournaments (2019-2022) showed limited Sylas presence in finals matches** , with no confirmed picks in 2019, 2020, 2021, or 2022 finals. This is significant because Sylas was only introduced in early 2019, [4] making his championship meta development a gradual process. [5] [6]
16 | 
17 | ## Analysis of Sylas-Rakan interaction potential
18 | 
19 | Research into champion mechanics confirms that **Rakan's ultimate "The Quickness" ranks as A-tier for Sylas to steal** , providing both mobility through movement speed and crowd control through charm effects. This combination would theoretically be highly effective for locking down multiple carries, exactly as described in your query. The stolen ability maintains both the movement speed boost and charm effects when used by Sylas, [7] [8] making it particularly potent for team fight initiation. [9] [8]
20 | 
21 | ## Possible explanations for the missing play
22 | 
23 | Several factors could explain why this specific play wasn't found in the research:
24 | 
25 | **Documentation gaps** may exist in older tournament coverage, particularly for plays that occurred during intense team fights where multiple abilities were used simultaneously. **Regional broadcast differences** might mean certain highlights were emphasized more in specific regions' coverage. Additionally, the play might have occurred in **semifinals or other elimination matches** rather than the grand finals specifically.
26 | 
27 | **Memory conflation** is another possibility - the described play might combine elements from multiple memorable Sylas moments across different matches or tournaments, creating a composite memory of an idealized ultimate steal scenario.
28 | 
29 | ## Conclusion
30 | 
31 | While the exact Sylas-Rakan ultimate steal play you described remains undocumented in available World Championship records, the research reveals rich context about Sylas's evolution as a championship-level pick [4] and the theoretical effectiveness of stealing Rakan's abilities. The closest documented moments involve Faker's clutch Sylas performances in recent finals [10] and Caps' legendary quarterfinals display, both showcasing why stolen ultimates can be game-changing in high-stakes matches. [3]
32 | 
33 | The specific combination of Sylas stealing Rakan's ultimate to lock down double carries represents exactly the type of play that would become legendary if it occurred, [7] but current documentation suggests this particular sequence may be either undocumented, misremembered, or potentially occurred in a different context than World Championship finals.
34 | 
35 | ## Sources
36 | [1]: https://en.wikipedia.org/wiki/2024_League_of_Legends_World_Championship_final
37 | [2]: https://www.si.com/esports/league-of-legends/worlds-2023-finals-recap-t1-faker
38 | [3]: https://dotesports.com/league-of-legends/news/lck-production-team-surprises-fans-with-highlight-reel-of-caps-on-sylas-at-worlds-2020
39 | [4]: https://www.newsweek.com/sylas-league-legends-champion-design-ultimate-bugs-interview-1304190
40 | [5]: https://www.newsweek.com/league-legends-sylas-new-champion-unshackled-1283886
41 | [6]: https://www.leagueoflegends.com/en-us/champions/sylas/
42 | [7]: https://liquipedia.net/leagueoflegends/Sylas
43 | [8]: https://wiki.leagueoflegends.com/en-us/Sylas
44 | [9]: https://leagueoflegends.fandom.com/wiki/Sylas/LoL
45 | [10]: https://www.oneesports.gg/league-of-legends/faker-worlds-2024-final-mvp/


--------------------------------------------------------------------------------
/answers/example/overleaf_template/answer_1.md:
--------------------------------------------------------------------------------
 1 | # Three Overleaf CV templates with top-left photo placement
 2 | 
 3 | Based on my comprehensive research of Overleaf's official template gallery, here are three CV templates that feature built-in photo placement functionality in the top left corner without requiring customization:
 4 | 
 5 | ## 1. ModernCV Classic Template
 6 | 
 7 | **Template Name:** ModernCV and Cover Letter Template **Direct Link:** [1]
 8 | 
 9 | **Built-in Photo Feature:** The ModernCV template includes a built-in `\photo[64pt][0.4pt]{picture}` command that automatically places the photo in the top left corner of the CV header when using the "classic" style. [1] The photo appears to the left of the personal information and name section.
10 | 
11 | **No Customization Required:** Simply uncomment the photo command and specify your image file name. [2] The template automatically handles the positioning in the top left area of the document header.
12 | 
13 | ## 2. AltaCV Template (Bare Bones Version)
14 | 
15 | **Template Name:** AltaCV Template **Direct Link:** [3]
16 | 
17 | **Built-in Photo Feature:** AltaCV provides both `\photoL{2.8cm}{image_name}` and `\photoR{2.8cm}{image_name}` commands for left and right photo placement. [3] The `\photoL` command specifically places photos in the top left corner of the CV header area. [3] The template uses a two-column layout where the photo is positioned in the left column header. [3]
18 | 
19 | **No Customization Required:** The template includes pre-configured photo commands - simply use `\photoL{2.8cm}{your_image}` instead of `\photoR{2.8cm}{Globe_High}` to place your photo in the top left corner. [3]
20 | 
21 | ## 3. AltaCV Marissa Mayer Style Template
22 | 
23 | **Template Name:** Recreating Business Insider's CV of Marissa Mayer using AltaCV **Direct Link:** [4]
24 | 
25 | **Built-in Photo Feature:** This template is specifically designed to recreate the Marissa Mayer CV layout, which prominently features a photo in the top left corner area. [5] It uses the same AltaCV class with `\photoL` functionality but is pre-configured for the specific layout style that places the photo prominently in the top left. [5]
26 | 
27 | **No Customization Required:** The template comes with the photo placement already configured for the top left corner. Simply replace the sample image with your own photo using the `\photoL` command.
28 | 
29 | ## Key Features Common to All Templates:
30 | 
31 | - **Official Overleaf Templates:** All three are from Overleaf's verified template gallery [6]
32 | - **Built-in Functionality:** No LaTeX coding modifications needed [7]
33 | - **Top Left Placement:** Photos are automatically positioned in the top left corner area
34 | - **Easy Implementation:** Simply specify your image filename in the photo command
35 | - **Professional Layouts:** All templates provide polished, professional CV designs [8]
36 | 
37 | The **ModernCV template** is recommended for traditional academic or corporate CVs, while the **AltaCV templates** work well for modern, creative, or tech industry positions. [8] All three templates have been verified to work with standard image formats (JPG, PNG) and require no additional customization beyond specifying your image file. [5] [3]
38 | 
39 | ## Sources
40 | [1]: https://www.overleaf.com/latex/templates/moderncv-and-cover-letter-template/sttkgjcysttn
41 | [2]: https://stackoverflow.com/questions/42182363/how-to-include-a-photo-in-moderncv-casual
42 | [3]: https://www.overleaf.com/latex/templates/altacv-template/trgqjpwnmtgv
43 | [4]: https://www.overleaf.com/latex/templates/recreating-business-insiders-cv-of-marissa-mayer-using-altacv/gtqfpbwncfvp
44 | [5]: https://www.overleaf.com/latex/templates/recreating-business-insiders-cv-of-marissa-mayer/gtqfpbwncfvp
45 | [6]: https://www.overleaf.com/latex/templates/altacv-nicolasomar-fork/htfpmrwhbwpw
46 | [7]: https://resumegenius.com/resume-templates/latex-resume-template
47 | [8]: https://www.latextemplates.com/cat/curricula-vitae


--------------------------------------------------------------------------------
/answers/example/overleaf_template/answer_2.md:
--------------------------------------------------------------------------------
 1 | # Overleaf CV Templates with Top-Left Photo Support
 2 | 
 3 | I found information about CV/resume templates from Overleaf's official template gallery that can display a photo in the top left corner without requiring further customization. However, I need to report some limitations in obtaining the exact direct URLs you requested.
 4 | 
 5 | ## Successfully Located Template with Direct URL:
 6 | 
 7 | ### 1. ModernCV and Cover Letter Template
 8 | 
 9 | **Direct URL** : [1]
10 | 
11 | **Photo Support** : ✅ **Confirmed** - This template includes explicit photo support using the `\photo[64pt][0.4pt]{picture}` command. [2] The photo appears in the top-left area of the CV with customizable dimensions and frame styling. This is an official Overleaf template [2] [1] that requires only replacing the image filename to add your photo. [1]
12 | 
13 | ## Templates Confirmed to Exist with Photo Support (URLs Not Directly Accessible):
14 | 
15 | ### 2. AltaCV Template
16 | 
17 | **Template Type** : Multiple AltaCV template variations available in Overleaf's official gallery [3] **Photo Support** : ✅ **Confirmed** - The AltaCV class supports photos with options like `\documentclass[10pt,a4paper,normalphoto]{altacv}` for square photos and default circular photo cropping. [4] [5] Photos can be placed in the left sidebar/header area using commands like `\photoL{2.5cm}{photo_name}` . [3]
18 | 
19 | ### 3. Simple Academic CV Template
20 | 
21 | **Template Type** : Academic CV template with image support [6] **Photo Support** : ✅ **Confirmed** - Templates in this category include photo/image capabilities using commands like `\headerbox{2.6cm}{blackgray}{white}{Captain}{photo.jpg}` for placing photos in the header area. [7] [8]
22 | 
23 | ## Research Limitations:
24 | 
25 | While I confirmed these templates exist in Overleaf's official collection with the requested photo functionality, I encountered limitations in extracting the exact direct URLs for the AltaCV and Simple Academic CV templates. The search results showed template content and descriptions but didn't always display the complete URL patterns in the required " [9] ]" format.
26 | 
27 | ## Recommendation:
28 | 
29 | For the two templates without direct URLs, I recommend:
30 | 
31 | 1. Visit Overleaf's template gallery at overleaf.com/gallery/tagged/cv [10]
32 | 2. Search for "AltaCV" and "Simple Academic CV"
33 | 3. Look for templates that specifically mention photo support in their descriptions
34 | 
35 | All three template types are confirmed to be from Overleaf's official collection and include built-in photo functionality that works without additional LaTeX customization - requiring only image filename replacement. [10]
36 | 
37 | ## Sources
38 | [1]: https://www.overleaf.com/latex/templates/moderncv-and-cover-letter-template/sttkgjcysttn
39 | [2]: https://www.overleaf.com/latex/examples/curriculum-vitae-for-researchers/jmrscnymyfps
40 | [3]: https://www.overleaf.com/gallery/tagged/cv
41 | [4]: https://www.overleaf.com/latex/templates/altacv-template/trgqjpwnmtgv
42 | [5]: https://www.overleaf.com/latex/templates/recreating-business-insiders-cv-of-marissa-mayer/gtqfpbwncfvp
43 | [6]: https://www.overleaf.com/latex/examples/tagged/cv
44 | [7]: https://www.overleaf.com/latex/templates/simple-academic-cv-with-photo/gxkrgqfhrwfp
45 | [8]: https://www.overleaf.com/latex/templates/simple-academic-cv/dncpvjfmfjqx
46 | [9]: https://www.overleaf.com/latex/templates/%5Btemplate-name
47 | [10]: https://www.overleaf.com/latex/templates/modern-latex-cv/qmdwjvcrcrph


--------------------------------------------------------------------------------
/answers/example/overleaf_template/answer_3.md:
--------------------------------------------------------------------------------
 1 | # Three Overleaf CV Templates with Photo Support
 2 | 
 3 | After searching Overleaf's official template gallery, I found several CV templates with built-in photo functionality. However, **most professional CV templates position photos on the left sidebar rather than specifically in the "top left corner"** as requested. Here are the three best templates that support photo display with left-side positioning:
 4 | 
 5 | ## Template findings and important clarification
 6 | 
 7 | **Critical Note** : The specific requirement for photos in the "top left corner" is rarely implemented in professional CV templates, as most use left sidebar layouts where photos are positioned vertically along the left side. The templates below offer the closest match to your requirements with left-side photo placement.
 8 | 
 9 | ## 1. AltaCV Template
10 | 
11 | **Status** : Available on Overleaf's template gallery **Photo Functionality** : Built-in left-side photo support with `\photoL{2cm}{photo}` command [1] **Direct Link** : Search "AltaCV" in Overleaf's template gallery or access through the CV/Resume section **Photo Position** : Left side of document, can be positioned at top of left column **Key Features** :
12 | 
13 | - Supports both circular and rectangular photos with `normalphoto` option [2]
14 | - Multiple photo placement options ( `\photoL` for left, `\photoR` for right) [2]
15 | - Based on Marissa Mayer's CV design [1]
16 | - No customization required for photo display
17 | 
18 | ## 2. Twenty Seconds CV Template
19 | 
20 | **Status** : Available on Overleaf's template gallery **Photo Functionality** : Built-in circular photo display in left sidebar **Direct Link** : Search "Twenty Seconds CV" or "TwentySecondsCurriculumVitae" in Overleaf's gallery **Photo Position** : Left sidebar, positioned at top of sidebar section **Key Features** :
21 | 
22 | - Prominent circular photo placement
23 | - Left sidebar design with photo at top
24 | - Color customizable
25 | - Timeline and progress bar features
26 | 
27 | ## 3. Forty Seconds CV Template
28 | 
29 | **Status** : Available on Overleaf's template gallery **Photo Functionality** : Advanced photo customization options **Direct Link** : Search "Forty Seconds CV" in Overleaf's template gallery **Photo Position** : Left sidebar with customizable photo positioning [3] **Key Features** :
30 | 
31 | - `profilepicsize` , `profilepicstyle` , and positioning options [4]
32 | - Multiple photo styles (circular, square)
33 | - Highly customizable layout
34 | - Based on Twenty Seconds CV but with enhanced features [4]
35 | 
36 | ## How to access these templates
37 | 
38 | Since Overleaf's template gallery uses dynamic URLs, the most reliable way to access these templates is:
39 | 
40 | 1. Go to **overleaf.com** [5] [6]
41 | 2. Click on **"Templates"** in the top menu
42 | 3. Filter by **"CV/Resume"** or search for the specific template names: [5] [7] - "AltaCV" - "Twenty Seconds CV" - "Forty Seconds CV"
43 | 
44 | ## Important considerations
45 | 
46 | **Layout Reality** : Professional CV templates typically use **left sidebar layouts** rather than placing photos in the absolute "top left corner" of the page. [8] The templates above position photos in the **top portion of the left sidebar** , which is the closest professional equivalent to your specification.
47 | 
48 | **Recommendation** : The **AltaCV template** is your best option as it offers the most flexible photo positioning with the `\photoL` command and explicitly supports left-side photo placement without requiring modifications.
49 | 
50 | All three templates support photo display without customization and are available in Overleaf's official template gallery, [5] [7] though none place photos in the literal "top left corner" as this isn't a standard professional CV layout pattern.
51 | 
52 | ## Sources
53 | [1]: https://www.overleaf.com/latex/templates/tagged/cv
54 | [2]: https://www.overleaf.com/latex/templates/altacv-template/trgqjpwnmtgv
55 | [3]: https://www.overleaf.com/latex/examples/tagged/cv
56 | [4]: https://www.overleaf.com/latex/templates/forty-seconds-cv/pztcktmyngsk
57 | [5]: https://www.overleaf.com/gallery/tagged/cv
58 | [6]: https://www.overleaf.com/latex/templates/jakes-resume/syzfjbzwjncs
59 | [7]: https://www.overleaf.com/latex/templates/tagged/cv/page/2
60 | [8]: https://www.latextemplates.com/cat/curricula-vitae


--------------------------------------------------------------------------------
/answers/example/yu_lineage/answer_1.md:
--------------------------------------------------------------------------------
  1 | # Academic Lineage of Prof. Yu Su
  2 | 
  3 | Tracing the doctoral advisor relationships through five generations reveals a fascinating intellectual heritage spanning computer science, data mining, and philosophy. The first three generations are definitively confirmed through multiple authoritative sources, while generations four and five require additional archival research for complete verification.
  4 | 
  5 | ## Generation 1: Xifeng Yan (Confirmed)
  6 | 
  7 | **Prof. Yu Su's doctoral advisor** is **Dr. Xifeng Yan** , Professor of Computer Science at University of California, Santa Barbara.
  8 | 
  9 | **Verification Sources:**
 10 | 
 11 | - **UCSB Official Announcement (2018)** : "UCSB Computer Science Ph.D. candidate Yu Su recently accepted a faculty position with Ohio State University as Assistant Professor in the Computer Science and Engineering Department beginning Fall 2018. Currently, he is finishing up his Ph.D. with Dr. Xifeng Yan." [1]
 12 | - **Academic Database** : Direct citation "PhD Advisor · Xifeng Yan · 2012 – 2018" [2]
 13 | - **Publication Record** : Multiple co-authored papers during PhD years (2012-2018) in EMNLP, NAACL, SDM
 14 | - **Google Scholar** : Xifeng Yan prominently listed in Yu Su's collaborator network [3]
 15 | 
 16 | **Academic Details:**
 17 | 
 18 | - PhD completed at UCSB (2012-2018) [1]
 19 | - Research focus: Natural language processing, data mining, question answering [1]
 20 | - Yu Su received Outstanding Dissertation Award from UCSB Computer Science (2019) [4] [5]
 21 | 
 22 | ## Generation 2: Jiawei Han (Confirmed)
 23 | 
 24 | **Dr. Xifeng Yan's doctoral advisor** is **Dr. Jiawei Han** , Michael Aiken Chair Professor at University of Illinois at Urbana-Champaign.
 25 | 
 26 | **Verification Sources:**
 27 | 
 28 | - **Mathematics Genealogy Project** : Direct citation "Advisor 1: Jiawei Han" for Xifeng Yan (MGP ID: 279264) [6]
 29 | - **Academic Profile Database** : "PhD Advisor · Jiawei Han · 2001 – 2006" [7]
 30 | - **UIUC Data Mining Group Website** : Consistently refers to Yan as "our group's alumnus" [8]
 31 | - **Dissertation Record** : "Mining, Indexing and Similarity Search in Large Graph Data Sets" (2006) [6]
 32 | 
 33 | **Academic Details:**
 34 | 
 35 | - PhD completed at UIUC (2001-2006) [7] [9]
 36 | - **ACM SIGMOD Dissertation Award Honorable Mention** (2007) [10]
 37 | - Research conducted within Han's renowned Data Mining Group (DMG) [8]
 38 | 
 39 | ## Generation 3: Larry E. Travis (Confirmed)
 40 | 
 41 | **Dr. Jiawei Han's doctoral advisor** is **Dr. Larry E. Travis** (1929-2017), Professor Emeritus at University of Wisconsin-Madison.
 42 | 
 43 | **Verification Sources:**
 44 | 
 45 | - **Mathematics Genealogy Project** : "Advisor: Larry Travis" for Jiawei Han (MGP ID: 72247) [11]
 46 | - **OpenReview Database** : "PhD Advisor · Larry Travis · 1983 – 1985" [12]
 47 | - **Joint Publication (1985)** : Han and Travis co-authored "Using Expert Knowledge in Database-Oriented Problem Solving" [13]
 48 | - **Multiple Biographical Sources** : Consistent documentation across academic profiles
 49 | 
 50 | **Academic Details:**
 51 | 
 52 | - PhD completed at University of Wisconsin-Madison (1985) [14]
 53 | - Dissertation: "Pattern-Based and Knowledge-Directed Query Compilation for Recursive Data Bases" [11]
 54 | - Travis served as Computer Sciences Department Chair (1978-1980) and Director of Madison Academic Computing Center [15] [16]
 55 | - Research focus: Artificial intelligence, expert systems, Prolog [15]
 56 | 
 57 | ## Generation 4: Richard Montague (Probable - Requires Verification)
 58 | 
 59 | **Dr. Larry E. Travis's doctoral advisor** is most likely **Dr. Richard Montague** , but this requires additional archival research for definitive confirmation.
 60 | 
 61 | **Current Evidence:**
 62 | 
 63 | - **Larry E. Travis PhD Details** : University of California, Los Angeles, Philosophy Department (1966) [16] [17]
 64 | - **Dissertation** : "A Logical Analysis of the Concept of Stored Program: A Step Toward a Possible Theory of Rational Learning" [16] [17]
 65 | - **Mathematics Genealogy Project** : Lists Travis (MGP ID: 82279) [17] but **does not specify his advisor** [17]
 66 | 
 67 | **Most Likely Candidate - Richard Montague:**
 68 | 
 69 | - UCLA Philosophy Professor specializing in mathematical logic (1955-1971) [18]
 70 | - **Confirmed dissertation supervisor** for other UCLA logic students (Nino Cocchiarella, Hans Kamp) [18]
 71 | - Co-founder of UCLA Logic Colloquium with C.C. Chang [19] [20]
 72 | - Research specialization aligns with Travis's dissertation on formal logical systems
 73 | - Active during Travis's doctoral period (1960s)
 74 | 
 75 | **Research Limitations:**
 76 | 
 77 | - No online sources definitively name Travis's advisor
 78 | - UCLA archival records from 1966 not digitally accessible
 79 | - ProQuest dissertation database access needed for full dissertation acknowledgments
 80 | 
 81 | ## Generation 5: Cannot Be Determined
 82 | 
 83 | Without definitive confirmation of Generation 4, Generation 5 cannot be reliably identified. If Richard Montague is confirmed as Travis's advisor, his doctoral advisor would need to be researched (likely at UC Berkeley under Alfred Tarski, based on typical academic patterns of the era).
 84 | 
 85 | ## Research Quality Assessment
 86 | 
 87 | **High Confidence (Generations 1-3):**
 88 | 
 89 | - Multiple independent, authoritative sources
 90 | - Mathematics Genealogy Project confirmation [21]
 91 | - Official university announcements
 92 | - Consistent publication and collaboration records
 93 | 
 94 | **Medium Confidence (Generation 4):**
 95 | 
 96 | - Strong circumstantial evidence pointing to Richard Montague
 97 | - Timeline and specialization alignment
 98 | - No contradictory evidence found
 99 | 
100 | **Requires Additional Research:**
101 | 
102 | - UCLA Special Collections and University Archives [22]
103 | - Access to original 1966 dissertation with acknowledgments
104 | - UCLA Philosophy Department historical records
105 | - Contact with Mathematics Genealogy Project to add missing advisor information
106 | 
107 | ## Academic Lineage Summary
108 | 
109 | 1. **Yu Su** → Xifeng Yan (UCSB, 2012-2018) [1] ✓ **Confirmed**
110 | 2. **Xifeng Yan** → Jiawei Han (UIUC, 2001-2006) ✓ **Confirmed**
111 | 3. **Jiawei Han** → Larry E. Travis (UW-Madison, 1983-1985) ✓ **Confirmed**
112 | 4. **Larry E. Travis** → Richard Montague (UCLA, ~1966) ? **Probable**
113 | 5. **Richard Montague** → [Unknown] ? **Requires Generation 4 confirmation**
114 | 
115 | This lineage represents a remarkable intellectual trajectory from early computer science and AI foundations through data mining innovations to modern natural language processing and large language models, spanning over five decades of computational research evolution.
116 | 
117 | ## Sources
118 | [1]: https://cs.ucsb.edu/index.php/happenings/news/phd-candidate-yu-su-set-join-ohio-state-assistant-professor
119 | [2]: https://openreview.net/profile?id=~Yu_Su2
120 | [3]: https://scholar.google.com/citations?user=rIh5OqoAAAAJ&hl=en
121 | [4]: https://cse.osu.edu/news/2025/02/cse-assistant-professor-yu-su-honored-2025-sloan-research-fellowship
122 | [5]: https://ysu1989.github.io/
123 | [6]: https://mathgenealogy.org/id.php?id=279264
124 | [7]: https://openreview.net/profile?id=~Xifeng_Yan1
125 | [8]: http://dm1.cs.uiuc.edu/
126 | [9]: https://iee.ucsb.edu/people/faculty/xifeng-yan
127 | [10]: https://dais.cs.illinois.edu/dais-news/dais-news-archive/
128 | [11]: https://mathgenealogy.org/id.php?id=72247
129 | [12]: https://openreview.net/profile?id=~Jiawei_Han1
130 | [13]: https://aisel.aisnet.org/icis1985/2/
131 | [14]: https://en.wikipedia.org/wiki/Jiawei_Han
132 | [15]: https://www.cs.wisc.edu/2017/09/25/in-memoriam-professor-emeritus-larry-travis/
133 | [16]: https://en.wikipedia.org/wiki/Larry_Travis
134 | [17]: https://www.genealogy.math.ndsu.nodak.edu/id.php?id=82279
135 | [18]: https://en.wikipedia.org/wiki/Richard_Montague
136 | [19]: http://www.logic.ucla.edu/history.html
137 | [20]: https://nap.nationalacademies.org/read/10683/chapter/15
138 | [21]: https://en.wikipedia.org/wiki/Mathematics_Genealogy_Project
139 | [22]: https://guides.library.ucla.edu/c.php?g=180289&p=1185474


--------------------------------------------------------------------------------
/answers/example/yu_lineage/answer_2.md:
--------------------------------------------------------------------------------
 1 | # Prof. Yu Su's Academic Lineage: Five Generations Traced
 2 | 
 3 | The academic genealogy of Prof. Yu Su at The Ohio State University reveals a remarkable intellectual heritage spanning from modern AI and natural language processing back to the foundational logic schools of early 20th-century Europe. Through comprehensive research across academic databases, university archives, and genealogy projects, I have successfully traced five complete generations of doctoral advisor-advisee relationships.
 4 | 
 5 | ## Generation 1: Xifeng Yan (Yu Su's Doctoral Advisor)
 6 | 
 7 | **Full Name:** Xifeng Yan **Institution:** University of California, Santa Barbara (PhD completed 2006) **Year of PhD Completion:** 2006 **Doctoral Advisor:** Jiawei Han (University of Illinois at Urbana-Champaign) [1]
 8 | 
 9 | **Academic Information:** Xifeng Yan currently holds the Venkatesh Narayanamurti Chair in Computer Science at UC Santa Barbara. [2] His research focuses on data mining, graph mining, knowledge graphs, and machine learning. **He received the SIGMOD Distinguished Dissertation Award Honorable Mention in 2007** . Yan supervised Yu Su's PhD from 2012-2018, [3] [4] during which Su won the Outstanding Dissertation Award from UCSB in 2019. [5] The Mathematics Genealogy Project confirms Yan as having supervised 3 students with 3 total academic descendants. [1]
10 | 
11 | ## Generation 2: Jiawei Han (Xifeng Yan's Doctoral Advisor)
12 | 
13 | **Full Name:** Jiawei Han (韩家炜) [6] **Institution:** University of Wisconsin-Madison (PhD completed 1985) [7] **Year of PhD Completion:** 1985 **Doctoral Advisor:** Larry E. Travis (University of Wisconsin-Madison)
14 | 
15 | **Academic Information:** Currently the Michael Aiken Chair Professor of Computer Science at University of Illinois at Urbana-Champaign, Jiawei Han is one of the most cited computer scientists globally [8] with over 268,000 citations. [8] **He is an ACM Fellow, IEEE Fellow, and recipient of the ACM SIGKDD Innovation Award (2004)** . [6] His dissertation was titled "Pattern-Based and Knowledge-Directed Query Compilation for Recursive Data Bases." [7] The Mathematics Genealogy Project (ID: 72247) shows he has supervised 13 students with 45 total academic descendants, [7] making him a highly influential figure in data mining and database systems. [7]
16 | 
17 | ## Generation 3: Larry E. Travis (Jiawei Han's Doctoral Advisor)
18 | 
19 | **Full Name:** Larry E. Travis **Institution:** University of California, Los Angeles (PhD completed 1966) [9] **Year of PhD Completion:** 1966 **Doctoral Advisor:** Richard Montague (UCLA Philosophy Department)
20 | 
21 | **Academic Information:** Larry Travis (1929-2017) served as Professor of Computer Science at University of Wisconsin-Madison from 1964-1994, [10] including tenure as Department Chair from 1978-1980. [11] **He was among the founding faculty of UW-Madison's Computer Science Department** . His dissertation was titled "A Logical Analysis of the Concept of Stored Program: A Step Toward a Possible Theory of Rational Learning." [9] [10] Travis specialized in artificial intelligence, expert systems, and Prolog, [11] representing the intersection of logic, philosophy, and early computer science. He also served as Director of the Madison Academic Computing Center. [11]
22 | 
23 | ## Generation 4: Richard Montague (Larry Travis's Doctoral Advisor)
24 | 
25 | **Full Name:** Richard Merett Montague **Institution:** University of California, Berkeley (PhD completed 1957) **Year of PhD Completion:** 1957 **Doctoral Advisor:** Alfred Tarski (UC Berkeley Philosophy Department)
26 | 
27 | **Academic Information:** Richard Montague (1930-1971) was a prominent logician and philosopher at UCLA from 1955 until his death in 1971. [12] **He is best known for Montague Grammar, a revolutionary approach to natural language semantics that treated natural language as a formal mathematical system** . [12] [13] Montague's work became foundational for computational linguistics and natural language processing, [14] creating a direct intellectual connection to Yu Su's current research. [14] He was a central figure in establishing UCLA as a world center for mathematical logic alongside Rudolf Carnap. [15] Montague supervised several notable logicians including David Kaplan and Hans Kamp. [16]
28 | 
29 | ## Generation 5: Alfred Tarski (Richard Montague's Doctoral Advisor)
30 | 
31 | **Full Name:** Alfred Tarski (born Alfred Teitelbaum) **Institution:** University of Warsaw (PhD completed 1924) **Year of PhD Completion:** 1924 **Doctoral Advisor:** Stanisław Leśniewski (University of Warsaw)
32 | 
33 | **Academic Information:** Alfred Tarski (1901-1983) was one of the most influential logicians and mathematicians of the 20th century. [17] [18] **He is best known for his semantic theory of truth and his work on model theory, which became fundamental to computer science and artificial intelligence** . [17] After emigrating to the United States in 1939, Tarski joined UC Berkeley where he remained until his death, establishing Berkeley as a major center for logic and mathematics. [17] [19] He supervised numerous students who became leaders in logic, mathematics, and early computer science, creating an intellectual lineage that spans from pure mathematics to modern AI. [17]
34 | 
35 | ## Extended Heritage: The Warsaw School Connection
36 | 
37 | Beyond the required five generations, the lineage continues to **Stanisław Leśniewski** (1886-1939), a founder of modern mathematical logic at the University of Warsaw, [20] [21] and ultimately connects to **Kazimierz Twardowski** (1866-1938), founder of the influential Lwów-Warsaw School of Logic [21] that revolutionized 20th-century philosophy and mathematics.
38 | 
39 | ## Intellectual Evolution Analysis
40 | 
41 | This academic lineage represents a fascinating evolution of ideas: from the foundational logical systems of the Warsaw School (Tarski, Leśniewski) through semantic theory and formal language analysis (Montague), to artificial intelligence and expert systems (Travis), database systems and data mining (Jiawei Han), graph mining and knowledge discovery (Yan), and finally to modern natural language processing and AI agents (Yu Su). **The thread connecting mathematical logic to contemporary AI demonstrates how foundational theoretical work eventually enables practical applications decades later** .
42 | 
43 | ## Research Methodology and Source Verification
44 | 
45 | This genealogy was verified through multiple authoritative sources including the Mathematics Genealogy Project (primary source for academic lineage verification), [22] university historical records, dissertation databases, academic CVs and biographical materials, and cross-referencing across multiple academic databases. Each generation was confirmed through at least two independent sources, with particular attention paid to resolving conflicting information through primary academic records.
46 | 
47 | The research revealed that **Prof. Yu Su's academic heritage connects modern artificial intelligence research to the foundational mathematical logic tradition of early 20th-century Europe** , representing over a century of intellectual development from pure mathematical theory to applied computational intelligence.
48 | 
49 | ## Sources
50 | [1]: https://mathgenealogy.org/id.php?id=279264
51 | [2]: https://ysu1989.github.io/
52 | [3]: https://openreview.net/profile?id=~Yu_Su2
53 | [4]: https://cs.ucsb.edu/index.php/happenings/news/phd-candidate-yu-su-set-join-ohio-state-assistant-professor
54 | [5]: https://cse.osu.edu/people/su.809
55 | [6]: https://en.wikipedia.org/wiki/Jiawei_Han
56 | [7]: https://mathgenealogy.org/id.php?id=72247
57 | [8]: https://siebelschool.illinois.edu/news/han_research_ranking
58 | [9]: https://www.genealogy.math.ndsu.nodak.edu/id.php?id=82279
59 | [10]: https://en.wikipedia.org/wiki/Larry_Travis
60 | [11]: https://www.cs.wisc.edu/2017/09/25/in-memoriam-professor-emeritus-larry-travis/
61 | [12]: https://en.wikipedia.org/wiki/Richard_Montague
62 | [13]: https://ncatlab.org/nlab/show/Richard+Montague
63 | [14]: https://linguistics.stanford.edu/events/semantics-and-pragmatics/richard-montague-simplicity-language-complexity-life-toward
64 | [15]: https://tigerneil.wordpress.com/2012/07/13/a-brief-history-of-logic-at-ucla/
65 | [16]: https://www.richardmontague.com/
66 | [17]: https://en.wikipedia.org/wiki/Alfred_Tarski
67 | [18]: https://fr.wikipedia.org/wiki/Abraham_Adolf_Fraenkel
68 | [19]: http://www.logic.ucla.edu/history.html
69 | [20]: https://plato.sydney.edu.au/archives/sum2018/entries/lesniewski/
70 | [21]: https://www.scientificlib.com/en/Mathematics/Biographies/StanislawLesniewski.html
71 | [22]: https://www.mathgenealogy.org/id.php?id=106535&lang=en


--------------------------------------------------------------------------------
/answers/example/yu_lineage/answer_3.md:
--------------------------------------------------------------------------------
 1 | # Prof. Yu Su's Academic Lineage: Five Generations
 2 | 
 3 | This research traces the complete doctoral advisor lineage of Prof. Yu Su at The Ohio State University through five generations, revealing a remarkable intellectual heritage spanning computer science, mathematics, and mathematical logic from the early 20th century to today.
 4 | 
 5 | **Yu Su's doctoral advisor is Dr. Xifeng Yan (UC Santa Barbara), whose lineage traces back through distinguished figures including data mining pioneer Jiawei Han, AI researcher Larry Travis, famous mathematician Abraham Robinson, and Hungarian function theorist Paul Dienes.** This lineage spans multiple countries, institutions, and mathematical disciplines, representing nearly a century of academic scholarship from Europe to North America.
 6 | 
 7 | ## Generation 1: Xifeng Yan (UC Santa Barbara)
 8 | 
 9 | **Full Name:** Xifeng Yan **PhD Institution:** University of Illinois at Urbana-Champaign **Year Completed:** 2006 **Doctoral Advisor:** Jiawei Han **Primary Research Area:** Data mining, graph mining, knowledge bases, and conversational AI
10 | 
11 | Xifeng Yan serves as Professor and holds the Venkatesh Narayanamurti Chair in Computer Science at UC Santa Barbara. [1] His 2006 dissertation "Mining, Indexing and Similarity Search in Large Graph Data Sets" [2] earned him the **ACM-SIGMOD Dissertation Runner-Up Award** in 2007. [3] [4] He has received numerous accolades including the NSF CAREER Award and IBM Invention Achievement Award, with over 23,000 citations demonstrating his significant impact on data mining research. [3]
12 | 
13 | **Sources:** Mathematics Genealogy Project (MGP ID 279264), [5] [2] UCSB Computer Science Department announcements, Google Scholar profiles, academic publication records
14 | 
15 | ## Generation 2: Jiawei Han (University of Illinois at Urbana-Champaign)
16 | 
17 | **Full Name:** Jiawei Han **PhD Institution:** University of Wisconsin-Madison **Year Completed:** 1985 [6] **Doctoral Advisor:** Larry E. Travis **Primary Research Area:** Data mining, text mining, database systems, and information networks
18 | 
19 | Jiawei Han is the Michael Aiken Chair Professor at UIUC and founder of the influential Data Mining Group (DMG). [7] Born in Shanghai in 1949, he completed his bachelor's degree at University of Science and Technology of China in 1979 before pursuing doctoral studies at UW-Madison. [8] His dissertation "Pattern-Based and Knowledge-Directed Query Compilation for Recursive Data Bases" established foundations for intelligent database systems. [9] Han is an **ACM Fellow and IEEE Fellow** , recipient of multiple prestigious awards including the ACM SIGKDD Innovations Award (2004) and IEEE Computer Society W. Wallace McDowell Award (2009). [8] He authored the widely-used textbook "Data Mining: Concepts and Techniques." [8]
20 | 
21 | **Sources:** Mathematics Genealogy Project, UIUC faculty records, ACM/IEEE award announcements, DAIS Laboratory documentation
22 | 
23 | ## Generation 3: Larry E. Travis (University of Wisconsin-Madison)
24 | 
25 | **Full Name:** Larry E. Travis **PhD Institution:** University of California, Los Angeles (UCLA) **Year Completed:** 1966 **Doctoral Advisor:** Abraham Robinson **Primary Research Area:** Artificial intelligence, expert systems, Prolog programming, and AI applications
26 | 
27 | Larry Travis (1929-2017) served as Professor Emeritus in the Computer Sciences Department at UW-Madison for 30 years (1964-1994). [10] His 1966 dissertation "A Logical Analysis of the Concept of Stored Program: A Step Toward a Possible Theory of Rational Learning" reflected his philosophical approach to computing. [11] Travis was **Chair of the Computer Sciences Department (1978-1980)** and Director of Madison Academic Computing Center, playing a crucial role in expanding computing across the university system. [12] He pioneered work in expert systems and Prolog programming at UW-Madison. [12]
28 | 
29 | **Sources:** Mathematics Genealogy Project, Wikidata, UW-Madison memorial records, department historical archives
30 | 
31 | ## Generation 4: Abraham Robinson (UCLA)
32 | 
33 | **Full Name:** Abraham Robinson (born Abraham Robinsohn) **PhD Institution:** University of London (Birkbeck College) **Year Completed:** 1949 **Doctoral Advisor:** Paul Dienes **Primary Research Area:** Mathematical logic, model theory, and nonstandard analysis
34 | 
35 | Abraham Robinson (1918-1974) was one of the most influential mathematicians of the 20th century. [13] [14] Born in Germany and later studying at Hebrew University of Jerusalem under Abraham Fraenkel, he completed his PhD in 1949 with dissertation "The Metamathematics of Algebraic Systems." [14] [13] Robinson revolutionized mathematics with his development of **nonstandard analysis** , providing a rigorous foundation for infinitesimals and validating Leibniz's original approach to calculus. [15] He published his groundbreaking book "Non-standard Analysis" in 1966 while at UCLA (1962-1967). [14] [16] **Robinson was elected to the National Academy of Sciences** shortly before his death and received the Brouwer Medal from the Dutch Mathematical Society (1973). His work spans 130 papers and 9 books, [15] with the Mathematics Genealogy Project documenting 19 direct students and 537 academic descendants. [17] [18]
36 | 
37 | **Sources:** Mathematics Genealogy Project, National Academy of Sciences Biographical Memoirs, MacTutor History of Mathematics, Yale University Archives, Dictionary of Scientific Biography
38 | 
39 | ## Generation 5: Paul Dienes (University of London)
40 | 
41 | **Full Name:** Paul Alexander Dienes (Hungarian: Dienes Pál) **PhD Institution:** Eötvös Loránd University and Université Paris IV-Sorbonne **Year Completed:** 1905 (Budapest), 1907 (Paris) **Doctoral Advisor:** Not definitively documented in available sources **Primary Research Area:** Function theory, mathematical analysis, and Taylor series
42 | 
43 | Paul Dienes (1882-1952) was a Hungarian mathematician who fled political persecution in 1920 and became Professor of Mathematics at Birkbeck College, London (1923-1948). [19] Born in Tokaj, Austria-Hungary, he earned dual PhDs with his dissertation "Essai sur les Singularités des Fonctions Analytiques." [20] [21] Dienes is best known for his work "The Taylor Series" (1931) [19] and for supervising several distinguished mathematicians including Abraham Robinson and Ralph Henstock. [19] [22] His academic career bridged the classical European mathematical tradition with the emerging modern mathematical landscape of the mid-20th century.
44 | 
45 | **Sources:** Mathematics Genealogy Project, Birkbeck College records, [23] Hungarian mathematical biography sources, academic genealogy databases
46 | 
47 | ## Research Methodology and Source Verification
48 | 
49 | This lineage was traced using multiple authoritative sources including the Mathematics Genealogy Project, university archives, official faculty records, academic databases, and biographical sources. [18] Each advisor-student relationship was verified through multiple independent sources, including official university announcements, [24] dissertation records, academic profiles, and peer-reviewed publications. The research revealed some gaps in computer science coverage in traditional academic genealogy databases, highlighting the importance of interdisciplinary verification across mathematical and computer science sources. [25] [26]
50 | 
51 | The traced lineage spans remarkable intellectual diversity: from modern AI and data mining (Yu Su, Yan, Han) through early computer science and AI (Travis) to foundational mathematical logic (Robinson) and classical analysis (Dienes), representing nearly a century of mathematical and computational advancement across multiple continents and institutions.
52 | 
53 | ## Sources
54 | [1]: https://scholar.google.com/citations?user=XZV2eogAAAAJ&hl=en
55 | [2]: https://mathgenealogy.org/id.php?id=279264
56 | [3]: https://iee.ucsb.edu/people/faculty/xifeng-yan
57 | [4]: https://ml.ucsb.edu/people/faculty/xifeng-yan
58 | [5]: https://www.genealogy.math.ndsu.nodak.edu/id.php?id=279264
59 | [6]: https://www.coursera.org/instructor/jiaweihan
60 | [7]: https://dais.cs.illinois.edu/
61 | [8]: https://en.wikipedia.org/wiki/Jiawei_Han
62 | [9]: https://mathgenealogy.org/id.php?id=72247
63 | [10]: https://en.wikipedia.org/wiki/Larry_Travis
64 | [11]: https://www.genealogy.math.ndsu.nodak.edu/id.php?id=82279
65 | [12]: https://www.cs.wisc.edu/2017/09/25/in-memoriam-professor-emeritus-larry-travis/
66 | [13]: https://mathshistory.st-andrews.ac.uk/Biographies/Robinson/
67 | [14]: https://en.wikipedia.org/wiki/Abraham_Robinson
68 | [15]: https://www.encyclopedia.com/science/dictionaries-thesauruses-pictures-and-press-releases/robinson-abraham
69 | [16]: https://en.wikipedia.org/wiki/Nonstandard_analysis
70 | [17]: https://www.genealogy.math.ndsu.nodak.edu/id.php?id=15886
71 | [18]: https://en.wikipedia.org/wiki/Mathematics_Genealogy_Project
72 | [19]: https://mathshistory.st-andrews.ac.uk/Biographies/Dienes_Paul/
73 | [20]: https://www.genealogy.ams.org/id.php?id=24557
74 | [21]: https://en.wikipedia.org/wiki/Paul_Dienes
75 | [22]: https://bookofproofs.github.io/history/19th-century/dienes-paul.html
76 | [23]: https://en.wikipedia.org/wiki/Ralph_Henstock
77 | [24]: https://cs.ucsb.edu/index.php/happenings/news/phd-candidate-yu-su-set-join-ohio-state-assistant-professor
78 | [25]: https://database.cs.wisc.edu/
79 | [26]: https://www.cs.wisc.edu/research/research-groups/


--------------------------------------------------------------------------------
/assets/mind2web2_overview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OSU-NLP-Group/Mind2Web-2/7f0da8401acb5502e1268cae55123e001685c7bb/assets/mind2web2_overview.jpg


--------------------------------------------------------------------------------
/batch_answer_cache.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Batch crawler using CacheFileSys (v2) - file-based cache with single-task design.
  3 | 
  4 | Key changes from the old batch_cache.py:
  5 | - Uses CacheFileSys instead of CacheClass (one cache instance per task)
  6 | - Stores content in task directories instead of PKL files
  7 | - Uses put_web(url, text, screenshot) instead of separate put_text/put_screenshot
  8 | - Removes MHTML storage (not supported in CacheFileSys)
  9 | - Memory efficient: only indexes in memory, content loaded on-demand
 10 | 
 11 | Depends on unified path management (`PathConfig`), which auto-detects the
 12 | project root and subdirectories like dataset/workspace. No manual path
 13 | concatenation needed.
 14 | """
 15 | 
 16 | from __future__ import annotations
 17 | 
 18 | import argparse
 19 | import asyncio
 20 | import json
 21 | import os
 22 | import random
 23 | import re
 24 | from pathlib import Path
 25 | from typing import Any, Dict, List, Tuple, Optional
 26 | from urllib.parse import urlparse
 27 | 
 28 | from pydantic import BaseModel
 29 | from tqdm import tqdm
 30 | import validators
 31 | from urllib.parse import urldefrag, unquote, urlparse, parse_qs, urlencode, urlunparse
 32 | 
 33 | # -------------------------------------------------------------------- #
 34 | # Mind2Web2 imports
 35 | # -------------------------------------------------------------------- #
 36 | from mind2web2.llm_client.azure_openai_client import AsyncAzureOpenAIClient
 37 | from mind2web2.llm_client.openai_client import AsyncOpenAIClient
 38 | from mind2web2.utils.page_info_retrieval import (
 39 |     BatchBrowserManager,
 40 | )
 41 | from mind2web2.api_tools.tool_pdf import is_pdf
 42 | from mind2web2.utils.cache_filesys import CacheFileSys  # 🔄 Changed import
 43 | from mind2web2.utils.logging_setup import create_logger
 44 | from mind2web2.api_tools.tool_pdf import PDFParser
 45 | from mind2web2.utils.path_config import PathConfig
 46 | from mind2web2.prompts.cache_prompts import llm_extraction_prompts
 47 | from mind2web2.utils.url_tools import remove_utm_parameters, normalize_url_simple,regex_find_urls, URLs
 48 | 
 49 | # -------------------------------------------------------------------- #
 50 | # Global configuration
 51 | # -------------------------------------------------------------------- #
 52 | 
 53 | # LLM concurrency control (kept for URL extraction stage)
 54 | MAX_LLM_CONCURRENCY = 30    # Concurrent LLM calls for URL extraction
 55 | llm_semaphore = asyncio.Semaphore(MAX_LLM_CONCURRENCY)
 56 | 
 57 | # Centralized paths
 58 | paths = PathConfig(Path(__file__).resolve().parent)         # Project root (script at top level)
 59 | ANSWERS_ROOT = paths.answers_root                               # <repo>/dataset/answers
 60 | CACHE_ROOT = paths.cache_root                                # <repo>/workspace/cache
 61 | 
 62 | # Override if needed (e.g., write to dataset/cache instead of workspace/cache)
 63 | # CACHE_ROOT = paths.dataset_root / "cache"
 64 | 
 65 | # Logging
 66 | logger, _ = create_logger(__name__, "tmp_logs")
 67 | 
 68 | # -------------------------------------------------------------------- #
 69 | # Helpers for URL extraction
 70 | # -------------------------------------------------------------------- #
 71 | 
 72 | #
 73 | # def _is_valid_url(u: str) -> bool:
 74 | #     p = urlparse(u)
 75 | #     return p.scheme in {"http", "https"} and "." in p.netloc and len(p.netloc) > 2
 76 | 
 77 | async def llm_extract_urls_with_model(
 78 |     client: AsyncAzureOpenAIClient | AsyncOpenAIClient, 
 79 |     answer_text: str, 
 80 |     model: str
 81 | ) -> List[str]:
 82 |     """Extract URLs using specified LLM model with enhanced prompt."""
 83 |     try:
 84 |         async with llm_semaphore:
 85 |             result: URLs = await client.response(
 86 |                 model=model,
 87 |                 messages=[{"role": "system", "content": llm_extraction_prompts},{"role": "user", "content": answer_text}],
 88 |                 response_format=URLs,
 89 |             )
 90 |         return result.urls or []
 91 |     except Exception as e:
 92 |         logger.warning(f"LLM extraction failed with model {model}: {e}")
 93 |         return []
 94 | 
 95 | async def llm_extract_urls_multi_model(
 96 |     client: AsyncAzureOpenAIClient | AsyncOpenAIClient,
 97 |     answer_text: str,
 98 |     models: List[str] = None
 99 | ) -> List[str]:
100 |     """Extract URLs using multiple LLM models concurrently and merge results."""
101 |     if models is None:
102 |         models = ["o4-mini", "gpt-4.1"]
103 |     
104 |     # Run all models concurrently
105 |     tasks = [
106 |         llm_extract_urls_with_model(client, answer_text, model) 
107 |         for model in models
108 |     ]
109 |     
110 |     results = await asyncio.gather(*tasks, return_exceptions=True)
111 |     
112 |     # Merge all results
113 |     all_urls = set()
114 |     for result in results:
115 |         if isinstance(result, list):
116 |             all_urls.update(result)
117 |         elif isinstance(result, Exception):
118 |             logger.warning(f"Model extraction failed: {result}")
119 |     
120 |     return list(all_urls)
121 | 
122 | 
123 | 
124 | def filter_url_variants(urls: List[str], priorities: Dict[str, int] | None = None) -> List[str]:
125 |     """Filter out URL variants to keep only unique URLs.
126 | 
127 |     Args:
128 |         urls: URL candidates (duplicates allowed).
129 |         priorities: Optional map assigning lower scores to preferred originals.
130 |     """
131 |     if not urls:
132 |         return []
133 | 
134 |     # Group URLs by normalized form
135 |     url_groups = {}
136 |     for url in urls:
137 |         normalized = normalize_url_simple(url)
138 |         if normalized not in url_groups:
139 |             url_groups[normalized] = []
140 |         url_groups[normalized].append(url)
141 | 
142 |     # Select representative URL from each group
143 |     unique_urls = []
144 |     priority_lookup = priorities or {}
145 |     default_priority = 1 if priorities else 0
146 |     for group in url_groups.values():
147 |         # Prefer https over http, then prefer shorter URLs
148 |         group.sort(key=lambda u: (
149 |             priority_lookup.get(u, default_priority),
150 |             0 if u.startswith('https://') else 1,  # https first
151 |             len(u),  # shorter first
152 |             u.lower()  # alphabetical
153 |         ))
154 |         unique_urls.append(group[0])
155 | 
156 |     return unique_urls
157 | 
158 | async def extract_from_file(
159 |     client: AsyncAzureOpenAIClient | AsyncOpenAIClient | None,
160 |     ans_path: Path,
161 |     rel_source: str,
162 |     llm_models: List[str] = None,
163 | ) -> Tuple[Dict[str, List[str]], int]:
164 |     """Enhanced URL extraction with multi-model LLM and comprehensive regex + variant filtering."""
165 |     text = ans_path.read_text(encoding="utf-8")
166 | 
167 |     # --- Enhanced regex extraction ---
168 |     urls_regex = regex_find_urls(text)
169 | 
170 |     # --- Multi-model LLM extraction ---
171 |     urls_llm: List[str] = []
172 |     if client is not None:
173 |         urls_llm = await llm_extract_urls_multi_model(client, text, llm_models)
174 | 
175 |     # --- Merge all results ---
176 |     priorities: Dict[str, int] = {}
177 |     for url in urls_regex:
178 |         priorities[url] = 0
179 |     for url in urls_llm:
180 |         priorities.setdefault(url, 1)
181 | 
182 |     all_urls = urls_regex + urls_llm
183 | 
184 |     # --- Filter variants to avoid duplicates ---
185 |     unique_urls = filter_url_variants(all_urls, priorities if priorities else None)
186 | 
187 |     mapping = {u: [rel_source] for u in unique_urls}
188 |     return mapping, len(unique_urls)
189 | 
190 | # -------------------------------------------------------------------- #
191 | # Crawling helpers  
192 | # -------------------------------------------------------------------- #
193 | 
194 | async def crawl_one_page(url: str, cache: CacheFileSys, pdf_parser: PDFParser, browser_manager: BatchBrowserManager) -> None:
195 |     """Crawl a single page using a shared browser instance."""
196 |     try:
197 |         # Already cached? Skip
198 |         if cache.has(url):
199 |             return
200 |         url=remove_utm_parameters(url)
201 |         logger.info(f"Crawling {url}")
202 |         # ---------- PDF ----------
203 |         is_pdf_or_not = await is_pdf(url)
204 |         if is_pdf_or_not:
205 |             try:
206 |                 await asyncio.sleep(0.2 * random.random())
207 |                 buf = await pdf_parser._fetch_pdf_bytes(url)
208 |                 if buf is not None:
209 |                     cache.put_pdf(url, buf)
210 |                     return
211 |             except Exception as e:
212 |                 logger.info(f"Fail to extract PDF from {url} : {e}")
213 | 
214 |         # ---------- Web page capture (using shared browser) ----------
215 |         if is_pdf_or_not:
216 |             logger.info(f"⚠Try to load the Seemingly PDF file by loading online: {url}")
217 | 
218 |         shot, text = await browser_manager.capture_page(url, logger)
219 | 
220 |         # ---------- Persist ----------
221 |         if shot and text:
222 |             cache.put_web(url, text, shot)
223 | 
224 |     except Exception:
225 |         logger.error(f"Error crawling {url}", exc_info=True)
226 | 
227 | # -------------------------------------------------------------------- #
228 | # Safe wrapper with timeout
229 | # -------------------------------------------------------------------- #
230 | async def crawl_one_page_safe(
231 |     url: str,
232 |     cache: CacheFileSys,
233 |     pdf_parser: PDFParser,
234 |     browser_manager: BatchBrowserManager,
235 |     overall_timeout: int = 300,  # Overall 5-minute timeout to avoid hanging
236 | ) -> None:
237 |     """
238 |     Wrap `crawl_one_page()` with an overall timeout to prevent hanging.
239 |     
240 |     Args:
241 |         overall_timeout: Maximum time in seconds for the entire page capture process.
242 |                         This prevents a single page from hanging the entire program.
243 |                         Playwright's internal timeouts (30s) handle navigation issues.
244 |     """
245 |     try:
246 |         await asyncio.wait_for(
247 |             crawl_one_page(url, cache, pdf_parser, browser_manager),
248 |             timeout=overall_timeout,
249 |         )
250 |     except asyncio.TimeoutError:
251 |         logger.warning(f"Overall timeout: abandoned {url} after {overall_timeout}s to prevent program hanging")
252 |     except Exception:
253 |         logger.error(f"Unexpected error crawling {url}", exc_info=True)
254 | 
255 | # -------------------------------------------------------------------- #
256 | # Utilities
257 | # -------------------------------------------------------------------- #
258 | 
259 | def sort_ci(iterable):
260 |     """Case-insensitive sorting."""
261 |     return sorted(iterable, key=lambda s: s.lower())
262 | 
263 | # -------------------------------------------------------------------- #
264 | # Main pipeline per task
265 | # -------------------------------------------------------------------- #
266 | 
267 | async def process_cache(
268 |     agent_name: str,
269 |     task_id: str, 
270 |     llm_provider: str = "openai",
271 |     max_concurrent_pages: int = 30,
272 |     max_retries: int = 1,
273 |     overall_timeout: int = 300,  # Overall timeout to prevent hanging
274 |     headless: bool = False,
275 | ) -> None:
276 |     """
277 |     1) Discover and aggregate all URLs in answers; write to <CACHE_ROOT>/<agent_name>/<task_id>.json
278 |     2) Crawl web/PDF content by unique URL; write to <CACHE_ROOT>/<agent_name>/<task_id>/ directory
279 |     """
280 |     answer_root = ANSWERS_ROOT / agent_name / task_id
281 |     agent_cache_root = CACHE_ROOT / agent_name
282 |     agent_cache_root.mkdir(parents=True, exist_ok=True)
283 | 
284 |     meta_json = agent_cache_root / f"{task_id}.json"
285 |     cache_task_dir = agent_cache_root / task_id
286 | 
287 |     # ------------------------------------------------- #
288 |     # Step 1️⃣  URL discovery
289 |     # ------------------------------------------------- #
290 |     meta_data: Dict[str, Any]
291 | 
292 |     if meta_json.exists():
293 |         logger.info(f"[{agent_name}/{task_id}] Found existing {meta_json.name}, skipping extraction …")
294 |         data = json.loads(meta_json.read_text("utf-8"))
295 |         url_meta: Dict[str, List[str]] = data["urls"]
296 |         all_unique_urls: List[str] = data["all_unique_urls"]
297 |         meta_data = data
298 |     else:
299 |         # Initialize LLM client based on provider
300 |         if llm_provider == "openai":
301 |             client = AsyncOpenAIClient()
302 |         elif llm_provider == "azure_openai":
303 |             client = AsyncAzureOpenAIClient()
304 |         else:
305 |             raise ValueError(f"Unsupported LLM provider: {llm_provider}")
306 |         url_meta: Dict[str, List[str]] = {}
307 | 
308 |         # All .md answer files
309 |         answer_files = [p for p in answer_root.rglob("*.md") if p.is_file()]
310 |         logger.info(f"[{agent_name}/{task_id}] Extracting URLs from {len(answer_files)} .md answer files …")
311 | 
312 |         async def handle_file(p: Path):
313 |             # File path structure: answer_root/task_id/*.md or answer_root/task_id/subdir/*.md
314 |             rel_path = p.relative_to(answer_root)
315 |             rel_source = str(rel_path)
316 |             mapping, _ = await extract_from_file(client, p, rel_source)
317 |             return mapping
318 | 
319 |         # Progress bar: extraction
320 |         with tqdm(total=len(answer_files), desc="Extracting", unit="file", ncols=80) as bar:
321 |             coros = [handle_file(p) for p in answer_files]
322 |             for coro in asyncio.as_completed(coros):
323 |                 mapping = await coro
324 |                 for u, srcs in mapping.items():
325 |                     url_meta.setdefault(u, []).extend(srcs)
326 |                 bar.update(1)
327 | 
328 |         # Deduplicate + sort
329 |         url_meta = {u: sort_ci(list(set(srcs))) for u, srcs in url_meta.items()}
330 |         ordered_items = sorted(url_meta.items(), key=lambda kv: (-len(kv[1]), kv[0].lower()))
331 |         url_meta_ordered = {u: srcs for u, srcs in ordered_items}
332 |         all_unique_urls = sort_ci(url_meta_ordered.keys())
333 | 
334 |         payload = {
335 |             "agent_name": agent_name,
336 |             "task_id": task_id,
337 |             "total_unique_urls": len(all_unique_urls),
338 |             "all_unique_urls": all_unique_urls,
339 |             "urls": url_meta_ordered,
340 |             "url_types": {},
341 |         }
342 |         meta_json.write_text(json.dumps(payload, ensure_ascii=False, indent=2), "utf-8")
343 |         logger.info(f"[{agent_name}/{task_id}] Wrote URL metadata → {meta_json.relative_to(paths.project_root)}")
344 |         url_meta = url_meta_ordered
345 |         meta_data = payload
346 | 
347 |     # ------------------------------------------------- #
348 |     # Step 2️⃣  Crawl & cache (using shared browser instance)
349 |     # ------------------------------------------------- #
350 |     logger.info(f"[{agent_name}/{task_id}] Total unique URLs to crawl: {len(all_unique_urls)}")
351 | 
352 |     pdf_parser = PDFParser()
353 |     cache = CacheFileSys(str(cache_task_dir))
354 | 
355 |     # Use BatchBrowserManager to share browser instance; supports high concurrency
356 |     logger.info(f"[{agent_name}/{task_id}] Headless mode: {headless}")
357 | 
358 |     async with BatchBrowserManager(
359 |         headless=headless, 
360 |         max_concurrent_pages=max_concurrent_pages,
361 |         max_retries=max_retries
362 |     ) as browser_manager:
363 |         logger.info(f"[{agent_name}/{task_id}] Browser manager initialized")
364 |         
365 |         tasks = [crawl_one_page_safe(u, cache, pdf_parser, browser_manager, overall_timeout=overall_timeout) for u in all_unique_urls]
366 |         with tqdm(total=len(tasks), desc="Crawling", unit="url", ncols=80) as bar:
367 |             for coro in asyncio.as_completed(tasks):
368 |                 await coro
369 |                 bar.update(1)
370 |         
371 |         logger.info(f"[{agent_name}/{task_id}] Browser manager will be cleaned up automatically")
372 | 
373 |     cache.save()
374 | 
375 |     # Update metadata with cached content types
376 |     try:
377 |         url_types: Dict[str, str] = {}
378 |         for url in all_unique_urls:
379 |             content_type = cache.has(url)
380 |             if content_type:
381 |                 url_types[url] = content_type
382 | 
383 |         meta_data.update({
384 |             "agent_name": agent_name,
385 |             "task_id": task_id,
386 |             "total_unique_urls": len(all_unique_urls),
387 |             "all_unique_urls": all_unique_urls,
388 |             "urls": url_meta,
389 |             "url_types": url_types,
390 |             "cached_url_count": len(url_types),
391 |         })
392 |         meta_json.write_text(json.dumps(meta_data, ensure_ascii=False, indent=2), "utf-8")
393 |         logger.info(f"[{agent_name}/{task_id}] Updated metadata with cache types → {meta_json.relative_to(paths.project_root)}")
394 |     except Exception as e:
395 |         import traceback
396 |         traceback.print_exc()
397 |         logger.error(f"[{agent_name}/{task_id}] Failed to update metadata with cache types", exc_info=True)
398 | 
399 |     logger.info(f"[{agent_name}/{task_id}] Saved updated cache → {cache_task_dir.relative_to(paths.project_root)}")
400 | 
401 | # -------------------------------------------------------------------- #
402 | # Entry point
403 | # -------------------------------------------------------------------- #
404 | 
405 | def _strip_suffixes(task_id: str) -> str:
406 |     """If CLI argument mistakenly includes .json/.pkl, strip it automatically."""
407 |     suffixes = (".json", ".pkl")
408 |     for s in suffixes:
409 |         if task_id.endswith(s):
410 |             return task_id[: -len(s)]
411 |     return task_id
412 | 
413 | if __name__ == "__main__":
414 |     parser = argparse.ArgumentParser(description="Batch crawl pages and cache results using CacheFileSys (v2)")
415 |     parser.add_argument("agent_name", help="Agent name (e.g., chatgpt_agent)")
416 |     parser.add_argument("task_id", help="Task ID")
417 |     parser.add_argument(
418 |         "--llm_provider", 
419 |         choices=["openai", "azure_openai"], 
420 |         default="openai",
421 |         help="LLM provider (openai or azure_openai, default: openai)"
422 |     )
423 |     parser.add_argument(
424 |         "--max_concurrent_pages",
425 |         type=int,
426 |         default=5,
427 |         help="Maximum number of concurrent pages to process (default: 30)"
428 |     )
429 |     parser.add_argument(
430 |         "--max_retries",
431 |         type=int,
432 |         default=1,
433 |         help="Maximum number of retries per page (default: 1)"
434 |     )
435 |     parser.add_argument(
436 |         "--overall_timeout",
437 |         type=int,
438 |         default=120,
439 |         help="Overall timeout in seconds for each page capture to prevent hanging (default: 240s)"
440 |     )
441 |     parser.add_argument(
442 |         "--headless",
443 |         action="store_true",
444 |         help="Run browser in headless mode (default: headful)"
445 |     )
446 |     
447 |     args = parser.parse_args()
448 | 
449 |     task_id = _strip_suffixes(args.task_id)
450 |     asyncio.run(process_cache(
451 |         agent_name=args.agent_name,
452 |         task_id=task_id,
453 |         llm_provider=args.llm_provider,
454 |         max_concurrent_pages=args.max_concurrent_pages,
455 |         max_retries=args.max_retries,
456 |         overall_timeout=args.overall_timeout,
457 |         headless=args.headless
458 |     ))
459 | 


--------------------------------------------------------------------------------
/cache_all_answers.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -euo pipefail
 3 | 
 4 | # Avoid iterating a literal "*" when no match
 5 | shopt -s nullglob
 6 | 
 7 | # Agent can be passed as the first arg; default to example
 8 | AGENT="${1:-example}"
 9 | 
10 | # Resolve repo-relative answers root (works on any machine)
11 | SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
12 | ANS_ROOT="${SCRIPT_DIR}/answers/${AGENT}"
13 | 
14 | # Collect task directories
15 | dirs=( "${ANS_ROOT}"/*/ )
16 | if [ ${#dirs[@]} -eq 0 ]; then
17 |   echo "No tasks found under ${ANS_ROOT}. Check agent name and path." >&2
18 |   exit 1
19 | fi
20 | 
21 | for d in "${dirs[@]}"; do
22 |   task_id="$(basename "$d")"
23 |   echo ">>> Running: ${AGENT}/${task_id}"
24 |   python batch_answer_cache.py "${AGENT}" "${task_id}"
25 | done
26 | 


--------------------------------------------------------------------------------
/eval_scripts/README.md:
--------------------------------------------------------------------------------
1 | Please download the latest full evaluation script from [😊 Dataset (Tasks) and Evaluation Scripts (Judge Agents)](https://huggingface.co/datasets/osunlp/Mind2Web-2)


--------------------------------------------------------------------------------
/mind2web2/__init__.py:
--------------------------------------------------------------------------------
 1 | from .evaluator import Evaluator
 2 | from .verification_tree import VerificationNode, AggregationStrategy
 3 | from .utils.cache_filesys import CacheFileSys
 4 | from .eval_toolkit import create_evaluator, Extractor, Verifier, EvaluatorConfig
 5 | from .llm_client.base_client import LLMClient
 6 | 
 7 | # Import from subpackages for convenience
 8 | from .api_tools import ArxivTool, GoogleMapsTool, PDFParser
 9 | from .llm_client import (
10 |     OpenAIClient, AsyncOpenAIClient,
11 |     AzureOpenAIClient, AsyncAzureOpenAIClient,
12 |     calculate_api_cost
13 | )
14 | from .utils import (
15 |     create_logger, cleanup_logger, create_sub_logger,
16 |     PathConfig, PageManager,
17 |     load_eval_script,
18 |     normalize_url_markdown, text_dedent, strip_extension,
19 |     encode_image, encode_image_buffer,
20 |     extract_doc_description, extract_doc_description_from_frame,
21 | )
22 | 
23 | __all__ = [
24 |     # Core evaluation components
25 |     "Evaluator",
26 |     "VerificationNode", 
27 |     "AggregationStrategy",
28 |     "CacheClass",
29 |     "create_evaluator",
30 |     "Extractor",
31 |     "Verifier",
32 |     "EvaluatorConfig",
33 |     "LLMClient",
34 |     
35 |     # API tools
36 |     "ArxivTool",
37 |     "GoogleMapsTool",
38 |     "PDFParser",
39 |     
40 |     # LLM clients
41 |     "OpenAIClient",
42 |     "AsyncOpenAIClient",
43 |     "AzureOpenAIClient", 
44 |     "AsyncAzureOpenAIClient",
45 |     "calculate_api_cost",
46 |     
47 |     # Utilities
48 |     "create_logger",
49 |     "cleanup_logger",
50 |     "create_sub_logger", 
51 |     "PathConfig",
52 |     "PageManager",
53 |     "load_eval_script",
54 |     "normalize_url_markdown",
55 |     "text_dedent",
56 |     "strip_extension",
57 |     "encode_image",
58 |     "encode_image_buffer",
59 |     "extract_doc_description",
60 |     "extract_doc_description_from_frame",
61 | ]


--------------------------------------------------------------------------------
/mind2web2/api_tools/__init__.py:
--------------------------------------------------------------------------------
 1 | from .tool_arxiv import ArxivTool
 2 | from .tool_googlemap import GoogleMapsTool
 3 | from .tool_pdf import PDFParser
 4 | 
 5 | __all__ = [
 6 |     "ArxivTool",
 7 |     "GoogleMapsTool", 
 8 |     "PDFParser"
 9 | ]
10 | 


--------------------------------------------------------------------------------
/mind2web2/api_tools/tool_arxiv.py:
--------------------------------------------------------------------------------
 1 | import arxiv
 2 | import asyncio
 3 | from typing import Optional
 4 | 
 5 | class ArxivTool:
 6 |     def __init__(self, page_size: int = 100):
 7 |         self.client = arxiv.Client(page_size=page_size)
 8 | 
 9 |     @staticmethod
10 |     def is_arxiv_pdf_link(link: str) -> bool:
11 |         """Simple check to see if the link is an arXiv PDF link."""
12 |         return "arxiv.org/pdf" in link
13 | 
14 |     @staticmethod
15 |     def get_arxiv_id_from_pdf_link(link: str) -> str:
16 |         """Extract the arXiv ID from the PDF link."""
17 |         if link.endswith(".pdf"):
18 |             return link.split("/")[-1][:-4]
19 |         return link.split("/")[-1]
20 | 
21 |     async def search_arxiv_by_id(self, arxiv_id: str) -> Optional[dict]:
22 |         """Search arXiv by ID and return the result as a dictionary."""
23 |         try:
24 |             search = await asyncio.to_thread(arxiv.Search, id_list=[arxiv_id], max_results=1)
25 |             result_generator = self.client.results(search)
26 |             return vars(next(result_generator))
27 |         except StopIteration:
28 |             print(f"No results found for arXiv ID: {arxiv_id}")
29 |             return None
30 | 
31 |     async def search_arxiv_by_title(self, title: str) -> Optional[dict]:
32 |         """Search arXiv by title and return the result as a dictionary."""
33 |         try:
34 |             search = await asyncio.to_thread(arxiv.Search, query=title, max_results=1)
35 |             result_generator = self.client.results(search)
36 |             return vars(next(result_generator))
37 |         except StopIteration:
38 |             print(f"No results found for title: {title}")
39 |             return None
40 | 
41 | # Example usage
42 | if __name__ == "__main__":
43 |     async def main():
44 |         # Initialize tool
45 |         arxiv_tool = ArxivTool(page_size=100)
46 | 
47 |         # Example PDF link and title
48 |         pdf_url = "https://arxiv.org/pdf/2306.06070"
49 |         arxiv_id = arxiv_tool.get_arxiv_id_from_pdf_link(pdf_url)
50 | 
51 |         # Search by ID
52 |         id_result = await arxiv_tool.search_arxiv_by_id(arxiv_id)
53 |         if id_result:
54 |             print("Search by ID result:", id_result['title'])
55 | 
56 |         # Search by title
57 |         title_result = await arxiv_tool.search_arxiv_by_title("Mind2Web")
58 |         if title_result:
59 |             print("Search by Title result:", title_result['title'])
60 |             print("Published date timezone:", title_result['published'].tzinfo)
61 | 
62 |     asyncio.run(main())


--------------------------------------------------------------------------------
/mind2web2/api_tools/tool_googlemap.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import asyncio
  3 | import googlemaps
  4 | import os
  5 | 
  6 | class GoogleMapsTool:
  7 |     def __init__(self):
  8 |         """
  9 |         Initialize the Google Maps client with the provided API key.
 10 |         """
 11 |         api_key = os.getenv('GOOGLE_MAPS_API_KEY')
 12 |         self.client = googlemaps.Client(key=api_key)
 13 | 
 14 |     async def get_city_name(self, address, level='locality'):
 15 |         """
 16 |         Take an address string as input and return the city name or sub-city name.
 17 |         :param address: str - The address to look up.
 18 |         :param level: str - The level of granularity ('locality' or 'sublocality').
 19 |         :return: str - The city or sub-city name.
 20 |         """
 21 |         geocode_result = await asyncio.to_thread(self.client.geocode, address)
 22 |         assert level in ['locality', 'sublocality'], "Invalid level. Must be 'locality' or 'sublocality'."
 23 |         if geocode_result:
 24 |             for component in geocode_result[0]['address_components']:
 25 |                 print(component)
 26 |                 if level in component['types']:
 27 |                     return component['long_name']
 28 |         return "City/Sub-city name not found"
 29 | 
 30 | 
 31 |     async def get_address_information(self, address):
 32 |         """
 33 |         Take an address string as input and return the city name or sub-city name.
 34 |         :param address: str - The address to look up.
 35 |         :param level: str - The level of granularity ('locality' or 'sublocality').
 36 |         :return: str - The city or sub-city name.
 37 |         """
 38 |         geocode_result = await asyncio.to_thread(self.client.geocode, address)
 39 |         print(geocode_result)
 40 | 
 41 |         return geocode_result
 42 | 
 43 |     async def calculate_distance(self, address1, address2, mode="driving"):
 44 |         """
 45 |         Calculate the driving or walking distance between two addresses in meters.
 46 |         :param address1: str - The starting address.
 47 |         :param address2: str - The destination address.
 48 |         :param mode: str - The mode of transportation ('driving', 'walking', 'transit').
 49 |         :return: int - The distance in meters.
 50 |         """
 51 |         assert mode in ['driving', 'walking', 'transit'], "Invalid mode. Must be within ['driving', 'walking', 'transit']"
 52 |         directions_result = await asyncio.to_thread(
 53 |             self.client.directions, origin=address1, destination=address2, mode=mode
 54 |         )
 55 |         if directions_result:
 56 |             return directions_result[0]['legs'][0]['distance']['value']
 57 |         return "Distance not found"
 58 | 
 59 |     async def calculate_travel_time(self, address1, address2, mode="driving"):
 60 |         """
 61 |         Calculate the travel time between two addresses in seconds.
 62 |         :param address1: str - The starting address.
 63 |         :param address2: str - The destination address.
 64 |         :param mode: str - The mode of transportation ('driving', 'walking', 'transit').
 65 |         :return: int - The travel time in seconds.
 66 |         """
 67 |         assert mode in ['driving', 'walking', 'transit'], "Invalid mode. Must be within ['driving', 'walking', 'transit']"
 68 |         directions_result = await asyncio.to_thread(
 69 |             self.client.directions, origin=address1, destination=address2, mode=mode
 70 |         )
 71 |         if directions_result:
 72 |             return directions_result[0]['legs'][0]['duration']['value']
 73 |         return "Travel time not found"
 74 | 
 75 | # Example usage
 76 | if __name__ == "__main__":
 77 |     parser = argparse.ArgumentParser()
 78 |     parser.add_argument("--address1", type=str, help="The starting address.")
 79 |     parser.add_argument("--address2", type=str, help="The destination address.")
 80 |     args = parser.parse_args()
 81 | 
 82 |     address1= args.address1
 83 |     address2= args.address2
 84 | 
 85 |     address1= '318 E 6th St, New York, NY 10003'
 86 | 
 87 |     gmaps_tool = GoogleMapsTool()
 88 | 
 89 |     async def main():
 90 | 
 91 |         if address1:
 92 |             city_name = await gmaps_tool.get_city_name(address1)
 93 |             print("City Name:", city_name)
 94 | 
 95 |             city_information= await gmaps_tool.get_address_information(address1)
 96 |             print("City Information:", city_information)
 97 | 
 98 |             if address2:
 99 |                 distance = await gmaps_tool.calculate_distance(address1, address2)
100 |                 print("Distance (meters):", distance)
101 | 
102 |                 travel_time = await gmaps_tool.calculate_travel_time(address1, address2)
103 |                 print("Travel Time (seconds):", travel_time)
104 |             else:
105 |                 print("No destination address provided for distance and travel time calculation.")
106 |         else:
107 |             print("No starting address provided for city name lookup.")
108 |     asyncio.run(main())


--------------------------------------------------------------------------------
/mind2web2/api_tools/tool_pdf.py:
--------------------------------------------------------------------------------
  1 | # pdf_parser.py  ---------------------------------------------------------
  2 | """
  3 | Lightweight PDF parser:
  4 |     * extract()   - Pass URL / local path / bytes, asynchronously returns (imgs, text)
  5 |     * If download or parsing fails, always returns (None, None)
  6 |     * imgs: screenshot of each page (JPEG, base64), up to 50 pages
  7 |     * text: all plain text, up to 100 pages
  8 | Dependencies:
  9 |     pip install aiohttp pymupdf pillow
 10 | """
 11 | 
 12 | import asyncio
 13 | import base64
 14 | import random
 15 | import ssl
 16 | from io import BytesIO
 17 | from logging import Logger
 18 | from typing import List, Tuple, Union, Optional
 19 | from urllib.parse import urlparse, unquote
 20 | import certifi
 21 | 
 22 | import aiohttp
 23 | import certifi
 24 | import fitz  # PyMuPDF
 25 | import httpx
 26 | import requests
 27 | from PIL import Image
 28 | from ..utils.url_tools import remove_utm_parameters,normalize_url_for_browser
 29 | 
 30 | def make_blank_png_b64() -> str:
 31 |     # Create 1×1 RGBA fully transparent pixel
 32 |     img = Image.new("RGBA", (1, 1), (0, 0, 0, 0))
 33 |     buf = BytesIO()
 34 |     img.save(buf, format="PNG")
 35 |     # Convert to base64 and remove line breaks
 36 |     return base64.b64encode(buf.getvalue()).decode()
 37 | 
 38 | 
 39 | # ------------------ Constants ------------------
 40 | PDF_MAGIC = b"%PDF-"  # PDF file header
 41 | UA_CHROME = (
 42 |     "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
 43 |     "AppleWebKit/537.36 (KHTML, like Gecko) "
 44 |     "Chrome/124.0.0.0 Safari/537.36"
 45 | )
 46 | 
 47 | # User-agent strings for PDF detection
 48 | USER_AGENT_STRINGS = [
 49 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36',
 50 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36',
 51 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 14_4_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36 OPR/109.0.0.0',
 52 | ]
 53 | 
 54 | 
 55 | # ================================ PDF Detection Functions ================================
 56 | 
 57 | 
 58 | def is_pdf_by_suffix(url: str) -> bool:
 59 |     """Check if URL likely points to PDF based on path/query patterns."""
 60 |     parsed = urlparse(url.lower())
 61 |     path = unquote(parsed.path)
 62 | 
 63 |     # Direct .pdf extension
 64 |     if path.endswith('.pdf'):
 65 |         return True
 66 | 
 67 |     # Common PDF URL patterns
 68 |     pdf_patterns = [
 69 |         'arxiv.org/pdf/',
 70 |         '/download/pdf',
 71 |         '/fulltext.pdf',
 72 |         '/article/pdf',
 73 |         '/content/pdf',
 74 |         'type=pdf',
 75 |         'format=pdf',
 76 |         'download=pdf',
 77 |         '.pdf?',
 78 |         '/pdf/',
 79 |         'pdfviewer',
 80 |     ]
 81 | 
 82 |     url_lower = url.lower()
 83 |     return any(pattern in url_lower for pattern in pdf_patterns)
 84 | 
 85 | 
 86 | 
 87 | def is_pdf_by_requests_head(url: str) -> bool:
 88 |     """Check via HEAD request whether URL is a PDF, with strict certificate verification."""
 89 |     try:
 90 |         r = requests.head(
 91 |             url,
 92 |             allow_redirects=True,
 93 |             timeout=10,
 94 |             verify=certifi.where()  # Use certifi's root CA bundle
 95 |         )
 96 |         ct = r.headers.get("content-type", "").lower()
 97 |         return "pdf" in ct
 98 |     except requests.RequestException as e:
 99 |         # If some sites have certificate issues, you can log it
100 |         # print(f"HEAD request failed for {url}: {e}")
101 |         return False
102 | 
103 | async def is_pdf_by_httpx_get_range(url: str, timeout: int = 10) -> bool:
104 |     """Check PDF via partial GET request to read file header."""
105 |     try:
106 |         # Configure httpx with custom SSL context
107 |         ssl_context = ssl.create_default_context()
108 |         ssl_context.check_hostname = False
109 |         ssl_context.verify_mode = ssl.CERT_NONE
110 | 
111 |         async with httpx.AsyncClient(
112 |                 follow_redirects=True,
113 |                 timeout=timeout,
114 |                 verify=False
115 |         ) as client:
116 | 
117 |             headers = {
118 |                 "User-Agent": random.choice(USER_AGENT_STRINGS),
119 |                 "Range": "bytes=0-1023",  # Get first 1KB to check magic number
120 |                 "Accept": "*/*",
121 |             }
122 | 
123 |             r = await client.get(url, headers=headers)
124 | 
125 |             # First check Content-Type
126 |             ctype = r.headers.get("content-type", "").split(";")[0].strip().lower()
127 |             if "pdf" in ctype:
128 |                 return True
129 | 
130 |             # If we got content, check PDF magic number
131 |             if r.content:
132 |                 # PDF files start with %PDF-
133 |                 return r.content.startswith(b'%PDF-')
134 | 
135 |     except httpx.TimeoutException:
136 |         print(f"[is_pdf_httpx_get_range] Timeout for {url}")
137 |         return False
138 |     except httpx.ConnectError:
139 |         print(f"[is_pdf_httpx_get_range] Connection error for {url}")
140 |         return False
141 |     except Exception as e:
142 |         print(f"[is_pdf_httpx_get_range] Error for {url}: {type(e).__name__}: {e}")
143 |         return False
144 | 
145 | 
146 | async def is_pdf_by_full_get(url: str, timeout: int = 15) -> bool:
147 |     """Last resort: download beginning of file to check magic number."""
148 |     try:
149 |         async with httpx.AsyncClient(
150 |                 follow_redirects=True,
151 |                 timeout=timeout,
152 |                 verify=False
153 |         ) as client:
154 | 
155 |             headers = {
156 |                 "User-Agent": random.choice(USER_AGENT_STRINGS),
157 |                 "Accept": "*/*",
158 |             }
159 | 
160 |             # Stream the response to avoid downloading large files
161 |             async with client.stream('GET', url, headers=headers) as response:
162 |                 # Read first chunk to check PDF magic number
163 |                 chunk_data = b""
164 |                 async for chunk in response.aiter_bytes(chunk_size=5):
165 |                     chunk_data += chunk
166 |                     if len(chunk_data) >= 5:
167 |                         break
168 | 
169 |                 if chunk_data and chunk_data.startswith(b'%PDF-'):
170 |                     return True
171 | 
172 |                 # Also check Content-Type from response
173 |                 ctype = response.headers.get("content-type", "").split(";")[0].strip().lower()
174 |                 return "pdf" in ctype
175 | 
176 |     except Exception as e:
177 |         print(f"[is_pdf_by_full_get] Error for {url}: {type(e).__name__}: {e}")
178 |         return False
179 | 
180 | 
181 | async def is_pdf(url: str, logger: Logger = None) -> bool:
182 |     """
183 |     Robustly detect if a URL points to a PDF file using multiple strategies.
184 |     
185 |     Args:
186 |         url: The URL to check
187 |         logger: Optional logger instance
188 |         
189 |     Returns:
190 |         bool: True if URL points to a PDF, False otherwise
191 |     """
192 |     url = normalize_url_for_browser(url)
193 | 
194 |     if logger:
195 |         logger.debug(f"Checking if URL is PDF: {url}")
196 | 
197 |     # 1. Fast URL pattern check
198 |     if is_pdf_by_suffix(url):
199 |         if logger:
200 |             logger.info(f"URL pattern indicates PDF: {url}")
201 |         else:
202 |             print(f"{url} IS a PDF (by URL pattern)")
203 |         return True
204 | 
205 |     # 2. Try HEAD request first (fastest network check)
206 |     if is_pdf_by_requests_head(url):
207 |         if logger:
208 |             logger.info(f"HEAD request confirms PDF: {url}")
209 |         else:
210 |             print(f"{url} IS a PDF (by HEAD request)")
211 |         return True
212 | 
213 |     # 3. Try partial GET with magic number check
214 |     if await is_pdf_by_httpx_get_range(url):
215 |         if logger:
216 |             logger.info(f"Partial GET confirms PDF: {url}")
217 |         else:
218 |             print(f"{url} IS a PDF (by partial GET)")
219 |         return True
220 | 
221 |     # 4. Last resort: stream beginning of file
222 |     if await is_pdf_by_full_get(url):
223 |         if logger:
224 |             logger.info(f"Full GET confirms PDF: {url}")
225 |         else:
226 |             print(f"{url} IS a PDF (by full GET)")
227 |         return True
228 | 
229 |     # # Not a PDF
230 |     # if logger:
231 |     #     logger.debug(f"URL is not a PDF: {url}")
232 |     # else:
233 |     #     print(f"{url} IS NOT a PDF")
234 |     return False
235 | 
236 | 
237 | class PDFParser:
238 |     """
239 |     Download and parse PDF. Returns (None, None) on failure.
240 |     """
241 | 
242 |     # Default limits
243 |     MAX_PAGES: int = 100
244 |     MAX_IMAGE_PAGES: int = 50
245 |     RENDER_DPI: int = 144
246 |     JPEG_QUALITY: int = 70
247 | 
248 |     # ------------------ Public API ------------------
249 |     async def extract(
250 |             self,
251 |             source: Union[str, bytes, BytesIO],
252 |     ) -> Tuple[Optional[List[str]], Optional[str]]:
253 |         """
254 |         Parameters
255 |         ----------
256 |         source : str | bytes | BytesIO
257 |             URL / local file path / PDF byte stream
258 | 
259 |         Returns
260 |         -------
261 |         imgs : list[str] | None
262 |         text : str | None
263 |         """
264 |         try:
265 |             # 1) Obtain PDF bytes
266 |             if isinstance(source, (bytes, BytesIO)):
267 |                 data = source.getvalue() if isinstance(source, BytesIO) else source
268 |             elif isinstance(source, str) and source.lower().startswith(("http://", "https://")):
269 |                 data = await self._fetch_pdf_bytes(source)
270 |             else:  # Local file
271 |                 data = await asyncio.to_thread(lambda p: open(p, "rb").read(), str(source))
272 | 
273 |             # 2) Magic number check
274 |             if not data.lstrip().startswith(PDF_MAGIC):
275 |                 return [make_blank_png_b64()], "PDF extraction failed: Invalid PDF format"
276 | 
277 |             # 3) Parsing (CPU-intensive, synchronous), run in thread
278 |             return await asyncio.to_thread(self._extract_from_bytes, data)
279 | 
280 |         except Exception as e:
281 |             print(f"PDF extraction failed: {e}")
282 |             return [make_blank_png_b64()], "PDF extraction failed: Download or parsing error"
283 | 
284 |     # ------------------ Internal Implementation ------------------
285 |     async def _fetch_pdf_bytes(self, url: str) -> bytes:
286 |         """
287 |         Fetch PDF with browser User-Agent; if necessary, switch to export.arxiv.org as backup.
288 |         """
289 |         headers = {
290 |             "User-Agent": UA_CHROME,
291 |             "Accept": "application/pdf,application/octet-stream;q=0.9,*/*;q=0.8",
292 |         }
293 | 
294 |         async def _download(u: str) -> bytes:
295 |             async with aiohttp.ClientSession(headers=headers) as s:
296 |                 async with s.get(u, allow_redirects=True, timeout=30) as r:
297 |                     r.raise_for_status()
298 |                     return await r.read()
299 | 
300 |         data = await _download(url)
301 |         # print(data)
302 | 
303 |         # If returned HTML, try backup domain for arxiv
304 |         if not data.lstrip().startswith(PDF_MAGIC) and "arxiv.org" in url:
305 |             backup = url.replace("://arxiv.org", "://export.arxiv.org")
306 |             try:
307 |                 data = await _download(backup)
308 |             except Exception as e:
309 |                 print(f"failed to download from {url} with backup export arxiv: {e}")
310 | 
311 |         # print(data)
312 | 
313 |         return data
314 | 
315 |     def _extract_from_bytes(
316 |             self, data: bytes
317 |     ) -> Tuple[Optional[List[str]], Optional[str]]:
318 |         """
319 |         Actual parsing logic. Returns (None, None) on failure.
320 |         """
321 |         # Double-check magic number (in case called directly by other modules)
322 |         if not data.lstrip().startswith(PDF_MAGIC):
323 |             return [make_blank_png_b64()], "PDF extraction failed: Invalid PDF format"
324 | 
325 |         try:
326 |             doc = fitz.open(stream=data, filetype="pdf")
327 |         except (fitz.FileDataError, RuntimeError):
328 |             return [make_blank_png_b64()], "PDF extraction failed: Unable to parse PDF file"
329 | 
330 |         imgs: List[str] = []
331 |         texts: List[str] = []
332 |         zoom = self.RENDER_DPI / 72
333 | 
334 |         max_pages = min(self.MAX_PAGES, doc.page_count)
335 |         max_img_pages = min(self.MAX_IMAGE_PAGES, doc.page_count)
336 | 
337 |         for i in range(max_pages):
338 |             page = doc.load_page(i)
339 |             texts.append(page.get_text("text"))
340 | 
341 |             if i < max_img_pages:
342 |                 pix = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), alpha=False)
343 |                 img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
344 | 
345 |                 buf = BytesIO()
346 |                 img.save(buf, "JPEG", quality=self.JPEG_QUALITY,
347 |                          optimize=True, progressive=True)
348 |                 imgs.append(base64.b64encode(buf.getvalue()).decode())
349 |         # print(texts)
350 |         return imgs, "\n".join(texts)
351 | 
352 | 
353 | # ------------------ PDF Testing Functions ------------------
354 | 
355 | async def test_pdf_detection():
356 |     """Test PDF detection functionality."""
357 |     # Test URLs
358 |     test_urls = [
359 |         "https://www.fhwa.dot.gov/policyinformation/statistics/2023/pdf/mv1.pdf",  # Should be PDF
360 |         "https://arxiv.org/pdf/2301.00001.pdf",  # Should be PDF (arxiv)
361 |         "https://www.google.com",  # Should NOT be PDF
362 |         "https://example.com/document.pdf",  # Should be PDF by suffix
363 |     ]
364 | 
365 |     print("🧪 Testing PDF detection functionality...")
366 |     print("=" * 50)
367 | 
368 |     for url in test_urls:
369 |         print(f"\n🔍 Testing: {url}")
370 |         try:
371 |             result = await is_pdf(url)
372 |             status = "✅ IS PDF" if result else "❌ NOT PDF"
373 |             print(f"   Result: {status}")
374 |         except Exception as e:
375 |             print(f"   Error: {e}")
376 | 
377 |     print("\n" + "=" * 50)
378 |     print("✅ PDF detection test completed!")
379 | 
380 | 
381 | # ------------------ Local Quick Test ------------------
382 | if __name__ == "__main__":
383 |     async def _demo() -> None:
384 |         # Test PDF detection
385 |         # await test_pdf_detection()
386 |         #
387 |         # print("\n" + "=" * 50)
388 |         # print("🧪 Testing PDF parsing functionality...")
389 | 
390 |         parser = PDFParser()
391 | 
392 |         # # ✅ Normal PDF
393 |         # ok_imgs, ok_txt = await parser.extract(
394 |         #     "https://arxiv.org/pdf/2505.07880.pdf"
395 |         # )
396 |         # print("Normal PDF:", "Success" if ok_txt else "Failed")
397 | 
398 |         # ❌ Fake PDF
399 |         bad_imgs, bad_txt = await parser.extract(
400 |             "https://arxiv.org/pdf/2408.XXXXXv1.pdf"
401 |         )
402 |         # print(bad_txt)
403 |         # print("Fake PDF:", "Success" if bad_txt else "Failed")
404 | 
405 | 
406 |     asyncio.run(_demo())
407 | 


--------------------------------------------------------------------------------
/mind2web2/llm_client/__init__.py:
--------------------------------------------------------------------------------
 1 | from .base_client import LLMClient
 2 | from .openai_client import OpenAIClient, AsyncOpenAIClient
 3 | from .azure_openai_client import AzureOpenAIClient, AsyncAzureOpenAIClient
 4 | from .api_cost import calculate_api_cost
 5 | 
 6 | __all__ = [
 7 |     "LLMClient",
 8 |     "OpenAIClient",
 9 |     "AsyncOpenAIClient", 
10 |     "AzureOpenAIClient",
11 |     "AsyncAzureOpenAIClient",
12 |     "calculate_api_cost"
13 | ]
14 | 


--------------------------------------------------------------------------------
/mind2web2/llm_client/api_cost.py:
--------------------------------------------------------------------------------
 1 | API_COST = {
 2 |     # model_name: input, output
 3 |     "gpt-4.1": [2.00, 8.00],
 4 |     "o4-mini": [1.10, 4.40],
 5 |     "gpt-4o": [2.50, 10.00],
 6 |     "us.anthropic.claude-3-7-sonnet-20250219-v1:0": [3.00, 15.00]
 7 | }
 8 | 
 9 | UNIT = 1000000
10 | 
11 | 
12 | def calculate_api_cost(input_tokens, output_tokens, model_name):
13 |     if model_name not in API_COST:
14 |         raise ValueError(f"Cannot get the price of calling {model_name}")
15 |     return API_COST[model_name][0] * input_tokens / UNIT + API_COST[model_name][1] * output_tokens / UNIT
16 | 


--------------------------------------------------------------------------------
/mind2web2/llm_client/azure_openai_client.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | from openai import AzureOpenAI, AsyncAzureOpenAI
  4 | import backoff
  5 | from openai import OpenAIError,APIConnectionError,RateLimitError, InternalServerError, APITimeoutError
  6 | 
  7 | 
  8 | logging.getLogger("httpx").setLevel(logging.WARNING)
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | def _log_backoff(details):
 13 |     exc = details.get("exception")
 14 |     tries = details.get("tries")
 15 |     wait = details.get("wait")
 16 |     kwargs = details.get("kwargs") or {}
 17 |     model = kwargs.get("model")
 18 |     target = details.get("target")
 19 |     target_name = getattr(target, "__name__", str(target))
 20 |     if exc is not None:
 21 |         logger.warning(
 22 |             "Azure OpenAI retry #%s after %.1fs in %s (model=%s) due to %s: %s",
 23 |             tries,
 24 |             wait or 0,
 25 |             target_name,
 26 |             model,
 27 |             type(exc).__name__,
 28 |             exc,
 29 |         )
 30 |     else:
 31 |         logger.warning(
 32 |             "Azure OpenAI retry #%s after %.1fs in %s (model=%s, no exception info)",
 33 |             tries,
 34 |             wait or 0,
 35 |             target_name,
 36 |             model,
 37 |         )
 38 | 
 39 | 
 40 | def _log_giveup(details):
 41 |     exc = details.get("exception")
 42 |     kwargs = details.get("kwargs") or {}
 43 |     model = kwargs.get("model")
 44 |     target = details.get("target")
 45 |     target_name = getattr(target, "__name__", str(target))
 46 |     if exc is not None:
 47 |         logger.error(
 48 |             "Azure OpenAI retries exhausted in %s (model=%s) due to %s: %s",
 49 |             target_name,
 50 |             model,
 51 |             type(exc).__name__,
 52 |             exc,
 53 |         )
 54 |     else:
 55 |         logger.error(
 56 |             "Azure OpenAI retries exhausted in %s (model=%s, no exception info)",
 57 |             target_name,
 58 |             model,
 59 |         )
 60 | 
 61 | 
 62 | @backoff.on_exception(
 63 |     backoff.expo,
 64 |     (OpenAIError,APIConnectionError,RateLimitError, InternalServerError, APITimeoutError),
 65 |     on_backoff=_log_backoff,
 66 |     on_giveup=_log_giveup,
 67 | )
 68 | def completion_with_backoff(client, **kwargs):
 69 |     if "response_format" in kwargs:
 70 |         return client.beta.chat.completions.parse(**kwargs)
 71 |     return client.chat.completions.create(**kwargs)
 72 | 
 73 | 
 74 | @backoff.on_exception(
 75 |     backoff.expo,
 76 |     (OpenAIError,APIConnectionError,RateLimitError, InternalServerError, APITimeoutError),
 77 |     on_backoff=_log_backoff,
 78 |     on_giveup=_log_giveup,
 79 | )
 80 | async def acompletion_with_backoff(client, **kwargs):
 81 |     if "response_format" in kwargs:
 82 |         return await client.beta.chat.completions.parse(**kwargs)
 83 |     return await client.chat.completions.create(**kwargs)
 84 | 
 85 | 
 86 | class AzureOpenAIClient():
 87 |     def __init__(self):
 88 |         self.client = AzureOpenAI(
 89 |             api_key=os.getenv("AZURE_OPENAI_API_KEY"), 
 90 |             azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT_URL"),
 91 |             api_version=os.getenv("AZURE_OPENAI_API_VERSION")
 92 |         )
 93 |     
 94 |     def response(self, count_token=False, **kwargs):
 95 |         response = completion_with_backoff(self.client, **kwargs)
 96 |         tokens = {
 97 |             "input_tokens": response.usage.prompt_tokens,
 98 |             "output_tokens": response.usage.completion_tokens
 99 |         }
100 |         if "response_format" in kwargs:
101 |             if count_token:
102 |                 return response.choices[0].message.parsed, tokens
103 |             else:
104 |                 return response.choices[0].message.parsed
105 |         if count_token:
106 |             return response.choices[0].message.content, tokens
107 |         else:
108 |             return response.choices[0].message.content
109 | 
110 |         
111 | class AsyncAzureOpenAIClient():
112 |     def __init__(self):
113 |         self.client = AsyncAzureOpenAI(
114 |             api_key=os.getenv("AZURE_OPENAI_API_KEY"), 
115 |             azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT_URL"),
116 |             api_version=os.getenv("AZURE_OPENAI_API_VERSION")
117 |         )
118 | 
119 |     async def response(self, count_token=False, **kwargs):
120 |         response = await acompletion_with_backoff(self.client, **kwargs)
121 |         tokens = {
122 |             "input_tokens": response.usage.prompt_tokens,
123 |             "output_tokens": response.usage.completion_tokens
124 |         }
125 |         if "response_format" in kwargs:
126 |             if count_token:
127 |                 return response.choices[0].message.parsed, tokens
128 |             else:
129 |                 return response.choices[0].message.parsed
130 |         if count_token:
131 |             return response.choices[0].message.content, tokens
132 |         else:
133 |             return response.choices[0].message.content
134 | 


--------------------------------------------------------------------------------
/mind2web2/llm_client/base_client.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | class LLMClient():
 4 |     def __init__(self, provider, is_async=False):
 5 |         self.provider = provider
 6 |         self.is_async = is_async
 7 |         if provider == 'azure_openai':
 8 |             if is_async:
 9 |                 from mind2web2.llm_client.azure_openai_client import AsyncAzureOpenAIClient
10 |                 self.client = AsyncAzureOpenAIClient()
11 |             else:
12 |                 from mind2web2.llm_client.azure_openai_client import AzureOpenAIClient
13 |                 self.client = AzureOpenAIClient()
14 |         elif provider == 'openai':
15 |             if is_async:
16 |                 from mind2web2.llm_client.openai_client import AsyncOpenAIClient
17 |                 self.client = AsyncOpenAIClient()
18 |             else:
19 |                 from mind2web2.llm_client.openai_client import OpenAIClient
20 |                 self.client = OpenAIClient()
21 |         elif provider == 'bedrock_anthropic':
22 |             if is_async:
23 |                 from mind2web2.llm_client.bedrock_anthropic_client import AsyncBedrockAntrhopicClient
24 |                 self.client = AsyncBedrockAntrhopicClient()
25 |             else:
26 |                 from mind2web2.llm_client.bedrock_anthropic_client import BedrockAntrhopicClient
27 |                 self.client = BedrockAntrhopicClient()
28 |         else:
29 |             raise ValueError(f'Provider {provider} not supported')
30 |     
31 |     def response(self, **kwargs):
32 |         # insure that the provider is not async
33 |         if self.is_async:
34 |             raise ValueError(f'Provider {self.provider} is async and does not support synchronous response')
35 |         return self.client.response(**kwargs)
36 | 
37 |     async def async_response(self, **kwargs):
38 |         # insure that the provider is async
39 |         if not self.is_async:
40 |             raise ValueError(f'Provider {self.provider} is not async and does not support asynchronous response')
41 |         return await self.client.response(**kwargs)
42 | 


--------------------------------------------------------------------------------
/mind2web2/llm_client/bedrock_anthropic_client.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from anthropic import AnthropicBedrock, AsyncAnthropicBedrock
 3 | 
 4 | 
 5 | def completion_with_backoff(client, **kwargs):
 6 |     return client.messages.create(**kwargs)
 7 | 
 8 | 
 9 | async def acompletion_with_backoff(client, **kwargs):
10 |     return await client.messages.create(**kwargs)
11 | 
12 | 
13 | class BedrockAntrhopicClient():
14 |     def __init__(self):
15 |         self.client = AnthropicBedrock(
16 |             aws_access_key=os.getenv("AWS_ACCESS_KEY"), 
17 |             aws_secret_key=os.getenv("AWS_SECRET_KEY"),
18 |             aws_region=os.getenv("AWS_REGION")
19 |         )
20 |     
21 |     def response(self, count_token=False, **kwargs):
22 |         response = completion_with_backoff(self.client, **kwargs)
23 |         if count_token:
24 |             tokens = {
25 |                 "input_tokens": response.usage.input_tokens,
26 |                 "output_tokens": response.usage.output_tokens
27 |             }
28 |             return response.content[0].text, tokens
29 |         else:
30 |             return response.content[0].text
31 | 
32 | 
33 | class AsyncBedrockAntrhopicClient():
34 |     def __init__(self):
35 |         self.client = AsyncAnthropicBedrock(
36 |             aws_access_key=os.getenv("AWS_ACCESS_KEY"), 
37 |             aws_secret_key=os.getenv("AWS_SECRET_KEY"),
38 |             aws_region=os.getenv("AWS_REGION")
39 |         )
40 |     
41 |     async def response(self, count_token=False, **kwargs):
42 |         response = await acompletion_with_backoff(self.client, **kwargs)
43 |         if count_token:
44 |             tokens = {
45 |                 "input_tokens": response.usage.input_tokens,
46 |                 "output_tokens": response.usage.output_tokens
47 |             }
48 |             return response.content[0].text, tokens
49 |         else:
50 |             return response.content[0].text
51 | 


--------------------------------------------------------------------------------
/mind2web2/llm_client/openai_client.py:
--------------------------------------------------------------------------------
  1 | """
  2 | mind2web2/llm_client/openai_client.py
  3 | 
  4 | A thin wrapper around the OpenAI Python SDK (v1+) that
  5 | adds exponential-backoff retry logic, unified synchronous
  6 | and asynchronous interfaces, and optional token usage stats.
  7 | """
  8 | 
  9 | import os
 10 | import backoff
 11 | from openai import OpenAI, AsyncOpenAI
 12 | from openai import (
 13 |     OpenAIError,
 14 |     APIConnectionError,
 15 |     RateLimitError,
 16 |     InternalServerError,
 17 |     APITimeoutError,
 18 | )
 19 | import logging
 20 | 
 21 | logging.getLogger("httpx").setLevel(logging.WARNING)
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | 
 26 | def _log_backoff(details):
 27 |     """Log retry attempts triggered by backoff."""
 28 |     exc = details.get("exception")
 29 |     tries = details.get("tries")
 30 |     wait = details.get("wait")
 31 |     target = details.get("target")
 32 |     target_name = getattr(target, "__name__", str(target))
 33 |     kwargs = details.get("kwargs") or {}
 34 |     model = kwargs.get("model")
 35 |     if exc is not None:
 36 |         logger.warning(
 37 |             "OpenAI retry #%s after %.1fs in %s (model=%s) due to %s: %s",
 38 |             tries,
 39 |             wait or 0,
 40 |             target_name,
 41 |             model,
 42 |             type(exc).__name__,
 43 |             exc,
 44 |         )
 45 |     else:
 46 |         logger.warning(
 47 |             "OpenAI retry #%s after %.1fs in %s (model=%s, no exception info)",
 48 |             tries,
 49 |             wait or 0,
 50 |             target_name,
 51 |             model,
 52 |         )
 53 | 
 54 | 
 55 | def _log_giveup(details):
 56 |     exc = details.get("exception")
 57 |     target = details.get("target")
 58 |     target_name = getattr(target, "__name__", str(target))
 59 |     kwargs = details.get("kwargs") or {}
 60 |     model = kwargs.get("model")
 61 |     if exc is not None:
 62 |         logger.error(
 63 |             "OpenAI retries exhausted in %s (model=%s) due to %s: %s",
 64 |             target_name,
 65 |             model,
 66 |             type(exc).__name__,
 67 |             exc,
 68 |         )
 69 |     else:
 70 |         logger.error(
 71 |             "OpenAI retries exhausted in %s (model=%s, no exception info)",
 72 |             target_name,
 73 |             model,
 74 |         )
 75 | 
 76 | 
 77 | # --------------------------------------------------------------------------- #
 78 | # Retry helpers                                                               #
 79 | # --------------------------------------------------------------------------- #
 80 | 
 81 | 
 82 | @backoff.on_exception(
 83 |     backoff.expo,
 84 |     (OpenAIError, APIConnectionError, RateLimitError, InternalServerError, APITimeoutError),
 85 |     on_backoff=_log_backoff,
 86 |     on_giveup=_log_giveup,
 87 | )
 88 | def completion_with_backoff(client: OpenAI, **kwargs):
 89 |     """
 90 |     Synchronous completion request with exponential-backoff retry.
 91 | 
 92 |     If `response_format` is supplied the call is routed to the
 93 |     structured-output beta endpoint; otherwise the regular endpoint is used.
 94 |     """
 95 |     if "response_format" in kwargs:
 96 |         return client.beta.chat.completions.parse(**kwargs)  # structured JSON
 97 |     return client.chat.completions.create(**kwargs)
 98 | 
 99 | 
100 | @backoff.on_exception(
101 |     backoff.expo,
102 |     (OpenAIError, APIConnectionError, RateLimitError, InternalServerError, APITimeoutError),
103 |     on_backoff=_log_backoff,
104 |     on_giveup=_log_giveup,
105 | )
106 | async def acompletion_with_backoff(client: AsyncOpenAI, **kwargs):
107 |     """
108 |     Asynchronous completion request with exponential-backoff retry.
109 |     """
110 |     if "response_format" in kwargs:
111 |         return await client.beta.chat.completions.parse(**kwargs)
112 |     return await client.chat.completions.create(**kwargs)
113 | 
114 | 
115 | # --------------------------------------------------------------------------- #
116 | # Synchronous client                                                          #
117 | # --------------------------------------------------------------------------- #
118 | 
119 | 
120 | class OpenAIClient:
121 |     """
122 |     Synchronous OpenAI client.
123 | 
124 |     Example:
125 |         client = OpenAIClient()
126 |         result = client.response(
127 |             model="gpt-4o",
128 |             messages=[{"role": "user", "content": "Hello!"}],
129 |             temperature=0.2,
130 |             # response_format={"type": "json_object"}  # optional
131 |         )
132 |     """
133 | 
134 |     def __init__(self) -> None:
135 |         self.client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
136 | 
137 |     def response(self, count_token: bool = False, **kwargs):
138 |         """
139 |         Wrapper around `chat.completions.create`.
140 | 
141 |         Args:
142 |             count_token: If True, also return a dict with token usage.
143 |             **kwargs: Arguments accepted by the OpenAI `/chat/completions` API.
144 | 
145 |         Returns:
146 |             Either the content/parsed JSON, or a tuple
147 |             (content_or_parsed_json, token_dict) when `count_token=True`.
148 |         """
149 |         response = completion_with_backoff(self.client, **kwargs)
150 | 
151 |         tokens = {
152 |             "input_tokens": response.usage.prompt_tokens,
153 |             "output_tokens": response.usage.completion_tokens,
154 |         }
155 | 
156 |         if "response_format" in kwargs:  # structured-output mode
157 |             return (response.choices[0].message.parsed, tokens) if count_token else response.choices[0].message.parsed
158 | 
159 |         # plain-text mode
160 |         return (response.choices[0].message.content, tokens) if count_token else response.choices[0].message.content
161 | 
162 | 
163 | # --------------------------------------------------------------------------- #
164 | # Asynchronous client                                                         #
165 | # --------------------------------------------------------------------------- #
166 | 
167 | 
168 | class AsyncOpenAIClient:
169 |     """
170 |     Asynchronous OpenAI client.
171 | 
172 |     Example:
173 |         client = AsyncOpenAIClient()
174 |         result = await client.response(
175 |             model="gpt-3.5-turbo",
176 |             messages=[{"role": "user", "content": "Ping"}],
177 |         )
178 |     """
179 | 
180 |     def __init__(self) -> None:
181 |         self.client = AsyncOpenAI(api_key=os.getenv("OPENAI_API_KEY"))
182 | 
183 |     async def response(self, count_token: bool = False, **kwargs):
184 |         """
185 |         Async wrapper around `chat.completions.create`.
186 | 
187 |         Behavior mirrors `OpenAIClient.response`.
188 |         """
189 |         response = await acompletion_with_backoff(self.client, **kwargs)
190 | 
191 |         tokens = {
192 |             "input_tokens": response.usage.prompt_tokens,
193 |             "output_tokens": response.usage.completion_tokens,
194 |         }
195 | 
196 |         if "response_format" in kwargs:
197 |             return (response.choices[0].message.parsed, tokens) if count_token else response.choices[0].message.parsed
198 | 
199 |         return (response.choices[0].message.content, tokens) if count_token else response.choices[0].message.content
200 | 


--------------------------------------------------------------------------------
/mind2web2/prompts/cache_prompts.py:
--------------------------------------------------------------------------------
 1 | llm_extraction_prompts = """You are responsible for extracting all unique website URLs appearing in the text provided by users.
 2 | 
 3 | GENERAL RULES:
 4 | 1. **Do not** create, omit, or invent any URL. Extract only unique URLs mentioned in the provided text.
 5 | 2. If no URL exists, return `null` (JSON value).
 6 | 3. Always include full URLs with protocol. If protocol is missing, prepend `http://`.
 7 | 4. Ignore obviously invalid or malformed URLs.
 8 | 
 9 | SPECIAL ATTENTION - Look for these hard-to-find URLs:
10 | - Domain names without http/https protocol (e.g., "example.com", "www.site.org")
11 | - URLs embedded in prose text without clear formatting
12 | - Partial URLs that need protocol completion
13 | - URLs in quotes, parentheses, or other punctuation
14 | - URLs that may be split across lines or have unusual formatting
15 | """


--------------------------------------------------------------------------------
/mind2web2/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .cache_filesys import CacheFileSys
 2 | from .logging_setup import create_logger, cleanup_logger, create_sub_logger
 3 | from .path_config import PathConfig
 4 | from .page_info_retrieval import PageManager
 5 | from .load_eval_script import load_eval_script
 6 | from .misc import (
 7 |     normalize_url_markdown,
 8 |     text_dedent,
 9 |     strip_extension,
10 |     encode_image,
11 |     encode_image_buffer,
12 |     extract_doc_description,
13 |     extract_doc_description_from_frame,
14 | )
15 | 
16 | __all__ = [
17 |     "CacheFileSys",
18 |     "create_logger",
19 |     "cleanup_logger", 
20 |     "create_sub_logger",
21 |     "PathConfig",
22 |     "PageManager",
23 |     "load_eval_script",
24 |     "normalize_url_markdown",
25 |     "text_dedent",
26 |     "strip_extension",
27 |     "encode_image",
28 |     "encode_image_buffer",
29 |     "extract_doc_description",
30 |     "extract_doc_description_from_frame",
31 | ]
32 | 


--------------------------------------------------------------------------------
/mind2web2/utils/cache_filesys.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import os
  4 | import json
  5 | import hashlib
  6 | import base64
  7 | from typing import Literal, List, Dict, Any, Optional, Tuple
  8 | from urllib.parse import urldefrag, quote, unquote, quote_plus
  9 | from functools import lru_cache
 10 | from PIL import Image
 11 | import io
 12 | from .url_tools import normalize_url_simple, remove_utm_parameters
 13 | 
 14 | ContentType = Literal["web", "pdf"]
 15 | 
 16 | 
 17 | class CacheFileSys:
 18 |     """Single-task file system cache with lazy loading.
 19 |     
 20 |     Each instance handles one task's cached content. URLs are stored as either
 21 |     'web' (text + screenshot) or 'pdf'. Files are named using URL hashes.
 22 |     
 23 |     Directory structure:
 24 |     task_dir/
 25 |     ├── index.json          # {"url1": "web", "url2": "pdf"}
 26 |     ├── <hash1>.txt         # text content
 27 |     ├── <hash1>.jpg         # screenshot
 28 |     ├── <hash2>.pdf         # pdf content
 29 |     └── ...
 30 |     """
 31 | 
 32 |     def __init__(self, task_dir: str):
 33 |         """Initialize cache for a single task.
 34 |         
 35 |         Args:
 36 |             task_dir: Directory path for this specific task's cache
 37 |         """
 38 |         self.task_dir = os.path.abspath(task_dir)
 39 |         self.index_file = os.path.join(self.task_dir, "index.json")
 40 |         self.urls: Dict[str, ContentType] = {}  # url -> "web"/"pdf"
 41 |         
 42 |         # Create task directory if it doesn't exist
 43 |         os.makedirs(self.task_dir, exist_ok=True)
 44 |         
 45 |         # Load index immediately
 46 |         self._load_index()
 47 |     
 48 |     def _get_url_hash(self, url: str) -> str:
 49 |         """Generate consistent hash for URL to use as filename."""
 50 |         normalized_url = self._remove_frag_and_slash(url)
 51 |         return hashlib.md5(normalized_url.encode('utf-8')).hexdigest()
 52 |     
 53 |     def _remove_frag_and_slash(self, url: str) -> str:
 54 |         """Normalize URL to a consistent format for storage"""
 55 |         url_no_frag, _ = urldefrag(url)
 56 |         decoded = unquote(url_no_frag)
 57 |         if decoded.endswith('/') and len(decoded) > 1 and not decoded.endswith('://'):
 58 |             decoded = decoded[:-1]
 59 |         return decoded
 60 | 
 61 |     @lru_cache(maxsize=1000)
 62 |     def _get_url_variants(self, url: str) -> List[str]:
 63 |         """Generate all possible variants of URL for matching."""
 64 |         #TODO: remove UTM SOURCE, or add CHATGPT/OPENAI
 65 |         #TODO: probably want to
 66 |         def swap_scheme(u: str):
 67 |             if u.startswith("http://"):
 68 |                 return "https://" + u[7:]
 69 |             if u.startswith("https://"):
 70 |                 return "http://" + u[8:]
 71 |             return None
 72 | 
 73 |         url_no_frag, _ = urldefrag(url)
 74 |         base_urls: set[str] = {
 75 |             url, url_no_frag, remove_utm_parameters(url), remove_utm_parameters(url_no_frag),
 76 |             f"{url}?utm_source=chatgpt.com", f"{url_no_frag}?utm_source=chatgpt.com",
 77 |             f"{url}?utm_source=openai.com", f"{url_no_frag}?utm_source=openai.com",
 78 |         }
 79 | 
 80 |         if not url.endswith("/"):
 81 |             base_urls.add(f"{url}/?utm_source=chatgpt.com")
 82 |         if not url_no_frag.endswith("/"):
 83 |             base_urls.add(f"{url_no_frag}/?utm_source=chatgpt.com")
 84 | 
 85 |         if not url.endswith("/"):
 86 |             base_urls.add(f"{url}/?utm_source=openai.com")
 87 |         if not url_no_frag.endswith("/"):
 88 |             base_urls.add(f"{url_no_frag}/?utm_source=openai.com")
 89 | 
 90 |         if url.startswith("http://www."):
 91 |             base_urls.add("http://" + url[11:])
 92 |         elif url.startswith("https://www."):
 93 |             base_urls.add("https://" + url[12:])
 94 |         else: #TODO: how do we handle this?
 95 |             pass
 96 | 
 97 |         for u in list(base_urls):
 98 |             swapped = swap_scheme(u)
 99 |             if swapped:
100 |                 base_urls.add(swapped)
101 | 
102 |         variants = []
103 |         for base_url in base_urls:
104 |             try:
105 |                 original = base_url
106 |                 encoded_default = quote(base_url)
107 |                 encoded_basic = quote(base_url, safe=':/?#')
108 |                 encoded_common = quote(base_url, safe=':/?#@!$&\'*+,;=')
109 |                 encoded_brackets = quote(base_url, safe=':/?#[]@!$&\'*+,;=')
110 |                 encoded_rfc = quote(base_url, safe=':/?#[]@!$&\'()*+,;=')
111 |                 encoded_minimal = quote(base_url, safe=':/')
112 |                 encoded_plus = quote_plus(base_url, safe=':/?#[]@!$&\'()*+,;=')
113 |                 decoded_url = unquote(base_url)
114 | 
115 |                 encoding_variants = [
116 |                     original, encoded_default, encoded_basic, encoded_common,
117 |                     encoded_brackets, encoded_rfc, encoded_minimal, encoded_plus, decoded_url
118 |                 ]
119 | 
120 |                 for url_variant in encoding_variants:
121 |                     variants.append(url_variant)
122 |                     if url_variant.endswith("/") and len(url_variant) > 1 and not url_variant.endswith('://'):
123 |                         variants.append(url_variant[:-1])
124 |                     elif not url_variant.endswith('/'):
125 |                         variants.append(url_variant + "/")
126 |             except Exception:
127 |                 variants.append(base_url)
128 |                 if base_url.endswith("/") and len(base_url) > 1 and not base_url.endswith('://'):
129 |                     variants.append(base_url[:-1])
130 |                 elif not base_url.endswith('/'):
131 |                     variants.append(base_url + "/")
132 | 
133 |         # Deduplicate while maintaining order
134 |         seen = set()
135 |         unique_variants = []
136 |         for variant in variants:
137 |             if variant not in seen:
138 |                 seen.add(variant)
139 |                 unique_variants.append(variant)
140 |         return unique_variants
141 | 
142 |     def _load_index(self):
143 |         """Load the index file and verify file integrity."""
144 |         if os.path.exists(self.index_file):
145 |             try:
146 |                 with open(self.index_file, 'r', encoding='utf-8') as f:
147 |                     loaded_urls = json.load(f)  # Direct load: {url: type}
148 |             except (IOError, json.JSONDecodeError) as e:
149 |                 print(f"Warning: Failed to load index: {e}. Starting with empty index.")
150 |                 loaded_urls = {}
151 |         else:
152 |             loaded_urls = {}
153 |         
154 |         # Verify file integrity and keep only URLs with existing files
155 |         self.urls = {}
156 |         for url, content_type in loaded_urls.items():
157 |             url_hash = self._get_url_hash(url)
158 |             files_exist = True
159 |             
160 |             if content_type == "web":
161 |                 text_file = os.path.join(self.task_dir, f"{url_hash}.txt")
162 |                 screenshot_file = os.path.join(self.task_dir, f"{url_hash}.jpg")
163 |                 if not (os.path.exists(text_file) and os.path.exists(screenshot_file)):
164 |                     files_exist = False
165 |             elif content_type == "pdf":
166 |                 pdf_file = os.path.join(self.task_dir, f"{url_hash}.pdf")
167 |                 if not os.path.exists(pdf_file):
168 |                     files_exist = False
169 |             
170 |             if files_exist:
171 |                 self.urls[url] = content_type
172 |             else:
173 |                 print(f"Warning: Missing files for URL {url}, removing from index")
174 | 
175 |     def _find_url(self, url: str) -> Optional[str]:
176 |         """Find stored URL that matches input URL (handling variants)."""
177 |         
178 |         # Direct lookup
179 |         if url in self.urls:
180 |             return url
181 |         
182 |         # Try normalized
183 |         normalized = normalize_url_simple(url)
184 |         if normalized in self.urls:
185 |             return normalized
186 |         
187 | 
188 |         # Reverse search - check if any stored URL normalizes to same as input
189 |         normalized_input = normalize_url_simple(url)
190 |         for stored_url in self.urls:
191 |             try:
192 |                 if normalize_url_simple(stored_url) == normalized_input:
193 |                     return stored_url
194 |             except Exception:
195 |                 continue
196 | 
197 |         # Try all variants
198 |         variants = self._get_url_variants(url)
199 |         for variant in variants:
200 |             if variant in self.urls:
201 |                 return variant
202 |         
203 |         return None
204 | 
205 |     def _convert_image_to_jpg(self, image_data: str | bytes, quality: int = 85) -> bytes:
206 |         """Convert image data to JPG format for storage efficiency."""
207 |         try:
208 |             if isinstance(image_data, str):
209 |                 if image_data.startswith('data:image/'):
210 |                     image_data = image_data.split(',', 1)[1]
211 |                 image_bytes = base64.b64decode(image_data)
212 |             else:
213 |                 image_bytes = image_data
214 |                 
215 |             image = Image.open(io.BytesIO(image_bytes))
216 |             
217 |             # Convert to RGB if necessary
218 |             if image.mode in ('RGBA', 'LA', 'P'):
219 |                 background = Image.new('RGB', image.size, (255, 255, 255))
220 |                 if image.mode == 'P':
221 |                     image = image.convert('RGBA')
222 |                 if image.mode in ('RGBA', 'LA'):
223 |                     background.paste(image, mask=image.split()[-1])
224 |                 image = background
225 |             elif image.mode != 'RGB':
226 |                 image = image.convert('RGB')
227 |                 
228 |             jpg_buffer = io.BytesIO()
229 |             image.save(jpg_buffer, format='JPEG', quality=quality, optimize=True)
230 |             return jpg_buffer.getvalue()
231 |             
232 |         except Exception as e:
233 |             print(f"Error converting image to JPG: {e}")
234 |             if isinstance(image_data, str):
235 |                 if image_data.startswith('data:image/'):
236 |                     image_data = image_data.split(',', 1)[1]
237 |                 return base64.b64decode(image_data)
238 |             return image_data
239 | 
240 |     # Public API methods
241 |     def put_web(self, url: str, text: str, screenshot: str | bytes):
242 |         """Store web page content (text + screenshot)."""
243 |         url_hash = self._get_url_hash(url)
244 |         
245 |         # Save text file
246 |         text_file = os.path.join(self.task_dir, f"{url_hash}.txt")
247 |         with open(text_file, 'w', encoding='utf-8') as f:
248 |             f.write(text)
249 |         
250 |         # Convert and save screenshot as JPG
251 |         jpg_data = self._convert_image_to_jpg(screenshot)
252 |         screenshot_file = os.path.join(self.task_dir, f"{url_hash}.jpg")
253 |         with open(screenshot_file, 'wb') as f:
254 |             f.write(jpg_data)
255 |         
256 |         # Update index (safe because each async handles different URLs)
257 |         self.urls[self._remove_frag_and_slash(url)] = "web"
258 | 
259 |     def put_pdf(self, url: str, pdf_bytes: bytes):
260 |         """Store PDF content."""
261 |         url_hash = self._get_url_hash(url)
262 |         
263 |         # Save PDF file
264 |         pdf_file = os.path.join(self.task_dir, f"{url_hash}.pdf")
265 |         with open(pdf_file, 'wb') as f:
266 |             f.write(pdf_bytes)
267 |         
268 |         # Update index (safe because each async handles different URLs)
269 |         self.urls[self._remove_frag_and_slash(url)] = "pdf"
270 | 
271 |     def get_web(self, url: str, get_screenshot=True) -> Tuple[str, bytes]:
272 |         """Get web page content (text, screenshot_bytes). Raises error if not found."""
273 |         stored_url = self._find_url(url)
274 |         if not stored_url or self.urls[stored_url] != "web":
275 |             raise KeyError(f"No web content found for URL: {url}")
276 |         
277 |         url_hash = self._get_url_hash(stored_url)
278 |         
279 |         # Load text (files are guaranteed to exist due to integrity check)
280 |         text_file = os.path.join(self.task_dir, f"{url_hash}.txt")
281 |         with open(text_file, 'r', encoding='utf-8') as f:
282 |             text = f.read()
283 |         
284 |         # Load screenshot
285 |         if get_screenshot:
286 |             screenshot_file = os.path.join(self.task_dir, f"{url_hash}.jpg")
287 |             with open(screenshot_file, 'rb') as f:
288 |                 screenshot_bytes = f.read()
289 |         else:
290 |             screenshot_bytes = None
291 |         
292 |         return text, screenshot_bytes
293 | 
294 |     def get_pdf(self, url: str) -> bytes:
295 |         """Get PDF content. Raises error if not found."""
296 |         stored_url = self._find_url(url)
297 |         if not stored_url or self.urls[stored_url] != "pdf":
298 |             raise KeyError(f"No PDF content found for URL: {url}")
299 |         
300 |         url_hash = self._get_url_hash(stored_url)
301 |         
302 |         # Load PDF (file is guaranteed to exist due to integrity check)
303 |         pdf_file = os.path.join(self.task_dir, f"{url_hash}.pdf")
304 |         with open(pdf_file, 'rb') as f:
305 |             return f.read()
306 | 
307 |     def has(self, url: str) -> ContentType | None:
308 |         """Check what type of content exists for URL.
309 |         
310 |         Returns:
311 |             "web" if web content exists
312 |             "pdf" if PDF content exists  
313 |             None if no content exists
314 |         """
315 |         stored_url = self._find_url(url)
316 |         if stored_url is not None:
317 |             return self.urls[stored_url]
318 |         return None
319 | 
320 |     def has_web(self, url: str) -> bool:
321 |         """Check if web content exists for URL."""
322 |         return self.has(url) == "web"
323 | 
324 |     def has_pdf(self, url: str) -> bool:
325 |         """Check if PDF content exists for URL."""
326 |         return self.has(url) == "pdf"
327 | 
328 |     def get_all_urls(self) -> List[str]:
329 |         """Get all stored URLs."""
330 |         return list(self.urls.keys())
331 | 
332 |     def summary(self) -> Dict[str, Any]:
333 |         """Get cache summary."""
334 |         web_count = sum(1 for content_type in self.urls.values() if content_type == "web")
335 |         pdf_count = sum(1 for content_type in self.urls.values() if content_type == "pdf")
336 |         
337 |         return {
338 |             "total_urls": len(self.urls),
339 |             "web_pages": web_count,
340 |             "pdf_pages": pdf_count,
341 |         }
342 | 
343 |     def save(self):
344 |         """Save the index to disk."""
345 |         with open(self.index_file, 'w', encoding='utf-8') as f:
346 |             json.dump(self.urls, f, indent=2, ensure_ascii=False)  # Direct save: {url: type}
347 | 
348 |     def clear(self):
349 |         """Clear all cached content."""
350 |         if os.path.exists(self.task_dir):
351 |             import shutil
352 |             shutil.rmtree(self.task_dir)
353 |             os.makedirs(self.task_dir, exist_ok=True)
354 |         
355 |         self.urls.clear()
356 |         self._get_url_variants.cache_clear()


--------------------------------------------------------------------------------
/mind2web2/utils/load_eval_script.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Utilities for dynamically loading an evaluation script and returning its
 3 | `evaluate_answer` coroutine function.
 4 | 
 5 | Usage
 6 | -----
 7 | from mind2web2.utils.load_eval_script import load_eval_script
 8 | 
 9 | eval_fn = load_eval_script("/path/to/my_eval_script.py")
10 | result  = await eval_fn(...)
11 | """
12 | 
13 | import importlib.util
14 | import sys
15 | import uuid
16 | import inspect
17 | import asyncio
18 | from pathlib import Path
19 | from types import ModuleType
20 | 
21 | 
22 | def load_eval_script(path: str):
23 |     """
24 |     Load an external evaluation script and return its `evaluate_answer`
25 |     coroutine function.
26 | 
27 |     Parameters
28 |     ----------
29 |     path : str
30 |         Filesystem path to the Python script that defines `async def evaluate_answer(...)`.
31 | 
32 |     Returns
33 |     -------
34 |     Callable
35 |         A reference to the `evaluate_answer` coroutine function.
36 | 
37 |     Raises
38 |     ------
39 |     FileNotFoundError
40 |         If the file does not exist.
41 |     ImportError
42 |         If the module spec cannot be created.
43 |     AttributeError
44 |         If `evaluate_answer` is missing.
45 |     TypeError
46 |         If `evaluate_answer` is not an async function or has an invalid signature.
47 |     """
48 |     path = Path(path).expanduser().resolve()
49 |     if not path.exists():
50 |         raise FileNotFoundError(path)
51 | 
52 |     # Generate a unique module name to avoid namespace collisions.
53 |     module_name = f"mind2web2_dynamic_{uuid.uuid4().hex}"
54 |     spec = importlib.util.spec_from_file_location(module_name, str(path))
55 |     if spec is None or spec.loader is None:
56 |         raise ImportError(f"Could not create module spec for {path}")
57 | 
58 |     module: ModuleType = importlib.util.module_from_spec(spec)  # type: ignore[arg-type]
59 |     # Register the module so that any relative imports inside the script work.
60 |     sys.modules[module_name] = module
61 |     spec.loader.exec_module(module)  # type: ignore[union-attr]
62 | 
63 |     # --------------------------------------------------------------------- #
64 |     # Validate the presence and signature of `evaluate_answer`.             #
65 |     # --------------------------------------------------------------------- #
66 |     if not hasattr(module, "evaluate_answer"):
67 |         raise AttributeError(f"{path} does not define `evaluate_answer`")
68 | 
69 |     evaluate_answer = module.evaluate_answer  # type: ignore[attr-defined]
70 | 
71 |     if not asyncio.iscoroutinefunction(evaluate_answer):
72 |         raise TypeError("`evaluate_answer` must be defined with `async def`")
73 | 
74 |     required_params = {
75 |         "client",
76 |         "answer",
77 |         "agent_name",
78 |         "answer_name",
79 |         "cache",
80 |         "semaphore",
81 |         "logger",
82 |     }
83 |     sig = inspect.signature(evaluate_answer)
84 |     missing = required_params - set(sig.parameters)
85 |     if missing:
86 |         raise TypeError(
87 |             f"`evaluate_answer` is missing required parameters: {', '.join(sorted(missing))}"
88 |         )
89 | 
90 |     return evaluate_answer
91 | 


--------------------------------------------------------------------------------
/mind2web2/utils/logging_setup.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | import sys
  3 | import os
  4 | import json
  5 | import threading
  6 | from logging import Logger, StreamHandler
  7 | from logging.handlers import TimedRotatingFileHandler
  8 | from datetime import datetime
  9 | from pythonjsonlogger import jsonlogger
 10 | from typing import Literal, Optional
 11 | 
 12 | # Globally shared error handler, used by all answer loggers
 13 | _shared_error_handler = None
 14 | _handler_lock = threading.Lock()
 15 | 
 16 | 
 17 | class ColoredStructuredFormatter(logging.Formatter):
 18 |     """Colored structured log formatter."""
 19 | 
 20 |     COLORS = {
 21 |         'DEBUG': '\033[36m',  # Cyan
 22 |         'INFO': '\033[32m',  # Green
 23 |         'WARNING': '\033[33m',  # Yellow
 24 |         'ERROR': '\033[31m',  # Red
 25 |         'RESET': '\033[0m'
 26 |     }
 27 | 
 28 |     def format(self, record):
 29 |         # Use a special format for verification operations
 30 |         if hasattr(record, 'op_id'):
 31 |             op_id = record.op_id
 32 |             level_color = self.COLORS.get(record.levelname, '')
 33 |             reset = self.COLORS['RESET']
 34 | 
 35 |             # Build main message - remove duplicate levelname
 36 |             msg_parts = [
 37 |                 f"{level_color}[{op_id}]{reset}"
 38 |             ]
 39 | 
 40 |             # Add node info
 41 |             if hasattr(record, 'node_id') and record.node_id:
 42 |                 msg_parts.append(f"Node({record.node_id})")
 43 | 
 44 |             # Add verification type
 45 |             if hasattr(record, 'verify_type'):
 46 |                 msg_parts.append(f"<{record.verify_type}>")
 47 | 
 48 |             # Add main message
 49 |             msg_parts.append(record.getMessage())
 50 | 
 51 |             # Build detailed info (indented display)
 52 |             details = []
 53 | 
 54 |             if hasattr(record, 'node_desc') and record.node_desc:
 55 |                 details.append(f"  📋 Description: {record.node_desc}")
 56 | 
 57 |             if hasattr(record, 'url') and record.url:
 58 |                 details.append(f"  🔗 URL: {record.url}")
 59 | 
 60 |             if hasattr(record, 'claim_preview'):
 61 |                 details.append(f"  💬 Claim: {record.claim_preview}")
 62 | 
 63 |             if hasattr(record, 'reasoning') and record.reasoning:
 64 |                 reasoning = record.reasoning
 65 |                 # if len(reasoning) > 200:
 66 |                 #     reasoning = reasoning[:200] + "..."
 67 |                 details.append(f"  💭 Reasoning: {reasoning}")
 68 | 
 69 |             if hasattr(record, 'result'):
 70 |                 result_str = "✅ PASS" if record.result else "❌ FAIL"
 71 |                 details.append(f"  📊 Result: {result_str}")
 72 | 
 73 |             # Combine all parts
 74 |             full_msg = " ".join(msg_parts)
 75 |             if details:
 76 |                 full_msg += "\n" + "\n".join(details)
 77 | 
 78 |             return full_msg
 79 | 
 80 |         # For other logs, use standard format - show level only for ERROR/WARNING
 81 |         level_indicator = ""
 82 |         if record.levelname == 'ERROR':
 83 |             level_indicator = f"{self.COLORS['ERROR']}[ERROR]{self.COLORS['RESET']} "
 84 |         elif record.levelname == 'WARNING':
 85 |             level_indicator = f"{self.COLORS['WARNING']}[WARN]{self.COLORS['RESET']} "
 86 | 
 87 |         return f"{level_indicator}{record.getMessage()}"
 88 | 
 89 | 
 90 | class ErrorWithContextFormatter(logging.Formatter):
 91 |     """Formatter specialized for errors, adding context information."""
 92 | 
 93 |     COLORS = {
 94 |         'ERROR': '\033[31m',  # Red
 95 |         'WARNING': '\033[33m',  # Yellow
 96 |         'RESET': '\033[0m'
 97 |     }
 98 | 
 99 |     def format(self, record):
100 |         level_color = self.COLORS.get(record.levelname, '')
101 |         reset = self.COLORS['RESET']
102 | 
103 |         # Build context information
104 |         context_parts = []
105 | 
106 |         # Add agent and answer information
107 |         if hasattr(record, 'agent_name') and record.agent_name:
108 |             context_parts.append(f"Agent:{record.agent_name}")
109 |         if hasattr(record, 'answer_name') and record.answer_name:
110 |             context_parts.append(f"Answer:{record.answer_name}")
111 |         if hasattr(record, 'node_id') and record.node_id:
112 |             context_parts.append(f"Node:{record.node_id}")
113 |         if hasattr(record, 'op_id') and record.op_id:
114 |             context_parts.append(f"Op:{record.op_id}")
115 | 
116 |         context_str = " | ".join(context_parts)
117 |         context_prefix = f"[{context_str}] " if context_str else ""
118 | 
119 |         return f"{level_color}[{record.levelname}]{reset} {context_prefix}{record.getMessage()}"
120 | 
121 | 
122 | class HumanReadableFormatter(logging.Formatter):
123 |     """Human-readable file log format, keep emojis."""
124 | 
125 |     def format(self, record):
126 |         # Timestamp - second precision
127 |         timestamp = self.formatTime(record, '%Y-%m-%d %H:%M:%S')
128 | 
129 |         # 基本信息 - 只在重要级别显示level
130 |         level_prefix = ""
131 |         if record.levelname in ['ERROR', 'WARNING']:
132 |             level_prefix = f"[{record.levelname}] "
133 | 
134 |         base_info = f"[{timestamp}] {level_prefix}{record.getMessage()}"
135 | 
136 |         # Add structured fields
137 |         extras = []
138 |         skip_fields = {
139 |             'name', 'msg', 'args', 'levelname', 'levelno', 'pathname',
140 |             'filename', 'module', 'lineno', 'funcName', 'created',
141 |             'msecs', 'relativeCreated', 'thread', 'threadName',
142 |             'processName', 'process', 'getMessage', 'exc_info',
143 |             'exc_text', 'stack_info', 'message'
144 |         }
145 | 
146 |         for key, value in record.__dict__.items():
147 |             if key not in skip_fields and value is not None:
148 |                 # Special handling for some fields
149 |                 if key == 'final_score' and isinstance(value, (int, float)):
150 |                     extras.append(f"score={value}")
151 |                 elif key == 'agent_name':
152 |                     extras.append(f"agent={value}")
153 |                 elif key == 'node_id':
154 |                     extras.append(f"node={value}")
155 |                 elif key == 'op_id':
156 |                     extras.append(f"op={value}")
157 |                 else:
158 |                     extras.append(f"{key}={value}")
159 | 
160 |         if extras:
161 |             base_info += f" | {' | '.join(extras)}"
162 | 
163 |         return base_info
164 | 
165 | 
166 | class CompactJsonFormatter(jsonlogger.JsonFormatter):
167 |     """Compact JSON formatter that removes redundant fields."""
168 | 
169 |     def add_fields(self, log_record, record, message_dict):
170 |         super().add_fields(log_record, record, message_dict)
171 | 
172 |         # Remove unnecessary fields
173 |         fields_to_remove = ['name', 'levelname']
174 |         for field in fields_to_remove:
175 |             log_record.pop(field, None)
176 | 
177 |         # Simplify time format to seconds
178 |         if 'asctime' in log_record:
179 |             try:
180 |                 asctime = log_record['asctime']
181 |                 if ',' in asctime:
182 |                     log_record['asctime'] = asctime.split(',')[0]
183 |             except:
184 |                 pass
185 | 
186 | 
187 | def _get_shared_error_handler() -> StreamHandler:
188 |     """Get or create the globally shared error handler."""
189 |     global _shared_error_handler
190 | 
191 |     with _handler_lock:
192 |         if _shared_error_handler is None:
193 |             _shared_error_handler = StreamHandler(sys.stderr)  # Use stderr for errors
194 |             _shared_error_handler.setFormatter(ErrorWithContextFormatter())
195 |             _shared_error_handler.setLevel(logging.ERROR)  # Show only ERROR level
196 | 
197 |     return _shared_error_handler
198 | 
199 | 
200 | def create_logger(
201 |         lgr_nm: str,
202 |         log_folder: str,
203 |         enable_console: bool = True,
204 |         file_format: Literal["jsonl", "readable", "both"] = "both",
205 |         enable_shared_errors: bool = False  # New parameter
206 | ) -> tuple[Logger, str]:
207 |     """
208 |     Create an independent logger instance, supporting multiple file formats.
209 | 
210 |     Args:
211 |         lgr_nm: Logger name
212 |         log_folder: Log folder
213 |         enable_console: Whether to enable console output
214 |         file_format: File log format
215 |         enable_shared_errors: Whether to output ERROR-level logs to the shared terminal
216 | 
217 |     Returns:
218 |         (logger instance, timestamp)
219 |     """
220 |     if not os.path.exists(log_folder):
221 |         os.makedirs(log_folder)
222 | 
223 |     current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
224 | 
225 |     # Create a unique logger name to avoid duplication
226 |     unique_logger_name = f"{lgr_nm}_{current_time}_{id(log_folder)}"
227 | 
228 |     # If a logger already exists, clean it up first
229 |     existing_logger = logging.getLogger(unique_logger_name)
230 |     if existing_logger.handlers:
231 |         for handler in existing_logger.handlers[:]:
232 |             existing_logger.removeHandler(handler)
233 |             handler.close()
234 | 
235 |     # Create a new logger
236 |     new_logger = logging.getLogger(unique_logger_name)
237 |     new_logger.setLevel(logging.DEBUG)
238 |     new_logger.propagate = False
239 | 
240 |     # File handlers
241 |     if file_format in ["jsonl", "both"]:
242 |         # JSON Lines format
243 |         jsonl_file = os.path.join(log_folder, f"{current_time}_{lgr_nm}.jsonl")
244 |         jsonl_handler = TimedRotatingFileHandler(
245 |             jsonl_file,
246 |             when="D",
247 |             backupCount=14,
248 |             encoding="utf-8"
249 |         )
250 |         jsonl_formatter = CompactJsonFormatter('%(asctime)s %(message)s')
251 |         jsonl_handler.setFormatter(jsonl_formatter)
252 |         jsonl_handler.setLevel(logging.DEBUG)
253 |         new_logger.addHandler(jsonl_handler)
254 | 
255 |     if file_format in ["readable", "both"]:
256 |         # Human-readable format
257 |         readable_file = os.path.join(log_folder, f"{current_time}_{lgr_nm}.log")
258 |         readable_handler = TimedRotatingFileHandler(
259 |             readable_file,
260 |             when="D",
261 |             backupCount=14,
262 |             encoding="utf-8"
263 |         )
264 |         readable_formatter = HumanReadableFormatter()
265 |         readable_handler.setFormatter(readable_formatter)
266 |         readable_handler.setLevel(logging.DEBUG)
267 |         new_logger.addHandler(readable_handler)
268 | 
269 |     # Console handler - use colored structured format
270 |     if enable_console:
271 |         console_handler = StreamHandler(sys.stdout)
272 |         console_handler.setFormatter(ColoredStructuredFormatter())
273 |         console_handler.setLevel(logging.INFO)
274 |         new_logger.addHandler(console_handler)
275 | 
276 |     # Shared error handler - for displaying errors during parallel execution
277 |     if enable_shared_errors:
278 |         shared_error_handler = _get_shared_error_handler()
279 |         new_logger.addHandler(shared_error_handler)
280 | 
281 |     return new_logger, current_time
282 | 
283 | 
284 | def create_sub_logger(parent_logger: Logger, sub_name: str) -> Logger:
285 |     """
286 |     Create sublogger based on parent logger, inherit parent logger's handlers
287 |     Used to create hierarchical logs within the same evaluation
288 |     """
289 |     parent_name = parent_logger.name
290 |     sub_logger_name = f"{parent_name}.{sub_name}"
291 | 
292 |     sub_logger = logging.getLogger(sub_logger_name)
293 |     sub_logger.setLevel(parent_logger.level)
294 |     sub_logger.propagate = True  # Allow propagation to parent logger
295 | 
296 |     return sub_logger
297 | 
298 | 
299 | def cleanup_logger(logger: Logger) -> None:
300 |     """Clean up all handlers of the logger (but not the shared error handler)."""
301 |     global _shared_error_handler
302 | 
303 |     for handler in logger.handlers[:]:
304 |         # Do not clean up the shared error handler
305 |         if handler is not _shared_error_handler:
306 |             logger.removeHandler(handler)
307 |             handler.close()
308 |         else:
309 |             logger.removeHandler(handler)  # Remove only, do not close
310 | 
311 | 
312 | def cleanup_shared_error_handler():
313 |     """Clean up the shared error handler at program end."""
314 |     global _shared_error_handler
315 | 
316 |     with _handler_lock:
317 |         if _shared_error_handler is not None:
318 |             _shared_error_handler.close()
319 |             _shared_error_handler = None
320 | 
321 | 
322 | # Usage examples and notes
323 | """
324 | How to use in the evaluation runner:
325 | 
326 | 1. Main logger — normal console output:
327 |    main_logger, timestamp = create_logger("main_task", log_folder, enable_console=True)
328 | 
329 | 2. Per-answer loggers — errors are shown in the terminal:
330 |    logger, timestamp = create_logger(
331 |        log_tag, 
332 |        str(log_dir), 
333 |        enable_console=False,  # Do not enable regular console output
334 |        enable_shared_errors=True  # Enable shared error output
335 |    )
336 | 
337 | This results in:
338 | - Primary progress information shown in the main terminal
339 | - Each answer's ERROR-level messages also shown in the terminal (with context)
340 | - All detailed logs still saved to their respective files
341 | 
342 | Example terminal output:
343 | 🚀 Starting concurrent evaluation of 10 answers
344 | 👉 Processing human/answer_1.md
345 | [ERROR] [Agent:human | Answer:answer_1.md | Node:price_check] Failed to verify price claim
346 | 👉 Processing openai_deep_research/answer_1.md
347 | ✅ Successfully evaluated human/answer_1.md
348 | """
349 | 


--------------------------------------------------------------------------------
/mind2web2/utils/misc.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import textwrap
  3 | from os import PathLike
  4 | import os
  5 | import re
  6 | import inspect
  7 | 
  8 | 
  9 | def normalize_url_markdown(url: str) -> str:
 10 |     """Process URLs extracted from markdown, remove escape characters"""
 11 | 
 12 |     # Remove leading and trailing whitespace
 13 |     url = url.strip()
 14 | 
 15 |     # Remove escape backslashes before common markdown characters
 16 |     url = re.sub(r'\\([_()[\]*#!&?])', r'\1', url)
 17 | 
 18 |     return url
 19 | 
 20 | def text_dedent(multi_line_str: str) -> str:
 21 |     """
 22 |     abbreviation for removing superfluous start-of-line indenting from multi-line strings
 23 |     :param multi_line_str: a string value from a multi-line string expression
 24 |     :return: the multi-line string with any start-of-line whitespace that all lines have removed,
 25 |                 plus any starting and ending newlines removed
 26 |     """
 27 |     return textwrap.dedent(multi_line_str).strip()
 28 | 
 29 | 
 30 | def strip_extension(filename):
 31 |     """
 32 |     Removes the file extension from a filename or file path.
 33 | 
 34 |     Args:
 35 |         filename (str): The file name or path.
 36 | 
 37 |     Returns:
 38 |         str: The file name or path without the extension.
 39 |     """
 40 |     return os.path.splitext(filename)[0]
 41 | 
 42 | 
 43 | def encode_image(image_path: str|PathLike) -> str:
 44 |     """
 45 |     credit to OpenAI docs
 46 |     :param image_path: path of image file to convert to base-64-encoded string
 47 |     :return: a base-64-encoded string version of the image file
 48 |     """
 49 |     with open(image_path, "rb") as image_file:
 50 |         return base64.b64encode(image_file.read()).decode('utf-8')
 51 | 
 52 | def encode_image_buffer(buffer: bytes) -> str:
 53 |     """
 54 |     credit to OpenAI docs
 55 |     :param image_path: path of image file to convert to base-64-encoded string
 56 |     :return: a base-64-encoded string version of the image file
 57 |     """
 58 |     return base64.b64encode(buffer).decode('utf-8')
 59 | 
 60 | 
 61 | def _get_doc_from_frame(frame):
 62 |     co = frame.f_code
 63 |     name = co.co_name
 64 |     func = frame.f_globals.get(name)
 65 |     if (inspect.isfunction(func) or inspect.ismethod(func)) and func.__doc__:
 66 |         return inspect.getdoc(func)
 67 |     self_obj = frame.f_locals.get("self")
 68 |     if self_obj:
 69 |         cls = type(self_obj)
 70 |         meth = getattr(cls, name, None)
 71 |         if (inspect.isfunction(meth) or inspect.ismethod(meth)) and meth.__doc__:
 72 |             return inspect.getdoc(meth)
 73 |     consts = co.co_consts
 74 |     if consts and isinstance(consts[0], str):
 75 |         return consts[0]
 76 |     return None
 77 | 
 78 | def extract_doc_description(doc: str) -> str:
 79 |     """
 80 |     Given a full docstring, return only the desc part,
 81 |     i.e. all lines up until the first section header like
 82 |     'Parameters:', 'Returns:', etc.
 83 |     """
 84 |     if not doc:
 85 |         return ""
 86 |     lines = doc.splitlines()
 87 |     desc_lines = []
 88 |     section_rx = re.compile(r'^(?:Args?|Parameters?|Returns?|Yields?|Raises?):')
 89 |     for line in lines:
 90 |         if section_rx.match(line):
 91 |             break
 92 |         desc_lines.append(line)
 93 |     # strip leading/trailing blank lines, then re‑join
 94 |     return "\n".join(desc_lines).strip()
 95 | 
 96 | def extract_doc_description_from_frame(frame) -> str:
 97 |     """
 98 |     Given a frame object, return the desc part of the docstring
 99 |     of the function or method that the frame is in.
100 |     """
101 |     doc = _get_doc_from_frame(frame)
102 |     return extract_doc_description(doc)
103 | 


--------------------------------------------------------------------------------
/mind2web2/utils/page_info_retrieval.py:
--------------------------------------------------------------------------------
  1 | # Standard library imports
  2 | import asyncio
  3 | import base64
  4 | import hashlib
  5 | import random
  6 | import time
  7 | from io import BytesIO
  8 | from logging import Logger
  9 | from pathlib import Path
 10 | from typing import Optional, Tuple, Union
 11 | 
 12 | # Third-party imports
 13 | from PIL import Image
 14 | from rebrowser_playwright.async_api import (
 15 |     Browser,
 16 |     BrowserContext,
 17 |     Page,
 18 |     async_playwright,
 19 | )
 20 | 
 21 | 
 22 | import html2text
 23 | 
 24 | def html_to_markdown(html: str) -> str:
 25 |     """Convert HTML to Markdown."""
 26 |     h = html2text.HTML2Text()
 27 |     h.ignore_links = True      # Ignore hyperlinks
 28 |     h.ignore_emphasis = True   # Ignore bold/italic emphasis
 29 |     h.images_to_alt = True     # Convert images to alt text
 30 |     h.body_width = 0
 31 |     return h.handle(html)
 32 | 
 33 | 
 34 | 
 35 | # ================================ Constants ================================
 36 | 
 37 | def make_blank_png_b64() -> str:
 38 |     # Create 1×1 RGBA fully transparent pixel
 39 |     img = Image.new("RGBA", (1, 1), (0, 0, 0, 0))
 40 |     buf = BytesIO()
 41 |     img.save(buf, format="PNG")
 42 |     # Convert to base64 and remove line breaks
 43 |     return base64.b64encode(buf.getvalue()).decode()
 44 | 
 45 | 
 46 | # Error handling constants
 47 | BLANK_IMG_B64 = make_blank_png_b64()
 48 | ERROR_TEXT = "\u26A0\ufe0f This URL could not be loaded (navigation error)."
 49 | 
 50 | 
 51 | # User-agent pools
 52 | DEFAULT_USER_AGENTS = [
 53 |     'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
 54 |     '(KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36',
 55 |     'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 '
 56 |     '(KHTML, like Gecko) Version/14.1.2 Safari/605.1.15',
 57 |     'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 '
 58 |     '(KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36',
 59 | ]
 60 | 
 61 | 
 62 | class PageManager:
 63 |     """
 64 |     Manage active Page within a BrowserContext, handling new pages, closures, crashes, navigations.
 65 |     """
 66 | 
 67 |     def __init__(self, context: BrowserContext, logger: Logger):
 68 |         self.context = context
 69 |         self.logger = logger
 70 |         self.current: Optional[Page] = None
 71 |         self.closing = False
 72 |         self._handlers = []
 73 |         # Listen for new page events on context
 74 |         handler = lambda page: asyncio.create_task(self._on_new_page(page))
 75 |         context.on('page', handler)
 76 |         self._handlers.append((context, 'page', handler))
 77 |         for pg in context.pages:
 78 |             asyncio.create_task(self._on_new_page(pg))
 79 | 
 80 |     async def _on_new_page(self, page: Page):
 81 |         if self.closing:
 82 |             return
 83 |         self.logger.debug(f'New page opened: {page.url}')
 84 |         self.current = page
 85 |         self._attach_handlers(page)
 86 | 
 87 |     def _attach_handlers(self, page: Page):
 88 |         for event in ('close', 'crash', 'framenavigated'):
 89 |             if event == 'close':
 90 |                 cb = lambda: asyncio.create_task(self._on_close(page))
 91 |             elif event == 'crash':
 92 |                 cb = lambda: asyncio.create_task(self._on_crash(page))
 93 |             else:
 94 |                 cb = lambda frame: asyncio.create_task(self._on_navigate(page, frame))
 95 |             page.on(event, cb)
 96 |             self._handlers.append((page, event, cb))
 97 | 
 98 |     async def _on_close(self, page: Page):
 99 |         if self.closing:
100 |             return
101 |         self.logger.warning(f'Page closed: {page.url}')
102 |         pages = self.context.pages
103 |         if pages:
104 |             await self._on_new_page(pages[-1])
105 |         else:
106 |             try:
107 |                 new_pg = await self.context.new_page()
108 |                 await self._on_new_page(new_pg)
109 |             except Exception as e:
110 |                 self.logger.error(f'Failed to reopen page after close: {e}')
111 | 
112 |     async def _on_crash(self, page: Page):
113 |         if self.closing:
114 |             return
115 |         self.logger.error(f'Page crashed: {page.url}, refreshing...')
116 |         try:
117 |             await page.reload()
118 |         except Exception as e:
119 |             self.logger.error(f'Reload after crash failed: {e}')
120 | 
121 |     async def _on_navigate(self, page: Page, frame):
122 |         if self.closing:
123 |             return
124 |         if frame == page.main_frame:
125 |             self.logger.debug(f'Frame navigated: {page.url}')
126 |             self.current = page
127 | 
128 |     async def get(self) -> Page:
129 |         if self.closing:
130 |             raise RuntimeError('Context is closing')
131 |         if not self.current or self.current.is_closed():
132 |             # self.logger.info('No active page, creating a new one')
133 |             page = await self.context.new_page()
134 |             await self._on_new_page(page)
135 |         return self.current
136 | 
137 |     def dispose(self):
138 |         """Stop listening and prevent new page opens."""
139 |         self.closing = True
140 |         for emitter, event, cb in self._handlers:
141 |             try:
142 |                 emitter.off(event, cb)
143 |             except Exception:
144 |                 pass
145 |         self._handlers.clear()
146 | 
147 | 
148 | class BatchBrowserManager:
149 |     """Robust browser manager for both batch and single web content extraction.
150 |     
151 |     Integrates PageManager's stability features while maintaining efficiency for batch processing.
152 |     Can be used as a drop-in replacement for capture_page_content_async.
153 |     """
154 |     
155 |     def __init__(self, headless: bool = True, max_retries: int = 3, max_concurrent_pages: int = 10):
156 |         self.headless = headless
157 |         self.max_retries = max_retries
158 |         self.max_concurrent_pages = max_concurrent_pages
159 |         self.playwright = None
160 |         self.browser = None
161 |         self._browser_lock = asyncio.Lock()
162 |         self._page_semaphore = asyncio.Semaphore(max_concurrent_pages)
163 |         
164 |     async def __aenter__(self):
165 |         """Async context manager entry."""
166 |         await self.start()
167 |         return self
168 |         
169 |     async def __aexit__(self, exc_type, exc_val, exc_tb):
170 |         """Async context manager exit."""
171 |         await self.stop()
172 |         
173 |     async def start(self):
174 |         """Initialize the browser instance."""
175 |         if self.browser is None:
176 |             self.playwright = await async_playwright().start()
177 |             self.browser = await self.playwright.chromium.launch(
178 |                 headless=self.headless,
179 |                 args=[
180 |                     "--disable-blink-features=AutomationControlled",
181 |                     "--disable-web-security",
182 |                     "--disable-site-isolation-trials", 
183 |                     "--no-sandbox",
184 |                     "--disable-setuid-sandbox",
185 |                     "--disable-dev-shm-usage",
186 |                     "--disable-gpu",
187 |                     "--ignore-certificate-errors",
188 |                     "--safebrowsing-disable-auto-save",
189 |                     "--safebrowsing-disable-download-protection",
190 |                     "--password-store=basic",
191 |                     "--use-mock-keychain",
192 |                 ]
193 |             )
194 |             
195 |     async def stop(self):
196 |         """Clean up browser resources."""
197 |         if self.browser:
198 |             await self.browser.close()
199 |             self.browser = None
200 |         if self.playwright:
201 |             await self.playwright.stop()
202 |             self.playwright = None
203 |             
204 |     async def _restart_browser(self):
205 |         """Restart browser if it crashes."""
206 |         await self.stop()
207 |         await self.start()
208 |         
209 |     async def capture_page(
210 |         self, 
211 |         url: str, 
212 |         logger: Logger,
213 |         wait_until: str = "load",
214 |         timeout: int = 30000,
215 |         grant_permissions: bool = True,
216 |         user_data_dir: Union[str, Path] = None
217 |     ) -> Tuple[Optional[str], Optional[str]]:
218 |         """Robust page capture with PageManager integration for stability.
219 |         
220 |         Returns:
221 |             Tuple of (screenshot_b64, text_content)
222 |         """
223 | 
224 |         print(f"Start collecting page {url}")
225 |         # Use semaphore to limit concurrent pages
226 |         async with self._page_semaphore:
227 |             # Ensure browser is running
228 |             if not self.browser:
229 |                 async with self._browser_lock:
230 |                     if not self.browser:  # Double-check pattern
231 |                         await self.start()
232 |             
233 |             browser = self.browser  # Cache reference
234 |             
235 |             for attempt in range(self.max_retries):
236 |                 context = None
237 |                 page_manager = None
238 |                 try:
239 |                     # Create context with enhanced settings (similar to original capture_page_content_async)
240 |                     user_agent = random.choice(DEFAULT_USER_AGENTS)
241 |                     headers = {"user-agent": user_agent}
242 |                     
243 |                     if user_data_dir:
244 |                         # Use persistent context if user_data_dir provided
245 |                         context = await self.playwright.chromium.launch_persistent_context(
246 |                             user_data_dir=user_data_dir,
247 |                             locale='en-US',
248 |                             headless=self.headless,
249 |                             ignore_https_errors=True,
250 |                             extra_http_headers=headers,
251 |                             viewport={
252 |                                 "width": random.randint(1050, 1150),
253 |                                 "height": random.randint(700, 800),
254 |                             },
255 |                         )
256 |                     else:
257 |                         # Regular context
258 |                         context = await browser.new_context(
259 |                             locale='en-US',
260 |                             ignore_https_errors=True,
261 |                             extra_http_headers=headers,
262 |                             viewport={
263 |                                 "width": random.randint(1050, 1150),
264 |                                 "height": random.randint(700, 800),
265 |                             }
266 |                         )
267 |                     
268 |                     # Grant permissions if requested
269 |                     if grant_permissions:
270 |                         try:
271 |                             await context.grant_permissions(
272 |                                 [
273 |                                     "geolocation",
274 |                                     "notifications", 
275 |                                     "camera",
276 |                                     "microphone",
277 |                                     "clipboard-read",
278 |                                     "clipboard-write",
279 |                                 ],
280 |                                 origin=url,
281 |                             )
282 |                         except Exception as e:
283 |                             logger.debug(f'Failed to grant permissions: {e}')
284 |                     
285 |                     # Use PageManager for robust page handling
286 |                     page_manager = PageManager(context, logger)
287 |                     
288 |                     # Navigate with robust error handling
289 |                     try:
290 |                         page = await page_manager.get()
291 |                         await page.goto(url, wait_until=wait_until, timeout=timeout)
292 |                     except Exception as e:
293 |                         logger.info(f"Navigation timeout/error (continuing): {e}")
294 |                     
295 |                     # Enhanced scrolling for content discovery (from original implementation)
296 |                     page = await page_manager.get()
297 |                     for _ in range(3):
298 |                         await page.keyboard.press("End")
299 |                         await asyncio.sleep(random.uniform(0.3, 0.8))  # Faster for batch
300 |                     await page.keyboard.press("Home")
301 |                     await asyncio.sleep(random.uniform(0.3, 0.8))
302 |                     
303 |                     # Use CDP for efficient and reliable capture
304 |                     page = await page_manager.get()
305 |                     cdp = await context.new_cdp_session(page)
306 |                     await cdp.send("Page.enable")
307 |                     await cdp.send("DOM.enable")
308 |                     await cdp.send("Runtime.enable")
309 |                     
310 |                     # Get proper page metrics
311 |                     metrics = await cdp.send("Page.getLayoutMetrics")
312 |                     css_vp = metrics["cssVisualViewport"]
313 |                     css_content = metrics["cssContentSize"]
314 |                     width = round(css_vp["clientWidth"])
315 |                     height = round(min(css_content["height"], 6000))
316 |                     scale = round(metrics.get("visualViewport", {}).get("scale", 1))
317 |                     
318 |                     # Set device metrics
319 |                     await cdp.send(
320 |                         "Emulation.setDeviceMetricsOverride",
321 |                         {
322 |                             "mobile": False,
323 |                             "width": width,
324 |                             "height": height,
325 |                             "deviceScaleFactor": scale,
326 |                         },
327 |                     )
328 |                     
329 |                     # Small delay for stability
330 |                     await asyncio.sleep(random.uniform(0.5, 1.0))
331 |                     
332 |                     # Capture screenshot and text using CDP
333 |                     screenshot_task = cdp.send(
334 |                         "Page.captureScreenshot",
335 |                         {"format": "png", "captureBeyondViewport": True},
336 |                     )
337 | 
338 |                     html_task = cdp.send("Runtime.evaluate", {
339 |                         "expression": "document.documentElement.outerHTML",
340 |                         "returnByValue": True,
341 |                     })
342 | 
343 | 
344 | 
345 |                     shot_result, html_result = await asyncio.gather(screenshot_task, html_task)
346 |                     screenshot_b64 = shot_result.get("data")
347 |                     page_html = html_result.get("result", {}).get("value", "")
348 |                     page_text=html_to_markdown(page_html)
349 | 
350 | 
351 | 
352 |                     return screenshot_b64, page_text
353 |                     
354 |                 except Exception as e:
355 |                     logger.error(f"Attempt {attempt + 1} failed for {url}: {e}")
356 |                     
357 |                     # Check if browser crashed
358 |                     if ("Target page, context or browser has been closed" in str(e) or 
359 |                         "Browser has been closed" in str(e) or
360 |                         "browser.newContext" in str(e)):
361 |                         # Browser crash - restart under lock
362 |                         async with self._browser_lock:
363 |                             if self.browser == browser:
364 |                                 logger.warning("Browser crashed, restarting...")
365 |                                 await self._restart_browser()
366 |                                 browser = self.browser
367 |                         
368 |                     if attempt == self.max_retries - 1:
369 |                         # Last attempt failed
370 |                         return make_blank_png_b64(), ERROR_TEXT
371 |                         
372 |                 finally:
373 |                     # Cleanup resources
374 |                     if page_manager:
375 |                         page_manager.dispose()
376 |                     if context:
377 |                         try:
378 |                             await context.close()
379 |                         except:
380 |                             pass
381 |                             
382 |             return make_blank_png_b64(), ERROR_TEXT
383 | 


--------------------------------------------------------------------------------
/mind2web2/utils/path_config.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Centralised project-relative path management for Mind2Web2.
  3 | 
  4 | Typical usage
  5 | -------------
  6 | from pathlib import Path
  7 | from mind2web2.utils.path_config import PathConfig
  8 | 
  9 | project_root = Path(__file__).resolve().parents[2]   # adapt as needed
 10 | paths = PathConfig(project_root)
 11 | 
 12 | # Override anything you like:
 13 | paths.apply_overrides(cache_root=Path("/tmp/my_cache"))
 14 | 
 15 | print(paths.eval_scripts_root)
 16 | print(paths.default_script_for("task_001"))
 17 | """
 18 | 
 19 | from __future__ import annotations
 20 | 
 21 | from dataclasses import dataclass, field
 22 | from pathlib import Path
 23 | from typing import Optional
 24 | 
 25 | 
 26 | @dataclass
 27 | class PathConfig:
 28 |     """
 29 |     Holds every project-relative directory in one place.
 30 | 
 31 |     All attributes are absolute `Path` objects and never contain `~`.
 32 |     """
 33 |     project_root: Path
 34 | 
 35 |     # Dataset subtree
 36 |     dataset_root: Path = field(init=False)
 37 |     answers_root: Path = field(init=False)
 38 |     eval_scripts_root: Path = field(init=False)
 39 |     tasks_root: Path = field(init=False)
 40 |     eval_results_root: Path = field(init=False)
 41 |     cache_root: Path = field(init=False)
 42 |     
 43 |     # Evaluation version
 44 |     eval_version: str = field(init=False)
 45 | 
 46 |     # Scripts
 47 |     run_eval_script: Path = field(init=False)
 48 | 
 49 |     # ------------------------------------------------------------------ #
 50 |     # Construction helpers
 51 |     # ------------------------------------------------------------------ #
 52 |     def __post_init__(self) -> None:
 53 |         self.project_root = self.project_root.expanduser().resolve()
 54 | 
 55 |         # Dataset
 56 |         self.dataset_root = self.project_root / "dataset"
 57 |         self.answers_root = self.project_root / "answers"
 58 | 
 59 |         self.eval_scripts_root = self.project_root / "eval_scripts"
 60 |         self.tasks_root = self.project_root / "tasks"
 61 |         self.eval_results_root = self.project_root / "eval_results"
 62 | 
 63 |         self.cache_root = self.project_root / "cache"
 64 |         
 65 |         # Default eval version
 66 |         self.eval_version = "2025_07_14"
 67 | 
 68 |         # Scripts
 69 |         self.run_eval_script = self.project_root / "run_eval.py"
 70 | 
 71 | 
 72 |     # ------------------------------------------------------------------ #
 73 |     # Public API
 74 |     # ------------------------------------------------------------------ #
 75 |     def default_script_for(self, task_id: str) -> Path:
 76 |         """Return `<eval_scripts_root>/<eval_version>/<task_id>.py`."""
 77 |         return self.eval_scripts_root / self.eval_version / f"{task_id}.py"
 78 | 
 79 |     def apply_overrides(
 80 |         self,
 81 |         *,
 82 |         dataset_root: Optional[Path] = None,
 83 |         answers_root: Optional[Path] = None,
 84 |         eval_scripts_root: Optional[Path] = None,
 85 |         tasks_root: Optional[Path] = None,
 86 |         eval_results_root: Optional[Path] = None,
 87 |         cache_root: Optional[Path] = None,
 88 |         run_eval_script: Optional[Path] = None,
 89 |         eval_version: Optional[str] = None,
 90 |     ) -> None:
 91 |         """
 92 |         Overwrite selected directories in-place.
 93 |         All arguments are absolute or will be resolved/expanded.
 94 |         """
 95 |         if dataset_root is not None:
 96 |             self.dataset_root = dataset_root.expanduser().resolve()
 97 |         if answers_root is not None:
 98 |             self.answers_root = answers_root.expanduser().resolve()
 99 |         if eval_scripts_root is not None:
100 |             self.eval_scripts_root = eval_scripts_root.expanduser().resolve()
101 |         if tasks_root is not None:
102 |             self.tasks_root = tasks_root.expanduser().resolve()
103 |         if eval_results_root is not None:
104 |             self.eval_results_root = eval_results_root.expanduser().resolve()
105 |         if cache_root is not None:
106 |             self.cache_root = cache_root.expanduser().resolve()
107 |         if run_eval_script is not None:
108 |             self.run_eval_script = run_eval_script.expanduser().resolve()
109 |         if eval_version is not None:
110 |             self.eval_version = eval_version
111 | 
112 |     # ------------------------------------------------------------------ #
113 |     # Debug helpers
114 |     # ------------------------------------------------------------------ #
115 |     def __repr__(self) -> str:  # pragma: no cover
116 |         fields = ", ".join(f"{k}={v}" for k, v in self.__dict__.items())
117 |         return f"{self.__class__.__name__}({fields})"
118 | 


--------------------------------------------------------------------------------
/mind2web2/utils/url_tools.py:
--------------------------------------------------------------------------------
  1 | from urllib.parse import urldefrag, unquote, urlparse, parse_qs, urlencode, urlunparse
  2 | 
  3 | import argparse
  4 | import asyncio
  5 | import json
  6 | import os
  7 | import random
  8 | import re
  9 | from pathlib import Path
 10 | from typing import Any, Dict, List, Tuple, Optional
 11 | from urllib.parse import urlparse
 12 | from urllib.parse import urldefrag, unquote
 13 | from pydantic import BaseModel
 14 | from tqdm import tqdm
 15 | import validators
 16 | from urllib.parse import urldefrag, unquote, urlparse, parse_qs, urlencode, urlunparse
 17 | 
 18 | class URLs(BaseModel):
 19 |     urls: List[str]
 20 | 
 21 | def _is_valid_url(u: str) -> bool:
 22 |     return validators.url(u, public=True) is True
 23 | 
 24 | def remove_utm_parameters(url: str) -> str:
 25 |     """Remove all UTM tracking parameters from URL."""
 26 |     from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
 27 | 
 28 |     parsed = urlparse(url)
 29 | 
 30 |     # If there are no query parameters, return original URL
 31 |     if not parsed.query:
 32 |         return url
 33 | 
 34 |     # Parse query parameters
 35 |     params = parse_qs(parsed.query, keep_blank_values=True)
 36 | 
 37 |     # Filter out all utm_* parameters
 38 |     filtered_params = {k: v for k, v in params.items() if not k.startswith('utm_')}
 39 | 
 40 |     # Reconstruct query string
 41 |     new_query = urlencode(filtered_params, doseq=True)
 42 | 
 43 |     # Reconstruct URL
 44 |     return urlunparse((
 45 |         parsed.scheme,
 46 |         parsed.netloc,
 47 |         parsed.path,
 48 |         parsed.params,
 49 |         new_query,
 50 |         parsed.fragment
 51 |     ))
 52 | 
 53 | 
 54 | 
 55 | def normalize_url_simple(url: str) -> str:
 56 |     """Simple URL normalization for variant detection."""
 57 | 
 58 |     url=remove_utm_parameters(url)
 59 |     # Remove fragment
 60 |     url_no_frag, _ = urldefrag(url)
 61 | 
 62 |     # Decode URL encoding
 63 |     decoded = unquote(url_no_frag)
 64 | 
 65 |     # Remove trailing slash (except for root)
 66 |     if decoded.endswith('/') and len(decoded) > 1 and not decoded.endswith('://'):
 67 |         decoded = decoded[:-1]
 68 | 
 69 |     # # Remove common tracking parameters
 70 |     # if decoded.endswith('?utm_source=chatgpt.com'):
 71 |     #     decoded = decoded[:-len('?utm_source=chatgpt.com')]
 72 | 
 73 | 
 74 |     # Remove all UTM parameters
 75 |     parsed = urlparse(decoded)
 76 |     if parsed.query:
 77 |         params = parse_qs(parsed.query, keep_blank_values=True)
 78 |         # Filter out all utm_* parameters
 79 |         filtered_params = {k: v for k, v in params.items() if not k.startswith('utm_')}
 80 |         # Reconstruct query string
 81 |         new_query = urlencode(filtered_params, doseq=True)
 82 |         decoded = urlunparse((
 83 |             parsed.scheme,
 84 |             parsed.netloc,
 85 |             parsed.path,
 86 |             parsed.params,
 87 |             new_query,
 88 |             parsed.fragment
 89 |         ))
 90 | 
 91 |     # Normalize scheme
 92 |     if decoded.startswith('http://'):
 93 |         decoded = 'https://' + decoded[7:]
 94 | 
 95 |     # Remove www prefix for comparison
 96 |     if '://www.' in decoded:
 97 |         decoded = decoded.replace('://www.', '://')
 98 | 
 99 |     return decoded.lower()
100 | 
101 | 
102 | 
103 | def normalize_url_for_browser(url: str) -> str:
104 |     """Simple URL normalization for variant detection."""
105 | 
106 | 
107 |     url=remove_utm_parameters(url)
108 |     # Remove fragment
109 |     if not url.startswith(('http://', 'https://', 'ftp://')):
110 |         return f'https://{url}'
111 |     return url
112 | 
113 | def regex_find_urls(text: str) -> List[str]:
114 |     """Enhanced regex extraction for comprehensive URL discovery."""
115 |     urls = set()
116 | 
117 |     # 1. Standard markdown links: [text](url)
118 |     urls.update(
119 |         m for m in re.findall(r"\[.*?\]\((https?://[^\s)]+)\)", text)
120 |         if _is_valid_url(m)
121 |     )
122 | 
123 |     # 2. Standard full URLs with protocol
124 |     urls.update(
125 |         m for m in re.findall(
126 |             r"\bhttps?://[A-Za-z0-9\-.]+\.[A-Za-z]{2,}(?:/[^\s<>\"'`{}|\\^\[\]]*)?\b",
127 |             text
128 |         )
129 |         if _is_valid_url(m)
130 |     )
131 | 
132 |     # 3. URLs without protocol (www.example.com)
133 |     www_matches = re.findall(
134 |         r"\bwww\.[A-Za-z0-9\-.]+\.[A-Za-z]{2,}(?:/[^\s<>\"'`{}|\\^\[\]]*)?\b",
135 |         text
136 |     )
137 |     for match in www_matches:
138 |         # Always prefer https for www domains
139 |         urls.add(f"https://{match}")
140 | 
141 | 
142 |     # 4. URLs in quotes or parentheses
143 |     quote_patterns = [
144 |         r'"(https?://[^"\s]+)"',
145 |         r"'(https?://[^'\s]+)'",
146 |         r"\((https?://[^)\s]+)\)",
147 |         r"<(https?://[^>\s]+)>"
148 |     ]
149 |     for pattern in quote_patterns:
150 |         urls.update(
151 |             m for m in re.findall(pattern, text)
152 |             if _is_valid_url(m)
153 |         )
154 | 
155 |     # Clean URLs by removing trailing punctuation
156 |     cleaned_urls = set()
157 |     for url in urls:
158 |         # Remove trailing punctuation that might be captured accidentally
159 |         cleaned_url = re.sub(r'[.,;:!?\)\]}>"\'\u201d\u201c]*$', '', url)
160 |         if cleaned_url and _is_valid_url(cleaned_url):
161 |             cleaned_urls.add(cleaned_url)
162 | 
163 |     return list(cleaned_urls)


--------------------------------------------------------------------------------
/mind2web2/verification_tree.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | import sys
  3 | from enum import Enum
  4 | from typing import List, Literal, Optional
  5 | from pydantic import BaseModel, Field, validator
  6 | from pydantic import field_validator
  7 | from .utils.misc import extract_doc_description_from_frame
  8 | from pydantic import PrivateAttr
  9 | 
 10 | class AggregationStrategy(str, Enum):
 11 |     """How a parent node combines its children."""
 12 |     PARALLEL = "parallel"
 13 |     SEQUENTIAL = "sequential"
 14 | 
 15 | 
 16 | class VerificationNode(BaseModel):
 17 |     """One evaluation item in a rubric tree."""
 18 | 
 19 |     # Core data
 20 |     id: str
 21 |     desc: str
 22 |     critical: bool = False
 23 |     score: float = 0.0
 24 |     status: Literal["passed", "failed", "partial", "skipped", 'initialized'] = 'initialized'
 25 |     strategy: AggregationStrategy = AggregationStrategy.PARALLEL
 26 |     children: List["VerificationNode"] = Field(default_factory=list)
 27 | 
 28 | 
 29 |     # Provenance (optional)
 30 |     # func: Optional[str] = None
 31 |     # line: Optional[int] = None
 32 |     # doc: Optional[str] = None
 33 | 
 34 |     _cached_score: Optional[float] = PrivateAttr(default=None)
 35 | 
 36 |     # Backward compatibility
 37 |     @property
 38 |     def claim(self) -> str:
 39 |         """Backward compatibility property."""
 40 |         return self.desc
 41 | 
 42 |     @claim.setter
 43 |     def claim(self, value: str) -> None:
 44 |         """Backward compatibility setter."""
 45 |         self.desc = value
 46 | 
 47 |     # Validators
 48 |     @validator("score")
 49 |     def _score_in_range(cls, v: float) -> float:
 50 |         assert 0.0 <= v <= 1.0, "Score must lie in [0.0, 1.0]"
 51 |         return v
 52 | 
 53 |     @validator("status")
 54 |     def _status_matches_score(cls, v: str, values):
 55 |         score = values.get("score")
 56 |         if score is None:
 57 |             return v
 58 |         if v == "passed":
 59 |             assert score == 1.0
 60 |         elif v == "partial":
 61 |             assert 0.0 < score < 1.0
 62 |         elif v in ("failed", "skipped"):
 63 |             assert score == 0.0
 64 |         return v
 65 | 
 66 |     def model_post_init(self, __context: Optional[dict] = None) -> None:
 67 |         """Capture caller frame for provenance."""
 68 |         try:
 69 |             frame = sys._getframe(2)
 70 |             # self.func = frame.f_code.co_name
 71 |             # self.line = frame.f_lineno
 72 |             # self.doc = extract_doc_description_from_frame(frame)
 73 |         except Exception:
 74 |             pass
 75 | 
 76 |     def _validate_critical_consistency(self, node: VerificationNode, parent: VerificationNode) -> None:
 77 |         """
 78 |         Validate the consistency constraint for critical nodes:
 79 |         If the parent node is critical, then all its child nodes must also be critical.
 80 |         """
 81 |         if parent.critical and not node.critical:
 82 |             raise ValueError(
 83 |                 f"Critical node '{parent.id}' cannot have non-critical child '{node.id}'. "
 84 |                 f"All children of critical nodes must also be critical."
 85 |             )
 86 | 
 87 |     # Public API
 88 |     def add_node(self, node: "VerificationNode") -> None:
 89 |         """Append node as a child."""
 90 |         assert isinstance(node, VerificationNode), "Child must be a VerificationNode"
 91 |         assert node is not self, "A node cannot be its own child"
 92 | 
 93 |         # Validate critical node consistency
 94 |         if self.critical:
 95 |             self._validate_critical_consistency(node, self)
 96 | 
 97 |         self.children.append(node)
 98 | 
 99 |     # Aggregation logic
100 |     @property
101 |     def aggregated_score(self) -> float:
102 |         if self._cached_score is None:
103 |             self.compute_score(mutate=True)
104 |         return self._cached_score
105 | 
106 |     def compute_score(self, *, mutate: bool = False) -> float:
107 |         """
108 |         Pure score calculation. When `mutate=False`, does not write any state;
109 |         When `mutate=True`, writes score/status back and returns the final score.
110 |         """
111 |         # -------- 1. Leaf ----------
112 |         if not self.children:
113 |             raw_score = self.score  # leaf.score is already 0/1
114 |             final_status = self.status
115 |             # Optional: validate leaf legality
116 |         else:
117 |             # -------- 2. Recursively compute each child (mutate is passed recursively) ----------
118 |             child_scores = [c.compute_score(mutate=mutate) for c in self.children]
119 | 
120 |             # -------- 3. Sequential short-circuit (no longer directly modifies child) ----------
121 |             if self.strategy is AggregationStrategy.SEQUENTIAL:
122 |                 valid_until = next(
123 |                     (idx for idx, s in enumerate(child_scores) if s < 1.0),
124 |                     len(child_scores)
125 |                 )
126 |                 if mutate and valid_until < len(child_scores):
127 |                     for c in self.children[valid_until + 1:]:
128 |                         c.score, c.status = 0.0, "skipped"
129 |                         c._cached_score = 0.0
130 |                     child_scores = child_scores[:valid_until + 1] + [0] * (len(child_scores) - valid_until - 1)
131 | 
132 |             # -------- 4. Gate-then-Average ----------
133 |             crit = [s for s, c in zip(child_scores, self.children) if c.critical]
134 |             soft = [s for s, c in zip(child_scores, self.children) if not c.critical]
135 | 
136 |             if crit and any(s < 1.0 for s in crit):
137 |                 raw_score = 0.0
138 |             elif crit and not soft:
139 |                 raw_score = 1.0
140 |             else:
141 |                 raw_score = sum(soft) / len(soft) if soft else 1.0
142 | 
143 |             # status deduction (no longer writes child)
144 |             if raw_score == 1.0:
145 |                 final_status = "passed"
146 |             elif raw_score == 0.0:
147 |                 final_status = "failed" if any(c.status == "failed" for c in self.children) else "skipped"
148 |             else:
149 |                 final_status = "partial"
150 | 
151 |         # -------- 5. Side-effect write-back / cache ----------
152 |         if mutate:
153 |             self.score = raw_score
154 |             self.status = final_status
155 |             self._cached_score = raw_score
156 |         return raw_score
157 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | # pyproject.toml  ── Located at the root of Mind2Web2/
 2 | [build-system]
 3 | requires = ["setuptools>=64", "wheel"]
 4 | build-backend = "setuptools.build_meta"
 5 | 
 6 | [project]
 7 | name = "mind2web2"
 8 | version = "0.1.0"
 9 | description = "Mind2Web2: tools, agents and code generation utilities for web‑based reasoning benchmarks"
10 | readme = "README.md"
11 | requires-python = ">=3.11"
12 | 
13 | authors = [
14 |     { name = "Boyu Gou", email = "gou.43@osu.edu" }
15 | ]
16 | 
17 | # ── Runtime dependencies ───────────────────────────────────────────────
18 | dependencies = [
19 |     "openai",
20 |     "backoff",
21 |     "pydantic>=1.10", # If already migrated to v2, change to pydantic>=2
22 |     "tqdm",
23 |     "pandas>=1.4",
24 |     "playwright~=1.42",
25 |     "arxiv",
26 |     "googlemaps",
27 |     "aiohttp",
28 |     "httpx",
29 |     "dill",
30 |     "python-json-logger",
31 |     #    "pyside6",
32 |     "beautifulsoup4",
33 |     "PyMuPDF",
34 |     "google-auth",
35 |     "google-api-python-client",
36 |     "gspread",
37 |     "fastapi",
38 |     "jinja2",
39 |     "markdown",
40 |     "uvicorn[standard]",
41 |     "markdownify",
42 |     "html2text>=2025.4.15",
43 |     "pyside6>=6.9.2",
44 |     "pillow>=11.3.0",
45 |     "rebrowser-playwright>=1.52.0",
46 |     "validators>=0.35.0",
47 | ]
48 | 
49 | # ── Optional: Code generation dependencies ─────────────────────────────
50 | [project.optional-dependencies]
51 | code-gen = [
52 |     "anthropic[bedrock]" # Only required for code generation users
53 | ]
54 | 
55 | # ── setuptools settings ────────────────────────────────────────────────
56 | [tool.setuptools]
57 | include-package-data = true          # Include non-.py files in the package
58 | 
59 | [tool.setuptools.packages.find]
60 | where = ["."]
61 | include = ["mind2web2*"]
62 | exclude = ["code_gen*", "InfoVisualizer*", "dataset*", "eval_scripts*", "scripts*"]
63 | 


--------------------------------------------------------------------------------
/run_cache_manager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | """
 3 | Entry point script for Cache Manager
 4 | 
 5 | Run this script to start the modern PySide6-based cache manager.
 6 | """
 7 | 
 8 | import sys
 9 | from pathlib import Path
10 | 
11 | # Add the project root to Python path
12 | project_root = Path(__file__).parent
13 | sys.path.insert(0, str(project_root))
14 | 
15 | if __name__ == "__main__":
16 |     from cache_manager.main import cli_main
17 |     sys.exit(cli_main())


--------------------------------------------------------------------------------
/run_eval.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | 
  3 | import argparse
  4 | import asyncio
  5 | import logging
  6 | from pathlib import Path
  7 | from typing import List, Dict, Any
  8 | 
  9 | from tqdm import tqdm
 10 | 
 11 | from mind2web2.eval_runner import evaluate_task, merge_all_results
 12 | from mind2web2.llm_client.base_client import LLMClient
 13 | from mind2web2.utils.path_config import PathConfig
 14 | 
 15 | 
 16 | # --------------------------------------------------------------------------- #
 17 | # CLI                                                                        #
 18 | # --------------------------------------------------------------------------- #
 19 | 
 20 | 
 21 | def build_parser() -> argparse.ArgumentParser:
 22 |     p = argparse.ArgumentParser(description="Run Mind2Web2 task evaluation.")
 23 | 
 24 |     # Task specification
 25 |     p.add_argument("--task_id", help="Task folder name (if not provided, evaluates all tasks)")
 26 |     p.add_argument("--agent_name", required=True, help="Agent name for evaluation")
 27 | 
 28 |     # Required path
 29 |     p.add_argument("--answer_folder", type=Path,
 30 |                    help="Directory containing answer files (required)")
 31 | 
 32 |     # Optional path overrides
 33 |     p.add_argument("--eval_scripts_root", type=Path,
 34 |                    help="Override evaluation scripts directory")
 35 |     p.add_argument("--eval_results_root", type=Path,
 36 |                    help="Override output directory for results/logs")
 37 |     p.add_argument("--cache_root", type=Path,
 38 |                    help="Override cache directory")
 39 |     p.add_argument("--eval_version", default="2025_07_14",
 40 |                    help="Version of evaluation scripts to use (default: 2025_07_14)")
 41 | 
 42 |     # LLM configuration
 43 |     p.add_argument("--llm_provider", choices=["openai", "azure_openai"],
 44 |                    default="openai", help="LLM provider to use")
 45 | 
 46 |     # Runtime options - Concurrency control
 47 |     p.add_argument("--max_concurrent_tasks", type=int, default=3,
 48 |                    help="Maximum number of tasks to evaluate concurrently (default: 2)")
 49 |     p.add_argument("--max_concurrent_answers", type=int, default=3,
 50 |                    help="Maximum number of answers to evaluate concurrently per task (default: 3)")
 51 |     p.add_argument("--max_webpage_retrieval", type=int, default=10,
 52 |                    help="Maximum number of concurrent webpage retrieval operations (playwright) (default: 5)")
 53 |     p.add_argument("--max_llm_requests", type=int, default=30,
 54 |                    help="Maximum number of concurrent LLM API requests (default: 30)")
 55 | 
 56 |     # Other runtime options
 57 |     p.add_argument("--dump_cache", action="store_true", default=True,
 58 |                    help="Persist cache to disk at the end (default: True)")
 59 |     p.add_argument("--self_debug", action="store_true",
 60 |                    help="Add *_debug suffix to logs / result files")
 61 |     p.add_argument("--overwrite", action="store_true",
 62 |                    help="Overwrite existing results")
 63 |     return p
 64 | 
 65 | 
 66 | # --------------------------------------------------------------------------- #
 67 | # Helpers                                                                    #
 68 | # --------------------------------------------------------------------------- #
 69 | 
 70 | 
 71 | async def evaluate_single_task(
 72 |         task_id: str,
 73 |         agent_name: str,
 74 |         client: LLMClient,
 75 |         paths: PathConfig,
 76 |         args: argparse.Namespace,
 77 |         webpage_semaphore: asyncio.Semaphore,
 78 |         llm_semaphore: asyncio.Semaphore
 79 | ) -> List[Dict[str, Any]]:
 80 |     """Evaluate a single task."""
 81 |     # Resolve evaluation script
 82 |     script_path = paths.default_script_for(task_id)
 83 |     if not script_path.exists():
 84 |         logging.error(f"Evaluation script not found: {script_path}")
 85 |         return []
 86 | 
 87 |     # Invoke evaluation with proper concurrency controls
 88 |     return await evaluate_task(
 89 |         client=client,
 90 |         task_id=task_id,
 91 |         agent_name=agent_name,
 92 |         answer_dir=paths.answers_root,
 93 |         cache_dir=paths.cache_root,
 94 |         output_dir=paths.eval_results_root,
 95 |         script_path=script_path,
 96 |         dump_cache=args.dump_cache,
 97 |         is_self_debug=args.self_debug,
 98 |         overwrite=args.overwrite,
 99 |         max_concurrent_answers=args.max_concurrent_answers,
100 |         webpage_semaphore=webpage_semaphore,
101 |         llm_semaphore=llm_semaphore,
102 |     )
103 | 
104 | 
105 | async def evaluate_all_tasks(
106 |         agent_name: str,
107 |         client: LLMClient,
108 |         paths: PathConfig,
109 |         args: argparse.Namespace,
110 |         webpage_semaphore: asyncio.Semaphore,
111 |         llm_semaphore: asyncio.Semaphore
112 | ) -> Dict[str, List[Dict[str, Any]]]:
113 |     """Evaluate all tasks based on available answers for the specified agent."""
114 |     results = {}
115 | 
116 |     # Find all task directories in the agent's answers folder
117 |     agent_dir = paths.answers_root / agent_name
118 |     if not agent_dir.exists():
119 |         logging.error(f"Agent directory not found: {agent_dir}")
120 |         return results
121 |     
122 |     # Get all task directories (subdirectories in agent folder)
123 |     task_dirs = [d for d in agent_dir.iterdir() if d.is_dir()]
124 |     if not task_dirs:
125 |         logging.warning(f"No task directories found in {agent_dir}")
126 |         return results
127 | 
128 |     # Verify that corresponding eval scripts exist for each task
129 |     available_tasks = []
130 |     for task_dir in task_dirs:
131 |         task_id = task_dir.name
132 |         script_path = paths.default_script_for(task_id)
133 |         if script_path.exists():
134 |             available_tasks.append(task_id)
135 |         else:
136 |             logging.warning(f"No evaluation script found for task {task_id} at {script_path}")
137 | 
138 |     if not available_tasks:
139 |         logging.warning(f"No tasks with both answers and evaluation scripts found")
140 |         return results
141 | 
142 |     logging.info(f"Found {len(available_tasks)} tasks with answers for agent '{agent_name}'")
143 |     logging.info(
144 |         f"Concurrency: {args.max_concurrent_tasks} tasks, {args.max_concurrent_answers} answers/task, {args.max_webpage_retrieval} webpage ops, {args.max_llm_requests} LLM requests")
145 | 
146 |     # Create a semaphore to limit concurrent task evaluations
147 |     task_semaphore = asyncio.Semaphore(args.max_concurrent_tasks)
148 | 
149 |     async def evaluate_task_with_semaphore(current_task_id: str) -> tuple[str, List[Dict[str, Any]]]:
150 |         """Evaluate a single task with semaphore control."""
151 |         async with task_semaphore:
152 |             try:
153 |                 logging.info(f"🚀 Starting evaluation for task: {current_task_id}")
154 |                 current_results = await evaluate_single_task(
155 |                     task_id=current_task_id,
156 |                     agent_name=agent_name,
157 |                     client=client,
158 |                     paths=paths,
159 |                     args=args,
160 |                     webpage_semaphore=webpage_semaphore,
161 |                     llm_semaphore=llm_semaphore
162 |                 )
163 |                 if current_results:
164 |                     logging.info(f"✅ Task {current_task_id}: {len(current_results)} results")
165 |                 else:
166 |                     logging.warning(f"⚠️ Task {current_task_id}: No results")
167 |                 return current_task_id, current_results
168 |             except Exception as e:
169 |                 logging.error(f"❌ Failed to evaluate task {current_task_id}: {e}")
170 |                 return current_task_id, []
171 | 
172 |     # Create tasks for all evaluations
173 |     tasks = []
174 |     for task_id in available_tasks:
175 |         tasks.append(evaluate_task_with_semaphore(task_id))
176 | 
177 |     # Run all tasks concurrently with progress bar
178 |     logging.info(f"🏃 Starting concurrent evaluation of {len(tasks)} tasks")
179 | 
180 |     # Use tqdm to show progress
181 |     completed = 0
182 |     with tqdm(total=len(tasks), desc="Evaluating tasks", unit="task") as pbar:
183 |         for coro in asyncio.as_completed(tasks):
184 |             task_id, task_results = await coro
185 |             results[task_id] = task_results
186 |             completed += 1
187 |             pbar.update(1)
188 |             pbar.set_postfix({"completed": f"{completed}/{len(tasks)}"})
189 | 
190 |     return results
191 | 
192 | 
193 | async def run_evaluation(args: argparse.Namespace, paths: PathConfig):
194 |     """Main evaluation runner."""
195 |     # Build async client
196 |     client = LLMClient(provider=args.llm_provider, is_async=True)
197 | 
198 |     # Create separate semaphores for webpage retrieval and LLM requests
199 |     webpage_semaphore = asyncio.Semaphore(args.max_webpage_retrieval)
200 |     llm_semaphore = asyncio.Semaphore(args.max_llm_requests)
201 | 
202 |     if args.task_id:
203 |         # Evaluate single task
204 |         logging.info(f"Evaluating single task: {args.task_id}")
205 |         results = await evaluate_single_task(
206 |             task_id=args.task_id,
207 |             agent_name=args.agent_name,
208 |             client=client,
209 |             paths=paths,
210 |             args=args,
211 |             webpage_semaphore=webpage_semaphore,
212 |             llm_semaphore=llm_semaphore
213 |         )
214 |         return {args.task_id: results}
215 |     else:
216 |         # Evaluate all tasks
217 |         logging.info("Evaluating all tasks")
218 |         return await evaluate_all_tasks(
219 |             agent_name=args.agent_name,
220 |             client=client,
221 |             paths=paths,
222 |             args=args,
223 |             webpage_semaphore=webpage_semaphore,
224 |             llm_semaphore=llm_semaphore
225 |         )
226 | 
227 | 
228 | # --------------------------------------------------------------------------- #
229 | # Entrypoint                                                                 #
230 | # --------------------------------------------------------------------------- #
231 | 
232 | 
233 | def main() -> None:
234 |     logging.basicConfig(
235 |         level=logging.INFO,
236 |         format="%(asctime)s - %(levelname)s - %(message)s",
237 |         datefmt="%Y-%m-%d %H:%M:%S"
238 |     )
239 | 
240 |     # Initialize paths
241 |     project_root = Path(__file__).resolve().parent
242 |     paths = PathConfig(project_root)
243 | 
244 |     # Parse arguments
245 |     args = build_parser().parse_args()
246 | 
247 |     # Apply path overrides
248 |     paths.apply_overrides(
249 |         answers_root=args.answer_folder,
250 |         eval_scripts_root=args.eval_scripts_root,
251 |         eval_results_root=args.eval_results_root,
252 |         cache_root=args.cache_root,
253 |         eval_version=args.eval_version,
254 |     )
255 | 
256 |     # Validate answer folder structure
257 |     agent_dir = paths.answers_root / args.agent_name
258 |     if not agent_dir.exists():
259 |         logging.error(f"Agent directory not found: {agent_dir}")
260 |         logging.error(f"Expected structure: {paths.answers_root}/<agent_name>/<task_id>/answer_*.md")
261 |         return
262 | 
263 |     logging.info("=" * 60)
264 |     logging.info("Mind2Web2 Evaluation Runner")
265 |     logging.info("=" * 60)
266 |     logging.info(f"Agent: {args.agent_name}")
267 |     logging.info(f"Answer folder: {paths.answers_root}")
268 |     logging.info(f"Eval scripts root: {paths.eval_scripts_root}")
269 |     logging.info(f"Eval results root: {paths.eval_results_root}")
270 |     logging.info(f"Cache root: {paths.cache_root}")
271 |     logging.info(f"LLM Provider: {args.llm_provider}")
272 |     logging.info("Concurrency Settings:")
273 |     if not args.task_id:
274 |         logging.info(f"  • Max concurrent tasks: {args.max_concurrent_tasks}")
275 |     logging.info(f"  • Max concurrent answers per task: {args.max_concurrent_answers}")
276 |     logging.info(f"  • Max concurrent webpage retrieval (global): {args.max_webpage_retrieval}")
277 |     logging.info(f"  • Max concurrent LLM requests (global): {args.max_llm_requests}")
278 |     logging.info("=" * 60)
279 | 
280 |     # Run async evaluation
281 |     results = asyncio.run(run_evaluation(args, paths))
282 | 
283 |     # Log summary
284 |     logging.info("=" * 60)
285 |     logging.info("Evaluation Summary")
286 |     logging.info("=" * 60)
287 | 
288 |     if args.task_id:
289 |         task_results = results.get(args.task_id, [])
290 |         logging.info(f"Task {args.task_id}: {len(task_results)} results")
291 |         for res in task_results:
292 |             score = res.get('final_score', 'N/A')
293 |             answer = res.get('answer_name', 'unknown')
294 |             logging.info(f"  - {answer}: score={score}")
295 |     else:
296 |         total_results = sum(len(r) for r in results.values())
297 |         logging.info(f"Evaluated {len(results)} tasks with {total_results} total results")
298 |         for task_id, task_results in sorted(results.items()):
299 |             if task_results:
300 |                 avg_score = sum(r.get('final_score', 0) for r in task_results) / len(task_results)
301 |                 logging.info(f"  - {task_id}: {len(task_results)} results, avg_score={avg_score:.2f}")
302 |             else:
303 |                 logging.info(f"  - {task_id}: No results")
304 | 
305 |     # Merge all results if evaluating all tasks
306 |     if not args.task_id and results:
307 |         logging.info("=" * 60)
308 |         logging.info("Merging all results...")
309 |         merge_all_results(paths.eval_results_root)
310 |         logging.info("✅ Results merged successfully")
311 | 
312 |     logging.info("=" * 60)
313 |     logging.info("🎉 Evaluation completed!")
314 | 
315 | 
316 | if __name__ == "__main__":
317 |     main()
318 | 


--------------------------------------------------------------------------------