├── .gitattributes ├── .gitignore ├── AgentBench.old ├── configs │ ├── agents │ │ ├── do_nothing.yaml │ │ └── tgi_clients │ │ │ ├── AgentLM-13b.yaml │ │ │ ├── AgentLM-70b.yaml │ │ │ └── AgentLM-7b.yaml │ └── tasks │ │ ├── alfworld │ │ ├── dev.yaml │ │ └── std.yaml │ │ ├── card_game │ │ ├── dev.yaml │ │ ├── ext.yaml │ │ └── std.yaml │ │ ├── dbbench │ │ ├── dev.yaml │ │ └── std.yaml │ │ ├── knowledgegraph │ │ ├── dev.yaml │ │ └── std.yaml │ │ ├── mind2web │ │ ├── dev.yaml │ │ └── std.yaml │ │ ├── os_interaction │ │ ├── dev.yaml │ │ └── std.yaml │ │ └── webshop │ │ ├── dev.yaml │ │ └── std.yaml ├── data │ ├── alfworld │ │ ├── dev.json │ │ └── std.json │ ├── dbbench │ │ ├── dev.jsonl │ │ └── standard.jsonl │ ├── knowledgegraph │ │ ├── dev.json │ │ └── std.json │ ├── mind2web │ │ └── prompt │ │ │ ├── llm_prompt.json │ │ │ └── llm_prompt_cot.json │ └── os_interaction │ │ ├── data │ │ ├── 1 │ │ │ └── stock.json │ │ ├── 2 │ │ │ └── environment.json │ │ ├── 3 │ │ │ └── ac.json │ │ ├── 4 │ │ │ ├── N11.json │ │ │ ├── N225.json │ │ │ ├── N37.json │ │ │ ├── N4.json │ │ │ ├── N41.json │ │ │ ├── Q09.json │ │ │ ├── Q19.json │ │ │ ├── Q30.json │ │ │ ├── Q47.json │ │ │ └── Q49.json │ │ ├── 5 │ │ │ └── new.json │ │ ├── 6 │ │ │ └── new.json │ │ ├── 7 │ │ │ └── 2023-06-07-00-13-14.json │ │ ├── 6-backup.json │ │ └── dev.json │ │ ├── res │ │ └── dockerfiles │ │ │ ├── default │ │ │ ├── packages │ │ │ └── ubuntu │ │ └── scripts │ │ ├── 1 │ │ ├── check │ │ │ ├── 1.sh │ │ │ ├── containing.py │ │ │ ├── in.py │ │ │ ├── integer-match.py │ │ │ ├── size-match.py │ │ │ └── string-match.py │ │ ├── example │ │ │ └── 1.sh │ │ └── init │ │ │ ├── gen_words.sh │ │ │ ├── install_nettools.sh │ │ │ ├── nested_folders.sh │ │ │ └── stock-log.sh │ │ ├── 2 │ │ ├── check │ │ │ ├── 1.sh │ │ │ ├── containing.py │ │ │ ├── in.py │ │ │ ├── integer-match.py │ │ │ ├── size-match.py │ │ │ └── string-match.py │ │ ├── example │ │ │ └── 1.sh │ │ └── init │ │ │ ├── gen_words.sh │ │ │ ├── install_nettools.sh │ │ │ ├── nested_folders.sh │ │ │ └── stock-log.sh │ │ ├── 3 │ │ ├── check │ │ │ ├── 1.sh │ │ │ ├── containing.py │ │ │ ├── in.py │ │ │ ├── integer-match.py │ │ │ ├── size-match.py │ │ │ └── string-match.py │ │ ├── example │ │ │ └── 1.sh │ │ └── init │ │ │ ├── gen_words.sh │ │ │ ├── install_nettools.sh │ │ │ ├── nested_folders.sh │ │ │ └── stock-log.sh │ │ ├── 4 │ │ ├── check │ │ │ ├── 1.sh │ │ │ ├── containing.py │ │ │ ├── in.py │ │ │ ├── integer-match.py │ │ │ ├── size-match.py │ │ │ └── string-match.py │ │ ├── example │ │ │ └── 1.sh │ │ └── init │ │ │ ├── gen_words.sh │ │ │ ├── install_nettools.sh │ │ │ ├── nested_folders.sh │ │ │ └── stock-log.sh │ │ ├── 5 │ │ ├── check │ │ │ ├── containing.py │ │ │ ├── in.py │ │ │ ├── integer-match.py │ │ │ ├── size-match.py │ │ │ └── string-match.py │ │ ├── checking │ │ │ ├── 0.sh │ │ │ ├── 1.sh │ │ │ └── 2.sh │ │ ├── example │ │ │ ├── 0.sh │ │ │ ├── 1.sh │ │ │ └── 2.sh │ │ ├── file.json │ │ ├── init │ │ │ └── 1.sh │ │ ├── new.json │ │ └── prompt.md │ │ ├── 6 │ │ ├── collected.json │ │ ├── collecting.py │ │ ├── new.json │ │ ├── permission.json │ │ └── stackoverflow │ │ │ └── spider.py │ │ ├── 7 │ │ ├── check │ │ │ ├── 1.sh │ │ │ ├── containing.py │ │ │ ├── in.py │ │ │ ├── integer-match.py │ │ │ ├── size-match.py │ │ │ └── string-match.py │ │ ├── example │ │ │ └── 1.sh │ │ └── init │ │ │ ├── gen_words.sh │ │ │ ├── install_nettools.sh │ │ │ ├── nested_folders.sh │ │ │ └── stock-log.sh │ │ └── dev │ │ ├── check │ │ ├── 0.sh │ │ ├── containing.py │ │ ├── in.py │ │ ├── integer-match.py │ │ ├── size-match.py │ │ └── string-match.py │ │ ├── example │ │ └── 0.sh │ │ └── init │ │ └── stock-log.sh ├── eval.sh ├── eval │ ├── AgentLM-13b-eval-all.sh │ ├── AgentLM-70b-eval-all.sh │ ├── AgentLM-7b-eval-all.sh │ └── single-task │ │ ├── alfworld.sh │ │ ├── card.sh │ │ ├── db.sh │ │ ├── eval_single_setup.sh │ │ ├── kg.sh │ │ ├── mind2web.sh │ │ ├── os.sh │ │ └── webshop.sh ├── evaluate.py ├── readme-zh.md ├── readme.md ├── requirements.txt └── src │ ├── __init__.py │ ├── agent.py │ ├── agents │ ├── __init__.py │ ├── do_nothing_agent.py │ └── tgi_client.py │ ├── configs.py │ ├── task.py │ ├── tasks │ ├── __init__.py │ ├── alfworld │ │ ├── __init__.py │ │ ├── configs │ │ │ └── base_config.yaml │ │ ├── environment.py │ │ ├── prompts │ │ │ ├── alfworld_multiturn.json │ │ │ ├── alfworld_multiturn_cot.json │ │ │ ├── alfworld_multiturn_new.json │ │ │ ├── alfworld_multiturn_plan_first.json │ │ │ ├── alfworld_multiturn_pure.json │ │ │ ├── alfworld_multiturn_react.json │ │ │ └── alfworld_singleturn.json │ │ ├── task.py │ │ └── utils.py │ ├── card_game │ │ ├── AI_SDK │ │ │ ├── C++ │ │ │ │ ├── Action.cpp │ │ │ │ ├── Action.hpp │ │ │ │ ├── Action_sample.cpp │ │ │ │ ├── Makefile │ │ │ │ ├── main.cpp │ │ │ │ └── sdk │ │ │ │ │ ├── ai_client.hpp │ │ │ │ │ └── jsoncpp │ │ │ │ │ ├── json │ │ │ │ │ ├── json-forwards.h │ │ │ │ │ └── json.h │ │ │ │ │ └── jsoncpp.cpp │ │ │ └── Python │ │ │ │ ├── AI_Cn.py │ │ │ │ ├── AI_Cn_en.py │ │ │ │ ├── AI_En.py │ │ │ │ ├── Action.py │ │ │ │ ├── action1.py │ │ │ │ ├── action2.py │ │ │ │ ├── action3.py │ │ │ │ ├── basline1.py │ │ │ │ ├── basline2.py │ │ │ │ ├── basline3.py │ │ │ │ ├── client.py │ │ │ │ ├── main.py │ │ │ │ ├── prompt │ │ │ │ ├── cn.py │ │ │ │ ├── cn_en.py │ │ │ │ └── en.py │ │ │ │ ├── sdk │ │ │ │ ├── CMakeLists.txt │ │ │ │ ├── ai_client.cpython-310-x86_64-linux-gnu.so │ │ │ │ ├── ai_client.cpython-38-x86_64-linux-gnu.so │ │ │ │ ├── ai_client.hpp │ │ │ │ ├── ai_client.pyi │ │ │ │ ├── build │ │ │ │ │ ├── CMakeCache.txt │ │ │ │ │ ├── CMakeFiles │ │ │ │ │ │ ├── 3.25.0 │ │ │ │ │ │ │ ├── CMakeCXXCompiler.cmake │ │ │ │ │ │ │ ├── CMakeDetermineCompilerABI_CXX.bin │ │ │ │ │ │ │ ├── CMakeSystem.cmake │ │ │ │ │ │ │ └── CompilerIdCXX │ │ │ │ │ │ │ │ ├── CMakeCXXCompilerId.cpp │ │ │ │ │ │ │ │ └── a.out │ │ │ │ │ │ ├── CMakeDirectoryInformation.cmake │ │ │ │ │ │ ├── CMakeOutput.log │ │ │ │ │ │ ├── Makefile.cmake │ │ │ │ │ │ ├── Makefile2 │ │ │ │ │ │ ├── TargetDirectories.txt │ │ │ │ │ │ ├── ai_client.dir │ │ │ │ │ │ │ ├── DependInfo.cmake │ │ │ │ │ │ │ ├── build.make │ │ │ │ │ │ │ ├── cmake_clean.cmake │ │ │ │ │ │ │ ├── compiler_depend.make │ │ │ │ │ │ │ ├── compiler_depend.ts │ │ │ │ │ │ │ ├── depend.make │ │ │ │ │ │ │ ├── flags.make │ │ │ │ │ │ │ ├── jsoncpp │ │ │ │ │ │ │ │ ├── jsoncpp.cpp.o │ │ │ │ │ │ │ │ └── jsoncpp.cpp.o.d │ │ │ │ │ │ │ ├── link.txt │ │ │ │ │ │ │ ├── progress.make │ │ │ │ │ │ │ ├── py_ai_sdk.cpp.o │ │ │ │ │ │ │ └── py_ai_sdk.cpp.o.d │ │ │ │ │ │ ├── cmake.check_cache │ │ │ │ │ │ └── progress.marks │ │ │ │ │ ├── Makefile │ │ │ │ │ ├── bin │ │ │ │ │ │ └── ai_client.cpython-310-x86_64-linux-gnu.so │ │ │ │ │ └── cmake_install.cmake │ │ │ │ ├── jsoncpp │ │ │ │ │ ├── json │ │ │ │ │ │ ├── json-forwards.h │ │ │ │ │ │ └── json.h │ │ │ │ │ └── jsoncpp.cpp │ │ │ │ ├── py_ai_sdk.cpp │ │ │ │ ├── py_json_cast.hpp │ │ │ │ └── pyd │ │ │ │ │ ├── ai_client.cp38-win_amd64.pyd │ │ │ │ │ ├── ai_client.cpython-310-x86_64-linux-gnu.so │ │ │ │ │ ├── ai_client.cpython-36m-x86_64-linux-gnu.so │ │ │ │ │ ├── ai_client.cpython-38-darwin.so │ │ │ │ │ └── ai_client.cpython-38-x86_64-linux-gnu.so │ │ │ │ └── test.py │ │ ├── Tools │ │ │ └── player.py │ │ ├── __init__.py │ │ ├── judger │ │ │ ├── cal_metric.py │ │ │ ├── judger.py │ │ │ ├── rserver.py │ │ │ └── run_all.py │ │ ├── logic │ │ │ ├── .gitignore │ │ │ ├── Makefile │ │ │ ├── bin │ │ │ │ └── main │ │ │ └── src │ │ │ │ ├── aqua_sdk.cpp │ │ │ │ ├── fish.cpp │ │ │ │ ├── fish.h │ │ │ │ ├── fishset.cpp │ │ │ │ ├── fishset.h │ │ │ │ ├── game.cpp │ │ │ │ ├── game.h │ │ │ │ ├── jsoncpp │ │ │ │ ├── json │ │ │ │ │ ├── json-forwards.h │ │ │ │ │ └── json.h │ │ │ │ └── jsoncpp.cpp │ │ │ │ ├── main │ │ │ │ ├── main.cpp │ │ │ │ ├── player.cpp │ │ │ │ ├── player.h │ │ │ │ └── timer.h │ │ ├── server.py │ │ ├── task.py │ │ └── utils.py │ ├── composite_task.py │ ├── dbbench │ │ ├── Interaction.py │ │ ├── __init__.py │ │ └── requirements.txt │ ├── example_task.py │ ├── knowledgegraph │ │ ├── __init__.py │ │ ├── api.py │ │ ├── ontology │ │ │ ├── fb_roles │ │ │ ├── fb_types │ │ │ ├── reverse_properties │ │ │ └── vocab.json │ │ ├── requirements.txt │ │ ├── task.py │ │ └── utils │ │ │ ├── logic_form_util.py │ │ │ ├── semparse_util.py │ │ │ └── sparql_executer.py │ ├── mind2web │ │ ├── __init__.py │ │ ├── data_utils │ │ │ ├── __init__.py │ │ │ └── dom_utils.py │ │ ├── dataloader.py │ │ └── task.py │ ├── os_interaction │ │ ├── __init__.py │ │ ├── images.py │ │ ├── requirements.txt │ │ └── task.py │ └── webshop │ │ ├── .github │ │ ├── ISSUE_TEMPLATE.md │ │ ├── PULL_REQUEST_TEMPLATE.md │ │ └── workflows │ │ │ └── pytest.yml │ │ ├── .gitignore │ │ ├── LICENSE.md │ │ ├── README.md │ │ ├── __init__.py │ │ ├── assets │ │ ├── diagram.gif │ │ ├── model_ckpts.png │ │ └── transfer-logic.png │ │ ├── baseline_models │ │ ├── .gitignore │ │ ├── README.md │ │ ├── agent.py │ │ ├── env.py │ │ ├── generate_search.py │ │ ├── logger.py │ │ ├── models │ │ │ ├── bert.py │ │ │ ├── modules.py │ │ │ └── rnn.py │ │ ├── requirements.txt │ │ ├── test.py │ │ ├── train_choice_il.py │ │ ├── train_rl.py │ │ └── train_search_il.py │ │ ├── conftest.py │ │ ├── requirements.txt │ │ ├── run_dev.sh │ │ ├── run_envs │ │ ├── run_web_agent_site_env.py │ │ └── run_web_agent_text_env.py │ │ ├── run_prod.sh │ │ ├── run_web_agent_site_env.sh │ │ ├── run_web_agent_text_env.sh │ │ ├── search_engine │ │ ├── convert_product_file_format.py │ │ ├── lucene_searcher.py │ │ └── run_indexing.sh │ │ ├── setup.sh │ │ ├── tests │ │ ├── transfer │ │ │ ├── mocks │ │ │ │ ├── mock_parse_item_page_amz │ │ │ │ ├── mock_parse_item_page_ebay │ │ │ │ ├── mock_parse_item_page_ws │ │ │ │ ├── mock_parse_item_page_ws_desc │ │ │ │ ├── mock_parse_item_page_ws_feat │ │ │ │ ├── mock_parse_results_amz │ │ │ │ ├── mock_parse_results_ebay │ │ │ │ └── mock_parse_results_ws │ │ │ └── test_predict_help.py │ │ └── web-agent-site │ │ │ ├── engine │ │ │ ├── test_goal.py │ │ │ └── test_normalize.py │ │ │ └── test_utils.py │ │ ├── transfer │ │ ├── README.md │ │ ├── __init__.py │ │ ├── app.py │ │ ├── predict_help.py │ │ └── webshop_lite.py │ │ └── web_agent_site │ │ ├── __init__.py │ │ ├── app.py │ │ ├── attributes │ │ ├── annotate.py │ │ └── generate_attrs.py │ │ ├── engine │ │ ├── __init__.py │ │ ├── engine.py │ │ ├── goal.py │ │ └── normalize.py │ │ ├── envs │ │ ├── __init__.py │ │ ├── chromedriver │ │ ├── web_agent_site_env.py │ │ └── web_agent_text_env.py │ │ ├── models │ │ ├── __init__.py │ │ └── models.py │ │ ├── static │ │ ├── images │ │ │ └── no-image-available.png │ │ └── style.css │ │ ├── templates │ │ ├── attributes_page.html │ │ ├── description_page.html │ │ ├── done_page.html │ │ ├── features_page.html │ │ ├── item_page.html │ │ ├── results_page.html │ │ ├── review_page.html │ │ └── search_page.html │ │ └── utils.py │ └── utils.py ├── README-zh.md ├── README.md ├── assets ├── head-figure.png ├── head-figure.svg ├── main-figure.png └── main-figure.svg ├── docker ├── agentlm-13b.yml ├── agentlm-70b.yml └── agentlm-7b.yml ├── docs ├── README.md ├── index.html └── static │ ├── css │ ├── bulma-carousel.min.css │ ├── bulma-slider.min.css │ ├── bulma.css.map.txt │ ├── bulma.min.css │ ├── fontawesome.all.min.css │ └── index.css │ ├── images │ ├── case-study.svg │ ├── error-analysis.svg │ ├── favicon.ico │ ├── head-figure.png │ ├── head-figure.svg │ ├── main-figure.png │ └── main-figure.svg │ └── js │ ├── bulma-carousel.js │ ├── bulma-carousel.min.js │ ├── bulma-slider.js │ ├── bulma-slider.min.js │ ├── fontawesome.all.min.js │ └── index.js ├── eval_general ├── eval_gsm8k_tgi.py ├── eval_mmlu_hf.py ├── eval_mt_bench_tgi.py └── requirements.txt └── eval_heldout ├── hotpotQA ├── eval_hotpot.py ├── requirements.txt └── src │ ├── agent_arch.py │ ├── config.py │ ├── data │ ├── easy.joblib │ ├── hard.joblib │ └── medium.joblib │ ├── evaluate.py │ ├── fewshots.py │ ├── hotpotqa_env.py │ ├── llms.py │ ├── pre_prompt.py │ ├── prompt.txt │ ├── utils.py │ ├── wikienv.py │ └── wrappers.py ├── miniwob++ ├── .gitignore ├── LICENSE ├── README.md ├── available_tasks.txt ├── computergym │ ├── .gitignore │ ├── README.md │ ├── computergym │ │ ├── __init__.py │ │ └── miniwob │ │ │ ├── base_env.py │ │ │ └── miniwob_interface │ │ │ ├── __init__.py │ │ │ ├── action.py │ │ │ ├── environment.py │ │ │ ├── fields.py │ │ │ ├── html │ │ │ ├── .gitignore │ │ │ ├── common │ │ │ │ ├── shapes.js │ │ │ │ ├── special │ │ │ │ │ ├── book-flight │ │ │ │ │ │ └── domestic.js │ │ │ │ │ ├── checkbox-numbers │ │ │ │ │ │ ├── ch_0.png │ │ │ │ │ │ ├── ch_1.png │ │ │ │ │ │ ├── ch_2.png │ │ │ │ │ │ ├── ch_3.png │ │ │ │ │ │ ├── ch_4.png │ │ │ │ │ │ ├── ch_5.png │ │ │ │ │ │ ├── ch_6.png │ │ │ │ │ │ ├── ch_7.png │ │ │ │ │ │ ├── ch_8.png │ │ │ │ │ │ └── ch_9.png │ │ │ │ │ ├── click-pie │ │ │ │ │ │ ├── raphael.icons.min.js │ │ │ │ │ │ ├── raphael.min.js │ │ │ │ │ │ └── wheelnav.min.js │ │ │ │ │ ├── drag-cube │ │ │ │ │ │ ├── blank.png │ │ │ │ │ │ ├── cube.css │ │ │ │ │ │ └── cube.js │ │ │ │ │ ├── email-inbox-nl │ │ │ │ │ │ └── templates.js │ │ │ │ │ ├── email-inbox │ │ │ │ │ │ ├── delete.png │ │ │ │ │ │ ├── email-inbox.css │ │ │ │ │ │ ├── forward.png │ │ │ │ │ │ ├── left-arrow-white.png │ │ │ │ │ │ ├── left-arrow.png │ │ │ │ │ │ ├── reply.png │ │ │ │ │ │ ├── search.png │ │ │ │ │ │ ├── send.png │ │ │ │ │ │ ├── star-clicked.png │ │ │ │ │ │ └── star.png │ │ │ │ │ ├── navigate-tree │ │ │ │ │ │ ├── images │ │ │ │ │ │ │ ├── ajax-loader.gif │ │ │ │ │ │ │ ├── file.gif │ │ │ │ │ │ │ ├── folder-closed.gif │ │ │ │ │ │ │ ├── folder.gif │ │ │ │ │ │ │ ├── minus.gif │ │ │ │ │ │ │ ├── plus.gif │ │ │ │ │ │ │ ├── treeview-black-line.gif │ │ │ │ │ │ │ ├── treeview-black.gif │ │ │ │ │ │ │ ├── treeview-default-line.gif │ │ │ │ │ │ │ ├── treeview-default.gif │ │ │ │ │ │ │ ├── treeview-famfamfam-line.gif │ │ │ │ │ │ │ ├── treeview-famfamfam.gif │ │ │ │ │ │ │ ├── treeview-gray-line.gif │ │ │ │ │ │ │ ├── treeview-gray.gif │ │ │ │ │ │ │ ├── treeview-red-line.gif │ │ │ │ │ │ │ └── treeview-red.gif │ │ │ │ │ │ ├── jquery.treeview.css │ │ │ │ │ │ └── jquery.treeview.min.js │ │ │ │ │ ├── search-engine │ │ │ │ │ │ └── jquery.twbsPagination.min.js │ │ │ │ │ ├── social-media │ │ │ │ │ │ ├── like-hover.png │ │ │ │ │ │ ├── like.png │ │ │ │ │ │ ├── more-hover.png │ │ │ │ │ │ ├── more.png │ │ │ │ │ │ ├── reply-hover.png │ │ │ │ │ │ ├── reply.png │ │ │ │ │ │ ├── retweet-hover.png │ │ │ │ │ │ ├── retweet.png │ │ │ │ │ │ ├── share-hover.png │ │ │ │ │ │ └── share.png │ │ │ │ │ ├── text-editor │ │ │ │ │ │ ├── quill.min.js │ │ │ │ │ │ └── quill.snow.css │ │ │ │ │ └── tic-tac-toe │ │ │ │ │ │ ├── o.png │ │ │ │ │ │ └── x.png │ │ │ │ └── ui_utils.js │ │ │ ├── core │ │ │ │ ├── core.css │ │ │ │ ├── core.js │ │ │ │ ├── d3.v3.min.js │ │ │ │ ├── jquery-ui │ │ │ │ │ ├── external │ │ │ │ │ │ └── jquery │ │ │ │ │ │ │ └── jquery.js │ │ │ │ │ ├── images │ │ │ │ │ │ ├── ui-icons_444444_256x240.png │ │ │ │ │ │ ├── ui-icons_555555_256x240.png │ │ │ │ │ │ ├── ui-icons_777620_256x240.png │ │ │ │ │ │ ├── ui-icons_777777_256x240.png │ │ │ │ │ │ ├── ui-icons_cc0000_256x240.png │ │ │ │ │ │ └── ui-icons_ffffff_256x240.png │ │ │ │ │ ├── jquery-ui.min.css │ │ │ │ │ ├── jquery-ui.min.js │ │ │ │ │ ├── jquery-ui.structure.min.css │ │ │ │ │ └── jquery-ui.theme.min.css │ │ │ │ ├── jscolor.min.js │ │ │ │ └── record.js │ │ │ ├── flight │ │ │ │ ├── AA │ │ │ │ │ ├── apps │ │ │ │ │ │ └── common │ │ │ │ │ │ │ └── js │ │ │ │ │ │ │ ├── aacom.js │ │ │ │ │ │ │ ├── aacomDevice.js │ │ │ │ │ │ │ ├── airportcode.js │ │ │ │ │ │ │ ├── cookieconsent.js │ │ │ │ │ │ │ ├── jquery │ │ │ │ │ │ │ └── aacom │ │ │ │ │ │ │ │ ├── plugins │ │ │ │ │ │ │ │ ├── aaAirportAutoComplete.js │ │ │ │ │ │ │ │ ├── aaCache.js │ │ │ │ │ │ │ │ ├── aaCookie.js │ │ │ │ │ │ │ │ ├── aaCountryLanSelect.js │ │ │ │ │ │ │ │ ├── aaDropdownPanel.js │ │ │ │ │ │ │ │ ├── aaFooterAds.js │ │ │ │ │ │ │ │ └── aaTextBoxMessage.js │ │ │ │ │ │ │ │ └── utilities │ │ │ │ │ │ │ │ ├── aaUtilities-2.1.js │ │ │ │ │ │ │ │ └── aaUtils.js │ │ │ │ │ │ │ └── wa.js │ │ │ │ │ ├── content │ │ │ │ │ │ ├── common │ │ │ │ │ │ │ └── css │ │ │ │ │ │ │ │ └── v3 │ │ │ │ │ │ │ │ ├── core.css │ │ │ │ │ │ │ │ ├── jquery-ui-1.10-aa.css │ │ │ │ │ │ │ │ ├── reservation │ │ │ │ │ │ │ │ └── findFlights │ │ │ │ │ │ │ │ │ └── mobile │ │ │ │ │ │ │ │ │ └── findFlights.css │ │ │ │ │ │ │ │ └── responsive.css │ │ │ │ │ │ ├── fonts │ │ │ │ │ │ │ ├── american-v2 │ │ │ │ │ │ │ │ ├── americansans-bold.woff │ │ │ │ │ │ │ │ ├── americansans-light.woff │ │ │ │ │ │ │ │ ├── americansans-medium.woff │ │ │ │ │ │ │ │ └── americansans-regular.woff │ │ │ │ │ │ │ └── icons │ │ │ │ │ │ │ │ └── american-icons-v4-4.woff │ │ │ │ │ │ └── images │ │ │ │ │ │ │ ├── chrome │ │ │ │ │ │ │ ├── icons │ │ │ │ │ │ │ │ └── loading.gif │ │ │ │ │ │ │ └── rebrand │ │ │ │ │ │ │ │ ├── aa-flight-icon.png │ │ │ │ │ │ │ │ ├── aa-icons-flags-sprite.png │ │ │ │ │ │ │ │ ├── aa-logo.png │ │ │ │ │ │ │ │ ├── down-arrow.png │ │ │ │ │ │ │ │ ├── favicon.png │ │ │ │ │ │ │ │ ├── oneworld.png │ │ │ │ │ │ │ │ ├── shadow-down.png │ │ │ │ │ │ │ │ ├── shadow-vertical-150.png │ │ │ │ │ │ │ │ ├── shadow12-down.png │ │ │ │ │ │ │ │ ├── shadow12-up.png │ │ │ │ │ │ │ │ └── shadow3-down.png │ │ │ │ │ │ │ └── graphics │ │ │ │ │ │ │ └── icons │ │ │ │ │ │ │ └── aa-jqueryUIicons-sprite.png │ │ │ │ │ ├── dataset-AA.js │ │ │ │ │ ├── index.html │ │ │ │ │ ├── js │ │ │ │ │ │ ├── aa │ │ │ │ │ │ │ ├── common │ │ │ │ │ │ │ │ ├── aa-utility-menu.js │ │ │ │ │ │ │ │ ├── aacom-ui-1.0.0.js │ │ │ │ │ │ │ │ └── core-2.0.0.js │ │ │ │ │ │ │ ├── modules │ │ │ │ │ │ │ │ ├── airportLookup.js │ │ │ │ │ │ │ │ ├── ajax.js │ │ │ │ │ │ │ │ ├── browserdetect.js │ │ │ │ │ │ │ │ ├── commonsetup.js │ │ │ │ │ │ │ │ ├── mobileDatePicker.js │ │ │ │ │ │ │ │ ├── utilities.js │ │ │ │ │ │ │ │ └── widgets.js │ │ │ │ │ │ │ ├── plugins │ │ │ │ │ │ │ │ └── noBounce.js │ │ │ │ │ │ │ └── shopping │ │ │ │ │ │ │ │ └── mobileSearchFlights.js │ │ │ │ │ │ └── libs │ │ │ │ │ │ │ ├── jquery │ │ │ │ │ │ │ ├── jquery-1.11.1.min.js │ │ │ │ │ │ │ ├── jquery-migrate-1.2.1.min.js │ │ │ │ │ │ │ └── ui │ │ │ │ │ │ │ │ └── 1.10 │ │ │ │ │ │ │ │ ├── i18n │ │ │ │ │ │ │ │ └── jquery.ui.datepicker-en-aa.js │ │ │ │ │ │ │ │ └── jquery-ui.min.js │ │ │ │ │ │ │ └── modernizr-2.8.1.js │ │ │ │ │ ├── original.html │ │ │ │ │ ├── surrogate │ │ │ │ │ │ ├── airportLookup.js │ │ │ │ │ │ └── airports.json │ │ │ │ │ └── wrapper.html │ │ │ │ ├── Alaska-auto-medium │ │ │ │ │ ├── images │ │ │ │ │ │ ├── aura.png │ │ │ │ │ │ ├── cal3.png │ │ │ │ │ │ ├── chkboxes3.png │ │ │ │ │ │ ├── clear_text2.png │ │ │ │ │ │ ├── collapse.png │ │ │ │ │ │ ├── expand.png │ │ │ │ │ │ ├── flight_arrow.png │ │ │ │ │ │ ├── geo.png │ │ │ │ │ │ ├── home.png │ │ │ │ │ │ ├── info2.png │ │ │ │ │ │ ├── leftright.png │ │ │ │ │ │ ├── logo2.png │ │ │ │ │ │ └── logos │ │ │ │ │ │ │ ├── AA.png │ │ │ │ │ │ │ ├── AS.png │ │ │ │ │ │ │ ├── DL.png │ │ │ │ │ │ │ └── VX.png │ │ │ │ │ ├── index.html │ │ │ │ │ ├── mobileweb-v3-28-6227-21813.css │ │ │ │ │ ├── scripts │ │ │ │ │ │ ├── datepickr.js │ │ │ │ │ │ ├── main.js │ │ │ │ │ │ └── shopbook.js │ │ │ │ │ ├── stylesheets │ │ │ │ │ │ └── circular │ │ │ │ │ │ │ ├── ASCircularWeb-Bold.woff │ │ │ │ │ │ │ └── ASCircularWeb-Book.woff │ │ │ │ │ ├── surrogate │ │ │ │ │ │ ├── airportLookup.js │ │ │ │ │ │ └── airports.json │ │ │ │ │ └── wrapper.html │ │ │ │ ├── Alaska-auto │ │ │ │ │ ├── images │ │ │ │ │ │ ├── aura.png │ │ │ │ │ │ ├── cal3.png │ │ │ │ │ │ ├── chkboxes3.png │ │ │ │ │ │ ├── clear_text2.png │ │ │ │ │ │ ├── collapse.png │ │ │ │ │ │ ├── expand.png │ │ │ │ │ │ ├── flight_arrow.png │ │ │ │ │ │ ├── geo.png │ │ │ │ │ │ ├── home.png │ │ │ │ │ │ ├── info2.png │ │ │ │ │ │ ├── leftright.png │ │ │ │ │ │ ├── logo2.png │ │ │ │ │ │ └── logos │ │ │ │ │ │ │ ├── AA.png │ │ │ │ │ │ │ ├── AS.png │ │ │ │ │ │ │ ├── DL.png │ │ │ │ │ │ │ └── VX.png │ │ │ │ │ ├── index.html │ │ │ │ │ ├── mobileweb-v3-28-6227-21813.css │ │ │ │ │ ├── scripts │ │ │ │ │ │ ├── datepickr.js │ │ │ │ │ │ ├── main.js │ │ │ │ │ │ └── shopbook.js │ │ │ │ │ ├── stylesheets │ │ │ │ │ │ └── circular │ │ │ │ │ │ │ ├── ASCircularWeb-Bold.woff │ │ │ │ │ │ │ └── ASCircularWeb-Book.woff │ │ │ │ │ ├── surrogate │ │ │ │ │ │ ├── airportLookup.js │ │ │ │ │ │ └── airports.json │ │ │ │ │ └── wrapper.html │ │ │ │ ├── Alaska │ │ │ │ │ ├── dataset-Alaska.js │ │ │ │ │ ├── images │ │ │ │ │ │ ├── aura.png │ │ │ │ │ │ ├── cal3.png │ │ │ │ │ │ ├── chkboxes3.png │ │ │ │ │ │ ├── clear_text2.png │ │ │ │ │ │ ├── collapse.png │ │ │ │ │ │ ├── expand.png │ │ │ │ │ │ ├── flight_arrow.png │ │ │ │ │ │ ├── geo.png │ │ │ │ │ │ ├── home.png │ │ │ │ │ │ ├── info2.png │ │ │ │ │ │ ├── leftright.png │ │ │ │ │ │ ├── logo2.png │ │ │ │ │ │ └── logos │ │ │ │ │ │ │ ├── AA.png │ │ │ │ │ │ │ ├── AS.png │ │ │ │ │ │ │ ├── DL.png │ │ │ │ │ │ │ └── VX.png │ │ │ │ │ ├── index.html │ │ │ │ │ ├── mobileweb-v3-28-6227-21813.css │ │ │ │ │ ├── original.html │ │ │ │ │ ├── scripts │ │ │ │ │ │ ├── datepickr.js │ │ │ │ │ │ ├── main.js │ │ │ │ │ │ └── shopbook.js │ │ │ │ │ ├── stylesheets │ │ │ │ │ │ └── circular │ │ │ │ │ │ │ ├── ASCircularWeb-Bold.woff │ │ │ │ │ │ │ └── ASCircularWeb-Book.woff │ │ │ │ │ ├── surrogate │ │ │ │ │ │ ├── airportLookup.js │ │ │ │ │ │ └── airports.json │ │ │ │ │ └── wrapper.html │ │ │ │ └── flight-common │ │ │ │ │ ├── inject.js │ │ │ │ │ ├── wrapper.css │ │ │ │ │ └── wrapper.js │ │ │ └── miniwob │ │ │ │ ├── bisect-angle.html │ │ │ │ ├── book-flight-nodelay.html │ │ │ │ ├── book-flight.html │ │ │ │ ├── chase-circle.html │ │ │ │ ├── choose-date-easy.html │ │ │ │ ├── choose-date-medium.html │ │ │ │ ├── choose-date-nodelay.html │ │ │ │ ├── choose-date.html │ │ │ │ ├── choose-list.html │ │ │ │ ├── circle-center.html │ │ │ │ ├── click-button-sequence.html │ │ │ │ ├── click-button.html │ │ │ │ ├── click-checkboxes-large.html │ │ │ │ ├── click-checkboxes-soft.html │ │ │ │ ├── click-checkboxes-transfer.html │ │ │ │ ├── click-checkboxes.html │ │ │ │ ├── click-collapsible-2-nodelay.html │ │ │ │ ├── click-collapsible-2.html │ │ │ │ ├── click-collapsible-nodelay.html │ │ │ │ ├── click-collapsible.html │ │ │ │ ├── click-color.html │ │ │ │ ├── click-dialog-2.html │ │ │ │ ├── click-dialog.html │ │ │ │ ├── click-link.html │ │ │ │ ├── click-menu-2.html │ │ │ │ ├── click-menu.html │ │ │ │ ├── click-option.html │ │ │ │ ├── click-pie-nodelay.html │ │ │ │ ├── click-pie.html │ │ │ │ ├── click-scroll-list.html │ │ │ │ ├── click-shades.html │ │ │ │ ├── click-shape.html │ │ │ │ ├── click-tab-2-easy.html │ │ │ │ ├── click-tab-2-hard.html │ │ │ │ ├── click-tab-2-medium.html │ │ │ │ ├── click-tab-2.html │ │ │ │ ├── click-tab.html │ │ │ │ ├── click-test-2.html │ │ │ │ ├── click-test-transfer.html │ │ │ │ ├── click-test.html │ │ │ │ ├── click-widget.html │ │ │ │ ├── copy-paste-2.html │ │ │ │ ├── copy-paste.html │ │ │ │ ├── count-shape.html │ │ │ │ ├── count-sides.html │ │ │ │ ├── drag-box.html │ │ │ │ ├── drag-cube.html │ │ │ │ ├── drag-item.html │ │ │ │ ├── drag-items-grid.html │ │ │ │ ├── drag-items.html │ │ │ │ ├── drag-shapes.html │ │ │ │ ├── drag-sort-numbers.html │ │ │ │ ├── email-inbox-delete.html │ │ │ │ ├── email-inbox-forward-nl-turk.html │ │ │ │ ├── email-inbox-forward-nl.html │ │ │ │ ├── email-inbox-forward.html │ │ │ │ ├── email-inbox-important.html │ │ │ │ ├── email-inbox-nl-turk.html │ │ │ │ ├── email-inbox-noscroll.html │ │ │ │ ├── email-inbox-reply.html │ │ │ │ ├── email-inbox-star-reply.html │ │ │ │ ├── email-inbox.html │ │ │ │ ├── enter-date.html │ │ │ │ ├── enter-password.html │ │ │ │ ├── enter-text-2.html │ │ │ │ ├── enter-text-dynamic.html │ │ │ │ ├── enter-text.html │ │ │ │ ├── enter-time.html │ │ │ │ ├── find-midpoint.html │ │ │ │ ├── find-word.html │ │ │ │ ├── focus-text-2.html │ │ │ │ ├── focus-text.html │ │ │ │ ├── grid-coordinate.html │ │ │ │ ├── guess-number.html │ │ │ │ ├── highlight-text-2.html │ │ │ │ ├── highlight-text.html │ │ │ │ ├── identify-shape.html │ │ │ │ ├── login-user-popup.html │ │ │ │ ├── login-user.html │ │ │ │ ├── moving-items.html │ │ │ │ ├── multi-layouts.html │ │ │ │ ├── multi-orderings.html │ │ │ │ ├── navigate-tree.html │ │ │ │ ├── number-checkboxes.html │ │ │ │ ├── read-table-2.html │ │ │ │ ├── read-table.html │ │ │ │ ├── resize-textarea.html │ │ │ │ ├── right-angle.html │ │ │ │ ├── scroll-text-2.html │ │ │ │ ├── scroll-text.html │ │ │ │ ├── search-engine.html │ │ │ │ ├── simon-says.html │ │ │ │ ├── simple-algebra.html │ │ │ │ ├── simple-arithmetic.html │ │ │ │ ├── social-media-all.html │ │ │ │ ├── social-media-some.html │ │ │ │ ├── social-media.html │ │ │ │ ├── terminal.html │ │ │ │ ├── terminal_reproduce.html │ │ │ │ ├── text-editor.html │ │ │ │ ├── text-transform.html │ │ │ │ ├── tic-tac-toe.html │ │ │ │ ├── unicode-test.html │ │ │ │ ├── use-autocomplete-nodelay.html │ │ │ │ ├── use-autocomplete.html │ │ │ │ ├── use-colorwheel-2.html │ │ │ │ ├── use-colorwheel.html │ │ │ │ ├── use-slider-2.html │ │ │ │ ├── use-slider.html │ │ │ │ ├── use-spinner.html │ │ │ │ └── visual-addition.html │ │ │ ├── instance.py │ │ │ ├── reward.py │ │ │ ├── screenshot.py │ │ │ ├── state.py │ │ │ └── utils.py │ └── setup.py ├── eval-gpt-3.5-turbo.sh ├── eval-gpt-4.sh ├── eval-tgi.sh ├── llm_agent.py ├── main.py ├── metrics.py ├── prompt.py ├── prompt │ ├── action.txt │ ├── base.txt │ ├── choose-list │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-button-sequence │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-button │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-checkboxes-large │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-checkboxes-soft │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-checkboxes-transfer │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-checkboxes │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-collapsible-2 │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-collapsible │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-dialog-2 │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-dialog │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-menu │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-option │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-scroll-list │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-shades │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-shape │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-tab-2-hard │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-tab-2 │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-tab │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-test │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── click-widget │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── count-shape │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── email-inbox-forward-nl-turk │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── email-inbox-forward-nl │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── email-inbox-nl-turk │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── email-inbox │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── enter-date │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── enter-time │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── example.txt │ ├── few-shot.json │ ├── first_action.txt │ ├── focus-text │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── grid-coordinate │ │ ├── action.txt │ │ ├── base.txt │ │ ├── check_finish.txt │ │ ├── done.txt │ │ ├── edit_plan.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── new_plan.txt │ │ ├── rci_action.txt │ │ ├── rci_answer.txt │ │ └── update_action.txt │ ├── identify-shape │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── initialize_plan.txt │ ├── login-user-popup │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── multi-layouts │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── navigate-tree │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── rci_action.txt │ ├── search-engine │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── simple-algebra │ │ ├── action.txt │ │ ├── base.txt │ │ ├── check_finish.txt │ │ ├── done.txt │ │ ├── edit_plan.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── new_plan.txt │ │ ├── rci_action.txt │ │ ├── rci_answer.txt │ │ └── update_action.txt │ ├── social-media-all │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── social-media-some │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── social-media │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── terminal │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── tic-tac-toe │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ ├── update_action.txt │ ├── use-autocomplete │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt │ └── use-spinner │ │ ├── action.txt │ │ ├── base.txt │ │ ├── example.txt │ │ ├── first_action.txt │ │ ├── initialize_plan.txt │ │ ├── rci_action.txt │ │ └── update_action.txt └── requirements.txt ├── rewoo ├── .gitignore ├── LICENSE ├── README.md ├── algos │ ├── PWS.py │ ├── __init__.py │ ├── notool.py │ └── react.py ├── alpaca │ ├── __init__.py │ ├── lora.py │ ├── templates │ │ ├── README.md │ │ ├── alpaca.json │ │ ├── alpaca_legacy.json │ │ ├── alpaca_short.json │ │ └── vigogne.json │ └── utils │ │ ├── README.md │ │ ├── __init__.py │ │ ├── callbacks.py │ │ └── prompter.py ├── eval-gpt-3.5-turbo.sh ├── eval-gpt-4.sh ├── eval-tgi.sh ├── metrics.py ├── nodes │ ├── LLMNode.py │ ├── Node.py │ ├── NodeCofig.py │ ├── Planner.py │ ├── Solver.py │ ├── Worker.py │ └── __init__.py ├── prompts │ ├── __init__.py │ ├── fewshots.py │ ├── planner.py │ ├── solver.py │ └── wiki_prompt.py ├── requirements.txt ├── run_eval.py └── utils │ ├── CustomDocstoreExplorer.py │ ├── DataLoader.py │ ├── Evaluator.py │ ├── __init__.py │ └── util.py ├── science-world ├── .gitignore ├── README.md ├── data_utils │ ├── READMD.md │ ├── __init__.py │ ├── data_convert.py │ ├── data_utils.py │ ├── demos.json │ └── goldpaths-all.zip ├── eval-gpt-3.5.sh ├── eval-gpt-4.sh ├── eval-tgi.sh ├── eval.py ├── eval_utils.py ├── fast_agent │ ├── deepspeed_reqs.txt │ ├── ds_train.py │ ├── ds_train.sh │ └── zero_2_bf16.json ├── metrics.py ├── prompts │ ├── README.md │ ├── convert.py │ ├── prompt.json │ └── prompt_orig.json ├── requirements.txt └── slow_agent │ ├── local_llm.py │ ├── run_gradio.py │ └── utils.py └── webarena ├── .gitignore ├── LICENSE ├── README.md ├── agent ├── __init__.py ├── agent.py └── prompts │ ├── README.md │ ├── __init__.py │ ├── prompt_constructor.py │ ├── raw │ ├── p_cot_id_actree_2s.py │ └── p_direct_id_actree_2s.py │ └── to_json.py ├── browser_env ├── __init__.py ├── actions.py ├── async_envs.py ├── auto_login.py ├── constants.py ├── env_config.py ├── envs.py ├── helper_functions.py ├── processors.py ├── py.typed ├── trajectory.py └── utils.py ├── config_files ├── examples │ ├── 1.json │ ├── 2.json │ ├── 3.json │ └── 4.json └── test.raw.json ├── environment_docker ├── README.md └── webarena-homepage │ ├── app.py │ ├── static │ └── figures │ │ ├── calculator.png │ │ ├── cms.png │ │ ├── gitlab.png │ │ ├── manual1.png │ │ ├── manual2.png │ │ ├── map.png │ │ ├── onestopshop.png │ │ ├── password.png │ │ ├── reddit.png │ │ ├── scratchpad.png │ │ └── wikipedia.png │ └── templates │ ├── calculator.html │ ├── index.html │ └── scratchpad.html ├── eval-gpt-3.5-turbo.sh ├── eval-gpt-4.sh ├── eval-tgi.sh ├── evaluation_harness ├── __init__.py ├── evaluators.py └── helper_functions.py ├── llms ├── __init__.py ├── lm_config.py ├── providers │ └── openai_utils.py └── tokenizers.py ├── prepare.sh ├── run.py ├── setup.cfg └── setup.py /.gitattributes: -------------------------------------------------------------------------------- 1 | eval_heldout/rewoo/data/** filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | tgi_data/ 2 | __pycache__/ 3 | .DS_Store -------------------------------------------------------------------------------- /AgentBench.old/configs/agents/do_nothing.yaml: -------------------------------------------------------------------------------- 1 | module: "src.agents.DoNothingAgent" 2 | parameters: 3 | name: "Do-Nothing-Agent" 4 | sleep: 0.01 5 | -------------------------------------------------------------------------------- /AgentBench.old/configs/agents/tgi_clients/AgentLM-13b.yaml: -------------------------------------------------------------------------------- 1 | module: "src.agents.TGIAgent" 2 | parameters: 3 | ip: "http://127.0.0.1" 4 | # Also you can set up the AgentLM on many address 5 | # The TGIAgent will automatically balance the load 6 | # address ∈ [address_from, address_to) 7 | address_from: 30013 8 | address_to: 30014 9 | model_name: "AgentLM-13b" 10 | max_tokens: 4096 -------------------------------------------------------------------------------- /AgentBench.old/configs/agents/tgi_clients/AgentLM-70b.yaml: -------------------------------------------------------------------------------- 1 | module: "src.agents.TGIAgent" 2 | parameters: 3 | ip: "http://127.0.0.1" 4 | # Also you can set up the AgentLM on many address 5 | # The TGIAgent will automatically balance the load 6 | # address ∈ [address_from, address_to) 7 | address_from: 30070 8 | address_to: 30071 9 | model_name: "AgentLM-70b" 10 | max_tokens: 4096 -------------------------------------------------------------------------------- /AgentBench.old/configs/agents/tgi_clients/AgentLM-7b.yaml: -------------------------------------------------------------------------------- 1 | module: "src.agents.TGIAgent" 2 | parameters: 3 | ip: "http://127.0.0.1" 4 | # Also you can set up the AgentLM on many address 5 | # The TGIAgent will automatically balance the load 6 | # address ∈ [address_from, address_to) 7 | address_from: 30007 8 | address_to: 30008 9 | model_name: "AgentLM-7b" 10 | max_tokens: 4096 -------------------------------------------------------------------------------- /AgentBench.old/configs/tasks/alfworld/dev.yaml: -------------------------------------------------------------------------------- 1 | module: "src.tasks.alfworld.ALFWorld" 2 | parameters: 3 | name: "ALFWorld" 4 | data_path: "/AgentBench/data/alfworld" # TODO replace it with your own data path 5 | config_path: "src/tasks/alfworld/configs/base_config.yaml" 6 | prompts_path: "src/tasks/alfworld/prompts/alfworld_multiturn_react.json" 7 | split: "dev" 8 | max_step: 35 9 | 10 | -------------------------------------------------------------------------------- /AgentBench.old/configs/tasks/alfworld/std.yaml: -------------------------------------------------------------------------------- 1 | module: "src.tasks.alfworld.ALFWorld" 2 | parameters: 3 | name: "ALFWorld" 4 | data_path: "/AgentBench/data/alfworld" # TODO replace it with your own data path 5 | config_path: "src/tasks/alfworld/configs/base_config.yaml" 6 | prompts_path: "src/tasks/alfworld/prompts/alfworld_multiturn_react.json" 7 | split: "std" 8 | max_step: 35 9 | 10 | -------------------------------------------------------------------------------- /AgentBench.old/configs/tasks/card_game/dev.yaml: -------------------------------------------------------------------------------- 1 | module: "src.tasks.CardGame" 2 | 3 | parameters: 4 | name: "CardGame" 5 | port: 12347 6 | test_time: 3 -------------------------------------------------------------------------------- /AgentBench.old/configs/tasks/card_game/ext.yaml: -------------------------------------------------------------------------------- 1 | module: "src.tasks.CardGame" 2 | 3 | parameters: 4 | name: "CardGame" 5 | port: 12349 6 | test_time: 50 -------------------------------------------------------------------------------- /AgentBench.old/configs/tasks/card_game/std.yaml: -------------------------------------------------------------------------------- 1 | module: "src.tasks.CardGame" 2 | 3 | parameters: 4 | name: "CardGame" 5 | port: 12342 6 | test_time: 5 -------------------------------------------------------------------------------- /AgentBench.old/configs/tasks/dbbench/dev.yaml: -------------------------------------------------------------------------------- 1 | module: src.tasks.DBBench 2 | 3 | parameters: 4 | name: "DBBench" 5 | data_file: data/dbbench/dev.jsonl 6 | max_round: 15 7 | -------------------------------------------------------------------------------- /AgentBench.old/configs/tasks/dbbench/std.yaml: -------------------------------------------------------------------------------- 1 | module: src.tasks.DBBench 2 | 3 | parameters: 4 | name: "DBBench" 5 | data_file: data/dbbench/standard.jsonl 6 | max_round: 15 -------------------------------------------------------------------------------- /AgentBench.old/configs/tasks/knowledgegraph/dev.yaml: -------------------------------------------------------------------------------- 1 | module: "src.tasks.KnowledgeGraph" 2 | parameters: 3 | name: "KnowledgeGraph-dev" 4 | round: 15 5 | data_file: "data/knowledgegraph/dev.json" 6 | sparql_url: "http://164.107.116.56:3093/sparql" 7 | -------------------------------------------------------------------------------- /AgentBench.old/configs/tasks/knowledgegraph/std.yaml: -------------------------------------------------------------------------------- 1 | module: "src.tasks.KnowledgeGraph" 2 | parameters: 3 | name: "KnowledgeGraph-std" 4 | round: 15 5 | data_file: "data/knowledgegraph/std.json" 6 | sparql_url: "http://164.107.116.56:3093/sparql" 7 | -------------------------------------------------------------------------------- /AgentBench.old/configs/tasks/webshop/dev.yaml: -------------------------------------------------------------------------------- 1 | module: src.tasks.WebShop 2 | 3 | parameters: 4 | name: "WebShop" 5 | start: 200 6 | end: 280 7 | num_envs: 3 8 | worker_limit: 3 -------------------------------------------------------------------------------- /AgentBench.old/configs/tasks/webshop/std.yaml: -------------------------------------------------------------------------------- 1 | module: src.tasks.WebShop 2 | 3 | parameters: 4 | name: "WebShop" 5 | start: 0 6 | end: 200 7 | num_envs: 3 8 | worker_limit: 3 -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/data/4/N37.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "What's the full path of a file \"TOOLS\"?", 3 | "create": { 4 | "local": "default", 5 | "init": "echo 123>/usr/TOOLS" 6 | }, 7 | "evaluation": { 8 | "check": [ 9 | null, 10 | { 11 | "language": "python", 12 | "file": "check/string-match.py" 13 | } 14 | ], 15 | "example": "find / -name TOOLS" 16 | }, 17 | "labels": ["path", "file"] 18 | } 19 | -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/data/4/N4.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "execute ~/test and get the output.", 3 | "start": "useradd -s /bin/bash -d /home/jack -m jack && cd /home/jack && echo 'echo love' > test && chmod 000 test && chown jack test && su - jack", 4 | "evaluation": { 5 | "match": "love", 6 | "example": { 7 | "code": "chmod 777 test && ~/test" 8 | } 9 | }, 10 | "labels": ["permission", "user"] 11 | } 12 | -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/res/dockerfiles/default: -------------------------------------------------------------------------------- 1 | FROM ubuntu 2 | RUN apt update && apt install python3 -y && apt install python3-pip -y && apt install git -y && apt install vim -y && apt install curl -y && apt install wget -y && apt install unzip -y && apt install zip -y && apt install tree -y 3 | CMD ["bash"] 4 | -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/res/dockerfiles/packages: -------------------------------------------------------------------------------- 1 | FROM ubuntu 2 | RUN apt update && apt install python3 -y && apt install python3-pip -y && apt install git -y && apt install vim -y && apt install curl -y && apt install wget -y && apt install unzip -y && apt install zip -y && apt install tree -y && apt install npm -y 3 | CMD ["bash"] 4 | -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/res/dockerfiles/ubuntu: -------------------------------------------------------------------------------- 1 | FROM ubuntu 2 | RUN echo "tmp" > file.txt 3 | CMD ["bash"] -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/1/check/containing.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | v1 = norm_newline(argv[1]).strip() 7 | v2 = norm_newline(argv[2]).strip() 8 | 9 | if v2 in v1: 10 | exit(0) 11 | else: 12 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/1/check/in.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | v1 = norm_newline(argv[1]).strip() 7 | v2 = norm_newline(argv[2]).strip() 8 | 9 | if v1 in v2: 10 | exit(0) 11 | else: 12 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/1/check/integer-match.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | if int(argv[1]) == int(argv[2]): exit(0) 3 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/1/check/string-match.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | if norm_newline(argv[1]).strip() == norm_newline(argv[2]).strip(): 7 | exit(0) 8 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/1/example/1.sh: -------------------------------------------------------------------------------- 1 | echo '#!/bin/bash 2 | 3 | count_files() { 4 | local dir=$1 5 | local count=0 6 | 7 | for file in "$dir"/*; do 8 | if [ -f "$file" ]; then 9 | count=$((count + 1)) 10 | elif [ -d "$file" ]; then 11 | count_sub=$(count_files "$file") 12 | count=$((count + count_sub)) 13 | fi 14 | done 15 | 16 | echo "$count" 17 | } 18 | 19 | directory="$1" 20 | total_count=$(count_files "$directory") 21 | echo "$total_count"' > /usr/local/bin/count 22 | chmod +x /usr/local/bin/count -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/1/init/gen_words.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | declare -a vocab=('aa' 'aaa' 'ab' 'abc' 'able' 'abut' 'ace' 'ache' 'act' 'acm') 4 | declare -a sep=(' ' ' ' ' ' ' ') 5 | 6 | out='/usr/words.txt' 7 | 8 | echo -n "${vocab[RANDOM % 10]}" > "${out}" 9 | for i in {1..99}; do 10 | echo -n "${sep[RANDOM % 4]}${vocab[RANDOM % 10]}" >> "${out}" 11 | done 12 | -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/1/init/install_nettools.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | apt-get update && apt-get install -y net-tools iproute2 lsof -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/1/init/stock-log.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Define the possible values for each field 4 | names=("Alice" "Bob") 5 | actions=("Purchase" "Sell") 6 | 7 | # Generate 400 random lines 8 | for ((i=1; i<=401; i++)) 9 | do 10 | # Randomly select values for each field 11 | name=${names[$RANDOM % ${#names[@]}]} 12 | action=${actions[$RANDOM % ${#actions[@]}]} 13 | stock_index=$((RANDOM % 100)) 14 | count=$((RANDOM % 1000)) 15 | 16 | # Write the line to the file 17 | echo "$name | $action | $stock_index | $count" >> /usr/stock.log 18 | done 19 | -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/2/check/containing.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | v1 = norm_newline(argv[1]).strip() 7 | v2 = norm_newline(argv[2]).strip() 8 | 9 | if v2 in v1: 10 | exit(0) 11 | else: 12 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/2/check/in.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | v1 = norm_newline(argv[1]).strip() 7 | v2 = norm_newline(argv[2]).strip() 8 | 9 | if v1 in v2: 10 | exit(0) 11 | else: 12 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/2/check/integer-match.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | if int(argv[1]) == int(argv[2]): exit(0) 3 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/2/check/string-match.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | if norm_newline(argv[1]).strip() == norm_newline(argv[2]).strip(): 7 | exit(0) 8 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/2/example/1.sh: -------------------------------------------------------------------------------- 1 | echo '#!/bin/bash 2 | 3 | count_files() { 4 | local dir=$1 5 | local count=0 6 | 7 | for file in "$dir"/*; do 8 | if [ -f "$file" ]; then 9 | count=$((count + 1)) 10 | elif [ -d "$file" ]; then 11 | count_sub=$(count_files "$file") 12 | count=$((count + count_sub)) 13 | fi 14 | done 15 | 16 | echo "$count" 17 | } 18 | 19 | directory="$1" 20 | total_count=$(count_files "$directory") 21 | echo "$total_count"' > /usr/local/bin/count 22 | chmod +x /usr/local/bin/count -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/2/init/gen_words.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | declare -a vocab=('aa' 'aaa' 'ab' 'abc' 'able' 'abut' 'ace' 'ache' 'act' 'acm') 4 | declare -a sep=(' ' ' ' ' ' ' ') 5 | 6 | out='/usr/words.txt' 7 | 8 | echo -n "${vocab[RANDOM % 10]}" > "${out}" 9 | for i in {1..99}; do 10 | echo -n "${sep[RANDOM % 4]}${vocab[RANDOM % 10]}" >> "${out}" 11 | done 12 | -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/2/init/install_nettools.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | apt-get update && apt-get install -y net-tools iproute2 lsof -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/2/init/stock-log.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Define the possible values for each field 4 | names=("Alice" "Bob") 5 | actions=("Purchase" "Sell") 6 | 7 | # Generate 400 random lines 8 | for ((i=1; i<=401; i++)) 9 | do 10 | # Randomly select values for each field 11 | name=${names[$RANDOM % ${#names[@]}]} 12 | action=${actions[$RANDOM % ${#actions[@]}]} 13 | stock_index=$((RANDOM % 100)) 14 | count=$((RANDOM % 1000)) 15 | 16 | # Write the line to the file 17 | echo "$name | $action | $stock_index | $count" >> /usr/stock.log 18 | done 19 | -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/3/check/containing.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | v1 = norm_newline(argv[1]).strip() 7 | v2 = norm_newline(argv[2]).strip() 8 | 9 | if v2 in v1: 10 | exit(0) 11 | else: 12 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/3/check/in.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | v1 = norm_newline(argv[1]).strip() 7 | v2 = norm_newline(argv[2]).strip() 8 | 9 | if v1 in v2: 10 | exit(0) 11 | else: 12 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/3/check/integer-match.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | if int(argv[1]) == int(argv[2]): exit(0) 3 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/3/check/string-match.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | if norm_newline(argv[1]).strip() == norm_newline(argv[2]).strip(): 7 | exit(0) 8 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/3/example/1.sh: -------------------------------------------------------------------------------- 1 | echo '#!/bin/bash 2 | 3 | count_files() { 4 | local dir=$1 5 | local count=0 6 | 7 | for file in "$dir"/*; do 8 | if [ -f "$file" ]; then 9 | count=$((count + 1)) 10 | elif [ -d "$file" ]; then 11 | count_sub=$(count_files "$file") 12 | count=$((count + count_sub)) 13 | fi 14 | done 15 | 16 | echo "$count" 17 | } 18 | 19 | directory="$1" 20 | total_count=$(count_files "$directory") 21 | echo "$total_count"' > /usr/local/bin/count 22 | chmod +x /usr/local/bin/count -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/3/init/gen_words.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | declare -a vocab=('aa' 'aaa' 'ab' 'abc' 'able' 'abut' 'ace' 'ache' 'act' 'acm') 4 | declare -a sep=(' ' ' ' ' ' ' ') 5 | 6 | out='/usr/words.txt' 7 | 8 | echo -n "${vocab[RANDOM % 10]}" > "${out}" 9 | for i in {1..99}; do 10 | echo -n "${sep[RANDOM % 4]}${vocab[RANDOM % 10]}" >> "${out}" 11 | done 12 | -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/3/init/install_nettools.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | apt-get update && apt-get install -y net-tools iproute2 lsof -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/3/init/stock-log.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Define the possible values for each field 4 | names=("Alice" "Bob") 5 | actions=("Purchase" "Sell") 6 | 7 | # Generate 400 random lines 8 | for ((i=1; i<=401; i++)) 9 | do 10 | # Randomly select values for each field 11 | name=${names[$RANDOM % ${#names[@]}]} 12 | action=${actions[$RANDOM % ${#actions[@]}]} 13 | stock_index=$((RANDOM % 100)) 14 | count=$((RANDOM % 1000)) 15 | 16 | # Write the line to the file 17 | echo "$name | $action | $stock_index | $count" >> /usr/stock.log 18 | done 19 | -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/4/check/containing.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | v1 = norm_newline(argv[1]).strip() 7 | v2 = norm_newline(argv[2]).strip() 8 | 9 | if v2 in v1: 10 | exit(0) 11 | else: 12 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/4/check/in.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | v1 = norm_newline(argv[1]).strip() 7 | v2 = norm_newline(argv[2]).strip() 8 | 9 | if v1 in v2: 10 | exit(0) 11 | else: 12 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/4/check/integer-match.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | if int(argv[1]) == int(argv[2]): exit(0) 3 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/4/check/string-match.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | if norm_newline(argv[1]).strip() == norm_newline(argv[2]).strip(): 7 | exit(0) 8 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/4/example/1.sh: -------------------------------------------------------------------------------- 1 | echo '#!/bin/bash 2 | 3 | count_files() { 4 | local dir=$1 5 | local count=0 6 | 7 | for file in "$dir"/*; do 8 | if [ -f "$file" ]; then 9 | count=$((count + 1)) 10 | elif [ -d "$file" ]; then 11 | count_sub=$(count_files "$file") 12 | count=$((count + count_sub)) 13 | fi 14 | done 15 | 16 | echo "$count" 17 | } 18 | 19 | directory="$1" 20 | total_count=$(count_files "$directory") 21 | echo "$total_count"' > /usr/local/bin/count 22 | chmod +x /usr/local/bin/count -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/4/init/gen_words.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | declare -a vocab=('aa' 'aaa' 'ab' 'abc' 'able' 'abut' 'ace' 'ache' 'act' 'acm') 4 | declare -a sep=(' ' ' ' ' ' ' ') 5 | 6 | out='/usr/words.txt' 7 | 8 | echo -n "${vocab[RANDOM % 10]}" > "${out}" 9 | for i in {1..99}; do 10 | echo -n "${sep[RANDOM % 4]}${vocab[RANDOM % 10]}" >> "${out}" 11 | done 12 | -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/4/init/install_nettools.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | apt-get update && apt-get install -y net-tools iproute2 lsof -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/4/init/stock-log.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Define the possible values for each field 4 | names=("Alice" "Bob") 5 | actions=("Purchase" "Sell") 6 | 7 | # Generate 400 random lines 8 | for ((i=1; i<=401; i++)) 9 | do 10 | # Randomly select values for each field 11 | name=${names[$RANDOM % ${#names[@]}]} 12 | action=${actions[$RANDOM % ${#actions[@]}]} 13 | stock_index=$((RANDOM % 100)) 14 | count=$((RANDOM % 1000)) 15 | 16 | # Write the line to the file 17 | echo "$name | $action | $stock_index | $count" >> /usr/stock.log 18 | done 19 | -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/5/check/containing.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | v1 = norm_newline(argv[1]).strip() 7 | v2 = norm_newline(argv[2]).strip() 8 | 9 | if v2 in v1: 10 | exit(0) 11 | else: 12 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/5/check/in.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | v1 = norm_newline(argv[1]).strip() 7 | v2 = norm_newline(argv[2]).strip() 8 | 9 | if v1 in v2: 10 | exit(0) 11 | else: 12 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/5/check/integer-match.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | if int(argv[1]) == int(argv[2]): exit(0) 3 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/5/check/string-match.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | if norm_newline(argv[1]).strip() == norm_newline(argv[2]).strip(): 7 | exit(0) 8 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/5/checking/2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | check() { 4 | target=`date -d "$1" +"$2"` 5 | output=`date-format "$1" "$2"` || exit 1 6 | [ "$output"x != "$target"x ] && exit 1 7 | exit 0 8 | } 9 | 10 | check "2023-5-1" "%Y-%m" || exit 1 11 | check "23-5-2" "%Y-%m-%d" || exit 1 12 | check "2023-5-1" "%Y/%m" || exit 1 13 | check "2023-5-1" "%m/%d" || exit 1 14 | check "2023/5/10" "%d/%m" || exit 1 15 | check "2021/05/1" "Date: %Y-%m-%d" || exit 1 16 | 17 | exit 0 -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/5/example/0.sh: -------------------------------------------------------------------------------- 1 | echo '#!/bin/bash 2 | python3 -c "print(\"%.6f\"%($*))"' > calc 3 | chmod +x calc 4 | mv calc /usr/local/bin/ 5 | -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/5/example/1.sh: -------------------------------------------------------------------------------- 1 | echo '#!/bin/bash 2 | 3 | count_files() { 4 | local dir=$1 5 | local count=0 6 | 7 | for file in "$dir"/*; do 8 | if [ -f "$file" ]; then 9 | count=$((count + 1)) 10 | elif [ -d "$file" ]; then 11 | count_sub=$(count_files "$file") 12 | count=$((count + count_sub)) 13 | fi 14 | done 15 | 16 | echo "$count" 17 | } 18 | 19 | directory="$1" 20 | total_count=$(count_files "$directory") 21 | echo "$total_count"' > /usr/local/bin/count 22 | chmod +x /usr/local/bin/count -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/5/example/2.sh: -------------------------------------------------------------------------------- 1 | echo '#!/bin/bash 2 | 3 | date -d "$1" +"$2" 4 | 5 | ' > /usr/local/bin/date-format 6 | chmod +x /usr/local/bin/date-format 7 | -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/5/init/1.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/data/os_interaction/scripts/5/init/1.sh -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/7/check/containing.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | v1 = norm_newline(argv[1]).strip() 7 | v2 = norm_newline(argv[2]).strip() 8 | 9 | if v2 in v1: 10 | exit(0) 11 | else: 12 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/7/check/in.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | v1 = norm_newline(argv[1]).strip() 7 | v2 = norm_newline(argv[2]).strip() 8 | 9 | if v1 in v2: 10 | exit(0) 11 | else: 12 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/7/check/integer-match.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | if int(argv[1]) == int(argv[2]): exit(0) 3 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/7/check/string-match.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | if norm_newline(argv[1]).strip() == norm_newline(argv[2]).strip(): 7 | exit(0) 8 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/7/example/1.sh: -------------------------------------------------------------------------------- 1 | echo '#!/bin/bash 2 | 3 | count_files() { 4 | local dir=$1 5 | local count=0 6 | 7 | for file in "$dir"/*; do 8 | if [ -f "$file" ]; then 9 | count=$((count + 1)) 10 | elif [ -d "$file" ]; then 11 | count_sub=$(count_files "$file") 12 | count=$((count + count_sub)) 13 | fi 14 | done 15 | 16 | echo "$count" 17 | } 18 | 19 | directory="$1" 20 | total_count=$(count_files "$directory") 21 | echo "$total_count"' > /usr/local/bin/count 22 | chmod +x /usr/local/bin/count -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/7/init/gen_words.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | declare -a vocab=('aa' 'aaa' 'ab' 'abc' 'able' 'abut' 'ace' 'ache' 'act' 'acm') 4 | declare -a sep=(' ' ' ' ' ' ' ') 5 | 6 | out='/usr/words.txt' 7 | 8 | echo -n "${vocab[RANDOM % 10]}" > "${out}" 9 | for i in {1..99}; do 10 | echo -n "${sep[RANDOM % 4]}${vocab[RANDOM % 10]}" >> "${out}" 11 | done 12 | -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/7/init/install_nettools.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | apt-get update && apt-get install -y net-tools iproute2 lsof -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/7/init/stock-log.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Define the possible values for each field 4 | names=("Alice" "Bob") 5 | actions=("Purchase" "Sell") 6 | 7 | # Generate 400 random lines 8 | for ((i=1; i<=401; i++)) 9 | do 10 | # Randomly select values for each field 11 | name=${names[$RANDOM % ${#names[@]}]} 12 | action=${actions[$RANDOM % ${#actions[@]}]} 13 | stock_index=$((RANDOM % 100)) 14 | count=$((RANDOM % 1000)) 15 | 16 | # Write the line to the file 17 | echo "$name | $action | $stock_index | $count" >> /usr/stock.log 18 | done 19 | -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/dev/check/containing.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | v1 = norm_newline(argv[1]).strip() 7 | v2 = norm_newline(argv[2]).strip() 8 | 9 | if v2 in v1: 10 | exit(0) 11 | else: 12 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/dev/check/in.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | v1 = norm_newline(argv[1]).strip() 7 | v2 = norm_newline(argv[2]).strip() 8 | 9 | if v1 in v2: 10 | exit(0) 11 | else: 12 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/dev/check/integer-match.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | if int(argv[1]) == int(argv[2]): exit(0) 3 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/dev/check/string-match.py: -------------------------------------------------------------------------------- 1 | from sys import argv 2 | 3 | def norm_newline(s): 4 | return s.replace("\r\n", "\n").replace("\r", "\n") 5 | 6 | if norm_newline(argv[1]).strip() == norm_newline(argv[2]).strip(): 7 | exit(0) 8 | exit(1) -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/dev/example/0.sh: -------------------------------------------------------------------------------- 1 | echo '#!/bin/bash 2 | python3 -c "print(\"%.6f\"%($*))"' > calc 3 | chmod +x calc 4 | mv calc /usr/local/bin/ 5 | -------------------------------------------------------------------------------- /AgentBench.old/data/os_interaction/scripts/dev/init/stock-log.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Define the possible values for each field 4 | names=("Alice" "Bob") 5 | actions=("Purchase" "Sell") 6 | 7 | # Generate 400 random lines 8 | for ((i=1; i<=401; i++)) 9 | do 10 | # Randomly select values for each field 11 | name=${names[$RANDOM % ${#names[@]}]} 12 | action=${actions[$RANDOM % ${#actions[@]}]} 13 | stock_index=$((RANDOM % 100)) 14 | count=$((RANDOM % 1000)) 15 | 16 | # Write the line to the file 17 | echo "$name | $action | $stock_index | $count" >> /usr/stock.log 18 | done 19 | -------------------------------------------------------------------------------- /AgentBench.old/eval/AgentLM-13b-eval-all.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/bash 2 | 3 | export AGENT_CONFIG='configs/agents/tgi_clients/AgentLM-13b.yaml' 4 | export WORKERS=8 5 | eval_time=$(date "+%Y-%m-%d-%H:%M:%S") 6 | export OUTPUT_ROOT_DIR=outputs/AgentLM-13b/$eval_time 7 | 8 | # For Held-in task 9 | export SPLIT='std' 10 | bash eval/single-task/alfworld.sh 11 | bash eval/single-task/webshop.sh 12 | bash eval/single-task/mind2web.sh 13 | bash eval/single-task/kg.sh 14 | bash eval/single-task/db.sh 15 | bash eval/single-task/os.sh 16 | 17 | # For Held-out task 18 | export SPLIT='ext' 19 | bash eval/single-task/card.sh -------------------------------------------------------------------------------- /AgentBench.old/eval/AgentLM-70b-eval-all.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/bash 2 | 3 | export AGENT_CONFIG='configs/agents/tgi_clients/AgentLM-70b.yaml' 4 | export WORKERS=8 5 | eval_time=$(date "+%Y-%m-%d-%H:%M:%S") 6 | export OUTPUT_ROOT_DIR=outputs/AgentLM-70b/$eval_time 7 | 8 | # For Held-in task 9 | export SPLIT='std' 10 | bash eval/single-task/alfworld.sh 11 | bash eval/single-task/webshop.sh 12 | bash eval/single-task/mind2web.sh 13 | bash eval/single-task/kg.sh 14 | bash eval/single-task/db.sh 15 | bash eval/single-task/os.sh 16 | 17 | # For Held-out task 18 | export SPLIT='ext' 19 | bash eval/single-task/card.sh -------------------------------------------------------------------------------- /AgentBench.old/eval/AgentLM-7b-eval-all.sh: -------------------------------------------------------------------------------- 1 | #! /usr/bin/bash 2 | 3 | export AGENT_CONFIG='configs/agents/tgi_clients/AgentLM-7b.yaml' 4 | export WORKERS=8 5 | eval_time=$(date "+%Y-%m-%d-%H:%M:%S") 6 | export OUTPUT_ROOT_DIR=outputs/AgentLM-7b/$eval_time 7 | 8 | # For Held-in task 9 | export SPLIT='std' 10 | bash eval/single-task/alfworld.sh 11 | bash eval/single-task/webshop.sh 12 | bash eval/single-task/mind2web.sh 13 | bash eval/single-task/kg.sh 14 | bash eval/single-task/db.sh 15 | bash eval/single-task/os.sh 16 | 17 | # # For Held-out task 18 | export SPLIT='ext' 19 | bash eval/single-task/card.sh -------------------------------------------------------------------------------- /AgentBench.old/eval/single-task/alfworld.sh: -------------------------------------------------------------------------------- 1 | source eval/single-task/eval_single_setup.sh 2 | 3 | evaluate_in_docker "learningrate/agentbench-alfworld" \ 4 | --task "configs/tasks/alfworld/$SPLIT.yaml" \ 5 | --agent "$AGENT_CONFIG" \ 6 | --workers $WORKERS \ 7 | --output_dir "$OUTPUT_DIR" --no_timestamp \ 8 | --max_new_tokens 128 -------------------------------------------------------------------------------- /AgentBench.old/eval/single-task/card.sh: -------------------------------------------------------------------------------- 1 | source eval/single-task/eval_single_setup.sh 2 | 3 | evaluate_in_docker "learningrate/agentbench-card_game" \ 4 | --task "configs/tasks/card_game/$SPLIT.yaml" \ 5 | --agent "$AGENT_CONFIG" \ 6 | --workers $WORKERS \ 7 | --output_dir "$OUTPUT_DIR" --no_timestamp\ 8 | --max_new_tokens 512 -------------------------------------------------------------------------------- /AgentBench.old/eval/single-task/db.sh: -------------------------------------------------------------------------------- 1 | source eval/single-task/eval_single_setup.sh 2 | 3 | evaluate_directly \ 4 | --task "configs/tasks/dbbench/$SPLIT.yaml" \ 5 | --agent "$AGENT_CONFIG" \ 6 | --workers $WORKERS \ 7 | --output_dir "$OUTPUT_DIR" --no_timestamp\ 8 | --max_new_tokens 128 -------------------------------------------------------------------------------- /AgentBench.old/eval/single-task/kg.sh: -------------------------------------------------------------------------------- 1 | source eval/single-task/eval_single_setup.sh 2 | 3 | evaluate_directly \ 4 | --task "configs/tasks/knowledgegraph/$SPLIT.yaml" \ 5 | --agent "$AGENT_CONFIG" \ 6 | --workers $WORKERS \ 7 | --output_dir "$OUTPUT_DIR" --no_timestamp\ 8 | --max_new_tokens 128 -------------------------------------------------------------------------------- /AgentBench.old/eval/single-task/mind2web.sh: -------------------------------------------------------------------------------- 1 | source eval/single-task/eval_single_setup.sh 2 | 3 | evaluate_in_docker "learningrate/agentbench-mind2web" \ 4 | --task "configs/tasks/mind2web/$SPLIT.yaml" \ 5 | --agent "$AGENT_CONFIG" \ 6 | --workers $WORKERS \ 7 | --output_dir "$OUTPUT_DIR" --no_timestamp\ 8 | --max_new_tokens 128 -------------------------------------------------------------------------------- /AgentBench.old/eval/single-task/os.sh: -------------------------------------------------------------------------------- 1 | source eval/single-task/eval_single_setup.sh 2 | set -x 3 | 4 | evaluate_directly \ 5 | --task "configs/tasks/os_interaction/$SPLIT.yaml" \ 6 | --agent "$AGENT_CONFIG" \ 7 | --workers $WORKERS \ 8 | --output_dir "$OUTPUT_DIR" --no_timestamp\ 9 | --max_new_tokens 128 -------------------------------------------------------------------------------- /AgentBench.old/eval/single-task/webshop.sh: -------------------------------------------------------------------------------- 1 | source eval/single-task/eval_single_setup.sh 2 | 3 | evaluate_in_docker "learningrate/agentbench-webshop" \ 4 | --task "configs/tasks/webshop/$SPLIT.yaml" \ 5 | --agent "$AGENT_CONFIG" \ 6 | --workers $WORKERS \ 7 | --output_dir "$OUTPUT_DIR" --no_timestamp \ 8 | --max_new_tokens 128 9 | -------------------------------------------------------------------------------- /AgentBench.old/requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | fschat==0.2.31 3 | dataclass_wizard 4 | jsonlines 5 | tensorboard 6 | openai -------------------------------------------------------------------------------- /AgentBench.old/src/__init__.py: -------------------------------------------------------------------------------- 1 | from .agent import Agent, Session 2 | from .task import * 3 | from .configs import * 4 | from .utils import print_rank_0, JsonEncoder 5 | from .agents import * 6 | -------------------------------------------------------------------------------- /AgentBench.old/src/agents/__init__.py: -------------------------------------------------------------------------------- 1 | from .do_nothing_agent import DoNothingAgent 2 | from . import tgi_client 3 | from .tgi_client import TGIAgent -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/alfworld/__init__.py: -------------------------------------------------------------------------------- 1 | from .task import ALFWorld -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/C++/Action.cpp: -------------------------------------------------------------------------------- 1 | #include "Action.hpp" 2 | #include 3 | 4 | std::vector AI::Pick(Game game) 5 | { 6 | // TODO: fill your code 7 | } 8 | 9 | std::pair AI::Assert(Game game) 10 | { 11 | // TODO: fill your code 12 | } 13 | 14 | Action AI::Act(Game game) 15 | { 16 | // TODO: fill your code 17 | } -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/C++/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | g++ main.cpp Action.cpp sdk/jsoncpp/jsoncpp.cpp -o main -std=c++17 -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/C++/main.cpp: -------------------------------------------------------------------------------- 1 | #include "Action.hpp" 2 | 3 | int main() 4 | { 5 | AI *myAI = new AI(); 6 | myAI->run(); 7 | delete (myAI); 8 | return 0; 9 | } -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/basline1.py: -------------------------------------------------------------------------------- 1 | from action1 import AI 2 | import sys 3 | 4 | if __name__ == "__main__": 5 | stage = int(sys.argv[1]) 6 | 7 | myAI = AI(stage) 8 | myAI.run() 9 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/basline2.py: -------------------------------------------------------------------------------- 1 | from action2 import AI 2 | import sys 3 | 4 | if __name__ == "__main__": 5 | stage = int(sys.argv[1]) 6 | 7 | myAI = AI(stage) 8 | myAI.run() 9 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/basline3.py: -------------------------------------------------------------------------------- 1 | from action3 import AI 2 | import sys 3 | 4 | if __name__ == "__main__": 5 | stage = int(sys.argv[1]) 6 | 7 | myAI = AI(stage) 8 | myAI.run() 9 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from client import Client 3 | import json 4 | 5 | if __name__ == "__main__": 6 | language = sys.argv[1] 7 | stage = int(sys.argv[2]) 8 | order = int(sys.argv[3]) 9 | save_dir = sys.argv[4] 10 | port = int(sys.argv[5]) 11 | client = Client(port=port) 12 | if language == 'en': 13 | from AI_En import Agent 14 | myAI = Agent(client, stage, order, save_dir) 15 | else: 16 | from AI_Cn import Agent 17 | myAI = Agent(client, stage, order, save_dir) 18 | 19 | myAI.run() 20 | client.quit() -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/ai_client.cpython-310-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/ai_client.cpython-310-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/ai_client.cpython-38-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/ai_client.cpython-38-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/CMakeFiles/3.25.0/CMakeDetermineCompilerABI_CXX.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/CMakeFiles/3.25.0/CMakeDetermineCompilerABI_CXX.bin -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/CMakeFiles/3.25.0/CMakeSystem.cmake: -------------------------------------------------------------------------------- 1 | set(CMAKE_HOST_SYSTEM "Linux-5.10.16.3-microsoft-standard-WSL2") 2 | set(CMAKE_HOST_SYSTEM_NAME "Linux") 3 | set(CMAKE_HOST_SYSTEM_VERSION "5.10.16.3-microsoft-standard-WSL2") 4 | set(CMAKE_HOST_SYSTEM_PROCESSOR "x86_64") 5 | 6 | 7 | 8 | set(CMAKE_SYSTEM "Linux-5.10.16.3-microsoft-standard-WSL2") 9 | set(CMAKE_SYSTEM_NAME "Linux") 10 | set(CMAKE_SYSTEM_VERSION "5.10.16.3-microsoft-standard-WSL2") 11 | set(CMAKE_SYSTEM_PROCESSOR "x86_64") 12 | 13 | set(CMAKE_CROSSCOMPILING "FALSE") 14 | 15 | set(CMAKE_SYSTEM_LOADED 1) 16 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/CMakeFiles/3.25.0/CompilerIdCXX/a.out: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/CMakeFiles/3.25.0/CompilerIdCXX/a.out -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/CMakeFiles/TargetDirectories.txt: -------------------------------------------------------------------------------- 1 | /mnt/c/Users/piano/Downloads/AquaWarAI/AI_SDK/Python/sdk/build/CMakeFiles/ai_client.dir 2 | /mnt/c/Users/piano/Downloads/AquaWarAI/AI_SDK/Python/sdk/build/CMakeFiles/edit_cache.dir 3 | /mnt/c/Users/piano/Downloads/AquaWarAI/AI_SDK/Python/sdk/build/CMakeFiles/rebuild_cache.dir 4 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/CMakeFiles/ai_client.dir/cmake_clean.cmake: -------------------------------------------------------------------------------- 1 | file(REMOVE_RECURSE 2 | "CMakeFiles/ai_client.dir/jsoncpp/jsoncpp.cpp.o" 3 | "CMakeFiles/ai_client.dir/jsoncpp/jsoncpp.cpp.o.d" 4 | "CMakeFiles/ai_client.dir/py_ai_sdk.cpp.o" 5 | "CMakeFiles/ai_client.dir/py_ai_sdk.cpp.o.d" 6 | "bin/ai_client.cpython-310-x86_64-linux-gnu.so" 7 | "bin/ai_client.pdb" 8 | ) 9 | 10 | # Per-language clean rules from dependency scanning. 11 | foreach(lang CXX) 12 | include(CMakeFiles/ai_client.dir/cmake_clean_${lang}.cmake OPTIONAL) 13 | endforeach() 14 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/CMakeFiles/ai_client.dir/compiler_depend.make: -------------------------------------------------------------------------------- 1 | # Empty compiler generated dependencies file for ai_client. 2 | # This may be replaced when dependencies are built. 3 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/CMakeFiles/ai_client.dir/compiler_depend.ts: -------------------------------------------------------------------------------- 1 | # CMAKE generated file: DO NOT EDIT! 2 | # Timestamp file for compiler generated dependencies management for ai_client. 3 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/CMakeFiles/ai_client.dir/depend.make: -------------------------------------------------------------------------------- 1 | # Empty dependencies file for ai_client. 2 | # This may be replaced when dependencies are built. 3 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/CMakeFiles/ai_client.dir/flags.make: -------------------------------------------------------------------------------- 1 | # CMAKE generated file: DO NOT EDIT! 2 | # Generated by "Unix Makefiles" Generator, CMake Version 3.25 3 | 4 | # compile CXX with /usr/bin/c++ 5 | CXX_DEFINES = -Dai_client_EXPORTS 6 | 7 | CXX_INCLUDES = -isystem /home/piano/mambaforge/lib/python3.10/site-packages/pybind11/include -isystem /mnt/e/conda/envs/ml/include/python3.10 8 | 9 | CXX_FLAGS = -O3 -DNDEBUG -fPIC -fvisibility=hidden -flto -fno-fat-lto-objects -std=gnu++17 10 | 11 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/CMakeFiles/ai_client.dir/jsoncpp/jsoncpp.cpp.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/CMakeFiles/ai_client.dir/jsoncpp/jsoncpp.cpp.o -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/CMakeFiles/ai_client.dir/link.txt: -------------------------------------------------------------------------------- 1 | /usr/bin/c++ -fPIC -O3 -DNDEBUG -flto -shared -o bin/ai_client.cpython-310-x86_64-linux-gnu.so CMakeFiles/ai_client.dir/py_ai_sdk.cpp.o CMakeFiles/ai_client.dir/jsoncpp/jsoncpp.cpp.o 2 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/CMakeFiles/ai_client.dir/progress.make: -------------------------------------------------------------------------------- 1 | CMAKE_PROGRESS_1 = 1 2 | CMAKE_PROGRESS_2 = 2 3 | CMAKE_PROGRESS_3 = 3 4 | 5 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/CMakeFiles/ai_client.dir/py_ai_sdk.cpp.o: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/CMakeFiles/ai_client.dir/py_ai_sdk.cpp.o -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/CMakeFiles/cmake.check_cache: -------------------------------------------------------------------------------- 1 | # This file is generated by cmake for dependency checking of the CMakeCache.txt file 2 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/CMakeFiles/progress.marks: -------------------------------------------------------------------------------- 1 | 3 2 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/bin/ai_client.cpython-310-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/build/bin/ai_client.cpython-310-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/pyd/ai_client.cp38-win_amd64.pyd: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/pyd/ai_client.cp38-win_amd64.pyd -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/pyd/ai_client.cpython-310-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/pyd/ai_client.cpython-310-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/pyd/ai_client.cpython-36m-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/pyd/ai_client.cpython-36m-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/pyd/ai_client.cpython-38-darwin.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/pyd/ai_client.cpython-38-darwin.so -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/pyd/ai_client.cpython-38-x86_64-linux-gnu.so: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/card_game/AI_SDK/Python/sdk/pyd/ai_client.cpython-38-x86_64-linux-gnu.so -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/__init__.py: -------------------------------------------------------------------------------- 1 | from .task import CardGame -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/logic/.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | cmake-build-debug -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/logic/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | g++ src/aqua_sdk.cpp src/fish.cpp src/fishset.cpp src/game.cpp src/main.cpp src/player.cpp src/jsoncpp/jsoncpp.cpp -o bin/main -std=c++17 -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/logic/bin/main: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/card_game/logic/bin/main -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/logic/src/main: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/card_game/logic/src/main -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/logic/src/main.cpp: -------------------------------------------------------------------------------- 1 | #include "aqua_sdk.cpp" 2 | 3 | int main(int argc, char* argv[]){ 4 | AquaWarSDK aw; 5 | aw.start(); 6 | aw.run(); 7 | 8 | return 0; 9 | } -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/card_game/logic/src/timer.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | 5 | class Timer { 6 | public: 7 | using BaseClock = std::chrono::steady_clock; 8 | using TimePoint = BaseClock::time_point; 9 | using Duration = BaseClock::duration; 10 | 11 | Timer() : m_start(Timer::now()) {} 12 | int runtime() const { 13 | Duration time = Timer::now() - m_start; 14 | return std::chrono::duration_cast(time).count(); 15 | } 16 | static TimePoint now() { return BaseClock::now(); } 17 | 18 | private: 19 | TimePoint m_start; 20 | }; -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/dbbench/requirements.txt: -------------------------------------------------------------------------------- 1 | mysql-connector-python==8.0.33 2 | docker==6.1.2 -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/knowledgegraph/__init__.py: -------------------------------------------------------------------------------- 1 | from .task import KnowledgeGraph -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/knowledgegraph/requirements.txt: -------------------------------------------------------------------------------- 1 | SPARQLWrapper 2 | networkx -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/mind2web/__init__.py: -------------------------------------------------------------------------------- 1 | from .task import Mind2Web -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/mind2web/data_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/mind2web/data_utils/__init__.py -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/os_interaction/__init__.py: -------------------------------------------------------------------------------- 1 | from .task import OSInteraction -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/os_interaction/requirements.txt: -------------------------------------------------------------------------------- 1 | docker 2 | PyYAML 3 | PyYAML 4 | Requests 5 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/webshop/.gitignore: -------------------------------------------------------------------------------- 1 | *.ipynb* 2 | *.pyc 3 | *.swp 4 | 5 | .DS_Store 6 | .idea/ 7 | .pytest_cache/ 8 | .vscode/ 9 | 10 | __pycache__/ 11 | data/ 12 | search_engine/indexes* 13 | search_engine/resources* 14 | transfer/flagged 15 | user_session_logs/ 16 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/webshop/assets/diagram.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/webshop/assets/diagram.gif -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/webshop/assets/model_ckpts.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/webshop/assets/model_ckpts.png -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/webshop/assets/transfer-logic.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/webshop/assets/transfer-logic.png -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/webshop/baseline_models/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | datasets 3 | faiss-gpu 4 | transformers 5 | wandb -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/webshop/conftest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/webshop/conftest.py -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/webshop/requirements.txt: -------------------------------------------------------------------------------- 1 | beautifulsoup4==4.11.1 2 | cleantext==1.1.4 3 | env==0.1.0 4 | Flask==2.1.2 5 | gdown 6 | gradio 7 | gym==0.24.0 8 | numpy==1.22.4 9 | pandas==1.4.2 10 | pyserini==0.17.0 11 | pytest 12 | PyYAML==6.0 13 | rank_bm25==0.2.2 14 | requests==2.27.1 15 | requests_mock 16 | rich==12.4.4 17 | scikit_learn==1.1.1 18 | selenium==4.2.0 19 | spacy 20 | thefuzz==0.19.0 21 | torch==1.11.0 22 | tqdm==4.64.0 23 | train==0.0.5 24 | transformers==4.19.2 25 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/webshop/run_dev.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | export FLASK_ENV=development 3 | python -m web_agent_site.app --log --attrs 4 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/webshop/run_prod.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python -m web_agent_site.app --log 3 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/webshop/run_web_agent_site_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python -m run_envs.run_web_agent_site_env 3 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/webshop/run_web_agent_text_env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python -m run_envs.run_web_agent_text_env 3 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/webshop/search_engine/lucene_searcher.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pyserini.search.lucene import LuceneSearcher 3 | from rich import print 4 | 5 | 6 | searcher = LuceneSearcher('indexes') 7 | hits = searcher.search('rubber sole shoes', k=20) 8 | 9 | for hit in hits: 10 | doc = searcher.doc(hit.docid) 11 | print(doc) 12 | obj = json.loads(doc.raw())['product']['Title'] 13 | print(obj) 14 | 15 | print(len(hits)) 16 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/webshop/transfer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/webshop/transfer/__init__.py -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/webshop/web_agent_site/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/webshop/web_agent_site/__init__.py -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/webshop/web_agent_site/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/webshop/web_agent_site/engine/__init__.py -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/webshop/web_agent_site/envs/__init__.py: -------------------------------------------------------------------------------- 1 | from gym.envs.registration import register 2 | 3 | from web_agent_site.envs.web_agent_site_env import WebAgentSiteEnv 4 | from web_agent_site.envs.web_agent_text_env import WebAgentTextEnv 5 | 6 | register( 7 | id='WebAgentSiteEnv-v0', 8 | entry_point='web_agent_site.envs:WebAgentSiteEnv', 9 | ) 10 | 11 | register( 12 | id='WebAgentTextEnv-v0', 13 | entry_point='web_agent_site.envs:WebAgentTextEnv', 14 | ) -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/webshop/web_agent_site/envs/chromedriver: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/webshop/web_agent_site/envs/chromedriver -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/webshop/web_agent_site/models/__init__.py: -------------------------------------------------------------------------------- 1 | from web_agent_site.models.models import * 2 | -------------------------------------------------------------------------------- /AgentBench.old/src/tasks/webshop/web_agent_site/static/images/no-image-available.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/AgentBench.old/src/tasks/webshop/web_agent_site/static/images/no-image-available.png -------------------------------------------------------------------------------- /assets/head-figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/assets/head-figure.png -------------------------------------------------------------------------------- /assets/main-figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/assets/main-figure.png -------------------------------------------------------------------------------- /docs/static/images/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/docs/static/images/favicon.ico -------------------------------------------------------------------------------- /docs/static/images/head-figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/docs/static/images/head-figure.png -------------------------------------------------------------------------------- /docs/static/images/main-figure.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/docs/static/images/main-figure.png -------------------------------------------------------------------------------- /docs/static/js/index.js: -------------------------------------------------------------------------------- 1 | window.HELP_IMPROVE_VIDEOJS = false; 2 | 3 | 4 | $(document).ready(function() { 5 | // Check for click events on the navbar burger icon 6 | 7 | var options = { 8 | slidesToScroll: 1, 9 | slidesToShow: 1, 10 | loop: true, 11 | infinite: true, 12 | autoplay: true, 13 | autoplaySpeed: 5000, 14 | } 15 | 16 | // Initialize all div with carousel class 17 | var carousels = bulmaCarousel.attach('.carousel', options); 18 | 19 | bulmaSlider.attach(); 20 | 21 | }) 22 | -------------------------------------------------------------------------------- /eval_general/requirements.txt: -------------------------------------------------------------------------------- 1 | fschat[llm-judge]==0.2.31 2 | tqdm 3 | numpy 4 | jsonlines 5 | argparse 6 | datasets 7 | requests 8 | jsonlines 9 | torch==2.0.1 10 | shortuuid 11 | -------------------------------------------------------------------------------- /eval_heldout/hotpotQA/requirements.txt: -------------------------------------------------------------------------------- 1 | langchain==0.0.174 2 | pandas<=2.0.0 -------------------------------------------------------------------------------- /eval_heldout/hotpotQA/src/config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2023, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: Apache License 2.0 5 | For full license text, see the LICENSE file in the repo root or https://www.apache.org/licenses/LICENSE-2.0 6 | """ 7 | 8 | available_agent_names = ["Zeroshot_HotPotQA_run_Agent", "ZeroshotThink_HotPotQA_run_Agent", 9 | "React_HotPotQA_run_Agent", "Planner_HotPotQA_run_Agent", "PlannerReact_HotPotQA_run_Agent"] 10 | OPENAI_API_KEY = "YOUR_API_KEY_HERE" 11 | -------------------------------------------------------------------------------- /eval_heldout/hotpotQA/src/data/easy.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/hotpotQA/src/data/easy.joblib -------------------------------------------------------------------------------- /eval_heldout/hotpotQA/src/data/hard.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/hotpotQA/src/data/hard.joblib -------------------------------------------------------------------------------- /eval_heldout/hotpotQA/src/data/medium.joblib: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/hotpotQA/src/data/medium.joblib -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/.gitignore: -------------------------------------------------------------------------------- 1 | # Distribution / packaging 2 | .Python 3 | build/ 4 | develop-eggs/ 5 | dist/ 6 | downloads/ 7 | eggs/ 8 | .eggs/ 9 | lib/ 10 | lib64/ 11 | parts/ 12 | sdist/ 13 | var/ 14 | wheels/ 15 | share/python-wheels/ 16 | *.egg-info/ 17 | .installed.cfg 18 | *.egg 19 | MANIFEST -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/README.md: -------------------------------------------------------------------------------- 1 | # Computergym 2 | 3 | ## Install 4 | ```sh 5 | pip install -e . 6 | ``` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/__init__.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | from gym.envs.registration import register 5 | 6 | sys.path.append(os.path.dirname(os.path.abspath(__file__))) 7 | 8 | 9 | _AVAILABLE_ENVS = { 10 | "MiniWoBEnv-v0": { 11 | "entry_point": "computergym.miniwob.base_env:MiniWoBEnv", 12 | "discription": "MinoWoB++ environments", 13 | }, 14 | } 15 | 16 | for env_id, val in _AVAILABLE_ENVS.items(): 17 | register(id=env_id, entry_point=val.get("entry_point")) 18 | -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/__init__.py -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/.gitignore: -------------------------------------------------------------------------------- 1 | twistd.pid 2 | -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_0.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_1.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_2.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_3.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_4.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_5.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_6.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_7.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_7.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_8.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/checkbox-numbers/ch_9.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/drag-cube/blank.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/drag-cube/blank.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/email-inbox/delete.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/email-inbox/delete.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/email-inbox/forward.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/email-inbox/forward.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/email-inbox/left-arrow-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/email-inbox/left-arrow-white.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/email-inbox/left-arrow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/email-inbox/left-arrow.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/email-inbox/reply.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/email-inbox/reply.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/email-inbox/search.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/email-inbox/search.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/email-inbox/send.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/email-inbox/send.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/email-inbox/star-clicked.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/email-inbox/star-clicked.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/email-inbox/star.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/email-inbox/star.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/ajax-loader.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/ajax-loader.gif -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/file.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/file.gif -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/folder-closed.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/folder-closed.gif -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/folder.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/folder.gif -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/minus.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/minus.gif -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/plus.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/plus.gif -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-black-line.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-black-line.gif -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-black.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-black.gif -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-default-line.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-default-line.gif -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-default.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-default.gif -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-famfamfam-line.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-famfamfam-line.gif -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-famfamfam.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-famfamfam.gif -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-gray-line.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-gray-line.gif -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-gray.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-gray.gif -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-red-line.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-red-line.gif -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-red.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/navigate-tree/images/treeview-red.gif -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/like-hover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/like-hover.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/like.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/like.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/more-hover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/more-hover.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/more.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/more.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/reply-hover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/reply-hover.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/reply.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/reply.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/retweet-hover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/retweet-hover.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/retweet.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/retweet.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/share-hover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/share-hover.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/share.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/social-media/share.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/tic-tac-toe/o.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/tic-tac-toe/o.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/tic-tac-toe/x.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/common/special/tic-tac-toe/x.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/core/jquery-ui/images/ui-icons_444444_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/core/jquery-ui/images/ui-icons_444444_256x240.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/core/jquery-ui/images/ui-icons_555555_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/core/jquery-ui/images/ui-icons_555555_256x240.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/core/jquery-ui/images/ui-icons_777620_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/core/jquery-ui/images/ui-icons_777620_256x240.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/core/jquery-ui/images/ui-icons_777777_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/core/jquery-ui/images/ui-icons_777777_256x240.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/core/jquery-ui/images/ui-icons_cc0000_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/core/jquery-ui/images/ui-icons_cc0000_256x240.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/core/jquery-ui/images/ui-icons_ffffff_256x240.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/core/jquery-ui/images/ui-icons_ffffff_256x240.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/apps/common/js/jquery/aacom/plugins/aaCache.js: -------------------------------------------------------------------------------- 1 | var aaCache=(function($j){var cache={};function _get(key){if(!cache[key]){cache[key]=$j(key);}return cache[key];}function _remove(key){if(cache.hasOwnProperty(key)){return(delete cache[key]);}return true;}return{get:_get,remove:_remove};}(jQuery)); -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/fonts/american-v2/americansans-bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/fonts/american-v2/americansans-bold.woff -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/fonts/american-v2/americansans-light.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/fonts/american-v2/americansans-light.woff -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/fonts/american-v2/americansans-medium.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/fonts/american-v2/americansans-medium.woff -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/fonts/american-v2/americansans-regular.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/fonts/american-v2/americansans-regular.woff -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/fonts/icons/american-icons-v4-4.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/fonts/icons/american-icons-v4-4.woff -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/icons/loading.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/icons/loading.gif -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/aa-flight-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/aa-flight-icon.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/aa-icons-flags-sprite.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/aa-icons-flags-sprite.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/aa-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/aa-logo.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/down-arrow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/down-arrow.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/favicon.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/oneworld.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/oneworld.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/shadow-down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/shadow-down.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/shadow-vertical-150.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/shadow-vertical-150.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/shadow12-down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/shadow12-down.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/shadow12-up.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/shadow12-up.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/shadow3-down.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/chrome/rebrand/shadow3-down.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/graphics/icons/aa-jqueryUIicons-sprite.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/content/images/graphics/icons/aa-jqueryUIicons-sprite.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/AA/js/aa/modules/commonsetup.js: -------------------------------------------------------------------------------- 1 | AAcom.modules.commonsetup=function(AAUI){/* NOT NEEDED */}; 2 | -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/aura.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/aura.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/cal3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/cal3.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/chkboxes3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/chkboxes3.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/clear_text2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/clear_text2.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/collapse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/collapse.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/expand.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/expand.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/flight_arrow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/flight_arrow.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/geo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/geo.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/home.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/home.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/info2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/info2.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/leftright.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/leftright.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/logo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/logo2.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/logos/AA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/logos/AA.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/logos/AS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/logos/AS.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/logos/DL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/logos/DL.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/logos/VX.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/images/logos/VX.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/stylesheets/circular/ASCircularWeb-Bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/stylesheets/circular/ASCircularWeb-Bold.woff -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/stylesheets/circular/ASCircularWeb-Book.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto-medium/stylesheets/circular/ASCircularWeb-Book.woff -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/aura.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/aura.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/cal3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/cal3.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/chkboxes3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/chkboxes3.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/clear_text2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/clear_text2.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/collapse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/collapse.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/expand.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/expand.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/flight_arrow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/flight_arrow.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/geo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/geo.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/home.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/home.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/info2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/info2.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/leftright.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/leftright.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/logo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/logo2.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/logos/AA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/logos/AA.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/logos/AS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/logos/AS.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/logos/DL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/logos/DL.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/logos/VX.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/images/logos/VX.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/stylesheets/circular/ASCircularWeb-Bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/stylesheets/circular/ASCircularWeb-Bold.woff -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/stylesheets/circular/ASCircularWeb-Book.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska-auto/stylesheets/circular/ASCircularWeb-Book.woff -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/aura.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/aura.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/cal3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/cal3.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/chkboxes3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/chkboxes3.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/clear_text2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/clear_text2.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/collapse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/collapse.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/expand.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/expand.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/flight_arrow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/flight_arrow.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/geo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/geo.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/home.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/home.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/info2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/info2.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/leftright.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/leftright.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/logo2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/logo2.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/logos/AA.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/logos/AA.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/logos/AS.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/logos/AS.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/logos/DL.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/logos/DL.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/logos/VX.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/images/logos/VX.png -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/stylesheets/circular/ASCircularWeb-Bold.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/stylesheets/circular/ASCircularWeb-Bold.woff -------------------------------------------------------------------------------- /eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/stylesheets/circular/ASCircularWeb-Book.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/computergym/computergym/miniwob/miniwob_interface/html/flight/Alaska/stylesheets/circular/ASCircularWeb-Book.woff -------------------------------------------------------------------------------- /eval_heldout/miniwob++/eval-gpt-3.5-turbo.sh: -------------------------------------------------------------------------------- 1 | export CONTROLLER_ADDR= 2 | for task in $(cat available_tasks.txt) 3 | do 4 | python main.py --env $task --llm chatgpt --num-episodes 4 --erci 1 --irci 3 --sgrounding & 5 | done -------------------------------------------------------------------------------- /eval_heldout/miniwob++/eval-gpt-4.sh: -------------------------------------------------------------------------------- 1 | export CONTROLLER_ADDR= 2 | for task in $(cat available_tasks.txt) 3 | do 4 | python main.py --env $task --llm gpt4 --num-episodes 1 --erci 1 --irci 3 --sgrounding & 5 | done -------------------------------------------------------------------------------- /eval_heldout/miniwob++/eval-tgi.sh: -------------------------------------------------------------------------------- 1 | export CONTROLLER_ADDR=http://127.0.0.1:23333 2 | llm=agent-llama-70b 3 | for task in $(cat available_tasks.txt) 4 | do 5 | python main.py --env $task --llm $llm --num-episodes 10 --erci 1 --irci 3 --sgrounding & 6 | done -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/choose-list/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/choose-list/example.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/choose-list/example.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/choose-list/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/choose-list/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/choose-list/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/choose-list/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/choose-list/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-button-sequence/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-button-sequence/example.txt: -------------------------------------------------------------------------------- 1 | task: Click button ONE, then click button TWO. 2 | plan: 3 | 1. Click the button with xpath "//button[@id='subbtn']". 4 | 2. Click the button with xpath "//button[@id='subbtn2']". -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-button-sequence/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-button-sequence/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-button-sequence/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-button-sequence/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, the single specific instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-button/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-button/example.txt: -------------------------------------------------------------------------------- 1 | Example plans) 2 | The task is to click "Ok" button. 3 | Here is a plan to solve this example task on the same webpage with the autonomous agent. 4 | 1. clickxpath //button[text()="Ok"] 5 | -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-button/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-button/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-button/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-button/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, the single specific instruction for solving the task should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes-large/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes-large/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes-large/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes-large/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes-large/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, the single specific instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes-soft/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes-soft/example.txt: -------------------------------------------------------------------------------- 1 | Example plans) 2 | task: select the words similar to bunnies, tiny, scared, swine and click the Submit button. 3 | plan: we need to select synonyms of the given words and the words themselves. 4 | 1. clickxpath //label[text() = 'rabbit']/input // rabbit is a synonym of bunnies 5 | 2. clickxpath //label[text() = 'pig']/input // pig is a synonym of swine 6 | 3. clickxpath //label[text() = 'panicked']/input panicked is a synonym of scared 7 | 4. clickxpath //label[text() = 'tiny']/input 8 | 5. clickxpath //button[@id='subbtn'] 9 | -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes-soft/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes-soft/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes-soft/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes-soft/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, this instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes-transfer/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes-transfer/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes-transfer/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes-transfer/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes-transfer/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, the single specific instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-checkboxes/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, the single specific instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-collapsible-2/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-collapsible-2/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-collapsible-2/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-collapsible-2/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-collapsible-2/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, this instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-collapsible/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-collapsible/example.txt: -------------------------------------------------------------------------------- 1 | Example plans) 2 | The task is to expand the section below and click the submit button. 3 | Here is a plan to solve this example task on the webpage with the autonomous agent. 4 | 1. Click the expanded section by typing the xpath: "//*[@id="ui-id-1"]". 5 | 2. Click the submit button by typing the xpath: "//button[@id='subbtn']". -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-collapsible/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-collapsible/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-collapsible/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-collapsible/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, the single specific instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-dialog-2/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-dialog-2/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-dialog-2/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-dialog-2/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-dialog-2/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/click-dialog-2/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-dialog/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-dialog/example.txt: -------------------------------------------------------------------------------- 1 | Example plans) 2 | The task is to close the dialog box by clicking the "x". 3 | Here is a plan to solve this example task on the webpage with the autonomous agent. 4 | 5 | 1. Clickxpath //button[@class='ui-button ui-corner-all ui-widget ui-button-icon-only ui-dialog-titlebar-close'] 6 | -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-dialog/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-dialog/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-dialog/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-dialog/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/click-dialog/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-menu/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-menu/example.txt: -------------------------------------------------------------------------------- 1 | Example plans) 2 | The task is for the agent to select Leonelle > Jane > Amabel from the HTML code provided. 3 | Here is a plan to solve this example task on the webpage with the autonomous agent. You must use "movemouse" not "clickxpath" to expand option. 4 | 1. movemouse //*[text()="Leonelle"] 5 | 2. movemouse //*[text()="Jane"] 6 | 3. clickxpath //*[text()="Amabel"] 7 | -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-menu/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-menu/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-menu/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-menu/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/click-menu/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-option/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-option/example.txt: -------------------------------------------------------------------------------- 1 | Example plans) 2 | The task is to select the radio button labeled "Rf" and click the "Submit" button. 3 | Here is a plan to solve this example task on the same webpage with the autonomous agent. 4 | 1. clickxpath //label[input[@id='ch4']] 5 | 2. clickxpath //button[@id='subbtn'] -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-option/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-option/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-option/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-option/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, the single specific instruction for solving the task should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-scroll-list/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-scroll-list/example.txt: -------------------------------------------------------------------------------- 1 | Example plans) 2 | The task is to select Canada, Bosnia and Herzegovina from the scroll list and click Submit. 3 | Here is a plan to solve this example task on the same webpage with the autonomous agent. 4 | 5 | 1. clickoption //option[text() = 'Canada'] 6 | 2. clickoption //option[text() = 'Bosnia'] 7 | 3. clickoption //option[text() = 'Herzegovina'] 8 | 4. Clickxpath //*[@class="secondary-action"] -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-scroll-list/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-scroll-list/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-scroll-list/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-scroll-list/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, the single specific instruction for solving the task should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-shades/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-shades/example.txt: -------------------------------------------------------------------------------- 1 | Example plans) 2 | The task is to select all the shades of red and press the Submit button. 3 | Here is a plan to solve a task on the webpage with the autonomous agent. The first instruction will click all the shades of red. 4 | 5 | 1. Clickxpath //span[@data-color="red"] 6 | 2. Clickxpath //*[@id="submit"] -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-shades/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-shades/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-shades/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-shades/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, the single specific instruction for solving the task should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-shape/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-shape/example.txt: -------------------------------------------------------------------------------- 1 | Example plans) 2 | The task is to click on the magenta digit on the webpage. 3 | Here is a plan to solve this example task on the webpage with the autonomous agent. 4 | 1. Clickxpath //*[name()='svg']//*[name()='text' and @fill='magenta'] 5 | 6 | The task is to click on the large 4 on the webpage. 7 | Here is a plan to solve this example task on the webpage with the autonomous agent. 8 | 1. clickxpath //*[name()='svg']//*[name()='text' and @fill="aqua" and @font-size="20px" and text()="4"] -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-shape/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-shape/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-shape/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-shape/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, this instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-tab-2-hard/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-tab-2-hard/example.txt: -------------------------------------------------------------------------------- 1 | Example plans) 2 | task: switch between the tabs to find and click on the link "amet". 3 | plan: 4 | 1. Clickxpath //a[text()="1"] // first, open the proper tab where the link with a text "amet" is in 5 | 2. Clickxpath //*[@id="tabs-1"]/p/span[text()="amet] 6 | 7 | task: switch between the tabs to find and click on the link "gur". 8 | plan: 9 | 1. Clickxpath //a[text()="4"] // first, open the proper tab where the link with a text "gur" is in 10 | 2. Clickxpath //*[@id="tabs-4"]/p/span[text()="gur"] 11 | -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-tab-2-hard/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-tab-2-hard/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-tab-2-hard/rci_action.txt: -------------------------------------------------------------------------------- 1 | This action does not match the regular expressions. The updated instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-tab-2-hard/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, this instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-tab-2/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-tab-2/example.txt: -------------------------------------------------------------------------------- 1 | Example plans) 2 | task: switch between the tabs to find and click on the link "vitae." 3 | Here is a plan to solve this example task on the webpage with the autonomous agent. 4 | 1. clickxpath //a[text()="Tab #2"] // first, open the proper tab where the link with a text "vitae." is in 5 | 2. clickxpath //*[@id="tabs-2"]/p/span[text()="vitae."] 6 | -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-tab-2/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-tab-2/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-tab-2/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-tab-2/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, this instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-tab/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-tab/example.txt: -------------------------------------------------------------------------------- 1 | Example plans) 2 | The task is to click on Tab #3 on the webpage. 3 | Here is a plan to solve this example task on the webpage with the autonomous agent. 4 | 1. clickxpath //a[text()="Tab #3"] -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-tab/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-tab/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-tab/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-tab/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/click-tab/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-test/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-test/example.txt: -------------------------------------------------------------------------------- 1 | Example plans) 2 | The task is to click the button with the id "subbtn" on the webpage. 3 | Here is a plan to solve this example task on the webpage with the autonomous agent. 4 | 1. Clickxpath //*[@id="subbtn"] -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-test/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-test/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-test/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-test/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, this single specific instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-widget/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-widget/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-widget/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-widget/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/click-widget/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, this instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/count-shape/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/count-shape/example.txt: -------------------------------------------------------------------------------- 1 | Example plans) 2 | task: How many small letters are there? 3 | plan: we need to count the number of small letters in the svg area. 4 | clickxpath //*[@id="count-buttons"]/button[5] 5 | task: How many red items are there? 6 | plan: we need to count the number of red items in the svg area. 7 | clickxpath //*[@id="count-buttons"]/button[2] -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/count-shape/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/count-shape/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/count-shape/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/count-shape/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, this instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox-forward-nl-turk/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox-forward-nl-turk/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox-forward-nl-turk/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox-forward-nl-turk/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox-forward-nl-turk/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/email-inbox-forward-nl-turk/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox-forward-nl/action.txt: -------------------------------------------------------------------------------- 1 | the next instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox-forward-nl/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox-forward-nl/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox-forward-nl/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox-forward-nl/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/email-inbox-forward-nl/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox-nl-turk/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox-nl-turk/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox-nl-turk/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox-nl-turk/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox-nl-turk/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/email-inbox-nl-turk/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/email-inbox/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/email-inbox/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/enter-date/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction to solve the task should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/enter-date/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/enter-date/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan for the task on the above webpage: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/enter-date/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/enter-date/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, this instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/enter-time/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/enter-time/example.txt: -------------------------------------------------------------------------------- 1 | Example plans) 2 | The task is to enter 9:01 AM as the time in the input box and press the submit button. 3 | Here is a plan to solve this example task on the webpage with the autonomous agent. 4 | 1. clickxpath //*[@id="tt"] 5 | 2. type 0901AM 6 | 3. clickxpath //*[@id="subbtn"] 7 | 8 | The task is to enter 10:07 PM as the time in the input box and press the submit button. 9 | Here is a plan to solve this example task on the webpage with the autonomous agent. 10 | 1. clickxpath //*[@id="tt"] 11 | 2. type 1007PM 12 | 3. clickxpath //*[@id="subbtn"] -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/enter-time/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/enter-time/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/enter-time/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/enter-time/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/enter-time/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/example.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/example.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/focus-text/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/focus-text/example.txt: -------------------------------------------------------------------------------- 1 | Example plans) 2 | The task is for the agent to focus into the textbox on the webpage. 3 | Here is a plan to solve the example task. 4 | 1. clickxpath //*[@id="area"]/input[@id="tt"] -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/focus-text/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/focus-text/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/focus-text/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/focus-text/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/focus-text/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/grid-coordinate/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/grid-coordinate/check_finish.txt: -------------------------------------------------------------------------------- 1 | Do you think you complete the current plan? -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/grid-coordinate/done.txt: -------------------------------------------------------------------------------- 1 | Do you think that you accomplished the task on this webpage with the above list of instructions? Answer Yes or No. -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/grid-coordinate/edit_plan.txt: -------------------------------------------------------------------------------- 1 | The agent is following the above plan. Do you think that you need to edit the current plan for the agent to solve the task on this webpage? Answer yes or no and explain why. -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/grid-coordinate/example.txt: -------------------------------------------------------------------------------- 1 | Example plans) 2 | The task is to click on the grid coordinate (1,2) on the webpage. 3 | Here is a plan to solve the example task. 4 | 1. clickxpath //*[@id="(1,2)"] -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/grid-coordinate/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/grid-coordinate/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/grid-coordinate/new_plan.txt: -------------------------------------------------------------------------------- 1 | You failed with the above plans to solve a task on the below webpage. Based on your answer, what is your new plan to solve the task on this webpage? Note that the new plan should be able to start from the current webpage state. You should not assume that the autonomous agent is in the initial state of the webpage. -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/grid-coordinate/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/grid-coordinate/rci_answer.txt: -------------------------------------------------------------------------------- 1 | Answer Yes or No. -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/grid-coordinate/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/grid-coordinate/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/identify-shape/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/identify-shape/example.txt: -------------------------------------------------------------------------------- 1 | Example plan) 2 | task: click the button that best describes the figure below, which is a number (1). 3 | plan: note that number 0 is a number not circle and the number text inside tag is number not a letter. 4 | 1. clickxpath //button[@data-type="digit"] 5 | 6 | task: click the button that best describes the figure below. 7 | plan: note that a single letter of number 0 is a digit not a circle, and a single number inside tag is a digit not a letter. 8 | 1. clickxpath //button[@data-type="triangle"] -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/identify-shape/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/identify-shape/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/identify-shape/rci_action.txt: -------------------------------------------------------------------------------- 1 | This action does not match the regular expressions. The updated instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/identify-shape/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/identify-shape/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/login-user-popup/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/login-user-popup/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/login-user-popup/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/login-user-popup/rci_action.txt: -------------------------------------------------------------------------------- 1 | This action does not match the regular expressions. Notice that the instruction shouldn't contain quotation marks. The updated instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/login-user-popup/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/login-user-popup/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/multi-layouts/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/multi-layouts/example.txt: -------------------------------------------------------------------------------- 1 | Example plan) 2 | task: search for crime movies directed by Mckenzie from year 1970. 3 | plan: 4 | 1. click the input box labeled with "genre" 5 | 2. type crime 6 | 3. click the input box labeled with "director name" 7 | 4. type Mckenzie 8 | 5. click the input box labeled with "year" 9 | 6. type 1970 10 | 7. click the search button -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/multi-layouts/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/multi-layouts/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/multi-layouts/rci_action.txt: -------------------------------------------------------------------------------- 1 | This action does not match the regular expressions. The updated instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/multi-layouts/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/multi-layouts/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/navigate-tree/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/navigate-tree/example.txt: -------------------------------------------------------------------------------- 1 | Example plans) 2 | task: navigate through the file tree. Find and click on the folder or file named "Riley". 3 | plan: 4 | 1. clickxpath //span[text()='Briana'] // Click the expandable hitarea of the folder "Briana" 5 | 2. clickxpath //span[text()='Riley'] -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/navigate-tree/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/navigate-tree/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/navigate-tree/rci_action.txt: -------------------------------------------------------------------------------- 1 | This action does not match the regular expressions. The updated instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/navigate-tree/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/navigate-tree/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/rci_action.txt: -------------------------------------------------------------------------------- 1 | This action does not match the regular expressions. The updated instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/search-engine/action.txt: -------------------------------------------------------------------------------- 1 | the next instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/search-engine/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/search-engine/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/search-engine/rci_action.txt: -------------------------------------------------------------------------------- 1 | This action does not match the regular expressions. The updated instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/search-engine/update_action.txt: -------------------------------------------------------------------------------- 1 | The target of click should be a valid XPath. Therefore, considering the output on the webpage, this instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/simple-algebra/action.txt: -------------------------------------------------------------------------------- 1 | the next instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/simple-algebra/check_finish.txt: -------------------------------------------------------------------------------- 1 | Do you think you complete the current plan? -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/simple-algebra/done.txt: -------------------------------------------------------------------------------- 1 | Do you think that you accomplished the task on this webpage with the above list of instructions? Answer Yes or No. -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/simple-algebra/edit_plan.txt: -------------------------------------------------------------------------------- 1 | The agent is following the above plan. Do you think that you need to edit the current plan for the agent to solve the task on this webpage? Answer yes or no and explain why. -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/simple-algebra/example.txt: -------------------------------------------------------------------------------- 1 | task: Solve for x and type your answer into the textbox. Press Submit when done. 2 | plan: 3 | 1. Locate the input box by clicking on it with a click instruction using its ID "math-answer". 4 | 2. Type the answer using a type instruction. 5 | 3. Press the "Enter" key using a press instruction. 6 | 4. Click the "Submit" button using a clickxpath instruction with the xpath "//button[@id='subbtn']". -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/simple-algebra/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/simple-algebra/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/simple-algebra/new_plan.txt: -------------------------------------------------------------------------------- 1 | You failed with the above plans to solve a task on the below webpage. Based on your answer, what is your new plan to solve the task on this webpage? Note that the new plan should be able to start from the current webpage state. You should not assume that the autonomous agent is in the initial state of the webpage. -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/simple-algebra/rci_action.txt: -------------------------------------------------------------------------------- 1 | Notice that the instruction shouldn't contain quotation marks. Without explanation, the single instruction that matches one of the regular expressions is ' -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/simple-algebra/rci_answer.txt: -------------------------------------------------------------------------------- 1 | Answer Yes or No. -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/simple-algebra/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/simple-algebra/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/social-media-all/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/social-media-all/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/social-media-all/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/social-media-all/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/social-media-all/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the information on the webpage, this instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/social-media-some/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/social-media-some/example.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/social-media-some/example.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/social-media-some/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/social-media-some/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/social-media-some/rci_action.txt: -------------------------------------------------------------------------------- 1 | This action does not match the regular expressions. The updated instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/social-media-some/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/social-media-some/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/social-media/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/social-media/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/social-media/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/social-media/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/social-media/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, this single specific instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/terminal/action.txt: -------------------------------------------------------------------------------- 1 | the next instruction that the agent needs to execute is -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/terminal/example.txt: -------------------------------------------------------------------------------- 1 | Example plans) 2 | task: use the terminal below to delete a file ending with the extension .gpg 3 | plan: 4 | 1. Type "ls" to list all files in the terminal with "type ls" instruction. 5 | 2. Press enter with "press enter" instruction 6 | 3. Identify the filename ending with ".gpg" and type "rm [filename].gpg" to delete the identified file with type instruction. 7 | 4. Press enter everytime after typing a command on terminal. 8 | -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/terminal/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/terminal/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/terminal/rci_action.txt: -------------------------------------------------------------------------------- 1 | This action does not match the regular expressions. The updated instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/terminal/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the output on the webpage, this instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/tic-tac-toe/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/tic-tac-toe/example.txt: -------------------------------------------------------------------------------- 1 | task: for the agent to play as 'X' and win a game of tic-tac-toe. 2 | plan: 3 | 1. Check the board and click the proper position which is empty to newly put X mark on the board to win the game (e.g., clickxpath clickxpath //*[@id="ttt-i"]) 4 | 2. Start in the middle, play defensively, play offensively, and block your opponent to improve your chances of winning 5 | 3. You should select the position if you can directly win by doing that 6 | 3. Repeat these until win -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/tic-tac-toe/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/tic-tac-toe/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/tic-tac-toe/rci_action.txt: -------------------------------------------------------------------------------- 1 | This action does not match the regular expressions. The updated instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/tic-tac-toe/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/tic-tac-toe/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/update_action.txt: -------------------------------------------------------------------------------- 1 | Therefore, considering the information on the webpage, this instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/use-autocomplete/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/use-autocomplete/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/use-autocomplete/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/use-autocomplete/rci_action.txt: -------------------------------------------------------------------------------- 1 | This action does not match the regular expressions. The updated instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/use-autocomplete/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/use-autocomplete/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/use-spinner/action.txt: -------------------------------------------------------------------------------- 1 | the next proper instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/use-spinner/first_action.txt: -------------------------------------------------------------------------------- 1 | the first instruction should be ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/use-spinner/initialize_plan.txt: -------------------------------------------------------------------------------- 1 | plan: -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/use-spinner/rci_action.txt: -------------------------------------------------------------------------------- 1 | Without explanation, the single instruction that matches one of the regular expressions is ` -------------------------------------------------------------------------------- /eval_heldout/miniwob++/prompt/use-spinner/update_action.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/miniwob++/prompt/use-spinner/update_action.txt -------------------------------------------------------------------------------- /eval_heldout/miniwob++/requirements.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | fschat 3 | gym 4 | openai 5 | selenium 6 | transformers 7 | Pillow 8 | regex -------------------------------------------------------------------------------- /eval_heldout/rewoo/.gitignore: -------------------------------------------------------------------------------- 1 | ./keys/ 2 | .idea/ 3 | __pycache__ 4 | .vscode 5 | /keys 6 | /logs 7 | /results 8 | /data -------------------------------------------------------------------------------- /eval_heldout/rewoo/algos/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/rewoo/algos/__init__.py -------------------------------------------------------------------------------- /eval_heldout/rewoo/alpaca/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/rewoo/alpaca/__init__.py -------------------------------------------------------------------------------- /eval_heldout/rewoo/alpaca/templates/alpaca.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "Template used by Alpaca-LoRA.", 3 | "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n", 4 | "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n", 5 | "response_split": "### Response:" 6 | } 7 | -------------------------------------------------------------------------------- /eval_heldout/rewoo/alpaca/templates/alpaca_short.json: -------------------------------------------------------------------------------- 1 | { 2 | "description": "A shorter template to experiment with.", 3 | "prompt_input": "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n", 4 | "prompt_no_input": "### Instruction:\n{instruction}\n\n### Response:\n", 5 | "response_split": "### Response:" 6 | } 7 | -------------------------------------------------------------------------------- /eval_heldout/rewoo/alpaca/utils/README.md: -------------------------------------------------------------------------------- 1 | # Directory for helpers modules 2 | 3 | ## prompter.py 4 | 5 | Prompter class, a template manager. 6 | 7 | `from utils.prompter import Prompter` 8 | 9 | ## callbacks.py 10 | 11 | Helpers to support streaming generate output. 12 | 13 | `from utils.callbacks import Iteratorize, Stream` 14 | -------------------------------------------------------------------------------- /eval_heldout/rewoo/alpaca/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/rewoo/alpaca/utils/__init__.py -------------------------------------------------------------------------------- /eval_heldout/rewoo/metrics.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | HISTORY = 'logs' 4 | 5 | for llm in Path(HISTORY).iterdir(): 6 | correct = 0 7 | total = 0 8 | for task in llm.iterdir(): 9 | if not task.is_dir(): 10 | continue 11 | episode_count = 0 12 | for episode in task.iterdir(): 13 | is_correct = 'succ' in episode.name 14 | correct += is_correct 15 | total += 1 16 | print(f'{llm.name}:\t{correct:3} / {total:3} = {(correct / total):.4}') -------------------------------------------------------------------------------- /eval_heldout/rewoo/nodes/Node.py: -------------------------------------------------------------------------------- 1 | # Basic Node to be inherited from. 2 | class Node: 3 | def __init__(self, name, input_type, output_type): 4 | self.name = name 5 | self.input_type = input_type 6 | self.output_type = output_type 7 | 8 | def run (self, input, log=False): 9 | raise NotImplementedError 10 | 11 | -------------------------------------------------------------------------------- /eval_heldout/rewoo/nodes/NodeCofig.py: -------------------------------------------------------------------------------- 1 | OPENAI_CONFIG = { 2 | "temperature": 0.5, 3 | "max_tokens": 256, 4 | "top_p": 1, 5 | "frequency_penalty": 0, 6 | "presence_penalty": 0, 7 | } -------------------------------------------------------------------------------- /eval_heldout/rewoo/nodes/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/rewoo/nodes/__init__.py -------------------------------------------------------------------------------- /eval_heldout/rewoo/prompts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/rewoo/prompts/__init__.py -------------------------------------------------------------------------------- /eval_heldout/rewoo/prompts/solver.py: -------------------------------------------------------------------------------- 1 | DEFAULT_PREFIX = "Solve the following task or problem. To assist you, we provide some plans and corresponding evidences that might be helpful. Notice that some of these information contain noise so you should trust them with caution.\n\n" 2 | DEFAULT_SUFFIX = "\nNow begin to solve the task or problem. Respond with the answer directly with no extra words.\n\n" 3 | 4 | 5 | TOOL_RELUCTANT_SUFFIX = "\nNow begin to solve the task or problem. Respond with the answer directly with no extra words.\n\n" -------------------------------------------------------------------------------- /eval_heldout/rewoo/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets==2.11.0 2 | geopy==2.3.0 3 | gradio==3.32.0 4 | langchain==0.0.187 5 | numpy==1.24.2 6 | openai==0.27.4 7 | pandas==1.5.2 8 | peft 9 | pytz 10 | requests==2.28.1 11 | tqdm==4.64.1 12 | wikipedia 13 | google-search-results 14 | 15 | accelerate 16 | fschat -------------------------------------------------------------------------------- /eval_heldout/rewoo/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/rewoo/utils/__init__.py -------------------------------------------------------------------------------- /eval_heldout/science-world/.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | .DS_Store 3 | logs/ 4 | *.pyc 5 | .vscode/ 6 | __pycache__/ -------------------------------------------------------------------------------- /eval_heldout/science-world/data_utils/READMD.md: -------------------------------------------------------------------------------- 1 | ```bash 2 | # wget https://github.com/allenai/ScienceWorld/raw/main/goldpaths/goldpaths-all.zip 3 | wget https://github.com/allenai/ScienceWorld/raw/exhaustivevalidactions/goldpaths/goldpaths-all.zip 4 | unzip goldpaths-all.zip 5 | ``` 6 | 7 | 8 | -------------------------------------------------------------------------------- /eval_heldout/science-world/data_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/science-world/data_utils/__init__.py -------------------------------------------------------------------------------- /eval_heldout/science-world/data_utils/goldpaths-all.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/science-world/data_utils/goldpaths-all.zip -------------------------------------------------------------------------------- /eval_heldout/science-world/eval-gpt-3.5.sh: -------------------------------------------------------------------------------- 1 | export OPENAI_API_KEY=sk-your-openai-api-key 2 | export MODEL_NAME=gpt-3.5-turbo 3 | 4 | for task in {0..29} 5 | do 6 | python eval.py \ 7 | --task_nums $task \ 8 | --output_path logs/$MODEL_NAME \ 9 | --model_name $MODEL_NAME 10 | done -------------------------------------------------------------------------------- /eval_heldout/science-world/eval-gpt-4.sh: -------------------------------------------------------------------------------- 1 | export OPENAI_API_KEY=sk-your-openai-api-key 2 | export MODEL_NAME=gpt-4 3 | 4 | for task in {0..29} 5 | do 6 | python eval.py \ 7 | --task_nums $task \ 8 | --output_path logs/$MODEL_NAME \ 9 | --model_name $MODEL_NAME 10 | done -------------------------------------------------------------------------------- /eval_heldout/science-world/eval-tgi.sh: -------------------------------------------------------------------------------- 1 | export CONTROLLER_ADDR=http://127.0.0.1:23333,http://127.0.0.1:23334 2 | export MODEL_NAME=agent-llama 3 | 4 | for task in {0..29} 5 | do 6 | python eval.py \ 7 | --task_nums $task \ 8 | --output_path logs/$MODEL_NAME \ 9 | --model_name $MODEL_NAME 10 | done -------------------------------------------------------------------------------- /eval_heldout/science-world/metrics.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | import re 3 | 4 | LOGS = 'logs' 5 | 6 | for model in Path(LOGS).iterdir(): 7 | scores = [] 8 | for i in range(30): 9 | file = model / f'task{i}-score.txt' 10 | try: 11 | s = file.open().read() 12 | except: 13 | print(f'Warning: {file} not found') 14 | continue 15 | score = re.search(r'Average score: ([0-9\.]*)', s)[1] 16 | x = float(score) 17 | scores.append(x) 18 | 19 | print(f'{model}:', sum(scores) / len(scores)) -------------------------------------------------------------------------------- /eval_heldout/science-world/prompts/README.md: -------------------------------------------------------------------------------- 1 | # Few-shot Prompts 2 | 3 | `prompt_orig.json` contains the original few-show prompts provided by [SwiftSage](https://github.com/yuchenlin/SwiftSage). We use `convert.py` to convert it to suit our conversation format and saved the result in `prompt.json`, which will be used in our evaluation. -------------------------------------------------------------------------------- /eval_heldout/webarena/agent/__init__.py: -------------------------------------------------------------------------------- 1 | from .agent import ( 2 | Agent, 3 | PromptAgent, 4 | TeacherForcingAgent, 5 | construct_agent, 6 | ) 7 | 8 | __all__ = ["Agent", "TeacherForcingAgent", "PromptAgent", "construct_agent"] 9 | -------------------------------------------------------------------------------- /eval_heldout/webarena/agent/prompts/README.md: -------------------------------------------------------------------------------- 1 | ## Naming of the prompt files 2 | `description.action_space.observation_space.json` 3 | -------------------------------------------------------------------------------- /eval_heldout/webarena/agent/prompts/__init__.py: -------------------------------------------------------------------------------- 1 | from .prompt_constructor import * 2 | -------------------------------------------------------------------------------- /eval_heldout/webarena/browser_env/py.typed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/webarena/browser_env/py.typed -------------------------------------------------------------------------------- /eval_heldout/webarena/browser_env/trajectory.py: -------------------------------------------------------------------------------- 1 | from typing import Union 2 | 3 | from .actions import Action 4 | from .utils import StateInfo 5 | 6 | Trajectory = list[Union[StateInfo, Action]] 7 | -------------------------------------------------------------------------------- /eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/calculator.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/calculator.png -------------------------------------------------------------------------------- /eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/cms.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/cms.png -------------------------------------------------------------------------------- /eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/gitlab.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/gitlab.png -------------------------------------------------------------------------------- /eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/manual1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/manual1.png -------------------------------------------------------------------------------- /eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/manual2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/manual2.png -------------------------------------------------------------------------------- /eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/map.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/map.png -------------------------------------------------------------------------------- /eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/onestopshop.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/onestopshop.png -------------------------------------------------------------------------------- /eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/password.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/password.png -------------------------------------------------------------------------------- /eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/reddit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/reddit.png -------------------------------------------------------------------------------- /eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/scratchpad.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/scratchpad.png -------------------------------------------------------------------------------- /eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/wikipedia.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/THUDM/AgentTuning/e33a45d7eab2b63cac4d1956da1e6377fca9fcc7/eval_heldout/webarena/environment_docker/webarena-homepage/static/figures/wikipedia.png -------------------------------------------------------------------------------- /eval_heldout/webarena/evaluation_harness/__init__.py: -------------------------------------------------------------------------------- 1 | from .evaluators import * 2 | from .helper_functions import ( 3 | shopping_get_latest_order_url, 4 | shopping_get_sku_latest_review_author, 5 | shopping_get_sku_latest_review_rating, 6 | ) 7 | -------------------------------------------------------------------------------- /eval_heldout/webarena/llms/__init__.py: -------------------------------------------------------------------------------- 1 | """This module is adapt from https://github.com/zeno-ml/zeno-build""" 2 | -------------------------------------------------------------------------------- /eval_heldout/webarena/llms/tokenizers.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import tiktoken 4 | 5 | 6 | class Tokenizer(object): 7 | def __init__(self, model_name: str) -> None: 8 | if model_name in ["gpt-4", "gpt-turbo-3.5"]: 9 | self.tokenizer = tiktoken.encoding_for_model(model_name) 10 | else: 11 | raise NotImplementedError 12 | 13 | def __call__(self, text: str) -> list[int]: 14 | return self.tokenizer.encode(text) 15 | -------------------------------------------------------------------------------- /eval_heldout/webarena/prepare.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # prepare the evaluation 4 | # re-validate login information 5 | mkdir -p ./.auth 6 | python browser_env/auto_login.py 7 | -------------------------------------------------------------------------------- /eval_heldout/webarena/setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = webarena 3 | 4 | [tool.pytest.ini_options] 5 | testpaths = ["tests"] 6 | python_files = "test_*.py" 7 | 8 | [options.extras_require] 9 | dev = 10 | pre-commit==3.0.1 11 | pytest==7.1.2 12 | mypy==0.991 13 | nbmake 14 | pytest-asyncio 15 | types-requests 16 | 17 | [options] 18 | python_requires = >=3.7, <4 19 | packages = 20 | browser_env 21 | agent 22 | evaluation_harness 23 | llms 24 | [mypy] 25 | strict = true 26 | -------------------------------------------------------------------------------- /eval_heldout/webarena/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | if __name__ == "__main__": 4 | setup() 5 | --------------------------------------------------------------------------------