├── .github └── workflows │ ├── codeql.yml │ ├── formatter.yml │ └── publish-website.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .vscode └── launch.json ├── CODE_OF_CONDUCT.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── RESPONSIBLE_AI.md ├── SECURITY.md ├── SUPPORT.md ├── agents ├── human.py ├── llm.py ├── llm_walkthrough.py ├── random.py ├── react.py ├── reasoning.py └── walkthrough.py ├── benchmark.py ├── docs └── website │ ├── Gemfile │ ├── _config.yml │ ├── _includes │ ├── footer.html │ ├── head-custom.html │ ├── simple_table.md │ ├── table.md │ └── test.md │ ├── _layouts │ └── default.html │ ├── _site │ ├── assets │ │ └── css │ │ │ └── style.css │ └── index.html │ ├── assets │ ├── css │ │ ├── custom.css │ │ └── style.scss │ ├── figs │ │ ├── alfworld_all_games.png │ │ ├── alfworld_image.png │ │ ├── all_framework_scores.png │ │ ├── arxiv-logomark-small.svg │ │ ├── arxiv-logomark.svg │ │ ├── figure1_eric.png │ │ ├── github-mark.svg │ │ ├── jericho_all_games.png │ │ ├── jericho_image.png │ │ ├── pull_run_data.ipynb │ │ ├── radar_chart.png │ │ ├── radar_chart_zoom.png │ │ ├── scienceworld_all_games.png │ │ ├── scienceworld_image.png │ │ ├── simon_says_chatgpt.png │ │ ├── static_banner.png │ │ ├── text-benchmark_bar_chart.png │ │ ├── text-benchmark_radar.png │ │ ├── text-benchmark_radar_zoom.png │ │ ├── textworld_all_games.png │ │ ├── textworld_express_all_games.png │ │ ├── textworld_image.png │ │ └── zork1.png │ ├── js │ │ └── tabs.js │ └── videos │ │ └── figure1v4.mp4 │ ├── favicon.ico │ └── index.md ├── print_results.py ├── pyproject.toml ├── requirements.txt ├── scripts └── example_script.sh ├── tales ├── __init__.py ├── agent.py ├── alfworld │ ├── __init__.py │ ├── alfworld_data.py │ └── alfworld_env.py ├── config.py ├── download.py ├── jericho │ ├── __init__.py │ ├── games.json │ ├── jericho_data.py │ └── jericho_env.py ├── logger.py ├── scienceworld │ ├── __init__.py │ ├── scienceworld_data.py │ └── scienceworld_env.py ├── textworld │ ├── __init__.py │ ├── textworld_data.py │ └── textworld_env.py ├── textworld_express │ ├── __init__.py │ ├── twx_data.py │ └── twx_env.py ├── token.py ├── utils.py └── version.py └── website ├── Gemfile.lock └── _site ├── assets ├── css │ └── style.css ├── figs │ ├── alfworld_all_games.png │ ├── all_framework_scores.png │ ├── jericho_all_games.png │ ├── pull_run_data.ipynb │ ├── radar_chart.png │ ├── radar_chart_zoom.png │ ├── scienceworld_all_games.png │ ├── text-benchmark_bar_chart.png │ ├── text-benchmark_radar.png │ ├── text-benchmark_radar_zoom.png │ ├── textworld_all_games.png │ └── textworld_express_all_games.png └── js │ └── tabs.js ├── favicon.ico └── index.html /.github/workflows/codeql.yml: -------------------------------------------------------------------------------- 1 | # For most projects, this workflow file will not need changing; you simply need 2 | # to commit it to your repository. 3 | # 4 | # You may wish to alter this file to override the set of languages analyzed, 5 | # or to provide custom queries or build logic. 6 | # 7 | # ******** NOTE ******** 8 | # We have attempted to detect the languages in your repository. Please check 9 | # the `language` matrix defined below to confirm you have the correct set of 10 | # supported CodeQL languages. 11 | # 12 | name: "CodeQL" 13 | 14 | on: 15 | push: 16 | branches: [ "main" ] 17 | pull_request: 18 | # The branches below must be a subset of the branches above 19 | branches: [ "main" ] 20 | schedule: 21 | - cron: '37 20 * * 3' 22 | 23 | jobs: 24 | analyze: 25 | name: Analyze 26 | runs-on: ubuntu-latest 27 | permissions: 28 | actions: read 29 | contents: read 30 | security-events: write 31 | 32 | strategy: 33 | fail-fast: false 34 | matrix: 35 | language: [ "python" ] 36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ] 37 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support 38 | 39 | steps: 40 | - name: Checkout repository 41 | uses: actions/checkout@v4 42 | # Install Python dependencies manually 43 | - name: Set up python 44 | uses: actions/setup-python@v5 45 | with: 46 | python-version: '3.12' 47 | cache: 'pip' 48 | # flash-attn requires torch to be installed 49 | - name: Install dependencies 50 | run: | 51 | pip install --upgrade pip 52 | pip install -e ".[dev]" 53 | # Initializes the CodeQL tools for scanning. 54 | - name: Initialize CodeQL 55 | uses: github/codeql-action/init@v3 56 | with: 57 | languages: python 58 | # languages: ${{ matrix.language }} 59 | # If you wish to specify custom queries, you can do so here or in a config file. 60 | # By default, queries listed here will override any specified in a config file. 61 | # Prefix the list here with "+" to use these queries and those in the config file. 62 | 63 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs 64 | # queries: security-extended,security-and-quality 65 | 66 | 67 | # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java). 68 | # If this step fails, then you should remove it and run the build manually (see below) 69 | - name: Autobuild 70 | uses: github/codeql-action/autobuild@v3 71 | 72 | # ℹ️ Command-line programs to run using the OS shell. 73 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun 74 | 75 | # If the Autobuild fails above, remove it and uncomment the following three lines. 76 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance. 77 | 78 | # - run: | 79 | # echo "Run, Build Application using script" 80 | # ./location_of_script_within_repo/buildscript.sh 81 | 82 | - name: Perform CodeQL Analysis 83 | uses: github/codeql-action/analyze@v3 84 | with: 85 | category: "/language:${{matrix.language}}" 86 | -------------------------------------------------------------------------------- /.github/workflows/formatter.yml: -------------------------------------------------------------------------------- 1 | name: "Formatter" 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | schedule: 9 | - cron: '37 20 * * 3' 10 | 11 | jobs: 12 | black: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: psf/black@stable 17 | with: 18 | options: "--check --verbose --line-length 88" 19 | 20 | isort: 21 | runs-on: ubuntu-latest 22 | steps: 23 | - uses: actions/checkout@v4 24 | - uses: isort/isort-action@v1 25 | with: 26 | requirements-files: "requirements.txt" 27 | configuration: "--check-only --diff --profile black --filter-files --verbose" 28 | -------------------------------------------------------------------------------- /.github/workflows/publish-website.yml: -------------------------------------------------------------------------------- 1 | # Sample workflow for building and deploying a Jekyll site to GitHub Pages 2 | name: Deploy Website 3 | 4 | on: 5 | # Runs on pushes targeting the default branch 6 | push: 7 | branches: ["main"] 8 | 9 | # Allows you to run this workflow manually from the Actions tab 10 | workflow_dispatch: 11 | 12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages 13 | permissions: 14 | contents: read 15 | pages: write 16 | id-token: write 17 | 18 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 19 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 20 | concurrency: 21 | group: "pages" 22 | cancel-in-progress: false 23 | 24 | jobs: 25 | # Build job 26 | build: 27 | runs-on: ubuntu-latest 28 | steps: 29 | - name: Checkout 30 | uses: actions/checkout@v4 31 | - name: Setup Pages 32 | uses: actions/configure-pages@v5 33 | - name: Build with Jekyll 34 | uses: actions/jekyll-build-pages@v1 35 | with: 36 | source: ./docs/website 37 | destination: ./_site 38 | - name: Upload artifact 39 | uses: actions/upload-pages-artifact@v3 40 | 41 | # Deployment job 42 | deploy: 43 | environment: 44 | name: github-pages 45 | url: ${{ steps.deployment.outputs.page_url }} 46 | runs-on: ubuntu-latest 47 | needs: build 48 | steps: 49 | - name: Deploy to GitHub Pages 50 | id: deployment 51 | uses: actions/deploy-pages@v4 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | docs/website/_site/media 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | db.sqlite3 64 | db.sqlite3-journal 65 | 66 | # Flask stuff: 67 | instance/ 68 | .webassets-cache 69 | 70 | # Scrapy stuff: 71 | .scrapy 72 | 73 | # Sphinx documentation 74 | docs/_build/ 75 | 76 | # PyBuilder 77 | .pybuilder/ 78 | target/ 79 | 80 | # Jupyter Notebook 81 | .ipynb_checkpoints 82 | 83 | # IPython 84 | profile_default/ 85 | ipython_config.py 86 | 87 | # pyenv 88 | # For a library or package, you might want to ignore these files since the code is 89 | # intended to run in multiple environments; otherwise, check them in: 90 | # .python-version 91 | 92 | # pipenv 93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 96 | # install all needed dependencies. 97 | #Pipfile.lock 98 | 99 | # poetry 100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 101 | # This is especially recommended for binary packages to ensure reproducibility, and is more 102 | # commonly ignored for libraries. 103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 104 | #poetry.lock 105 | 106 | # pdm 107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 108 | #pdm.lock 109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 110 | # in version control. 111 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 112 | .pdm.toml 113 | .pdm-python 114 | .pdm-build/ 115 | 116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 117 | __pypackages__/ 118 | 119 | # Celery stuff 120 | celerybeat-schedule 121 | celerybeat.pid 122 | 123 | # SageMath parsed files 124 | *.sage.py 125 | 126 | # Environments 127 | .env 128 | .venv 129 | env/ 130 | venv/ 131 | ENV/ 132 | env.bak/ 133 | venv.bak/ 134 | 135 | # Spyder project settings 136 | .spyderproject 137 | .spyproject 138 | 139 | # Rope project settings 140 | .ropeproject 141 | 142 | # mkdocs documentation 143 | /site 144 | 145 | # mypy 146 | .mypy_cache/ 147 | .dmypy.json 148 | dmypy.json 149 | 150 | # Pyre type checker 151 | .pyre/ 152 | 153 | # pytype static type analyzer 154 | .pytype/ 155 | 156 | # Cython debug symbols 157 | cython_debug/ 158 | 159 | # PyCharm 160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 162 | # and can be added to the global gitignore or merged into this file. For a more nuclear 163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 164 | #.idea/ 165 | 166 | # Logging 167 | wandb/ 168 | logs/ 169 | 170 | # Compute 171 | .amltconfig 172 | .amltignore 173 | amlt/ 174 | 175 | # Website 176 | docs/website/_site 177 | docs/website/Gemfile.lock 178 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pycqa/isort 3 | rev: 6.0.0 4 | hooks: 5 | - id: isort 6 | args: ["--profile", "black", "--filter-files"] 7 | 8 | - repo: https://github.com/psf/black 9 | rev: 24.4.2 10 | hooks: 11 | - id: black 12 | args: ["--line-length", "88"] -------------------------------------------------------------------------------- /.vscode/launch.json: -------------------------------------------------------------------------------- 1 | { 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | "version": "0.2.0", 6 | "configurations": [ 7 | { 8 | "name": "Python Debugger: Current File", 9 | "type": "debugpy", 10 | "request": "launch", 11 | "program": "benchmark.py", 12 | "console": "integratedTerminal", 13 | "args": ["--games", "games/detective.z5", "games/advent.z5", "--agent", "agent_llm.py:LLMAgent", "--llm", "azure_openai", "--enable_wandb", "-vv", "--conversation", "--context", "100", "--admissible_commands"] 14 | } 15 | ] 16 | } -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt 2 | include README.md 3 | include LICENSE 4 | include pyproject.toml 5 | 6 | global-exclude */__pycache__/* 7 | 8 | prune wandb 9 | prune logs 10 | prune website 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TALES: Text-Adventure Learning Environment Suite 2 | This repository contains the files needed to benchmark language agents on a curated list of text-based games from the following frameworks: [Jericho](https://github.com/microsoft/jericho), [TextWorld](https://github.com/microsoft/textworld), [TextWorld-Express](https://github.com/cognitiveailab/TextWorldExpress), [ScienceWorld](https://github.com/allenai/ScienceWorld), [ALFWorld](https://github.com/alfworld/alfworld)). 3 | 4 | [[Technical Report](https://arxiv.org/abs/2504.14128)] [[Project Page](https://t.co/rFPMRoqO9y)] 5 | 6 | ## 1. Installation 7 | 8 | It is recommended to create and activate a conda or virtual environment. `tales` requires `Python>=3.12`: 9 | 10 | conda create -n tales python=3.12 11 | conda activate tales 12 | 13 | Then, install `tales` directly from PyPI: 14 | 15 | pip install tale-suite 16 | 17 | > [!WARNING] 18 | > The name of the Python package on PyPI is `tale-suite` and not `tales`. 19 | 20 | Alternatively, clone the repository and install locally: 21 | 22 | git clone https://github.com/microsoft/tale-suite 23 | cd tale-suite 24 | pip install -e . 25 | 26 | > [!WARNING] 27 | > You will need Java 1.8+ installed to run the environments TextWorld-Express and ScienceWorld. 28 | > 29 | > sudo apt update && apt install openjdk-8-jre-headless -y 30 | 31 | Alternatively, if the above isn't working: 32 | 33 | > sudo apt-get update && apt-get install default-jre default-jdk 34 | 35 | ### Using Docker 36 | We provide a pre-built docker image at 37 | 38 | docker pull czcui/twb:prebuilt 39 | 40 | [Please see the following docs page for more details on how to set up a local vllm for use with the text world benchmark.](https://docs.google.com/document/d/1Q5FtcNpYDpMLbyraJ1dSKxJLwOgLvWCECiPsnDkEq2Y/edit?usp=sharing) 41 | 42 | An example script can be found in the scripts folder. 43 | 44 | ## 2. Getting Started 45 | 46 | 1. Run benchmark evaluation on all the games for the specified random agent: 47 | 48 | ```python 49 | python benchmark.py --agent agents/random.py random 50 | 51 | 2. Run benchmark evaluation on a subset of the games: 52 | 53 | ```python 54 | python benchmark.py --agent agents/random.py random --env textworld 55 | 56 | 3. Run benchmark evaluation on specific games: 57 | 58 | ```python 59 | python benchmark.py --agent agents/random.py random --envs JerichoEnvZork1 JerichoEnvDetective 60 | 61 | 4. Run benchmark evaluation using as a HumanAgent: 62 | 63 | ```python 64 | python benchmark.py --agent agents/human.py human --envs TWCookingLevel1 65 | 66 | 5. Run benchmark evaluation where the ground-truth walkthrough is being followed: 67 | 68 | ```python 69 | python benchmark.py --agent agents/walkthrough.py walkthrough --envs JerichoEnvZork1 70 | 71 | 72 | ## 3. Benchmarking LLMs 73 | 74 | In order to benchmark a given LLM acting as language agent playing text-based games, you will need to first configure it. `tales` is leveraging the [`llm`](https://llm.datasette.io/en/stable/) library to handle communication with different LLMs. 75 | 76 | python benchmark.py --agent agents/llm.py zero-shot --envs TWCookingLevel1 77 | 78 | ### API-based LLMs 79 | 80 | `llm` natively supports OpenAI models and self-hosted models that offer an OpenAI-compatible API (e.g. like vLLM does - more on this below). 81 | 82 | ### Adding support to other LLMs 83 | 84 | `llm` offers different plugins to include other LLMs. E.g. 85 | 86 | llm install llm-anthropic 87 | 88 | See the `llm`plugins [page](https://llm.datasette.io/en/stable/plugins/directory.html) for more information. 89 | 90 | ### Deploying a model locally using vLLM 91 | 92 | To serve a custom HugginFace model with vLLM, one can use the vllm docker image like this: 93 | 94 | docker run --runtime nvidia --gpus all --restart unless-stopped --name vllm-Llama-3.1-8B-Instruct --env "HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}" -v ~/.cache/huggingface:/root/.cache/huggingface -p 8000:8000 --ipc=host vllm/vllm-openai:latest --model meta-llama/Llama-3.1-8B-Instruct --tensor-parallel-size 4 --host 0.0.0.0 95 | 96 | Then, add the following entrypoint in `~/.config/io.datasette.llm/extra-openai-models.yaml` 97 | 98 | ``` 99 | - model_id: meta-llama/Llama-3.1-8B-Instruct 100 | model_name: meta-llama/Llama-3.1-8B-Instruct 101 | api_base: "http://0.0.0.0:8000/v1" 102 | ``` 103 | 104 | You can check that everything is working properly with this simple command: 105 | 106 | llm -m meta-llama/Llama-3.1-8B-Instruct "Hi. What's your name?" 107 | 108 | ## 4. Building Custom Agents 109 | 110 | To build a custom agent, you need to create a new file (e.g., `custom.py`) in the agents folder and implement the `Agent` class and implement the proper arguments parser. 111 | 112 | ```python 113 | from typing import Dict, Any 114 | import tales 115 | 116 | class CustomAgent(tales.Agent): 117 | 118 | def act(self, obs: str, reward: float, done: bool, infos: Dict[str, Any]) -> str: 119 | # ... 120 | return "help" 121 | 122 | 123 | def build_argparser(parser=None): 124 | return parser or argparse.ArgumentParser() 125 | 126 | 127 | register( 128 | name="my-agent", 129 | desc=( 130 | "This is a custom agent that always output 'help' as a text action." 131 | ), 132 | klass=CustomAgent, 133 | add_arguments=build_argparser, 134 | ) 135 | ``` 136 | 137 | You can then use this agent by specifying the path to the file and the class name in the `--agent` argument. 138 | 139 | python benchmark.py --agent agents/custom.py my-agent 140 | 141 | > [!NOTE] 142 | > See the [agents folder](https://github.com/microsoft/tale-suite/tree/main/agents) for more concrete examples. 143 | 144 | ## Citation 145 | ``` 146 | @article{cui2025tales, 147 | title={TALES: Text-Adventure Learning Environment Suite}, 148 | author={Christopher Cui, Xingdi Yuan, Ziang Xiao, Prithviraj Ammanabrolu, Marc-Alexandre C\^ot\'e}, 149 | journal={arXiv preprint arXiv:2504.14128}, 150 | year={2025}, 151 | url={https://arxiv.org/abs/2504.14128} 152 | } 153 | ``` 154 | 155 | If you use this benchmark, please consider citing the original frameworks as well. 156 | ``` 157 | @article{cote18textworld, 158 | author = {Marc-Alexandre C\^ot\'e and \'Akos K\'ad\'ar and Xingdi Yuan and Ben Kybartas and Tavian Barnes and Emery Fine and James Moore and Ruo Yu Tao and Matthew Hausknecht and Layla El Asri and Mahmoud Adada and Wendy Tay and Adam Trischler}, 159 | title = {TextWorld: A Learning Environment for Text-based Games}, 160 | journal = {CoRR}, 161 | volume = {abs/1806.11532}, 162 | year = {2018} 163 | } 164 | @article{jansen2022textworldexpress, 165 | url = {https://arxiv.org/abs/2208.01174}, 166 | author = {Jansen, Peter A. and Côté, Marc-Alexandre}, 167 | title = {TextWorldExpress: Simulating Text Games at One Million Steps Per Second}, 168 | journal = {arXiv}, 169 | year = {2022}, 170 | } 171 | @inproceedings{hausknecht2020interactive, 172 | title={Interactive fiction games: A colossal adventure}, 173 | author={Hausknecht, Matthew and Ammanabrolu, Prithviraj and C{\^o}t{\'e}, Marc-Alexandre and Yuan, Xingdi}, 174 | booktitle={Proceedings of the AAAI Conference on Artificial Intelligence}, 175 | volume={34}, 176 | number={05}, 177 | year={2020} 178 | } 179 | @inproceedings{ALFWorld20, 180 | title ={{ALFWorld: Aligning Text and Embodied Environments for Interactive Learning}}, 181 | author={Mohit Shridhar and Xingdi Yuan and Marc-Alexandre C\^ot\'e and Yonatan Bisk and Adam Trischler and Matthew Hausknecht}, 182 | booktitle = {Proceedings of the International 183 | Conference on Learning Representations (ICLR)}, 184 | year = {2021}, 185 | url = {https://arxiv.org/abs/2010.03768}} 186 | @misc{scienceworld2022, 187 | title={ScienceWorld: Is your Agent Smarter than a 5th Grader?}, 188 | author={Ruoyao Wang and Peter Jansen and Marc-Alexandre C{\^o}t{\'e} and Prithviraj Ammanabrolu}, 189 | year={2022}, 190 | eprint={2203.07540}, 191 | archivePrefix={arXiv}, 192 | primaryClass={cs.CL}, 193 | url={https://arxiv.org/abs/2203.07540} 194 | } 195 | ``` 196 | 197 | ## Contributing 198 | 199 | This project welcomes contributions and suggestions. Most contributions require you to agree to a 200 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us 201 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com. 202 | 203 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide 204 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions 205 | provided by the bot. You will only need to do this once across all repos using our CLA. 206 | 207 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 208 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or 209 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments. 210 | 211 | ## Trademarks 212 | 213 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft 214 | trademarks or logos is subject to and must follow 215 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general). 216 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship. 217 | Any use of third-party trademarks or logos are subject to those third-party's policies. 218 | 219 | ## Privacy 220 | This framework does not collect user's personal data. For more information about Microsoft's privacy policies. Please see [Microsoft Privacy Statement](https://www.microsoft.com/en-ca/privacy/privacystatement). 221 | 222 | ## Responsible AI 223 | Please see our [Responsible AI Statement](https://github.com/microsoft/tale-suite/blob/main/RESPONSIBLE_AI.md). -------------------------------------------------------------------------------- /RESPONSIBLE_AI.md: -------------------------------------------------------------------------------- 1 | # TALES - Text Adventure Learning Environment Suite 2 | 3 | TALES is a benchmark, which consists of a diverse collection of synthetic and human-written text-adventure games designed to evaluate reasoning capabilities of Large Language Model (LLM)-based agents. 4 | 5 | ### WHAT CAN TALES DO 6 | 7 | TALES was developed to evaluate LLM-based agents’ capabilities to solve text-adventure games. Text-adventure games are goal-oriented environments where an agent is required to interact with a game engine in multi-step setting to understand the goal, explore the game world, find clues, and plan itself towards solving the game. We curated the set of games in TALES in a way to cover a diverse spectrum of reasoning skills an LLM-based agent may need in solving real-world tasks, such as inductive reasoning, deductive reasoning, spatial reasoning, and grounded reasoning. We believe while being much more cost-efficient compared to realistic tasks, testing LLM-based agents’ performance on TALES can provide useful insights in evaluating the agents from different aspects, including LLM backbones, agent architecture design, and prompt engineering. These insights can further guide practitioners in developing their agents in use cases beyond text-adventure games. 8 | 9 | A detailed discussion of TALES, including how it was developed and tested, can be found in our paper at: https://arxiv.org/abs/2504.14128 10 | 11 | 12 | ### INTENDED USES 13 | 14 | TALES is best suited for Evaluating AI agents’ capability of solving text-adventure games. 15 | 16 | TALES is being shared with the research community to facilitate reproduction of our results and foster further research in this area. 17 | 18 | TALES is intended to be used by domain experts who are independently capable of evaluating the quality of outputs before acting on them. 19 | 20 | ### OUT-OF-SCOPE USES 21 | 22 | TALES is designed exclusively for evaluation; it is not well suited for training AI agents. 23 | 24 | We develop TALES for research purposes only, the benchmark does not cover all necessary criteria for real world decision making. We do not recommend using TALES in any way to make real world decisions. 25 | 26 | ### LIMITATIONS  27 | 28 | TALES was developed for research and experimental purposes. The games in the benchmark are exclusively selected to test LLM-based agents’ inductive reasoning, deductive reasoning, spatial reasoning, and grounded reasoning capabilities. We acknowledge that in real-world scenarios, decision making process may require additional context, more complex reasoning, as well as the combination of multiple reasoning types. We do not claim that our research findings can be directly transferred into real-world decision making. Further testing and validation are needed before considering its application in commercial or real-world scenarios. 29 | 30 | 31 | TALES was designed and tested using the English language. Performance in other languages may vary and should be assessed by someone who is both an expert in the expected outputs and a native speaker of that language.  32 | 33 | Outputs generated by AI may include factual errors, fabrication, or speculation. Users are responsible for assessing the accuracy of generated content. All decisions leveraging outputs of the system should be made with human oversight and not be based solely on system outputs. 34 | 35 | ### BEST PRACTICES  36 | 37 | We strongly encourage users to use LLMs/MLLMs that support robust Responsible AI mitigations, such as Azure Open AI (AOAI) services. Such services continually update their safety and RAI mitigations with the latest industry standards for responsible use. For more on AOAI’s best practices when employing foundations models for scripts and applications: 38 | 39 | [Blog post on responsible AI features in AOAI that were presented at Ignite 2023](https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/announcing-new-ai-safety-amp-responsible-ai-features-in-azure/ba-p/3983686) 40 | 41 | [Overview of Responsible AI practices for Azure OpenAI models] (https://learn.microsoft.com/en-us/legal/cognitive-services/openai/overview) 42 | 43 | [Azure OpenAI Transparency Note](https://learn.microsoft.com/en-us/legal/cognitive-services/openai/transparency-note) 44 | 45 | [OpenAI’s Usage policies](https://openai.com/policies/usage-policies) 46 | 47 | [Azure OpenAI’s Code of Conduct](https://learn.microsoft.com/en-us/legal/cognitive-services/openai/code-of-conduct) 48 | 49 | TALES contains a set of text adventure games specifically curated to fulfill our research on LLM-based agents’ capability of performing certain types of reasoning. We refer practitioners to our paper https://arxiv.org/abs/2504.14128 for detailed guidance on how to properly use this benchmark and how to correctly interpret an LLM-based agent’s results on this benchmark. Additionally, we recommend practitioners to use TALES in concert with other benchmarks to understand LLM-based agents’ performance and capabilities from multiple perspective and thus gain a less biased view. 50 | 51 | ### LICENSE 52 | 53 | We use the MIT license, please see the [license file](https://github.com/microsoft/tale-suite/blob/main/LICENSE). 54 | 55 | ### CONTACT 56 | 57 | We welcome feedback and collaboration from our audience. If you have suggestions, questions, or observe unexpected/offensive behavior in our technology, please contact us via [GitHub issues](https://github.com/microsoft/tale-suite/issues) or at textworld@microsoft.com. 58 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd). 40 | 41 | 42 | -------------------------------------------------------------------------------- /SUPPORT.md: -------------------------------------------------------------------------------- 1 | # Support 2 | 3 | ## How to file issues and get help 4 | 5 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing 6 | issues before filing new issues to avoid duplicates. For new issues, file your bug or 7 | feature request as a new Issue. 8 | 9 | For help and questions about using this project, please email textworld@microsoft.com. 10 | 11 | ## Microsoft Support Policy 12 | 13 | Support for this project is limited to the resources listed above. 14 | -------------------------------------------------------------------------------- /agents/human.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import sys 3 | 4 | import tales 5 | from tales.agent import register 6 | from tales.token import get_token_counter 7 | from tales.utils import format_messages_to_markdown, merge_messages 8 | 9 | prompt_toolkit_available = False 10 | try: 11 | # For command line history and autocompletion. 12 | from prompt_toolkit import prompt 13 | from prompt_toolkit.completion import WordCompleter 14 | from prompt_toolkit.history import InMemoryHistory 15 | 16 | prompt_toolkit_available = sys.stdout.isatty() 17 | except ImportError: 18 | pass 19 | 20 | 21 | class HumanAgent(tales.Agent): 22 | 23 | def __init__(self, *args, **kwargs): 24 | self.token_counter = get_token_counter() 25 | self.history = [] 26 | 27 | self._history = None 28 | if prompt_toolkit_available: 29 | self._history = InMemoryHistory() 30 | 31 | @property 32 | def uid(self): 33 | return f"HumanAgent" 34 | 35 | @property 36 | def params(self): 37 | return { 38 | "agent_type": "human", 39 | } 40 | 41 | def act(self, obs, reward, done, infos): 42 | available_commands = infos.get("admissible_commands", []) 43 | if prompt_toolkit_available: 44 | actions_completer = WordCompleter( 45 | available_commands, ignore_case=True, sentence=True 46 | ) 47 | response = prompt( 48 | "\n> ", 49 | completer=actions_completer, 50 | history=self._history, 51 | enable_history_search=True, 52 | ) 53 | else: 54 | if available_commands: 55 | print("Available actions: {}\n".format(available_commands)) 56 | 57 | response = input("\n> ") 58 | 59 | messages = self.build_messages(f"{obs}\n> ") 60 | # response = self._llm_call_from_messages( 61 | # messages, 62 | # temperature=self.act_temp, 63 | # max_tokens=100, # Text actions are short phrases. 64 | # seed=self.seed, 65 | # stream=False, 66 | # ) 67 | 68 | action = response.strip() 69 | self.history.append((f"{obs}\n> ", f"{action}\n")) 70 | 71 | # Compute usage statistics 72 | stats = { 73 | "prompt": format_messages_to_markdown(messages), 74 | "response": response, 75 | "nb_tokens": self.token_counter(messages=messages, text=response), 76 | } 77 | 78 | return action, stats 79 | 80 | def build_messages(self, observation): 81 | messages = [] 82 | 83 | for i, (obs, action) in enumerate(self.history): 84 | messages.append({"role": "user", "content": obs}) 85 | messages.append({"role": "assistant", "content": action}) 86 | 87 | messages.append({"role": "user", "content": observation}) 88 | 89 | # Just in case, let's avoid having multiple messages from the same role. 90 | messages = merge_messages(messages) 91 | 92 | return messages 93 | 94 | 95 | def build_argparser(parser=None): 96 | parser = parser or argparse.ArgumentParser() 97 | group = parser.add_argument_group("HumanAgent settings") 98 | return parser 99 | 100 | 101 | register( 102 | name="human", 103 | desc=("Manually decide which action to take."), 104 | klass=HumanAgent, 105 | add_arguments=build_argparser, 106 | ) 107 | -------------------------------------------------------------------------------- /agents/llm.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import llm 4 | import numpy as np 5 | from tenacity import ( 6 | retry, 7 | retry_if_exception, 8 | stop_after_attempt, 9 | wait_random_exponential, 10 | ) 11 | 12 | import tales 13 | from tales.agent import register 14 | from tales.token import get_token_counter 15 | from tales.utils import ( 16 | format_messages_to_markdown, 17 | is_recoverable_error, 18 | merge_messages, 19 | messages2conversation, 20 | ) 21 | 22 | SYSTEM_PROMPT = ( 23 | "You are playing a text-based game and your goal is to finish it with the highest score." 24 | " Upon reading the text observation, provide a *single* short phrase to interact with the game, e.g. `get lamp` (without the backticks)." 25 | " When stuck, try using the `help` command to see what commands are available." 26 | ) 27 | 28 | 29 | class LLMAgent(tales.Agent): 30 | 31 | def __init__(self, *args, **kwargs): 32 | self.llm = kwargs["llm"] 33 | self.model = llm.get_model(self.llm) 34 | self.token_counter = get_token_counter(self.model) 35 | self.allows_system_prompt = self.llm not in ["o1-mini", "o1-preview"] 36 | 37 | # Provide the API key, if one is needed and has been provided 38 | self.model.key = llm.get_key( 39 | kwargs.get("key"), kwargs["llm"], self.model.key_env_var 40 | ) or llm.get_key(None, self.model.needs_key, self.model.key_env_var) 41 | 42 | self.seed = kwargs["seed"] 43 | self.rng = np.random.RandomState(self.seed) 44 | 45 | self.history = [] 46 | self.context_limit = kwargs["context_limit"] 47 | if self.context_limit is not None: 48 | assert self.context_limit > 0, "--context-limit must be greater than 0." 49 | 50 | self.act_temp = kwargs["act_temp"] 51 | self.conversation = kwargs["conversation"] 52 | 53 | @property 54 | def uid(self): 55 | return ( 56 | f"LLMAgent_{self.llm}" 57 | f"_s{self.seed}" 58 | f"_c{self.context_limit}" 59 | f"_t{self.act_temp}" 60 | f"_conv{self.conversation}" 61 | ) 62 | 63 | @property 64 | def params(self): 65 | return { 66 | "agent_type": "zero-shot", 67 | "llm": self.llm, 68 | "seed": self.seed, 69 | "context_limit": self.context_limit, 70 | "act_temp": self.act_temp, 71 | "conversation": self.conversation, 72 | } 73 | 74 | @retry( 75 | retry=retry_if_exception(is_recoverable_error), 76 | wait=wait_random_exponential(multiplier=1, max=40), 77 | stop=stop_after_attempt(100), 78 | ) 79 | def _llm_call_from_conversation(self, conversation, *args, **kwargs): 80 | response = conversation.prompt(*args, **kwargs) 81 | response.duration_ms() # Forces the response to be computed. 82 | return response 83 | 84 | def _llm_call_from_messages(self, messages, *args, **kwargs): 85 | conversation = messages2conversation(self.model, messages) 86 | prompt = messages[-1]["content"] 87 | system = messages[0]["content"] if self.allows_system_prompt else None 88 | 89 | return self._llm_call_from_conversation( 90 | conversation, prompt=prompt, system=system, *args, **kwargs 91 | ) 92 | 93 | def act(self, obs, reward, done, infos): 94 | messages = self.build_messages(f"{obs}\n> ") 95 | llm_kwargs = { 96 | "temperature": self.act_temp, 97 | "max_tokens": 100, # Text actions are short phrases. 98 | "seed": self.seed, 99 | "stream": False, 100 | } 101 | if self.llm in [ 102 | "claude-3.5-haiku", 103 | "claude-3.5-sonnet", 104 | "claude-3.5-sonnet-latest", 105 | ]: 106 | # For these models, we cannot set the seed. 107 | llm_kwargs.pop("seed") 108 | 109 | if "gemini" in self.llm or "gemma" in self.llm: 110 | # For these models, we cannot set the seed and max_tokens has a different name. 111 | llm_kwargs.pop("seed") 112 | llm_kwargs["max_output_tokens"] = llm_kwargs.pop("max_tokens") 113 | 114 | response = self._llm_call_from_messages(messages, **llm_kwargs) 115 | 116 | action = response.text().strip() 117 | self.history.append((f"{obs}\n> ", f"{action}\n")) 118 | 119 | # Compute usage statistics 120 | stats = { 121 | "prompt": format_messages_to_markdown(messages), 122 | "response": response.text(), 123 | "nb_tokens": self.token_counter(messages=messages, text=response.text()), 124 | } 125 | 126 | return action, stats 127 | 128 | def build_messages(self, observation): 129 | messages = [{"role": "system", "content": SYSTEM_PROMPT}] 130 | limit = self.context_limit or len(self.history) + 1 131 | 132 | for i, (obs, action) in enumerate(self.history[-limit:]): 133 | if len(self.history) >= limit and i == 0: 134 | # Add the current observation. 135 | obs = ( 136 | f"// History has been truncated to the last {limit} steps.\n...\n> " 137 | ) 138 | 139 | messages.append({"role": "user", "content": obs}) 140 | messages.append({"role": "assistant", "content": action}) 141 | 142 | messages.append({"role": "user", "content": observation}) 143 | 144 | # Just in case, let's avoid having multiple messages from the same role. 145 | messages = merge_messages(messages) 146 | 147 | if not self.conversation: 148 | # Merge all messages into a single message except for the system. 149 | content = "".join([msg["content"] for msg in messages[1:]]) 150 | messages = messages[:1] + [{"role": "user", "content": content}] 151 | 152 | if not self.allows_system_prompt: 153 | # Make sure the system prompt is added to the following message. 154 | messages.pop(0) 155 | messages[1]["content"] = f"{SYSTEM_PROMPT}\n\n{messages[1]['content']}" 156 | 157 | return messages 158 | 159 | 160 | def build_argparser(parser=None): 161 | parser = parser or argparse.ArgumentParser() 162 | group = parser.add_argument_group("LLMAgent settings") 163 | 164 | group.add_argument( 165 | "--llm", 166 | default="gpt-4o-mini", 167 | help="LLM to be used for evaluation. Default: %(default)s", 168 | ) 169 | group.add_argument( 170 | "--seed", 171 | type=int, 172 | default=20241001, 173 | help="Seed for LLM (not all endpoints support this). Default: %(default)s", 174 | ) 175 | group.add_argument( 176 | "--act-temp", 177 | type=float, 178 | default=0.0, 179 | help="Temperature for LLM when taking actions. Default: %(default)s", 180 | ) 181 | group.add_argument( 182 | "--context-limit", 183 | type=int, 184 | help="Limit context for LLM (in conversation turns). Default: no limit.", 185 | ) 186 | group.add_argument( 187 | "--conversation", 188 | required=True, 189 | action=argparse.BooleanOptionalAction, 190 | help="Enable conversation mode. Otherwise, use single prompt.", 191 | ) 192 | 193 | return parser 194 | 195 | 196 | register( 197 | name="zero-shot", 198 | desc=( 199 | "This agent uses a LLM to decide which action to take in a zero-shot manner." 200 | ), 201 | klass=LLMAgent, 202 | add_arguments=build_argparser, 203 | ) 204 | -------------------------------------------------------------------------------- /agents/llm_walkthrough.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import gymnasium as gym 4 | 5 | from agents.llm import LLMAgent 6 | from tales.agent import register 7 | from tales.utils import merge_messages 8 | 9 | 10 | # For the LLMWlkThrAgent, the sysprompt is initialized in the __init__ function as we need to change it once we extract the walkthrough from the env 11 | class LLMWalkThroughAgent(LLMAgent): 12 | 13 | def __init__(self, *args, **kwargs): 14 | super().__init__(*args, **kwargs) 15 | self.sys_prompt = "Not Initialized" 16 | 17 | @property 18 | def uid(self): 19 | return ( 20 | f"LLMAgent_{self.llm}" 21 | f"_s{self.seed}" 22 | f"_c{self.context_limit}" 23 | f"_t{self.act_temp}" 24 | f"_conv{self.conversation is not None}" 25 | f"Walkthrough Agent" 26 | ) 27 | 28 | def build_messages(self, observation): 29 | messages = [{"role": "system", "content": self.sys_prompt}] 30 | limit = self.context_limit or len(self.history) + 1 31 | 32 | for i, (obs, action) in enumerate(self.history[-limit:]): 33 | if len(self.history) >= limit and i == 0: 34 | # Add the current observation. 35 | obs = ( 36 | f"// History has been truncated to the last {limit} steps.\n...\n> " 37 | ) 38 | 39 | messages.append({"role": "user", "content": obs}) 40 | messages.append({"role": "assistant", "content": action}) 41 | 42 | messages.append({"role": "user", "content": observation}) 43 | 44 | # Just in case, let's avoid having multiple messages from the same role. 45 | messages = merge_messages(messages) 46 | 47 | if not self.conversation: 48 | # Merge all messages into a single message except for the system. 49 | content = "".join([msg["content"] for msg in messages[1:]]) 50 | messages = messages[:1] + [{"role": "user", "content": content}] 51 | 52 | if not self.allows_system_prompt: 53 | # Make sure the system prompt is added to the following message. 54 | messages.pop(0) 55 | messages[1]["content"] = f"{self.sys_prompt}\n\n{messages[1]['content']}" 56 | 57 | return messages 58 | 59 | def reset(self, obs, info, env_name): 60 | walkthrough = info.get("extra.walkthrough") 61 | if walkthrough is None or len(walkthrough) < 1: 62 | raise ValueError("Walkthrough not initalized: Check the environment") 63 | 64 | # Check if the walkthrough is valid. 65 | env = gym.make(f"tales/{env_name}-v0", disable_env_checker=True) 66 | 67 | _, _ = env.reset() 68 | 69 | for act in walkthrough: 70 | _, _, _, info_ = env.step(act) 71 | 72 | if info_["score"] != info_["max_score"]: 73 | raise ValueError( 74 | "Provided walkthrough does not successfully complete game." 75 | ) 76 | 77 | numbered_walkthrough = ", ".join( 78 | f"{i + 1}.){act}" for i, act in enumerate(walkthrough) 79 | ) 80 | self.sys_prompt = ( 81 | "You are playing a text-based game and your goal is to finish it with the highest score." 82 | " The following is a walkthrough in the form of a list of actions to beat the game." 83 | " You should follow this walkthrough as closely as possible to get the maximum score" 84 | " You must ONLY respond with the action you wish to take with no other special tokens." 85 | f"Walkthrough: {numbered_walkthrough}" 86 | ) 87 | 88 | 89 | def build_argparser(parser=None): 90 | parser = parser or argparse.ArgumentParser() 91 | group = parser.add_argument_group("LLMAgent settings") 92 | 93 | group.add_argument( 94 | "--llm", 95 | default="gpt-4o-mini", 96 | help="LLM to be used for evaluation. Default: %(default)s", 97 | ) 98 | group.add_argument( 99 | "--seed", 100 | type=int, 101 | default=20241001, 102 | help="Seed for LLM (not all endpoints support this). Default: %(default)s", 103 | ) 104 | group.add_argument( 105 | "--act-temp", 106 | type=float, 107 | default=0.0, 108 | help="Temperature for LLM when taking actions. Default: %(default)s", 109 | ) 110 | group.add_argument( 111 | "--context-limit", 112 | type=int, 113 | default=10, 114 | help="Limit context for LLM (in conversation turns). Default: %(default)s", 115 | ) 116 | group.add_argument( 117 | "--conversation", 118 | action="store_true", 119 | help="Enable conversation mode. Otherwise, use single prompt.", 120 | ) 121 | group.add_argument( 122 | "--wlkthr-limit", 123 | type=int, 124 | default=10000, 125 | help="Number of walkthrough actions to provide the LLM. Default: %(default)s", 126 | ) 127 | 128 | return parser 129 | 130 | 131 | register( 132 | name="llm-walkthrough", 133 | desc=( 134 | "This agent uses the ground-truth walkthrough from the environment to attempt to progress through the game." 135 | ), 136 | klass=LLMWalkThroughAgent, 137 | add_arguments=build_argparser, 138 | ) 139 | -------------------------------------------------------------------------------- /agents/random.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import re 3 | 4 | import numpy as np 5 | 6 | import tales 7 | from tales.agent import register 8 | from tales.token import get_token_counter 9 | 10 | 11 | class RandomAgent(tales.Agent): 12 | def __init__(self, **kwargs): 13 | self.seed = kwargs.get("seed", 1234) 14 | self.rng = np.random.RandomState(self.seed) 15 | self.token_counter = get_token_counter() 16 | 17 | # fmt:off 18 | self.actions = [ 19 | "north", "south", "east", "west", "up", "down", 20 | "look", "inventory", 21 | "drop", "take", "take all", 22 | "eat", "attack", 23 | "wait", "YES", 24 | ] 25 | # fmt:on 26 | 27 | @property 28 | def uid(self): 29 | return f"RandomAgent_s{self.seed}" 30 | 31 | @property 32 | def params(self): 33 | return { 34 | "agent_type": "random", 35 | "seed": self.seed, 36 | } 37 | 38 | def act(self, obs, reward, done, info): 39 | stats = { 40 | "prompt": None, 41 | "response": None, 42 | "nb_tokens": self.token_counter(text=obs), 43 | } 44 | 45 | if "admissible_commands" in info: 46 | return self.rng.choice(info["admissible_commands"]), stats 47 | 48 | action = self.rng.choice(self.actions) 49 | if action in ["take", "drop", "eat", "attack"]: 50 | words = re.findall( 51 | r"\b[a-zA-Z]{4,}\b", obs 52 | ) # Extract words with 4 or more letters. 53 | if len(words) > 0: 54 | action += " " + self.rng.choice(words) 55 | 56 | return str(action), stats 57 | 58 | 59 | def build_argparser(parser=None): 60 | parser = parser or argparse.ArgumentParser() 61 | group = parser.add_argument_group("RandomAgent settings") 62 | group.add_argument( 63 | "--seed", 64 | type=int, 65 | default=20241001, 66 | help="Random generator seed to select actions. Default: %(default)s", 67 | ) 68 | return parser 69 | 70 | 71 | register( 72 | name="random", 73 | desc=( 74 | "This agent will pick an action at random among a predefined set of actions or," 75 | " if available, the admissible commands." 76 | ), 77 | klass=RandomAgent, 78 | add_arguments=build_argparser, 79 | ) 80 | -------------------------------------------------------------------------------- /agents/react.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import llm 4 | import numpy as np 5 | from tenacity import ( 6 | retry, 7 | retry_if_exception, 8 | stop_after_attempt, 9 | wait_random_exponential, 10 | ) 11 | from termcolor import colored 12 | 13 | import tales 14 | from tales.agent import register 15 | from tales.token import get_token_counter 16 | from tales.utils import ( 17 | format_messages_to_markdown, 18 | is_recoverable_error, 19 | log, 20 | merge_messages, 21 | messages2conversation, 22 | ) 23 | 24 | SYSTEM_PROMPT = ( 25 | "You are playing a text-based game and your goal is to finish it with the highest score." 26 | " Upon reading the text observation, generate a plan with subgoals when asked to think step-by-step," 27 | " then provide a *single* short phrase to interact with the game when asked to do so, e.g. `get lamp` (without the backticks)." 28 | " When stuck, try using the `help` command to see what commands are available." 29 | ) 30 | 31 | 32 | class ReactAgent(tales.Agent): 33 | 34 | def __init__(self, *args, **kwargs): 35 | self.llm = kwargs["llm"] 36 | self.model = llm.get_model(self.llm) 37 | self.token_counter = get_token_counter(self.model) 38 | self.allows_system_prompt = self.llm not in ["o1-mini", "o1-preview"] 39 | 40 | # Provide the API key, if one is needed and has been provided 41 | self.model.key = llm.get_key( 42 | kwargs.get("key"), kwargs["llm"], self.model.key_env_var 43 | ) or llm.get_key(None, self.model.needs_key, self.model.key_env_var) 44 | 45 | self.seed = kwargs["seed"] 46 | self.rng = np.random.RandomState(self.seed) 47 | 48 | self.history = [] 49 | self.context_limit = kwargs["context_limit"] 50 | if self.context_limit is not None: 51 | assert self.context_limit > 0, "--context-limit must be greater than 0." 52 | 53 | self.act_temp = kwargs["act_temp"] 54 | self.cot_temp = kwargs["cot_temp"] 55 | self.cot_max_tokens = kwargs["cot_max_tokens"] 56 | self.conversation = kwargs["conversation"] 57 | 58 | @property 59 | def uid(self): 60 | return ( 61 | f"ReactAgent_{self.llm}" 62 | f"_s{self.seed}" 63 | f"_c{self.context_limit}" 64 | f"_t{self.act_temp}" 65 | f"_cotT{self.cot_temp}" 66 | f"_cotN{self.cot_max_tokens}" 67 | f"_conv{self.conversation}" 68 | ) 69 | 70 | @property 71 | def params(self): 72 | return { 73 | "agent_type": "react", 74 | "llm": self.llm, 75 | "seed": self.seed, 76 | "context_limit": self.context_limit, 77 | "act_temp": self.act_temp, 78 | "cot_temp": self.cot_temp, 79 | "cot_max_tokens": self.cot_max_tokens, 80 | "conversation": self.conversation, 81 | } 82 | 83 | @retry( 84 | retry=retry_if_exception(is_recoverable_error), 85 | wait=wait_random_exponential(multiplier=1, max=40), 86 | stop=stop_after_attempt(100), 87 | ) 88 | def _llm_call_from_conversation(self, conversation, *args, **kwargs): 89 | response = conversation.prompt(*args, **kwargs) 90 | response.duration_ms() # Forces the response to be computed. 91 | return response 92 | 93 | def _llm_call_from_messages(self, messages, *args, **kwargs): 94 | conversation = messages2conversation(self.model, messages) 95 | prompt = messages[-1]["content"] 96 | system = messages[0]["content"] if self.allows_system_prompt else None 97 | 98 | return self._llm_call_from_conversation( 99 | conversation, prompt=prompt, system=system, *args, **kwargs 100 | ) 101 | 102 | def act(self, obs, reward, done, infos): 103 | question = "// Based on the above information (history), what is the best action to take? Let's think step by step.\n" 104 | messages = self.build_messages(obs, question, []) 105 | response = self._llm_call_from_messages( 106 | messages, 107 | temperature=self.cot_temp, 108 | max_tokens=self.cot_max_tokens, 109 | seed=self.seed, 110 | stream=False, 111 | ) 112 | 113 | answer = response.text().strip() 114 | log.debug(colored(question, "cyan")) 115 | log.debug(colored(answer, "green")) 116 | 117 | # Compute usage statistics for the CoT. 118 | nb_tokens_cot = self.token_counter(messages=messages, text=response.text()) 119 | 120 | prompt = "// Provide your chosen action on a single line while respecting the desired format.\n> " 121 | messages = self.build_messages(obs, prompt, [(question, f"{answer}\n")]) 122 | response = self._llm_call_from_messages( 123 | messages, 124 | temperature=self.act_temp, 125 | max_tokens=100, # Text actions are short phrases. 126 | seed=self.seed, 127 | stream=False, 128 | ) 129 | 130 | action = response.text().strip() 131 | self.history.append((f"{obs}\n> ", f"{action}\n")) 132 | log.debug(colored(prompt, "cyan")) 133 | 134 | # Compute usage statistics 135 | nb_tokens_act = self.token_counter(messages=messages, text=response.text()) 136 | stats = { 137 | "prompt": format_messages_to_markdown(messages), 138 | "response": response.text(), 139 | "nb_tokens": nb_tokens_cot + nb_tokens_act, 140 | } 141 | 142 | return action, stats 143 | 144 | def build_messages(self, observation, question, qa_history): 145 | messages = [{"role": "system", "content": SYSTEM_PROMPT}] 146 | limit = self.context_limit or len(self.history) + 1 147 | 148 | for i, (obs, action) in enumerate(self.history[-limit:]): 149 | if len(self.history) >= limit and i == 0: 150 | # Add the current observation. 151 | obs = ( 152 | f"// History has been truncated to the last {limit} steps.\n...\n> " 153 | ) 154 | 155 | messages.append({"role": "user", "content": obs}) 156 | messages.append({"role": "assistant", "content": action}) 157 | 158 | messages.append({"role": "user", "content": observation}) 159 | 160 | for q, a in qa_history: 161 | messages.append({"role": "user", "content": q}) 162 | messages.append({"role": "assistant", "content": a}) 163 | 164 | messages.append({"role": "user", "content": question}) 165 | 166 | # Merging the current game observation current and the question. 167 | messages = merge_messages(messages) 168 | 169 | if not self.conversation: 170 | # Merge all messages into a single message except for the system. 171 | content = "".join([msg["content"] for msg in messages[1:]]) 172 | messages = messages[:1] + [{"role": "user", "content": content}] 173 | 174 | if not self.allows_system_prompt: 175 | # Make sure the system prompt is added to the following message. 176 | messages.pop(0) 177 | messages[1]["content"] = f"{SYSTEM_PROMPT}\n\n{messages[1]['content']}" 178 | 179 | return messages 180 | 181 | 182 | def build_argparser(parser=None): 183 | parser = parser or argparse.ArgumentParser() 184 | group = parser.add_argument_group("LLMAgent settings") 185 | 186 | group.add_argument( 187 | "--llm", 188 | default="gpt-4o-mini", 189 | help="LLM to be used for evaluation. Default: %(default)s", 190 | ) 191 | group.add_argument( 192 | "--seed", 193 | type=int, 194 | default=20241001, 195 | help="Seed for LLM (not all endpoints support this). Default: %(default)s", 196 | ) 197 | group.add_argument( 198 | "--cot-temp", 199 | type=float, 200 | default=0.0, 201 | help="Temperature for LLM when doing chain-of-thoughts. Default: %(default)s", 202 | ) 203 | group.add_argument( 204 | "--cot-max-tokens", 205 | type=int, 206 | default=1024, 207 | help="Maximum number of token for chain-of-thoughts. Default: %(default)s", 208 | ) 209 | group.add_argument( 210 | "--act-temp", 211 | type=float, 212 | default=0.0, 213 | help="Temperature for LLM when taking actions. Default: %(default)s", 214 | ) 215 | group.add_argument( 216 | "--context-limit", 217 | type=int, 218 | help="Limit context for LLM (in conversation turns). Default: no limit", 219 | ) 220 | group.add_argument( 221 | "--conversation", 222 | required=True, 223 | action=argparse.BooleanOptionalAction, 224 | help="Enable conversation mode. Otherwise, use single prompt.", 225 | ) 226 | 227 | return parser 228 | 229 | 230 | register( 231 | name="react", 232 | desc=( 233 | "This agent uses a LLM to decide which action to take by following a CoT/ReAct approach." 234 | ), 235 | klass=ReactAgent, 236 | add_arguments=build_argparser, 237 | ) 238 | -------------------------------------------------------------------------------- /agents/reasoning.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import llm 4 | import numpy as np 5 | from tenacity import ( 6 | retry, 7 | retry_if_exception, 8 | stop_after_attempt, 9 | wait_random_exponential, 10 | ) 11 | from termcolor import colored 12 | 13 | import tales 14 | from tales.agent import register 15 | from tales.token import get_token_counter 16 | from tales.utils import ( 17 | format_messages_to_markdown, 18 | is_recoverable_error, 19 | merge_messages, 20 | messages2conversation, 21 | ) 22 | 23 | SYSTEM_PROMPT = ( 24 | "You are playing a text-based game and your goal is to finish it with the highest score." 25 | " Upon reading the text observation, provide a *single* short phrase to interact with the game, e.g. `get lamp` (without the backticks)." 26 | " When stuck, try using the `help` command to see what commands are available." 27 | ) 28 | 29 | DEEPSEEK_CHAT_TEMPLATE_NO_THINK = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n\\n'}}{% endif %}" 30 | 31 | 32 | class ReasoningAgent(tales.Agent): 33 | 34 | def __init__(self, *args, **kwargs): 35 | self.llm = kwargs["llm"] 36 | self.model = llm.get_model(self.llm) 37 | self.token_counter = get_token_counter(self.model) 38 | self.allows_system_prompt = self.llm not in [ 39 | "o1", 40 | "o1-mini", 41 | "o1-preview", 42 | "o3-mini", 43 | ] 44 | 45 | # Provide the API key, if one is needed and has been provided 46 | self.model.key = llm.get_key( 47 | kwargs.get("key"), kwargs["llm"], self.model.key_env_var 48 | ) or llm.get_key(None, self.model.needs_key, self.model.key_env_var) 49 | 50 | self.seed = kwargs["seed"] 51 | self.rng = np.random.RandomState(self.seed) 52 | 53 | self.history = [] 54 | self.context_limit = kwargs["context_limit"] 55 | if self.context_limit is not None: 56 | assert self.context_limit > 0, "--context-limit must be greater than 0." 57 | 58 | self.act_temp = kwargs["act_temp"] 59 | self.cot_temp = kwargs["cot_temp"] 60 | self.reasoning_effort = kwargs["reasoning_effort"] 61 | self.conversation = kwargs["conversation"] 62 | 63 | @property 64 | def uid(self): 65 | return ( 66 | f"ReasoningAgent_{self.llm}" 67 | f"_s{self.seed}" 68 | f"_c{self.context_limit}" 69 | f"_conv{self.conversation}" 70 | f"_actT{self.act_temp}" 71 | f"_cotT{self.cot_temp}" 72 | f"_effort{self.reasoning_effort}" 73 | ) 74 | 75 | @property 76 | def params(self): 77 | return { 78 | "agent_type": "react", 79 | "llm": self.llm, 80 | "seed": self.seed, 81 | "context_limit": self.context_limit, 82 | "conversation": self.conversation, 83 | "act_temp": self.act_temp, 84 | "cot_temp": self.cot_temp, 85 | "reasoning_effort": self.reasoning_effort, 86 | } 87 | 88 | @retry( 89 | retry=retry_if_exception(is_recoverable_error), 90 | wait=wait_random_exponential(multiplier=1, max=40), 91 | stop=stop_after_attempt(100), 92 | ) 93 | def _llm_call_from_conversation(self, conversation, *args, **kwargs): 94 | response = conversation.prompt(*args, **kwargs) 95 | response.duration_ms() # Forces the response to be computed. 96 | return response 97 | 98 | def _llm_call_from_messages(self, messages, *args, **kwargs): 99 | conversation = messages2conversation(self.model, messages) 100 | prompt = messages[-1]["content"] 101 | system = messages[0]["content"] if self.allows_system_prompt else None 102 | 103 | return self._llm_call_from_conversation( 104 | conversation, prompt=prompt, system=system, *args, **kwargs 105 | ) 106 | 107 | def act(self, obs, reward, done, infos): 108 | llm_kwargs = { 109 | "temperature": self.cot_temp, 110 | "seed": self.seed, 111 | "stream": True, # Should prevent openai.APITimeoutError 112 | } 113 | if isinstance(self.reasoning_effort, int): 114 | if self.llm in ["claude-3.7-sonnet"]: 115 | llm_kwargs["thinking_budget"] = self.reasoning_effort 116 | else: 117 | llm_kwargs["max_tokens"] = self.reasoning_effort 118 | 119 | elif self.llm in ["o1", "o1-preview", "o3-mini"]: 120 | llm_kwargs["reasoning_effort"] = self.reasoning_effort 121 | 122 | if self.llm in ["o1", "o1-mini", "o1-preview", "o3-mini", "claude-3.7-sonnet"]: 123 | # For these models, we cannot set the temperature. 124 | llm_kwargs.pop("temperature") 125 | 126 | if self.llm in ["o3-mini"]: 127 | llm_kwargs.pop("stream") 128 | 129 | if self.llm in ["claude-3.7-sonnet"]: 130 | llm_kwargs["thinking"] = 1 131 | llm_kwargs.pop("seed") 132 | 133 | if "gemini" in self.llm or "gemma" in self.llm: 134 | # For these models, we cannot set the seed and max_tokens has a different name. 135 | llm_kwargs.pop("seed") 136 | 137 | messages = self.build_messages(f"{obs}\n> ") 138 | response = self._llm_call_from_messages(messages, **llm_kwargs) 139 | response_text = response.text() 140 | 141 | action = response.text().strip() 142 | 143 | thinking = None 144 | if "DeepSeek-R1" in self.llm: 145 | # Strip the reasoning and . 146 | reasoning_end = action.find("") 147 | if reasoning_end == -1: 148 | # Send another request to get the action with the current reasoning. 149 | messages.append( 150 | { 151 | "role": "assistant", 152 | "content": "\n" + response_text.strip() + "\n", 153 | } 154 | ) 155 | # prompt = "// Thinking exceeded the length limit. Based on the thoughts so far, provide your chosen action on a single line while respecting the desired format.\n> " 156 | # messages.append({"role": "user", "content": prompt}) 157 | llm_kwargs["max_tokens"] = ( 158 | 100 # Text actions should be short phrases but deepseek forces thought process by starting the generation with . 159 | ) 160 | llm_kwargs["temperature"] = self.act_temp 161 | llm_kwargs["extra_body"] = { 162 | "chat_template": DEEPSEEK_CHAT_TEMPLATE_NO_THINK, 163 | } 164 | response = self._llm_call_from_messages(messages, **llm_kwargs) 165 | response_text += "\n" + response.text() 166 | action = response.text().strip() 167 | reasoning_end = action.find("") 168 | if reasoning_end == -1: 169 | reasoning_end = ( 170 | 0 # Give up and use the entire response as the action. 171 | ) 172 | else: 173 | reasoning_end += len("") 174 | else: 175 | reasoning_end += len("") 176 | 177 | # Extract the reasoning part from the response. 178 | thinking = action[:reasoning_end].strip() 179 | # Extract the action part from the response. 180 | action = action[reasoning_end:].strip() 181 | 182 | elif self.llm in ["claude-3.7-sonnet"]: 183 | # Extract the thinking part from the response JSON. 184 | thinking = "".join( 185 | [item.get("thinking", "") for item in response.json()["content"]] 186 | ) 187 | 188 | self.history.append((f"{obs}\n> ", f"{action}\n")) 189 | 190 | # Compute usage statistics 191 | stats = { 192 | "prompt": format_messages_to_markdown(messages), 193 | "thinking": thinking, 194 | "response": response_text, 195 | "nb_tokens": self.token_counter(messages=messages, text=response_text), 196 | } 197 | 198 | if thinking is not None: 199 | stats["nb_tokens"] += self.token_counter(text=thinking) 200 | 201 | return action, stats 202 | 203 | def build_messages(self, observation): 204 | messages = [{"role": "system", "content": SYSTEM_PROMPT}] 205 | limit = self.context_limit or len(self.history) + 1 206 | 207 | for i, (obs, action) in enumerate(self.history[-limit:]): 208 | if len(self.history) >= limit and i == 0: 209 | # Add the current observation. 210 | obs = ( 211 | f"// History has been truncated to the last {limit} steps.\n...\n> " 212 | ) 213 | 214 | messages.append({"role": "user", "content": obs}) 215 | messages.append({"role": "assistant", "content": action}) 216 | 217 | messages.append({"role": "user", "content": observation}) 218 | 219 | # Just in case, let's avoid having multiple messages from the same role. 220 | messages = merge_messages(messages) 221 | 222 | if not self.conversation: 223 | # Merge all messages into a single message except for the system. 224 | content = "".join([msg["content"] for msg in messages[1:]]) 225 | messages = messages[:1] + [{"role": "user", "content": content}] 226 | 227 | if not self.allows_system_prompt: 228 | # Make sure the system prompt is added to the following message. 229 | messages[1]["content"] = f"{SYSTEM_PROMPT}\n\n{messages[1]['content']}" 230 | messages.pop(0) 231 | 232 | return messages 233 | 234 | 235 | def build_argparser(parser=None): 236 | parser = parser or argparse.ArgumentParser() 237 | group = parser.add_argument_group("LLMAgent settings") 238 | 239 | group.add_argument( 240 | "--llm", 241 | default="gpt-4o-mini", 242 | help="LLM to be used for evaluation. Default: %(default)s", 243 | ) 244 | group.add_argument( 245 | "--seed", 246 | type=int, 247 | default=20241001, 248 | help="Seed for LLM (not all endpoints support this). Default: %(default)s", 249 | ) 250 | group.add_argument( 251 | "--act-temp", 252 | type=float, 253 | default=0.0, 254 | help="Temperature for LLM when taking actions. Default: %(default)s", 255 | ) 256 | group.add_argument( 257 | "--cot-temp", 258 | type=float, 259 | default=0.0, 260 | help="Temperature for LLM when doing chain-of-thoughts. Default: %(default)s", 261 | ) 262 | subgroup = group.add_mutually_exclusive_group(required=True) 263 | subgroup.add_argument( 264 | "--reasoning-effort", 265 | default="medium", 266 | dest="reasoning_effort", 267 | help="Reasoning effort for reasoning-type LLMs.", 268 | ) 269 | subgroup.add_argument( 270 | "--cot-max-tokens", 271 | type=int, 272 | default=1024, 273 | dest="reasoning_effort", 274 | help="Maximum number of token for chain-of-thoughts. Default: %(default)s", 275 | ) 276 | group.add_argument( 277 | "--context-limit", 278 | type=int, 279 | help="Limit context for LLM (in conversation turns). Default: no limit", 280 | ) 281 | group.add_argument( 282 | "--conversation", 283 | required=True, 284 | action=argparse.BooleanOptionalAction, 285 | help="Enable conversation mode. Otherwise, use single prompt.", 286 | ) 287 | 288 | return parser 289 | 290 | 291 | register( 292 | name="reasoning", 293 | desc=( 294 | "This agent uses reasoning LLM (o1/o3, deepseek-r1, etc.) to do CoT/thinking followed deciding which action to take." 295 | ), 296 | klass=ReasoningAgent, 297 | add_arguments=build_argparser, 298 | ) 299 | -------------------------------------------------------------------------------- /agents/walkthrough.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | import tales 4 | from tales.agent import register 5 | from tales.token import get_token_counter 6 | 7 | 8 | class WalkthroughAgent(tales.Agent): 9 | def __init__(self, **kwargs): 10 | self.token_counter = get_token_counter() 11 | self.walkthrough = None 12 | 13 | @property 14 | def uid(self): 15 | return f"WalkthroughAgent" 16 | 17 | @property 18 | def params(self): 19 | return {} 20 | 21 | def reset(self, obs, info, env_name): 22 | # Store the walkthrough in reverse order so we can pop from it. 23 | if self.walkthrough is None: 24 | self.walkthrough = info.get("extra.walkthrough")[::-1] 25 | 26 | def act(self, obs, reward, done, info): 27 | stats = { 28 | "prompt": None, 29 | "response": None, 30 | "nb_tokens": self.token_counter(text=obs), 31 | } 32 | 33 | if len(self.walkthrough) == 0: 34 | return "QUIT", stats 35 | 36 | return self.walkthrough.pop(), stats 37 | 38 | 39 | def build_argparser(parser=None): 40 | return parser or argparse.ArgumentParser() 41 | 42 | 43 | register( 44 | name="walkthrough", 45 | desc=("This agent will follow the walkthrough provided by the environment."), 46 | klass=WalkthroughAgent, 47 | add_arguments=build_argparser, 48 | ) 49 | -------------------------------------------------------------------------------- /docs/website/Gemfile: -------------------------------------------------------------------------------- 1 | source "https://rubygems.org" 2 | 3 | gem "github-pages", group: :jekyll_plugins -------------------------------------------------------------------------------- /docs/website/_config.yml: -------------------------------------------------------------------------------- 1 | remote_theme: pages-themes/cayman@v0.2.0 2 | plugins: 3 | - jekyll-remote-theme # add this line to the plugins list if you already have one 4 | 5 | title: "the Text Adventure Learning Environment Suite" 6 | 7 | # description: "A Text-games Benchmark" -------------------------------------------------------------------------------- /docs/website/_includes/footer.html: -------------------------------------------------------------------------------- 1 | 16 | 17 | -------------------------------------------------------------------------------- /docs/website/_includes/head-custom.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /docs/website/_includes/simple_table.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 |
RankModelOrganizationModel TypeTALES Score
1claude-3.7-sonnetAnthropicReasoning52.5%
2claude-3.5-sonnet-latestAnthropicNon-reasoning50.4%
3gemini-2.5-pro-preview*GoogleNon-reasoning48.8%
4o1AnthropicReasoning44.2%
5gpt-4oOpenAINon-reasoning40.6%
6claude-3.5-haikuAnthropicNon-reasoning39.6%
7Llama-3.1-405B-InstructMetaNon-reasoning36.4%
8gemini-2.0-flashGoogleNon-reasoning35.0%
9Llama-3.3-70B-InstructMetaNon-reasoning32.8%
10Llama-3.1-70B-InstructMetaNon-reasoning32.0%
11Qwen2.5-72B-InstructAlibabaNon-reasoning30.7%
12Mistral-Large-Instruct-2407Mistral AINon-reasoning30.3%
13gpt-4o-miniOpenAINon-reasoning21.8%
14Llama-4-Scout-17B-16E-InstructMetaNon-reasoning19.8%
15Llama-4-Maverick-17B-128E-InstructMetaNon-reasoning15.5%
16Mistral-Small-Instruct-2409Mistral AINon-reasoning14.8%
17Llama-3.1-8B-InstructMetaNon-reasoning13.9%
18DeepSeek-R1DeepSeek AIReasoning12.4%
19Qwen2.5-7B-InstructAlibabaNon-reasoning11.7%
20Llama-3.2-3B-InstructMetaNon-reasoning10.4%
21phi-4MicrosoftNon-reasoning10.3%
22Mistral-Small-24B-Instruct-2501Mistral AINon-reasoning8.8%
23DeepSeek-R1-Distill-Llama-70BDeepSeek AIReasoning8.4%
24Ministral-8B-Instruct-2410Mistral AINon-reasoning4.6%
25Mistral-Small-3.1-24B-Instruct-2503Mistral AINon-reasoning4.5%
26Mixtral-8x22B-Instruct-v0.1Mistral AINon-reasoning3.7%
27Llama-3.2-1B-InstructMetaNon-reasoning3.3%
28Phi-3-mini-128k-instructMicrosoftNon-reasoning2.2%
29Phi-3.5-MoE-instructMicrosoftNon-reasoning1.7%
30Phi-4-mini-instructMicrosoftNon-reasoning1.5%
31Mixtral-8x7B-Instruct-v0.1Mistral AINon-reasoning1.3%
32Phi-3.5-mini-instructMicrosoftNon-reasoning1.0%
33Phi-3-medium-128k-instructMicrosoftNon-reasoning0.7%
246 |
247 | -------------------------------------------------------------------------------- /docs/website/_includes/table.md: -------------------------------------------------------------------------------- 1 |
2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | 125 | 126 | 127 | 128 | 129 | 130 | 131 | 132 | 133 | 134 | 135 | 136 | 137 | 138 | 139 | 140 | 141 | 142 | 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | 153 | 154 | 155 | 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | 168 | 169 | 170 | 171 | 172 | 173 | 174 | 175 | 176 | 177 | 178 | 179 | 180 | 181 | 182 | 183 | 184 | 185 | 186 | 187 | 188 | 189 | 190 | 191 | 192 | 193 | 194 | 195 | 196 | 197 | 198 | 199 | 200 | 201 | 202 | 203 | 204 | 205 | 206 | 207 | 208 | 209 | 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | 240 | 241 | 242 | 243 | 244 | 245 | 246 | 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | 273 | 274 | 275 | 276 | 277 | 278 | 279 | 280 | 281 | 282 | 283 | 284 | 285 | 286 | 287 | 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 |
ModelTextworldTextworld ExpressAlfworldScienceworldJerichoOverall
claude-3.7-sonnet97.3%91.3%83.3%76.5%12.5%52.5%
claude-3.5-sonnet-latest95.5%81.6%75.0%82.3%9.6%50.4%
gemini-2.5-pro-preview*98.5%91.8%75.0%64.2%12.4%48.8%
o197.8%70.2%28.3%80.1%10.3%44.2%
gpt-4o83.6%80.6%56.7%61.4%5.6%40.6%
claude-3.5-haiku94.9%79.8%26.7%67.3%5.0%39.6%
Llama-3.1-405B-Instruct90.9%79.2%31.7%51.8%6.1%36.4%
gemini-2.0-flash80.8%76.1%20.0%57.1%5.4%35.0%
Llama-3.3-70B-Instruct69.6%77.2%15.0%55.1%4.5%32.8%
Llama-3.1-70B-Instruct65.6%81.9%8.3%51.9%5.3%32.0%
Qwen2.5-72B-Instruct76.5%83.8%36.7%35.0%2.9%30.7%
Mistral-Large-Instruct-240782.4%68.3%6.7%46.1%5.8%30.3%
gpt-4o-mini56.5%73.6%0.0%27.2%1.8%21.8%
Llama-4-Scout-17B-16E-Instruct41.1%68.4%0.0%27.0%1.8%19.8%
Llama-4-Maverick-17B-128E-Instruct-43.5%56.1%8.3%11.5%2.0%15.5%
Mistral-Small-Instruct-240956.1%27.3%0.0%24.4%1.4%14.8%
Llama-3.1-8B-Instruct29.7%50.3%0.0%15.7%2.3%13.9%
DeepSeek-R137.1%38.6%0.0%15.8%1.0%12.4%
Qwen2.5-7B-Instruct27.7%45.6%0.0%12.6%0.7%11.7%
Llama-3.2-3B-Instruct21.4%42.0%0.0%10.0%1.5%10.4%
phi-420.8%43.8%0.0%8.9%1.6%10.3%
Mistral-Small-24B-Instruct-250115.8%23.0%0.0%15.8%1.4%8.8%
DeepSeek-R1-Distill-Llama-70B8.7%39.8%0.0%7.7%1.3%8.4%
Ministral-8B-Instruct-241010.9%22.8%0.0%2.3%0.4%4.6%
Mistral-Small-3.1-24B-Instruct-25032.5%10.3%0.0%10.5%0.8%4.5%
Mixtral-8x22B-Instruct-v0.117.1%8.4%0.0%4.0%0.4%3.7%
Llama-3.2-1B-Instruct0.0%19.0%0.0%2.4%0.6%3.3%
Phi-3-mini-128k-instruct2.7%9.4%0.0%2.4%0.3%2.2%
Phi-3.5-MoE-instruct0.0%7.0%0.0%2.3%0.4%1.7%
Phi-4-mini-instruct0.0%5.5%0.0%2.3%0.5%1.5%
Mixtral-8x7B-Instruct-v0.10.0%1.6%0.0%4.0%0.3%1.3%
Phi-3.5-mini-instruct0.0%2.0%0.0%2.4%0.5%1.0%
Phi-3-medium-128k-instruct0.0%0.0%0.0%2.3%0.3%0.7%
314 |
315 | -------------------------------------------------------------------------------- /docs/website/_includes/test.md: -------------------------------------------------------------------------------- 1 | This is the string you want to save. -------------------------------------------------------------------------------- /docs/website/_layouts/default.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | {% seo %} 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | {% include head-custom.html %} 20 | 21 | 22 | 23 | 24 | 25 | Skip to the content. 26 | 27 | 40 | 41 |
42 | {{ content }} 43 | 44 | {%- include footer.html -%} 45 | 46 |
47 | 48 | 49 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /docs/website/assets/css/custom.css: -------------------------------------------------------------------------------- 1 | .site-footer-credits { 2 | color: #67747a; 3 | } 4 | -------------------------------------------------------------------------------- /docs/website/assets/css/style.scss: -------------------------------------------------------------------------------- 1 | --- 2 | --- 3 | 4 | @import "{{ site.theme }}"; 5 | 6 | /* Tabs styling */ 7 | .tab-container { 8 | width: 100%; 9 | margin-top: 20px; 10 | } 11 | 12 | .tabs, .nested-tabs { 13 | display: flex; /* Add flexbox display */ 14 | overflow: hidden; 15 | border: 1px solid #ccc; 16 | background-color: #f1f1f1; 17 | border-radius: 4px 4px 0 0; 18 | width: 100%; /* Ensure full width */ 19 | } 20 | 21 | .nested-tabs { 22 | margin-bottom: 15px; 23 | } 24 | 25 | /* Shared styles for both main and nested tab buttons */ 26 | .tab-button, .nested-tab-button { 27 | background-color: inherit; 28 | border: none; 29 | outline: none; 30 | cursor: pointer; 31 | transition: 0.3s; 32 | flex: 1; /* Make tabs grow evenly to fill space */ 33 | text-align: center; /* Center text in tabs */ 34 | } 35 | 36 | /* Main tab button specific styles */ 37 | .tab-button { 38 | padding: 14px 16px; 39 | font-size: 16px; 40 | } 41 | 42 | /* Nested tab button specific styles */ 43 | .nested-tab-button { 44 | padding: 10px 12px; 45 | font-size: 14px; 46 | } 47 | 48 | .tab-button:hover, .nested-tab-button:hover { 49 | background-color: #ddd; 50 | } 51 | 52 | .tab-button.active { 53 | background-color: #157878; 54 | color: white; 55 | } 56 | 57 | .nested-tab-button.active { 58 | background-color: #1a9a9a; /* Slightly different color to distinguish */ 59 | color: white; 60 | } 61 | 62 | .tab-content { 63 | display: none; 64 | padding: 20px; 65 | border: 1px solid #ccc; 66 | border-top: none; 67 | border-radius: 0 0 4px 4px; 68 | width: 100%; /* Ensure content takes full width */ 69 | } 70 | 71 | .tab-content.active { 72 | display: block; 73 | } 74 | 75 | .nested-tab-content { 76 | display: none; 77 | padding: 10px 0; 78 | border-top: none; 79 | width: 100%; /* Ensure nested content takes full width */ 80 | } 81 | 82 | .nested-tab-content.active { 83 | display: block; 84 | } 85 | 86 | #main-description { 87 | font-weight: normal; 88 | font-style: normal; 89 | } 90 | 91 | /* If needed, control other text properties */ 92 | #main-description { 93 | font-size: 16px; 94 | line-height: 1.5; 95 | color: #333; 96 | } 97 | 98 | .author-tagline { 99 | text-align: center; 100 | font-style: italic; 101 | color: #666; 102 | margin-bottom: 20px; 103 | } 104 | 105 | .abstract-container { 106 | background-color: #f5f5f5; 107 | border-radius: 8px; 108 | padding: 20px; 109 | margin: 20px 0; 110 | border-left: 4px solid #ddd; 111 | } 112 | 113 | .abstract-container h3 { 114 | margin-top: 0; 115 | color: #333; 116 | } 117 | 118 | .abstract-container p { 119 | margin-bottom: 0; 120 | line-height: 1.6; 121 | } 122 | 123 | .abstract-tagline { 124 | text-align: center; 125 | font-weight: bold; 126 | color: #666; 127 | margin-bottom: 20px; 128 | } 129 | 130 | /* Table styling */ 131 | // .table-container { 132 | // overflow-x: auto; 133 | // margin: 20px 0; 134 | // } 135 | 136 | .model-scores { 137 | width: 100%; 138 | border-collapse: collapse; 139 | font-size: 14px; 140 | } 141 | 142 | .model-scores th { 143 | background-color: #157878; 144 | color: white; 145 | text-align: left; 146 | padding: 10px; 147 | position: sticky; 148 | top: 0; 149 | } 150 | 151 | .model-scores td { 152 | padding: 8px 10px; 153 | border-bottom: 1px solid #ddd; 154 | } 155 | 156 | .model-scores tr:nth-child(even) { 157 | background-color: #f2f2f2; 158 | } 159 | 160 | .model-scores tr:hover { 161 | background-color: #e8f4f4; 162 | } 163 | 164 | /* Responsive design for mobile */ 165 | @media screen and (max-width: 768px) { 166 | .model-scores { 167 | font-size: 12px; 168 | } 169 | 170 | .model-scores th, .model-scores td { 171 | padding: 6px; 172 | } 173 | } 174 | 175 | .environment-container { 176 | background-color: #f5f5f5; 177 | border-radius: 8px; 178 | padding: 20px; 179 | margin: 20px 0; 180 | border-left: 4px solid #157878; 181 | } 182 | 183 | .environment-tagline { 184 | text-align: center; 185 | font-weight: bold; 186 | color: #157878; 187 | margin-bottom: 20px; 188 | } 189 | 190 | .environment-container p:not(.environment-tagline) { 191 | margin-bottom: 0; 192 | line-height: 1.6; 193 | } 194 | 195 | .cite-button { 196 | background: none; 197 | border: none; 198 | color: #157878; 199 | cursor: pointer; 200 | font-size: 0.8em; 201 | padding: 0 5px; 202 | vertical-align: middle; 203 | transition: transform 0.2s; 204 | } 205 | 206 | .cite-button:hover { 207 | transform: scale(1.2); 208 | } 209 | 210 | .modal { 211 | display: none; 212 | position: fixed; 213 | z-index: 1000; 214 | left: 0; 215 | top: 0; 216 | width: 100%; 217 | height: 100%; 218 | overflow: auto; 219 | background-color: rgba(0,0,0,0.4); 220 | } 221 | 222 | /* Modal Content */ 223 | .modal-content { 224 | background-color: #fefefe; 225 | margin: 10% auto; 226 | padding: 20px; 227 | border: 1px solid #888; 228 | width: 80%; 229 | max-width: 600px; 230 | border-radius: 8px; 231 | box-shadow: 0 4px 8px rgba(0,0,0,0.2); 232 | } 233 | 234 | /* The Close Button */ 235 | .close-modal { 236 | color: #aaa; 237 | float: right; 238 | font-size: 28px; 239 | font-weight: bold; 240 | cursor: pointer; 241 | } 242 | 243 | .close-modal:hover, 244 | .close-modal:focus { 245 | color: black; 246 | text-decoration: none; 247 | } 248 | 249 | .citation-popup { 250 | display: none; 251 | position: absolute; /* Use absolute instead of fixed */ 252 | z-index: 1000; 253 | background-color: #fefefe; 254 | border: 1px solid #ddd; 255 | border-radius: 8px; 256 | box-shadow: 0 4px 8px rgba(0,0,0,0.2); 257 | width: 400px; 258 | max-width: 90vw; 259 | padding: 15px; 260 | } 261 | 262 | /* Citation box styling */ 263 | .citation-box { 264 | background-color: #f9f9f9; 265 | border: 1px solid #ddd; 266 | border-radius: 4px; 267 | padding: 10px; 268 | margin: 10px 0; 269 | max-height: 200px; 270 | overflow-y: auto; 271 | } 272 | 273 | .citation-box pre { 274 | white-space: pre-wrap; 275 | word-wrap: break-word; 276 | margin: 0; 277 | font-family: monospace; 278 | font-size: 12px; 279 | } 280 | 281 | /* Popup header */ 282 | .popup-header { 283 | display: flex; 284 | justify-content: space-between; 285 | align-items: center; 286 | margin-bottom: 10px; 287 | } 288 | 289 | .popup-header h3 { 290 | margin: 0; 291 | font-size: 16px; 292 | } 293 | 294 | .close-popup { 295 | cursor: pointer; 296 | color: #888; 297 | font-size: 18px; 298 | font-weight: bold; 299 | } 300 | 301 | .close-popup:hover { 302 | color: #333; 303 | } 304 | 305 | /* Copy button styling */ 306 | .copy-button { 307 | background-color: #157878; 308 | color: white; 309 | border: none; 310 | padding: 8px 16px; 311 | text-align: center; 312 | text-decoration: none; 313 | display: inline-block; 314 | font-size: 14px; 315 | margin: 10px 0 0 0; 316 | cursor: pointer; 317 | border-radius: 4px; 318 | transition: background-color 0.3s; 319 | } 320 | 321 | .copy-button:hover { 322 | background-color: #0b5c5c; 323 | } 324 | 325 | /* Existing button styling */ 326 | .cite-button { 327 | background: none; 328 | border: none; 329 | color: #157878; 330 | cursor: pointer; 331 | font-size: 0.8em; 332 | padding: 0 5px; 333 | vertical-align: middle; 334 | transition: transform 0.2s; 335 | position: relative; /* For positioning the popup */ 336 | } 337 | 338 | .cite-button:hover { 339 | transform: scale(1.2); 340 | } 341 | 342 | .citation-notice { 343 | text-align: center; 344 | font-style: italic; 345 | color: #555; 346 | margin-bottom: 15px; 347 | font-size: 0.9em; 348 | } 349 | 350 | .environment-image { 351 | display: block; 352 | margin: 0 auto; 353 | max-width: 100%; 354 | height: auto; 355 | } 356 | 357 | .environment-image-container { 358 | text-align: center; 359 | margin: 20px 0; 360 | } 361 | 362 | 363 | .image-caption { 364 | text-align: center; 365 | font-size: 0.85em; 366 | color: #666; 367 | margin-top: 5px; 368 | font-style: italic; 369 | } 370 | 371 | /* Add this to your stylesheet */ 372 | .table-container { 373 | width: 100%; 374 | overflow-x: hidden; 375 | } 376 | 377 | /* Force the table to fit within the container */ 378 | .responsive-table { 379 | width: 100%; 380 | overflow-x: hidden; 381 | } 382 | 383 | /* Style the model-scores table */ 384 | .model-scores { 385 | width: 100%; 386 | table-layout: fixed; 387 | border-collapse: collapse; 388 | font-size: 14px; /* Base font size */ 389 | } 390 | 391 | /* Give more space to model names, less to percentages */ 392 | .model-scores th:first-child, 393 | .model-scores td:first-child { 394 | width: 16%; /* Prioritize model names */ 395 | text-align: left; 396 | font-weight: 500; 397 | padding-right: 8px; 398 | } 399 | 400 | /* Make percentage columns very compact */ 401 | .model-scores th:not(:first-child), 402 | .model-scores td:not(:first-child) { 403 | width: 14%; /* Distribute remaining 65% across 6 columns */ 404 | text-align: center; 405 | padding-left: 2px; 406 | padding-right: 2px; 407 | } 408 | 409 | /* Force text wrapping in all cells */ 410 | .model-scores th, 411 | .model-scores td { 412 | word-break: break-word; 413 | overflow-wrap: break-word; 414 | white-space: normal; 415 | padding-top: 4px; 416 | padding-bottom: 4px; 417 | } 418 | 419 | /* Progressive font size reduction for different screen sizes */ 420 | @media screen and (max-width: 992px) { 421 | .model-scores { 422 | font-size: 13px; 423 | } 424 | } 425 | 426 | @media screen and (max-width: 768px) { 427 | .model-scores { 428 | font-size: 12px; 429 | } 430 | 431 | .model-scores th:first-child, 432 | .model-scores td:first-child { 433 | width: 16%; 434 | } 435 | 436 | .model-scores th:not(:first-child), 437 | .model-scores td:not(:first-child) { 438 | width: 14%; 439 | } 440 | } 441 | 442 | @media screen and (max-width: 576px) { 443 | .model-scores { 444 | font-size: 10px; 445 | } 446 | 447 | .model-scores th:first-child, 448 | .model-scores td:first-child { 449 | width: 12.5%; 450 | } 451 | 452 | .model-scores th:not(:first-child), 453 | .model-scores td:not(:first-child) { 454 | width: 25%; 455 | } 456 | } 457 | 458 | /* For extremely small screens */ 459 | @media screen and (max-width: 400px) { 460 | .model-scores { 461 | font-size: 9px; 462 | } 463 | } 464 | 465 | .simplified-table-container { 466 | width: 100%; 467 | overflow-x: hidden; 468 | max-width: 100%; 469 | display: block; 470 | } 471 | 472 | 473 | /* Make sure simplified-table-container is correctly displayed in nested-tab-content */ 474 | #tab6-subtab1 .responsive-table.simplified-table-container { 475 | width: 100%; 476 | overflow-x: auto; /* Change from hidden to auto if table might overflow on small screens */ 477 | } 478 | 479 | /* Column-specific widths for the simplified table */ 480 | .simplified-table-container .model-scores th:nth-child(1), 481 | .simplified-table-container .model-scores td:nth-child(1) { 482 | width: 10%; /* Rank column - very narrow */ 483 | text-align: center; 484 | } 485 | 486 | .simplified-table-container .model-scores th:nth-child(2), 487 | .simplified-table-container .model-scores td:nth-child(2) { 488 | width: 40%; /* Model name column - give it most space */ 489 | text-align: center; 490 | } 491 | 492 | .simplified-table-container .model-scores th:nth-child(3), 493 | .simplified-table-container .model-scores td:nth-child(3) { 494 | width: 20%; /* Score column - moderate space */ 495 | text-align: center; 496 | } 497 | 498 | .simplified-table-container .model-scores th:nth-child(4), 499 | .simplified-table-container .model-scores td:nth-child(4) { 500 | width: 20%; /* Reasoning column - moderate space */ 501 | text-align: center; 502 | } 503 | 504 | .simplified-table-container .model-scores th:nth-child(5), 505 | .simplified-table-container .model-scores td:nth-child(5) { 506 | width: 20%; /* Reasoning column - moderate space */ 507 | text-align: center; 508 | } 509 | 510 | /* Responsive adjustments for simplified table */ 511 | @media screen and (max-width: 768px) { 512 | .simplified-table-container .model-scores th:nth-child(1), 513 | .simplified-table-container .model-scores td:nth-child(1) { 514 | width: 10%; 515 | } 516 | 517 | .simplified-table-container .model-scores th:nth-child(2), 518 | .simplified-table-container .model-scores td:nth-child(2) { 519 | width: 40%; 520 | } 521 | 522 | .simplified-table-container .model-scores th:nth-child(3), 523 | .simplified-table-container .model-scores td:nth-child(3), 524 | .simplified-table-container .model-scores th:nth-child(4), 525 | .simplified-table-container .model-scores td:nth-child(4), 526 | .simplified-table-container .model-scores td:nth-child(5) 527 | { 528 | width: 10; 529 | } 530 | } 531 | 532 | .asterisk-note { 533 | font-size: 0.85em; 534 | color: #666; 535 | margin-top: -10px; 536 | margin-bottom: 15px; 537 | font-style: italic; 538 | } 539 | 540 | .video-container { 541 | width: 100%; 542 | margin: 1em 0; 543 | position: relative; 544 | } 545 | 546 | .video-container video { 547 | width: 100%; 548 | display: block; 549 | } -------------------------------------------------------------------------------- /docs/website/assets/figs/alfworld_all_games.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/alfworld_all_games.png -------------------------------------------------------------------------------- /docs/website/assets/figs/alfworld_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/alfworld_image.png -------------------------------------------------------------------------------- /docs/website/assets/figs/all_framework_scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/all_framework_scores.png -------------------------------------------------------------------------------- /docs/website/assets/figs/arxiv-logomark-small.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/website/assets/figs/arxiv-logomark.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/website/assets/figs/figure1_eric.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/figure1_eric.png -------------------------------------------------------------------------------- /docs/website/assets/figs/github-mark.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/website/assets/figs/jericho_all_games.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/jericho_all_games.png -------------------------------------------------------------------------------- /docs/website/assets/figs/jericho_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/jericho_image.png -------------------------------------------------------------------------------- /docs/website/assets/figs/radar_chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/radar_chart.png -------------------------------------------------------------------------------- /docs/website/assets/figs/radar_chart_zoom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/radar_chart_zoom.png -------------------------------------------------------------------------------- /docs/website/assets/figs/scienceworld_all_games.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/scienceworld_all_games.png -------------------------------------------------------------------------------- /docs/website/assets/figs/scienceworld_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/scienceworld_image.png -------------------------------------------------------------------------------- /docs/website/assets/figs/simon_says_chatgpt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/simon_says_chatgpt.png -------------------------------------------------------------------------------- /docs/website/assets/figs/static_banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/static_banner.png -------------------------------------------------------------------------------- /docs/website/assets/figs/text-benchmark_bar_chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/text-benchmark_bar_chart.png -------------------------------------------------------------------------------- /docs/website/assets/figs/text-benchmark_radar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/text-benchmark_radar.png -------------------------------------------------------------------------------- /docs/website/assets/figs/text-benchmark_radar_zoom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/text-benchmark_radar_zoom.png -------------------------------------------------------------------------------- /docs/website/assets/figs/textworld_all_games.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/textworld_all_games.png -------------------------------------------------------------------------------- /docs/website/assets/figs/textworld_express_all_games.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/textworld_express_all_games.png -------------------------------------------------------------------------------- /docs/website/assets/figs/textworld_image.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/textworld_image.png -------------------------------------------------------------------------------- /docs/website/assets/figs/zork1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/zork1.png -------------------------------------------------------------------------------- /docs/website/assets/js/tabs.js: -------------------------------------------------------------------------------- 1 | function openTab(evt, tabName) { 2 | var i, tabcontent, tabbuttons; 3 | 4 | // Hide all tab content 5 | tabcontent = document.getElementsByClassName("tab-content"); 6 | for (i = 0; i < tabcontent.length; i++) { 7 | tabcontent[i].style.display = "none"; 8 | } 9 | 10 | // Remove "active" class from all tab buttons 11 | tabbuttons = document.getElementsByClassName("tab-button"); 12 | for (i = 0; i < tabbuttons.length; i++) { 13 | tabbuttons[i].className = tabbuttons[i].className.replace(" active", ""); 14 | } 15 | 16 | // Show the current tab and add "active" class to the button 17 | document.getElementById(tabName).style.display = "block"; 18 | evt.currentTarget.className += " active"; 19 | } 20 | 21 | // Nested tab functionality 22 | function openNestedTab(evt, tabName) { 23 | var i, tabcontent, tabbuttons; 24 | 25 | // Hide all nested tab content within the parent tab 26 | var parentTab = evt.currentTarget.closest('.tab-content'); 27 | tabcontent = parentTab.getElementsByClassName("nested-tab-content"); 28 | for (i = 0; i < tabcontent.length; i++) { 29 | tabcontent[i].style.display = "none"; 30 | } 31 | 32 | // Remove "active" class from all nested tab buttons 33 | tabbuttons = parentTab.getElementsByClassName("nested-tab-button"); 34 | for (i = 0; i < tabbuttons.length; i++) { 35 | tabbuttons[i].className = tabbuttons[i].className.replace(" active", ""); 36 | } 37 | 38 | // Show the current nested tab and add "active" class to the button 39 | document.getElementById(tabName).style.display = "block"; 40 | evt.currentTarget.className += " active"; 41 | } 42 | 43 | 44 | function copyTextToClipboard(elementId, event) { 45 | console.log("Citation button clicked for: " + elementId); 46 | 47 | // Get the citation text 48 | var citationText = document.getElementById(elementId); 49 | if (!citationText) { 50 | console.error("Citation element not found: " + elementId); 51 | return; 52 | } 53 | 54 | // Force create popup if not exists 55 | var popup = document.getElementById('citation-popup'); 56 | if (!popup) { 57 | console.log("Creating popup because it doesn't exist yet"); 58 | var popupHTML = 59 | '
' + 60 | '' + 64 | '
' + 65 | '
' +
 66 |                 '
' + 67 | '' + 68 | '
'; 69 | 70 | document.body.insertAdjacentHTML('beforeend', popupHTML); 71 | popup = document.getElementById('citation-popup'); 72 | 73 | // Set up event handlers for the newly created popup 74 | var closeButton = document.querySelector('.close-popup'); 75 | var copyButton = document.getElementById('copy-citation-button'); 76 | 77 | if (closeButton) { 78 | closeButton.onclick = function() { 79 | popup.style.display = 'none'; 80 | }; 81 | } 82 | 83 | if (copyButton) { 84 | copyButton.onclick = function() { 85 | var text = document.getElementById('citation-text').innerText; 86 | navigator.clipboard.writeText(text).then(function() { 87 | copyButton.innerText = 'Copied!'; 88 | setTimeout(function() { 89 | copyButton.innerText = 'Copy to Clipboard'; 90 | }, 1500); 91 | }); 92 | }; 93 | } 94 | } 95 | 96 | // Set the citation text in the popup 97 | var citationTextElement = document.getElementById('citation-text'); 98 | if (citationTextElement) { 99 | citationTextElement.innerText = citationText.innerText; 100 | } 101 | 102 | // Position the popup near the mouse cursor instead of the button 103 | var x = event.clientX; 104 | var y = event.clientY; 105 | 106 | // Get dimensions 107 | var viewportWidth = window.innerWidth || document.documentElement.clientWidth; 108 | var viewportHeight = window.innerHeight || document.documentElement.clientHeight; 109 | var scrollTop = window.pageYOffset || document.documentElement.scrollTop; 110 | var scrollLeft = window.pageXOffset || document.documentElement.scrollLeft; 111 | 112 | // Show the popup temporarily to get its dimensions 113 | popup.style.visibility = 'hidden'; 114 | popup.style.display = 'block'; 115 | var popupWidth = popup.offsetWidth; 116 | var popupHeight = popup.offsetHeight; 117 | 118 | // Calculate position to ensure popup stays in viewport 119 | // Add 10px padding from edges 120 | var padding = 10; 121 | 122 | // Position horizontally 123 | if (x + popupWidth + padding > viewportWidth) { 124 | // If too far right, position to the left of cursor 125 | x = Math.max(padding, x - popupWidth - padding); 126 | } else { 127 | // Otherwise position to the right of cursor with padding 128 | x = x + padding; 129 | } 130 | 131 | // Position vertically 132 | if (y + popupHeight + padding > viewportHeight) { 133 | // If too far down, position above cursor 134 | y = Math.max(padding, y - popupHeight - padding); 135 | } else { 136 | // Otherwise position below cursor with padding 137 | y = y + padding; 138 | } 139 | 140 | // Apply the position (convert from viewport coordinates to document coordinates) 141 | popup.style.left = (x + scrollLeft) + 'px'; 142 | popup.style.top = (y + scrollTop) + 'px'; 143 | 144 | // Make the popup visible 145 | popup.style.visibility = 'visible'; 146 | 147 | // Prevent default action and event bubbling 148 | event.preventDefault(); 149 | event.stopPropagation(); 150 | } 151 | 152 | // Initialize tabs and set up the citation popup 153 | document.addEventListener('DOMContentLoaded', function() { 154 | // Make sure the first tab and its first nested tab are active by default 155 | var firstTabButton = document.querySelector('.tab-button'); 156 | if (firstTabButton) { 157 | firstTabButton.click(); 158 | } 159 | 160 | // Create the citation popup HTML if it doesn't exist 161 | if (!document.getElementById('citation-popup')) { 162 | var popupHTML = 163 | '
' + 164 | '' + 168 | '
' + 169 | '
' +
170 |                 '
' + 171 | '' + 172 | '
'; 173 | 174 | document.body.insertAdjacentHTML('beforeend', popupHTML); 175 | 176 | // Now set up the event handlers for the popup 177 | var popup = document.getElementById('citation-popup'); 178 | var closeButton = document.querySelector('.close-popup'); 179 | var copyButton = document.getElementById('copy-citation-button'); 180 | 181 | // Close popup when clicking the close button 182 | if (closeButton) { 183 | closeButton.onclick = function() { 184 | popup.style.display = 'none'; 185 | }; 186 | } 187 | 188 | // When the user clicks the copy button 189 | if (copyButton) { 190 | copyButton.onclick = function() { 191 | var text = document.getElementById('citation-text').innerText; 192 | navigator.clipboard.writeText(text).then(function() { 193 | // Change button text temporarily to provide feedback 194 | var originalText = copyButton.innerText; 195 | copyButton.innerText = 'Copied!'; 196 | setTimeout(function() { 197 | copyButton.innerText = originalText; 198 | }, 1500); 199 | }).catch(function(err) { 200 | console.error('Could not copy text: ', err); 201 | }); 202 | }; 203 | } 204 | 205 | // Close popup when clicking outside 206 | document.addEventListener('click', function(event) { 207 | if (popup && 208 | !popup.contains(event.target) && 209 | !event.target.classList.contains('cite-button') && 210 | popup.style.display === 'block') { 211 | popup.style.display = 'none'; 212 | } 213 | }); 214 | } 215 | }); 216 | 217 | 218 | document.addEventListener('DOMContentLoaded', function() { 219 | // Find the simplified table by its container 220 | const simplifiedTableContainer = document.getElementById('tab6-subtab1'); 221 | if (simplifiedTableContainer) { 222 | const table = simplifiedTableContainer.querySelector('table'); 223 | if (table) { 224 | table.classList.add('simplified-scores'); 225 | } 226 | } 227 | }); -------------------------------------------------------------------------------- /docs/website/assets/videos/figure1v4.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/videos/figure1v4.mp4 -------------------------------------------------------------------------------- /docs/website/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/favicon.ico -------------------------------------------------------------------------------- /print_results.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | from os.path import join as pjoin 5 | 6 | import pandas as pd 7 | 8 | 9 | def parse_args(): 10 | # fmt: off 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument("--logs", metavar="path", nargs="+", default=["logs"], 13 | help="Paths within which to look for .jsonl files.") 14 | return parser.parse_args() 15 | # fmt: on 16 | 17 | 18 | def main(): 19 | args = parse_args() 20 | 21 | results = [] 22 | for logpath in args.logs: 23 | for logfile in glob.glob(pjoin(logpath, "**", "*.jsonl"), recursive=True): 24 | 25 | path, _ = os.path.splitext(logfile) 26 | _, agent, env_name, env_params = path.rsplit(os.path.sep, maxsplit=3) 27 | admissible_command, game_seed = env_params.split("_") 28 | admissible_command = bool(int(admissible_command[1])) 29 | agent = agent.split("_", maxsplit=1)[1] 30 | 31 | data = pd.read_json(logfile, lines=True) 32 | 33 | results.append( 34 | { 35 | "agent": agent, 36 | "env_name": env_name, 37 | # "env_params": env_params, 38 | "admissible_command": admissible_command, 39 | "game_seed": game_seed, 40 | "total_tokens": data["Token Usage"].sum(), 41 | "avg_tokens_per_step": data["Token Usage"].mean(), 42 | "norm_score": data["Normalized Score"].max(), 43 | "nb_steps": data["Step"].max(), 44 | # TODO: add more metrics: duration, nb_resets, nb_wins/losts, nb_invalid_actions, in-game moves 45 | } 46 | ) 47 | df = pd.DataFrame.from_records(results) 48 | 49 | group = df.groupby(["agent", "admissible_command", "env_name"]) 50 | columns = ["total_tokens", "avg_tokens_per_step", "norm_score", "nb_steps"] 51 | print(group[columns].mean()) 52 | print() 53 | 54 | group = df.groupby(["agent", "admissible_command"]) 55 | aggregated_results = group.agg( 56 | { 57 | "total_tokens": "sum", 58 | "avg_tokens_per_step": "mean", 59 | "norm_score": ["mean", "std"], 60 | "nb_steps": "mean", 61 | } 62 | ) 63 | print(aggregated_results) 64 | 65 | 66 | if __name__ == "__main__": 67 | main() 68 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "tale-suite" 7 | version = "1.0.0rc1" 8 | description = "TALES: Text-Adventure Learning Environment Suite" 9 | readme = "README.md" 10 | requires-python = ">=3.12" 11 | dynamic = ["dependencies"] 12 | 13 | classifiers = [ 14 | "Programming Language :: Python :: 3", 15 | "License :: OSI Approved :: MIT License", 16 | "Operating System :: OS Independent", 17 | ] 18 | 19 | [tool.setuptools.dynamic] 20 | dependencies = {file = ["requirements.txt"]} 21 | 22 | [tool.setuptools.packages.find] 23 | exclude = ["wandb/*", "logs/*", "website/*"] 24 | 25 | 26 | [project.optional-dependencies] 27 | dev = [ 28 | "pytest", 29 | "pre-commit", 30 | "black", 31 | "isort", 32 | ] 33 | 34 | [project.urls] 35 | "Homepage" = "https://github.com/microsoft/tale-suite" 36 | "Bug Tracker" = "https://github.com/microsoft/tale-suite/issues" 37 | 38 | [tool.black] 39 | line-length = 88 40 | 41 | [tool.isort] 42 | profile = "black" 43 | known_third_party = ["wandb"] 44 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tatsu==5.8.3 2 | gymnasium>=1.0.0 3 | jericho>=3.3.0 4 | textworld[pddl]>=1.6.2rc3 5 | textworld-express>=1.1.0rc5 6 | scienceworld>=1.2.2 7 | discoveryworld 8 | 9 | alfworld>=0.4.0 10 | 11 | termcolor 12 | wandb 13 | numpy 14 | pandas 15 | 16 | # llm>=0.18.0 17 | llm @ git+https://github.com/MarcCote/llm.git@add_extra_body_option 18 | llm-anthropic 19 | llm-gemini 20 | llm-azure-openai @ git+https://github.com/MarcCote/llm-azure-openai.git@generic_ad_auth 21 | anthropic 22 | google-genai 23 | tiktoken 24 | tenacity 25 | transformers 26 | -------------------------------------------------------------------------------- /scripts/example_script.sh: -------------------------------------------------------------------------------- 1 | # This is an example script to show how to use a self-hosted model with vllm to run the twb 2 | model="" 3 | 4 | cat <> .config/io.datasette.llm/extra-openai-models.yaml 5 | 6 | - model_id: $model 7 | model_name: $model 8 | api_base: "http://127.0.0.1:8002/v1" 9 | EOL 10 | 11 | export WANDB_API_KEY='' 12 | # Makes a log folder for vllm. This may error out if you already have a logs folder 13 | mkdir logs 14 | 15 | # Run the vllm server for the meta-llama/Llama-3.1-8B-Instruct model on port 8002. Make sure you have set your HF token 16 | nohup bash -c 'until ! (python -m vllm.entrypoints.openai.api_server --model mistralai/Ministral-8B-Instruct-2410 --port 8002 --tensor-parallel-size 1 --trust-remote-code --host 0.0.0.0 > logs/vllm_1.log 2>&1); do sleep 120; done' & 17 | 18 | # To make sure this doesn't run forever, we let it run for 300 seconds and check every 30 seconds 19 | echo "Waiting for VLLM server to start..." 20 | timeout=500 21 | interval=30 22 | elapsed=0 23 | 24 | # Wait loop with timeout 25 | until curl -s -o /dev/null -w "%{http_code}" http://localhost:8002/v1/models | grep -q "200"; do 26 | if [ $elapsed -ge $timeout ]; then 27 | echo "Timeout reached! VLLM server did not start within 5 minutes." 28 | exit 1 29 | fi 30 | sleep $interval 31 | echo "Pinging vllm server..." 32 | elapsed=$((elapsed + interval)) 33 | done 34 | 35 | # Send a test request to the API 36 | curl -X POST "http://localhost:8002/v1/completions" -H "Content-Type: application/json" -d '{"model": "mistralai/Ministral-8B-Instruct-2410", "prompt": "You want to play a (text) game?", "max_tokens": 10}' 37 | 38 | # Run the text games benchmark with the model we just set up for xork1 39 | wandb login 40 | 41 | set -ex 42 | 43 | pids="" 44 | 45 | for i in {1..5}; do 46 | python benchmark.py --agent agents/llm.py zero-shot --conversation --llm mistralai/Ministral-8B-Instruct-2410 --envs JerichoEnvZork1 --context 100 --nb-steps 100 --conversation --seed "20241106$((i))" 47 | pids="$pids $!" 48 | sleep 60 49 | done 50 | 51 | wait $pids -------------------------------------------------------------------------------- /tales/__init__.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | import traceback 4 | import warnings 5 | from collections import defaultdict 6 | 7 | from termcolor import colored 8 | 9 | from tales.agent import Agent 10 | from tales.version import __version__ 11 | 12 | root_dir = os.path.dirname(os.path.abspath(__file__)) 13 | tasks = [] 14 | envs = [] 15 | envs_per_task = defaultdict(list) 16 | 17 | _exclude_path = ["__pycache__", "tests"] 18 | 19 | for dirname in os.listdir(root_dir): 20 | if not os.path.isdir(os.path.join(root_dir, dirname)): 21 | continue 22 | 23 | if dirname in _exclude_path: 24 | continue 25 | 26 | if "skip" in os.listdir(os.path.join(root_dir, dirname)): 27 | continue 28 | 29 | if "__init__.py" in os.listdir(os.path.join(root_dir, dirname)): 30 | tasks.append(dirname) 31 | 32 | 33 | for task in tasks: 34 | try: 35 | # Load environments 36 | module = importlib.import_module(f".{task}", package="tales") 37 | environments = getattr(module, "environments", None) 38 | if environments: 39 | for env_name, version in environments: 40 | envs.append(env_name) 41 | envs_per_task[task].append(env_name) 42 | else: 43 | warnings.warn( 44 | "Failed to load `{}.environments`. Skipping the task.".format(task), 45 | UserWarning, 46 | ) 47 | continue 48 | 49 | except Exception as e: 50 | warnings.warn( 51 | "Failed to import `{}`. Skipping the task.".format(task), UserWarning 52 | ) 53 | warnings.warn(colored(f"{e}", "red"), UserWarning) 54 | warnings.warn(colored(f"{traceback.format_exc()}", "red"), UserWarning) 55 | continue 56 | 57 | envs_per_task = dict(envs_per_task) 58 | env2task = {env: task for task, envs in envs_per_task.items() for env in envs} 59 | 60 | __all__ = ["Agent", "__version__", "envs", "envs_per_task", "tasks"] 61 | -------------------------------------------------------------------------------- /tales/agent.py: -------------------------------------------------------------------------------- 1 | class Agent: 2 | 3 | def reset(self, obs, info, env): 4 | pass 5 | 6 | def act(self, obs, reward, done, info): 7 | raise NotImplementedError("Child class must implement this method.") 8 | 9 | @property 10 | def uid(self): 11 | """Unique identifier for this agent. 12 | 13 | Usually, this is a string that contains the class name and the values of the 14 | parameters used to initialize the agent. 15 | """ 16 | # return f"{self.__class__.__name__}_" + "_".join( 17 | # f"{k}:{v}" for k, v in self.kwargs.items() 18 | # ).strip("_") 19 | raise NotImplementedError("Child class must implement this property.") 20 | 21 | @property 22 | def params(self): 23 | """Parameters used to initialize the agent. 24 | 25 | Returns: 26 | dict: Parameters used to initialize the agent. 27 | """ 28 | # return self.kwargs 29 | raise NotImplementedError("Child class must implement this property.") 30 | 31 | 32 | # Registry for available agents to benchmark. 33 | AGENTS = {} 34 | 35 | 36 | def register(name: str, desc: str, klass: callable, add_arguments: callable) -> None: 37 | """ Register a new type of Agent. 38 | 39 | Arguments: 40 | name: 41 | Name of the agent (must be unique). 42 | desc: 43 | Bried description of how the agent works (for `benchmark.py --help`). 44 | klass: 45 | Class used to instantiate the agent. 46 | add_arguments: 47 | Function that should add the `argparse` arguments needed for this agent. 48 | The provided function should expect a `argparse.ArgumentParser` object. 49 | 50 | Example: 51 | 52 | >>> from tales.agent import register 53 | >>> from tales.agents import RandomAgent 54 | >>> def _add_arguments(parser): 55 | parser.add_argument("--seed", required=True, type=int, 56 | help="Random seed to use.") 57 | >>> \ 58 | >>> register(name="random", 59 | >>> desc="This agent randomly select actions.", 60 | >>> klass=RandomAgent, 61 | >>> add_arguments=_add_arguments) 62 | """ 63 | if name in AGENTS: 64 | raise ValueError(f"Agent '{name}' already registered.") 65 | 66 | AGENTS[name] = (desc, klass, add_arguments) 67 | -------------------------------------------------------------------------------- /tales/alfworld/__init__.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | 3 | from .alfworld_data import TASK_TYPES, prepare_alfworld_data 4 | from .alfworld_env import ALFWorldTask 5 | 6 | environments = [] 7 | 8 | for split in ["seen", "unseen"]: 9 | for task_type in TASK_TYPES: 10 | task_name = task_type.replace("_", " ").title().replace(" ", "") 11 | env_name = f"ALFWorld{task_name}{split.title()}" 12 | environments.append([env_name, "v0"]) 13 | 14 | gym.register( 15 | id=f"tales/{env_name}-v0", 16 | entry_point="tales.alfworld:ALFWorldTask", 17 | kwargs={"task_type": task_type, "split": split}, 18 | ) 19 | 20 | 21 | def download(): 22 | prepare_alfworld_data() 23 | -------------------------------------------------------------------------------- /tales/alfworld/alfworld_data.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import zipfile 4 | from os.path import join as pjoin 5 | 6 | from tales.config import TALES_CACHE_HOME, TALES_FORCE_DOWNLOAD 7 | from tales.utils import download 8 | 9 | TASK_TYPES = [ 10 | "pick_and_place_simple", 11 | "look_at_obj_in_light", 12 | "pick_clean_then_place_in_recep", 13 | "pick_heat_then_place_in_recep", 14 | "pick_cool_then_place_in_recep", 15 | "pick_two_obj_and_place", 16 | ] 17 | 18 | ALFWORLD_DATA_URL = "https://github.com/alfworld/alfworld/releases/download/0.4.2/json_2.1.3_tw-pddl.zip" 19 | TALES_CACHE_ALFWORLD = pjoin(TALES_CACHE_HOME, "alfworld") 20 | TALES_CACHE_ALFWORLD_DATA_ZIP = pjoin(TALES_CACHE_ALFWORLD, "json_2.1.3_tw-pddl.zip") 21 | TALES_CACHE_ALFWORLD_VALID_SEEN = pjoin( 22 | TALES_CACHE_ALFWORLD, "json_2.1.1", "valid_seen" 23 | ) 24 | TALES_CACHE_ALFWORLD_VALID_UNSEEN = pjoin( 25 | TALES_CACHE_ALFWORLD, "json_2.1.1", "valid_unseen" 26 | ) 27 | 28 | 29 | def prepare_alfworld_data(force=TALES_FORCE_DOWNLOAD): 30 | os.makedirs(TALES_CACHE_ALFWORLD, exist_ok=True) 31 | data_exists = os.path.exists(TALES_CACHE_ALFWORLD_VALID_SEEN) and os.path.exists( 32 | TALES_CACHE_ALFWORLD_VALID_UNSEEN 33 | ) 34 | if data_exists and not force: 35 | return 36 | 37 | if not os.path.exists(TALES_CACHE_ALFWORLD_DATA_ZIP) or force: 38 | download( 39 | ALFWORLD_DATA_URL, 40 | dst=TALES_CACHE_ALFWORLD, 41 | desc="Downloading ALFWorld data", 42 | force=force, 43 | ) 44 | 45 | # Extract the content of the folder test from the downloaded file 46 | with zipfile.ZipFile(TALES_CACHE_ALFWORLD_DATA_ZIP, "r") as zip_ref: 47 | # Only extract the test folder 48 | for member in zip_ref.namelist(): 49 | if "valid_seen" in member or "valid_unseen" in member: 50 | zip_ref.extract(member, TALES_CACHE_ALFWORLD) 51 | 52 | 53 | def get_alfworld_game(task_type, split="seen"): 54 | prepare_alfworld_data() # make sure the data is ready 55 | 56 | if split == "seen": 57 | root = TALES_CACHE_ALFWORLD_VALID_SEEN 58 | elif split == "unseen": 59 | root = TALES_CACHE_ALFWORLD_VALID_UNSEEN 60 | else: 61 | raise ValueError(f"Unknown split: {split}") 62 | 63 | game_files = sorted(glob.glob(pjoin(root, f"{task_type}*", "**", "*.tw-pddl"))) 64 | return game_files 65 | -------------------------------------------------------------------------------- /tales/alfworld/alfworld_env.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | import textworld 3 | import textworld.gym 4 | from alfworld.agents.environment.alfred_tw_env import AlfredDemangler 5 | from textworld.envs.wrappers import Filter 6 | 7 | from . import alfworld_data 8 | 9 | 10 | class ALFWorldEnv(gym.Env): 11 | 12 | def __init__(self, gamefile, admissible_commands=False, *args, **kwargs): 13 | self.infos = textworld.EnvInfos( 14 | score=True, 15 | max_score=True, 16 | won=True, 17 | lost=True, 18 | feedback=True, 19 | moves=True, 20 | admissible_commands=admissible_commands, 21 | extras=["walkthrough", "expert_plan"], 22 | ) 23 | self.gamefile = gamefile 24 | self.env = None 25 | 26 | def reset(self, *, seed=None, options=None): 27 | super().reset(seed=seed, options=options) 28 | 29 | if self.env is None: 30 | self.env = textworld.start( 31 | self.gamefile, self.infos, wrappers=[Filter, AlfredDemangler()] 32 | ) 33 | 34 | obs, info = self.env.reset() 35 | info["feedback"] = obs 36 | info["score"] = 0 37 | info["max_score"] = 1 38 | return obs, info 39 | 40 | def step(self, action): 41 | obs, done, reward, info = self.env.step(action) 42 | # if obs == "Nothing happens.": 43 | # obs = "Invalid command or this command can't be used in this context. Type 'help' for a list of available commands." 44 | 45 | info["feedback"] = obs 46 | info["score"] = int(done) 47 | info["max_score"] = 1 48 | return obs, done, reward, info 49 | 50 | 51 | class ALFWorldTask(ALFWorldEnv): 52 | 53 | def __init__(self, task_type, split, *args, **kwargs): 54 | self.gamefiles = sorted(alfworld_data.get_alfworld_game(task_type, split)) 55 | super().__init__(self.gamefiles[0], *args, **kwargs) 56 | 57 | def reset(self, *, seed=None, options=None): 58 | if seed is not None: 59 | self.gamefile = self.gamefiles[seed % len(self.gamefiles)] 60 | if self.env is not None: 61 | self.env.close() 62 | self.env = None 63 | 64 | return super().reset(seed=seed, options=options) 65 | -------------------------------------------------------------------------------- /tales/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | DEFAULT_TALES_CACHE_HOME = os.path.expanduser("~/.cache/tales") 4 | TALES_CACHE_HOME = os.getenv("TALES_CACHE_HOME", DEFAULT_TALES_CACHE_HOME) 5 | os.environ["TALES_CACHE_HOME"] = ( 6 | TALES_CACHE_HOME # Set the environment variable, in case it wasn't. 7 | ) 8 | os.makedirs(TALES_CACHE_HOME, exist_ok=True) 9 | 10 | # Check if cache is flag is set to force download 11 | TALES_FORCE_DOWNLOAD = os.getenv("TALES_FORCE_DOWNLOAD", "false").lower() in ( 12 | "yes", 13 | "true", 14 | "t", 15 | "1", 16 | ) 17 | -------------------------------------------------------------------------------- /tales/download.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import traceback 3 | import warnings 4 | 5 | from termcolor import colored 6 | from tqdm import tqdm 7 | 8 | from tales import tasks 9 | 10 | 11 | def download(): 12 | for task in tqdm(tasks, desc="Downloading data for TALES"): 13 | try: 14 | module = importlib.import_module(f".{task}", package="tales") 15 | module.download() 16 | except Exception as e: 17 | warnings.warn( 18 | "Failed to download data for `{task}`.", 19 | UserWarning, 20 | ) 21 | warnings.warn(colored(f"{e}", "red"), UserWarning) 22 | warnings.warn(colored(f"{traceback.format_exc()}", "red"), UserWarning) 23 | continue 24 | 25 | 26 | if __name__ == "__main__": 27 | download() 28 | -------------------------------------------------------------------------------- /tales/jericho/__init__.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | 3 | from .jericho_data import GAMES_INFOS, prepare_jericho_data 4 | from .jericho_env import JerichoEnv 5 | 6 | environments = [] 7 | 8 | for game, infos in GAMES_INFOS.items(): 9 | env_name = f"JerichoEnv{game.title()}" 10 | environments.append([env_name, "v0"]) 11 | 12 | gym.register( 13 | id=f"tales/{env_name}-v0", 14 | entry_point="tales.jericho:JerichoEnv", 15 | kwargs={"game": game}, 16 | ) 17 | 18 | 19 | def download(): 20 | prepare_jericho_data() 21 | -------------------------------------------------------------------------------- /tales/jericho/games.json: -------------------------------------------------------------------------------- 1 | { 2 | "905": { 3 | "filename": "905.z5", 4 | "info": "http://ifdb.tads.org/viewgame?id=qzftg3j8nh5f34i2", 5 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/905.z5", 6 | "md5": "4c5067169b834d247a30bb08d1039896" 7 | }, 8 | "acorncourt": { 9 | "filename": "acorncourt.z5", 10 | "info": "http://ifdb.tads.org/viewgame?id=tqvambr6vowym20v", 11 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/acorncourt.z5", 12 | "md5": "a61400439aa76f8faba3b8f01edd4a72" 13 | }, 14 | "advent": { 15 | "filename": "advent.z5", 16 | "info": "http://ifdb.tads.org/viewgame?id=fft6pu91j85y4acv", 17 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/Advent.z5", 18 | "md5": "ee2242e155fd8910921b0f8e04019a3a" 19 | }, 20 | "adventureland": { 21 | "filename": "adventureland.z5", 22 | "info": "http://ifdb.tads.org/viewgame?id=dy4ok8sdlut6ddj7", 23 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/Adventureland.z5", 24 | "md5": "a42545bd17330ae5e6fed02270ccfb4a" 25 | }, 26 | "afflicted": { 27 | "filename": "afflicted.z8", 28 | "info": "http://ifdb.tads.org/viewgame?id=epl4q2933rczoo9x", 29 | "link": "http://mirror.ifarchive.org/if-archive/games/competition2008/zcode/afflicted/afflicted.z8", 30 | "md5": "064272be87de7106192b6fb743c4dfc4" 31 | }, 32 | "anchor": { 33 | "filename": "anchor.z8", 34 | "info": "http://ifdb.tads.org/viewgame?id=op0uw1gn1tjqmjt7", 35 | "link": "http://ifarchive.org/if-archive/games/zcode/anchor.z8", 36 | "md5": "c043df8624e0e1e9fda92f1a74b6e402" 37 | }, 38 | "awaken": { 39 | "filename": "awaken.z5", 40 | "info": "http://ifdb.tads.org/viewgame?id=rwseuddvj1gbo481", 41 | "link": "https://github.com/danielricks/textplayer/raw/master/games/awaken.z5", 42 | "md5": "9ba48c72d96ab3e7956a8570b12d34d6" 43 | }, 44 | "balances": { 45 | "filename": "balances.z5", 46 | "info": "http://ifdb.tads.org/viewgame?id=x6ne0bbd2oqm6h3a", 47 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/Balances.z5", 48 | "md5": "f2cb8f94a7e8df3b850a758da26fa387" 49 | }, 50 | "ballyhoo": { 51 | "filename": "ballyhoo.z3", 52 | "info": "http://ifdb.tads.org/viewgame?id=b0i6bx7g4rkrekgg", 53 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FBallyhoo%20v97%20%281986%29%28Infocom%29.zip:BALLYHOO.DAT", 54 | "md5": "5d54e326815b0ed3aff8efb8ff02ef2f" 55 | }, 56 | "curses": { 57 | "filename": "curses.z5", 58 | "info": "http://ifdb.tads.org/viewgame?id=plvzam05bmz3enh8", 59 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/curses.z5", 60 | "md5": "f06a42a29a5a4e6aa70958c9ae4c37cd" 61 | }, 62 | "cutthroat": { 63 | "filename": "cutthroat.z3", 64 | "info": "http://ifdb.tads.org/viewgame?id=4ao65o1u0xuvj8jf", 65 | "link": "https://github.com/BYU-PCCL/z-machine-games/raw/master/jericho-game-suite/cutthroat.z3", 66 | "md5": "216eeeba1c8017a77343dc8482f6f185" 67 | }, 68 | "deephome": { 69 | "filename": "deephome.z5", 70 | "info": "http://ifdb.tads.org/viewgame?id=x85otcikhwp8bwup", 71 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/deephome.z5", 72 | "md5": "5e56a6e5cdeecded434a8fd8012fc2c6" 73 | }, 74 | "detective": { 75 | "filename": "detective.z5", 76 | "info": "http://ifdb.tads.org/viewgame?id=1po9rgq2xssupefw", 77 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/detective.z5", 78 | "md5": "822655c9be83e292e06d3d3b1d6a9734" 79 | }, 80 | "dragon": { 81 | "filename": "dragon.z5", 82 | "info": "http://ifdb.tads.org/viewgame?id=sjiyffz8n5patu8l", 83 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/dragon.zip:Dragon.z5", 84 | "md5": "96d314997e5d3a5a793c83845977d44d" 85 | }, 86 | "enchanter": { 87 | "filename": "enchanter.z3", 88 | "info": "http://ifdb.tads.org/viewgame?id=vu4xhul3abknifcr", 89 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FEnchanter%20v24%20%281984%29%28Infocom%29%5Bh%5D.zip:ench_24.z3", 90 | "md5": "ad3cdea88d81033fe29167688bd98c31" 91 | }, 92 | "enter": { 93 | "filename": "enter.z5", 94 | "info": "http://ifdb.tads.org/viewgame?id=ld1f3t5epeagilfz", 95 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/enter.z5", 96 | "md5": "4c48ba2c5523d78c5f7f9b7809d16b1d" 97 | }, 98 | "gold": { 99 | "filename": "gold.z5", 100 | "info": "http://ifdb.tads.org/viewgame?id=59ztsy9p01avd6wp", 101 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/gold.z5", 102 | "md5": "f275ddf32ce8a9e744d53c3b99c5a658" 103 | }, 104 | "hhgg": { 105 | "filename": "hhgg.z3", 106 | "info": "http://ifdb.tads.org/viewgame?id=ouv80gvsl32xlion", 107 | "link": "https://github.com/BYU-PCCL/z-machine-games/raw/master/jericho-game-suite/hhgg.z3", 108 | "md5": "6666389f60e0c8e4ceb08242a263bb52" 109 | }, 110 | "hollywood": { 111 | "filename": "hollywood.z3", 112 | "info": "http://ifdb.tads.org/viewgame?id=jnfkbgdgopwfqist", 113 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FHollywood%20Hijinx%20v235%20%281986%29%28Infocom%29%5Bh%5D%5B861118%5D.zip:hollywoo_235.z3", 114 | "md5": "1ea91a064941a3f612b20833f0a47df7" 115 | }, 116 | "huntdark": { 117 | "filename": "huntdark.z5", 118 | "info": "http://ifdb.tads.org/viewgame=mh1a6hizgwjdbeg7", 119 | "link": "http://mirror.ifarchive.org/if-archive/games/competition99/inform/huntdark/huntdark.z5", 120 | "md5": "253b02c8012710577085b9fd3a155cb7" 121 | }, 122 | "infidel": { 123 | "filename": "infidel.z3", 124 | "info": "http://ifdb.tads.org/viewgame?id=anu79a4n1jedg5mm", 125 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FInfidel%20v22%20%281983%29%28Infocom%29%5B830916%5D.zip:INFIDEL.DAT", 126 | "md5": "2fe5b5693fa60b0cf8621402423994b1" 127 | }, 128 | "inhumane": { 129 | "filename": "inhumane.z5", 130 | "info": "http://ifdb.tads.org/viewgame?id=wvs2vmbigm9unlpd", 131 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/inhumane.z5", 132 | "md5": "84d3ce7ccfafb873736490811a0cc78c" 133 | }, 134 | "jewel": { 135 | "filename": "jewel.z5", 136 | "info": "http://ifdb.tads.org/viewgame?id=hu60gp1bgkhlo5yx", 137 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/jewel.z5", 138 | "md5": "1eef9c0fa009ca4adf4872cfc5249d45" 139 | }, 140 | "karn": { 141 | "filename": "karn.z5", 142 | "info": "http://ifdb.tads.org/viewgame?id=bx8118ggp6j7nslo", 143 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/karn.z5", 144 | "md5": "ec55791be814db3663ad1aec0d6b7690" 145 | }, 146 | "library": { 147 | "filename": "library.z5", 148 | "info": "http://ifdb.tads.org/viewgame?id=400zakqderzjnu1i", 149 | "link": "http://mirror.ifarchive.org/if-archive/games/competition95/library.z5", 150 | "md5": "389acf3b617a40dc4848da3bda62ce06" 151 | }, 152 | "loose": { 153 | "filename": "loose.z5", 154 | "info": "http://ifdb.tads.org/viewgame?id=4wd3lyaxi4thp8qi", 155 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/loose.z5", 156 | "md5": "31a0c1e360dce94aa5bece5240691d17" 157 | }, 158 | "lostpig": { 159 | "filename": "lostpig.z8", 160 | "info": "http://ifdb.tads.org/viewgame?id=mohwfk47yjzii14w", 161 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/LostPig.z8", 162 | "md5": "aaf0b90fbb31717481c02832bf412070" 163 | }, 164 | "ludicorp": { 165 | "filename": "ludicorp.z5", 166 | "info": "http://ifdb.tads.org/viewgame?id=r6g7pflngn3uxbam", 167 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/ludicorp.z5", 168 | "md5": "646a63307f77dcdcd011f330277ae262" 169 | }, 170 | "lurking": { 171 | "filename": "lurking.z3", 172 | "info": "http://ifdb.tads.org/viewgame?id=jhbd0kja1t57uop", 173 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FLurking%20Horror%2C%20The%20v219%20%281987%29%28Infocom%29%5B870912%5D.zip:Lurking.z3", 174 | "md5": "5f42ff092a2f30471ae98150ef4da2e1" 175 | }, 176 | "moonlit": { 177 | "filename": "moonlit.z5", 178 | "info": "http://ifdb.tads.org/viewgame?id=10387w68qlwehbyq", 179 | "link": "http://mirror.ifarchive.org/if-archive/games/competition2002/zcode/moonlit/Moonlit.z5", 180 | "md5": "bf75b9651cff0e2d04302f19c443588e" 181 | }, 182 | "murdac": { 183 | "filename": "murdac.z5", 184 | "info": "http://ifdb.tads.org/viewgame?id=q36lh5np0q9nak28", 185 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/Murdac.z5", 186 | "md5": "570179d4f21b2f600862dbffbb5afc3e" 187 | }, 188 | "night": { 189 | "filename": "night.z5", 190 | "info": "http://ifdb.tads.org/viewgame?id=ydhwa11st460g9u3", 191 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/night.z5", 192 | "md5": "72125f159cccd581786ac16a2828d4e3" 193 | }, 194 | "omniquest": { 195 | "filename": "omniquest.z5", 196 | "info": "http://ifdb.tads.org/viewgame?id=mygqz9tzxqvryead", 197 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/omniquest.z5", 198 | "md5": "80ea198bca425b6d819c74bfa854236e" 199 | }, 200 | "partyfoul": { 201 | "filename": "partyfoul.z8", 202 | "info": "http://ifdb.tads.org/viewgame?id=cqwq699i9qiqdju", 203 | "link": "http://mirror.ifarchive.org/if-archive/games/mini-comps/cgdc7/PartyFoul.zblorb", 204 | "md5": "d221daa82708c4e54447f1a884c239ef" 205 | }, 206 | "pentari": { 207 | "filename": "pentari.z5", 208 | "info": "http://ifdb.tads.org/viewgame?id=llchvog0ukwrphih", 209 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/pentari.z5", 210 | "md5": "f24c6863468823b744e910ccfe997c6d" 211 | }, 212 | "planetfall": { 213 | "filename": "planetfall.z3", 214 | "info": "http://ifdb.tads.org/viewgame?id=xe6kb3cuqwie2q38", 215 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FPlanetfall%20v29%20%281983%29%28Infocom%29%5B840118%5D.zip:planetfa.z3", 216 | "md5": "6487dc814b280f5603c53155de378d27" 217 | }, 218 | "plundered": { 219 | "filename": "plundered.z3", 220 | "info": "http://ifdb.tads.org/viewgame?id=ddagftras22bnz8h", 221 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FPlundered%20Hearts%20v26%20%281987%29%28Infocom%29%5B870730%5D.zip:PLUNDERE.DAT", 222 | "md5": "29fc7b270af2fbd406a0548a8298da7f" 223 | }, 224 | "reverb": { 225 | "filename": "reverb.z5", 226 | "info": "http://ifdb.tads.org/viewgame?id=dop7nbjl90r5zmf9", 227 | "link": "http://mirror.ifarchive.org/if-archive/games/competition96/reverb/reverb.z5", 228 | "md5": "80d286fbfe624c621266b568c0076717" 229 | }, 230 | "seastalker": { 231 | "filename": "seastalker.z3", 232 | "info": "http://ifdb.tads.org/viewgame?id=56wb8hflec2isvzm", 233 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FSeastalker%20v86%20%281984%29%28Infocom%29%28beta%29%5B840320%5D.zip:SEASTALK.z3", 234 | "md5": "ee339dbdbb0792f67e20bd71bafe0ea5" 235 | }, 236 | "sherlock": { 237 | "filename": "sherlock.z5", 238 | "info": "http://ifdb.tads.org/viewgame?id=j8lmspy4iz73mx26", 239 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FSherlock%20-%20The%20Riddle%20of%20the%20Crown%20Jewels%20v21%20%281987%29%28Infocom%29%5B871214%5D.zip:SHER.z5", 240 | "md5": "35240654d83f9e7073973d338f9657b8" 241 | }, 242 | "snacktime": { 243 | "filename": "snacktime.z8", 244 | "info": "http://ifdb.tads.org/viewgame?id=yr3y8s9k8e40hl5q", 245 | "link": "http://mirror.ifarchive.org/if-archive/games/competition2008/zcode/snack/snacktime.z8", 246 | "md5": "0ff228d12d7cb470dc1a8e9a5151769b" 247 | }, 248 | "sorcerer": { 249 | "filename": "sorcerer.z3", 250 | "info": "http://ifdb.tads.org/viewgame?id=lidg5nx9ig0bwk55", 251 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FSorcerer%20v18%20%281984%29%28Infocom%29%5Bh%5D%5B860904%5D.zip:sorcerer_18.z3", 252 | "md5": "20f1468a058d0a6de016ae70022e651c" 253 | }, 254 | "spellbrkr": { 255 | "filename": "spellbrkr.z3", 256 | "info": "http://ifdb.tads.org/viewgame?id=wqsmrahzozosu3r", 257 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FSpellbreaker%20v63%20%281985%29%28Infocom%29%5B850916%5D.zip:spelbrkr.z3", 258 | "md5": "7a92ce19a39bedd970d0f1e296981f71" 259 | }, 260 | "spirit": { 261 | "filename": "spirit.z5", 262 | "info": "http://ifdb.tads.org/viewgame?id=tqpowvmdoemtooqf", 263 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/spirit.z5", 264 | "md5": "808039c4e9554bdd15d7793539b3bd97" 265 | }, 266 | "temple": { 267 | "filename": "temple.z5", 268 | "info": "http://ifdb.tads.org/viewgame?id=kq9qgjkf2k6xn1c0", 269 | "link": "http://mirror.ifarchive.org/if-archive/games/competition2002/zcode/temple/temple.z5", 270 | "md5": "047842c7b25c3d477b728cf3412e33de" 271 | }, 272 | "theatre": { 273 | "filename": "theatre.z5", 274 | "info": "http://ifdb.tads.org/viewgame?id=bv8of8y9xeo7307g", 275 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/theatre.z5", 276 | "md5": "33dcc5085acb290d1817e07653c13480" 277 | }, 278 | "trinity": { 279 | "filename": "trinity.z4", 280 | "info": "http://ifdb.tads.org/viewgame?id=j18kjz80hxjtyayw", 281 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FTrinity%20v12%20%281986%29%28Infocom%29%5B860926%5D.zip:TRINITY.z4", 282 | "md5": "3bf1a444a1fc2057130ecb9806117233" 283 | }, 284 | "tryst205": { 285 | "filename": "tryst205.z5", 286 | "info": "http://ifdb.tads.org/viewgame?id=ic0ebhbi70bdmyc2", 287 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/tryst205.z5", 288 | "md5": "fc65ad8d4588da92fd39871f6f7463db" 289 | }, 290 | "weapon": { 291 | "filename": "weapon.z5", 292 | "info": "http://ifdb.tads.org/viewgame?id=tcebhl79rlxo3qrk", 293 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/weapon.zip:weapon.z5", 294 | "md5": "c632204be3849d6c5bb6f4eb5aca3cc0" 295 | }, 296 | "wishbringer": { 297 | "filename": "wishbringer.z3", 298 | "info": "http://ifdb.tads.org/viewgame?id=z02joykzh66wfhcl", 299 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FWishbringer%20-%20The%20Magick%20Stone%20of%20Dreams%20v68%20%281985%29%28Infocom%29%5B850501%5D.zip:WISHBRIN.z3", 300 | "md5": "87ed53d854f7e57c36106fca3b9cf5a6" 301 | }, 302 | "yomomma": { 303 | "filename": "yomomma.z8", 304 | "info": "http://ifdb.tads.org/viewgame?id=1iqmpkn009h9gbug", 305 | "link": "http://nitku.net/if/yomomma/yomomma.zblorb", 306 | "md5": "5b10162a7a134e7b4c381ecedfb4bc44" 307 | }, 308 | "zenon": { 309 | "filename": "zenon.z5", 310 | "info": "http://ifdb.tads.org/viewgame?id=rw7zv98mifbr3335", 311 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/zenon.z5", 312 | "md5": "631cc926b4251f5a5f646d3a6bdac8c6" 313 | }, 314 | "zork1": { 315 | "filename": "zork1.z5", 316 | "info": "http://ifdb.tads.org/viewgame?id=0dbnusxunq7fw5ro", 317 | "link": "http://www.batmantis.com/zorks/zork1.z5", 318 | "md5": "b732a93a6244ddd92a9b9a3e3a46c687" 319 | }, 320 | "zork2": { 321 | "filename": "zork2.z5", 322 | "info": "http://ifdb.tads.org/viewgame?id=yzzm4puxyjakk8c4", 323 | "link": "http://www.batmantis.com/zorks/zork2.z5", 324 | "md5": "5bcd91ee055e9bd42812617571be227b" 325 | }, 326 | "zork3": { 327 | "filename": "zork3.z5", 328 | "info": "http://ifdb.tads.org/viewgame?id=vrsot1zgy1wfcdru", 329 | "link": "http://www.batmantis.com/zorks/zork3.z5", 330 | "md5": "ffda9ee2d428fa2fa8e75a1914ff6959" 331 | }, 332 | "ztuu": { 333 | "filename": "ztuu.z5", 334 | "info": "http://ifdb.tads.org/viewgame?id=40hswtkhap88gzvn", 335 | "link": "http://www.batmantis.com/zorks/ztuu.z5", 336 | "md5": "d8e1578470cbc676e013e03d72c93141" 337 | } 338 | } -------------------------------------------------------------------------------- /tales/jericho/jericho_data.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from os.path import join as pjoin 4 | 5 | from tales.config import TALES_CACHE_HOME, TALES_FORCE_DOWNLOAD 6 | from tales.utils import download 7 | 8 | GAMES_URLS = "https://github.com/BYU-PCCL/z-machine-games/raw/master/jericho-game-suite" 9 | TALES_CACHE_JERICHO = pjoin(TALES_CACHE_HOME, "jericho") 10 | 11 | 12 | with open(pjoin(os.path.dirname(__file__), "games.json")) as f: 13 | GAMES_INFOS = json.load(f) 14 | 15 | # Remove known games that are not working. 16 | GAMES_INFOS.pop("hollywood", None) 17 | GAMES_INFOS.pop("theatre", None) 18 | 19 | 20 | def prepare_jericho_data(force=TALES_FORCE_DOWNLOAD): 21 | os.makedirs(TALES_CACHE_JERICHO, exist_ok=True) 22 | 23 | for name, game_info in GAMES_INFOS.items(): 24 | filename = game_info["filename"] 25 | 26 | game_file = pjoin(TALES_CACHE_JERICHO, filename) 27 | if os.path.isfile(game_file) and not force: 28 | continue 29 | 30 | link = f"{GAMES_URLS}/{filename}" 31 | download(link, dst=TALES_CACHE_JERICHO, force=force) 32 | 33 | 34 | def get_game(game): 35 | prepare_jericho_data() # make sure the data is ready 36 | 37 | game_info = GAMES_INFOS[game] 38 | game_file = pjoin(TALES_CACHE_JERICHO, game_info["filename"]) 39 | return game_file 40 | -------------------------------------------------------------------------------- /tales/jericho/jericho_env.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | import textworld 3 | from textworld.envs.wrappers import Filter 4 | 5 | from . import jericho_data 6 | 7 | 8 | class JerichoEnv(gym.Env): 9 | 10 | def __init__(self, game, admissible_commands=False, *args, **kwargs): 11 | gamefile = jericho_data.get_game(game) 12 | self.infos = textworld.EnvInfos( 13 | score=True, 14 | max_score=True, 15 | won=True, 16 | lost=True, 17 | feedback=True, 18 | moves=True, 19 | admissible_commands=admissible_commands, 20 | extras=["walkthrough"], 21 | ) 22 | self.env = textworld.start(gamefile, self.infos, wrappers=[Filter]) 23 | 24 | def reset(self, *, seed=None, options=None): 25 | self.env.seed(seed) 26 | return self.env.reset() 27 | 28 | def step(self, action): 29 | return self.env.step(action) 30 | -------------------------------------------------------------------------------- /tales/logger.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import os 4 | import platform 5 | import re 6 | from os.path import join as pjoin 7 | 8 | from tqdm import tqdm 9 | 10 | log = logging.getLogger("tales") 11 | 12 | 13 | class TqdmLoggingHandler(logging.Handler): 14 | def __init__(self, level=logging.NOTSET): 15 | super().__init__(level) 16 | 17 | def emit(self, record): 18 | try: 19 | msg = self.format(record) 20 | tqdm.write(msg) 21 | self.flush() 22 | except (KeyboardInterrupt, SystemExit): 23 | raise 24 | except Exception: 25 | self.handleError(record) 26 | 27 | 28 | class StripAnsiFormatter(logging.Formatter): 29 | ansi_escape = re.compile(r"\x1B[@-_][0-?]*[ -/]*[@-~]") 30 | 31 | def format(self, record): 32 | msg = super().format(record) 33 | return self.ansi_escape.sub("", msg) 34 | 35 | 36 | def setup_logging(args): 37 | log.setLevel(logging.DEBUG) 38 | 39 | def add_new_file_handler(logfile): 40 | fh = logging.FileHandler(logfile, mode="w") 41 | formatter = StripAnsiFormatter("%(asctime)s: %(message)s") 42 | log.addHandler(fh) 43 | fh.setLevel(logging.DEBUG) 44 | fh.setFormatter(formatter) 45 | 46 | # Log some system information at the top of the log file. 47 | def _emit_msg(msg): 48 | fh.emit( 49 | logging.makeLogRecord( 50 | {"name": log.name, "level": logging.DEBUG, "msg": msg} 51 | ) 52 | ) 53 | 54 | _emit_msg("System information:") 55 | _emit_msg(f"args = {args}") 56 | _emit_msg(f"system = {platform.system()}") 57 | _emit_msg(f"server = {platform.uname()[1]}") 58 | _emit_msg(f"working_dir = {os.getcwd()}") 59 | _emit_msg(f"datetime = {datetime.datetime.now()}") 60 | _emit_msg(f"git_commit = {os.popen('git rev-parse HEAD').read().strip()}") 61 | 62 | return fh 63 | 64 | log.add_new_file_handler = add_new_file_handler 65 | 66 | timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") 67 | logfile = pjoin(args.log_dir, f"{timestamp}.log") 68 | log.add_new_file_handler(logfile) 69 | 70 | ch = TqdmLoggingHandler() 71 | formatter = logging.Formatter("%(message)s") 72 | ch.setLevel(args.logging_level) 73 | ch.setFormatter(formatter) 74 | log.addHandler(ch) 75 | -------------------------------------------------------------------------------- /tales/scienceworld/__init__.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | 3 | from .scienceworld_env import TASK_NAMES, ScienceWorldEnv 4 | 5 | environments = [] 6 | 7 | for task_name in TASK_NAMES: 8 | env_name = f"ScienceWorld{task_name.title().replace('-', '')}" 9 | environments.append([env_name, "v0"]) 10 | 11 | gym.register( 12 | id=f"tales/{env_name}-v0", 13 | entry_point="tales.scienceworld:ScienceWorldEnv", 14 | kwargs={"task_name": task_name}, 15 | ) 16 | 17 | 18 | def download(): 19 | pass 20 | -------------------------------------------------------------------------------- /tales/scienceworld/scienceworld_data.py: -------------------------------------------------------------------------------- 1 | import scienceworld 2 | 3 | 4 | def get_task_names(): 5 | return scienceworld.ScienceWorldEnv().task_names 6 | 7 | 8 | def get_variations(task_name, split, env=None): 9 | env = env or scienceworld.ScienceWorldEnv(task_name) 10 | if split == "train": 11 | return env.get_variations_train() 12 | elif split == "valid": 13 | return env.get_variations_dev() 14 | elif split == "test": 15 | return env.get_variations_test() 16 | else: 17 | raise NotImplementedError("Only plan to support train, dev, and test splits.") 18 | -------------------------------------------------------------------------------- /tales/scienceworld/scienceworld_env.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | import numpy as np 3 | import scienceworld 4 | 5 | from . import scienceworld_data 6 | 7 | TASK_NAMES = scienceworld_data.get_task_names() 8 | 9 | 10 | class ScienceWorldEnv(gym.Env): 11 | 12 | def __init__(self, task_name, admissible_commands=False, *args, **kwargs): 13 | self.task_name = task_name 14 | self.admissible_commands = admissible_commands 15 | self.env = scienceworld.ScienceWorldEnv(self.task_name, envStepLimit=np.inf) 16 | self.variations = scienceworld_data.get_variations( 17 | self.task_name, split="test", env=self.env 18 | ) 19 | self.variation = self.variations[0] 20 | 21 | def reset(self, *, seed=None, options=None): 22 | if seed is not None: 23 | self.variation = self.variations[seed % len(self.variations)] 24 | 25 | self.env.load( 26 | self.task_name, self.variation, simplificationStr="", generateGoldPath=True 27 | ) 28 | obs, info = self.env.reset() 29 | 30 | # Add task description to the first observation. 31 | obs = info["taskDesc"] + "\n\n" + obs 32 | 33 | info["max_score"] = 100 34 | info["feedback"] = obs 35 | info["won"] = False 36 | info["lost"] = False 37 | info["admissible_commands"] = info["valid"] 38 | info["extra.walkthrough"] = self.env.get_gold_action_sequence() 39 | return obs, info 40 | 41 | def step(self, action): 42 | obs, reward, done, info = self.env.step(action) 43 | info["max_score"] = 100 44 | info["feedback"] = obs 45 | info["won"] = info["score"] == 100 46 | info["lost"] = info["score"] < 0 47 | info["admissible_commands"] = info["valid"] 48 | return obs, reward, done, info 49 | 50 | def close(self): 51 | self.env.close() 52 | -------------------------------------------------------------------------------- /tales/textworld/__init__.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | 3 | from .textworld_data import prepare_twcooking_data 4 | from .textworld_env import TextWorldEnv, TWCookingEnv 5 | 6 | environments = [] 7 | 8 | # TWCookingEnv 9 | for difficulty in range(1, 10 + 1): 10 | env_name = f"TWCookingLevel{difficulty}" 11 | environments.append([env_name, "v0"]) 12 | 13 | gym.register( 14 | id=f"tales/{env_name}-v0", 15 | entry_point="tales.textworld:TWCookingEnv", 16 | kwargs={"difficulty": difficulty}, 17 | ) 18 | 19 | 20 | def download(): 21 | prepare_twcooking_data() 22 | -------------------------------------------------------------------------------- /tales/textworld/textworld_data.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import zipfile 4 | from os.path import join as pjoin 5 | 6 | from tales.config import TALES_CACHE_HOME, TALES_FORCE_DOWNLOAD 7 | from tales.utils import download 8 | 9 | TW_COOKING_URL = ( 10 | "https://github.com/xingdi-eric-yuan/GATA-public/releases/download/data/rl.0.2.zip" 11 | ) 12 | TALES_CACHE_TEXTWORLD = pjoin(TALES_CACHE_HOME, "textworld") 13 | TALES_CACHE_TWCOOKING = pjoin(TALES_CACHE_TEXTWORLD, "tw-cooking") 14 | TALES_CACHE_TWCOOKING_TEST = pjoin(TALES_CACHE_TWCOOKING, "test") 15 | 16 | 17 | def prepare_twcooking_data(force=TALES_FORCE_DOWNLOAD): 18 | os.makedirs(TALES_CACHE_TWCOOKING, exist_ok=True) 19 | if os.path.exists(TALES_CACHE_TWCOOKING_TEST) and not force: 20 | return 21 | 22 | zip_file = pjoin(TALES_CACHE_TWCOOKING, "rl.0.2.zip") 23 | if not os.path.exists(zip_file) or force: 24 | download( 25 | TW_COOKING_URL, 26 | dst=TALES_CACHE_TWCOOKING, 27 | desc="Downloading TWCooking", 28 | force=force, 29 | ) 30 | 31 | # Extract the content of the folder test from the downloaded file 32 | with zipfile.ZipFile(zip_file, "r") as zip_ref: 33 | # Only extract the test folder 34 | for member in zip_ref.namelist(): 35 | if "test" in member: 36 | zip_ref.extract(member, TALES_CACHE_TWCOOKING) 37 | 38 | 39 | def get_cooking_game(difficulty): 40 | prepare_twcooking_data() # make sure the data is ready 41 | 42 | cooking_dir = pjoin(TALES_CACHE_TWCOOKING_TEST, f"difficulty_level_{difficulty}") 43 | game_files = glob.glob(pjoin(cooking_dir, "*.z8")) 44 | return game_files 45 | -------------------------------------------------------------------------------- /tales/textworld/textworld_env.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | import numpy as np 3 | import textworld 4 | from textworld.envs.wrappers import Filter 5 | 6 | from . import textworld_data 7 | 8 | 9 | class TextWorldEnv(gym.Env): 10 | 11 | def __init__(self, gamefile, admissible_commands=False, *args, **kwargs): 12 | self.infos = textworld.EnvInfos( 13 | score=True, 14 | max_score=True, 15 | won=True, 16 | lost=True, 17 | feedback=True, 18 | moves=True, 19 | admissible_commands=admissible_commands, 20 | extras=["walkthrough"], 21 | ) 22 | self.gamefile = gamefile 23 | self.env = None 24 | 25 | def reset(self, *, seed=None, options=None): 26 | super().reset(seed=seed, options=options) 27 | 28 | if self.env is None: 29 | self.env = textworld.start(self.gamefile, self.infos, wrappers=[Filter]) 30 | 31 | return self.env.reset() 32 | 33 | def step(self, action): 34 | return self.env.step(action) 35 | 36 | 37 | class TWCookingEnv(TextWorldEnv): 38 | 39 | def __init__(self, difficulty, *args, **kwargs): 40 | self.gamefiles = sorted(textworld_data.get_cooking_game(difficulty)) 41 | super().__init__(self.gamefiles[0], *args, **kwargs) 42 | 43 | def reset(self, *, seed=None, options=None): 44 | if seed is not None: 45 | self.gamefile = self.gamefiles[seed % len(self.gamefiles)] 46 | if self.env is not None: 47 | self.env.close() 48 | self.env = None 49 | 50 | return super().reset(seed=seed, options=options) 51 | -------------------------------------------------------------------------------- /tales/textworld_express/__init__.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | 3 | from .twx_env import TASKS, TextWorldExpressEnv 4 | 5 | environments = [] 6 | 7 | for task_name, game_name, game_params in TASKS: 8 | env_name = f"TWX{task_name}" 9 | environments.append([env_name, "v0"]) 10 | 11 | gym.register( 12 | id=f"tales/{env_name}-v0", 13 | entry_point="tales.textworld_express:TextWorldExpressEnv", 14 | kwargs={"game_name": game_name, "game_params": game_params}, 15 | ) 16 | 17 | 18 | def download(): 19 | pass 20 | -------------------------------------------------------------------------------- /tales/textworld_express/twx_data.py: -------------------------------------------------------------------------------- 1 | import textworld_express as twx 2 | 3 | # TASK_NAMES = list(twx.GAME_NAMES) 4 | 5 | TASKS = [ 6 | ( 7 | "CookingWorld", 8 | "cookingworld", 9 | "numLocations=1, numIngredients=2, numDistractorItems=5, includeDoors=0, limitInventorySize=0", 10 | ), 11 | ( 12 | "TextWorldCommonsense", 13 | "twc", 14 | "numLocations=1,numItemsToPutAway=1,includeDoors=0,limitInventorySize=0", 15 | ), 16 | ( 17 | "CoinCollector", 18 | "coin", 19 | "numLocations=1, numDistractorItems=5, limitInventorySize=0", 20 | ), 21 | ("Arithmetic", "arithmetic", ""), 22 | ( 23 | "MapReader", 24 | "mapreader", 25 | "numLocations=2, maxDistanceApart=1, maxDistractorItemsPerLocation=2, includeDoors=0, limitInventorySize=0", 26 | ), 27 | ("Sorting", "sorting", ""), 28 | ("SimonSays10", "simonsays", "gameLength=10, numDistractors=4, memorization=0"), 29 | ("SimonSays50", "simonsays", "gameLength=50, numDistractors=4, memorization=0"), 30 | ("SimonSays100", "simonsays", "gameLength=100, numDistractors=4, memorization=0"), 31 | ( 32 | "SimonSaysWithMemory10", 33 | "simonsays", 34 | "gameLength=10, numDistractors=4, memorization=1, verbose=0", 35 | ), 36 | ( 37 | "SimonSaysWithMemory50", 38 | "simonsays", 39 | "gameLength=50, numDistractors=4, memorization=1, verbose=0", 40 | ), 41 | ( 42 | "SimonSaysWithMemory100", 43 | "simonsays", 44 | "gameLength=100, numDistractors=4, memorization=1, verbose=0", 45 | ), 46 | ( 47 | "SimonSaysWithMemory10Verbose", 48 | "simonsays", 49 | "gameLength=10, numDistractors=4, memorization=1, verbose=1", 50 | ), 51 | ( 52 | "SimonSaysWithMemory50Verbose", 53 | "simonsays", 54 | "gameLength=50, numDistractors=4, memorization=1, verbose=1", 55 | ), 56 | ( 57 | "SimonSaysWithMemory100Verbose", 58 | "simonsays", 59 | "gameLength=100, numDistractors=4, memorization=1, verbose=1", 60 | ), 61 | ("PeckingOrder", "peckingorder", ""), 62 | ] 63 | 64 | 65 | def get_seeds(split, env=None): 66 | env = env or twx.TextWorldExpressEnv() 67 | if split == "train": 68 | return env.getValidSeedsTrain() 69 | elif split == "valid": 70 | return env.getValidSeedsDev() 71 | elif split == "test": 72 | return env.getValidSeedsTest() 73 | else: 74 | raise NotImplementedError("Only plan to support train, dev, and test splits.") 75 | -------------------------------------------------------------------------------- /tales/textworld_express/twx_env.py: -------------------------------------------------------------------------------- 1 | import gymnasium as gym 2 | import numpy as np 3 | import textworld_express as twx 4 | 5 | from . import twx_data 6 | 7 | TASKS = twx_data.TASKS 8 | 9 | 10 | class TextWorldExpressEnv(gym.Env): 11 | 12 | def __init__( 13 | self, game_name, game_params, admissible_commands=False, *args, **kwargs 14 | ): 15 | self.game_name = game_name 16 | self.game_params = game_params 17 | self.admissible_commands = admissible_commands 18 | self.env = twx.TextWorldExpressEnv(envStepLimit=np.inf) 19 | self.seeds = twx_data.get_seeds(split="test", env=self.env) 20 | self.seed = self.seeds[0] 21 | 22 | def reset(self, *, seed=None, options=None): 23 | if seed is not None: 24 | self.seed = self.seeds[seed % len(self.seeds)] 25 | 26 | obs, info = self.env.reset( 27 | seed=self.seed, 28 | gameFold="test", 29 | gameName=self.game_name, 30 | gameParams=self.game_params, 31 | generateGoldPath=True, 32 | ) 33 | 34 | # Add task description to the first observation. 35 | obs = info["taskDescription"] + "\n\n" + obs 36 | 37 | info["max_score"] = 100 38 | info["feedback"] = obs 39 | info["won"] = False 40 | info["lost"] = False 41 | info["moves"] = 0 42 | info["score"] = int(info["score"] * 100) 43 | info["admissible_commands"] = info["validActions"] 44 | info["extra.walkthrough"] = self.env.getGoldActionSequence() 45 | return obs, info 46 | 47 | def step(self, action): 48 | obs, reward, done, info = self.env.step(action) 49 | info["max_score"] = 100 50 | info["feedback"] = obs 51 | info["won"] = info["tasksuccess"] 52 | info["lost"] = info["taskfailure"] 53 | info["moves"] = info["numMoves"] 54 | info["score"] = int(info["score"] * 100) 55 | info["admissible_commands"] = info["validActions"] 56 | return obs, reward, done, info 57 | 58 | def close(self): 59 | self.env.close() 60 | -------------------------------------------------------------------------------- /tales/token.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Optional 3 | 4 | import tiktoken 5 | from llm import Model 6 | 7 | # Suppress warnings from transformers 8 | os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "True" 9 | from transformers import AutoTokenizer 10 | 11 | 12 | def get_token_counter(model: Optional[Model] = None): 13 | if model is None or model.model_id == "gpt-4o": 14 | return OpenAITokenCounter("gpt-4o") 15 | 16 | if "claude-" in model.model_id: 17 | return ClaudeTokenCounter(model) 18 | 19 | elif "gemini" in model.model_id or "gemma" in model.model_id: 20 | return GeminiTokenCounter(model) 21 | 22 | try: 23 | return OpenAITokenCounter(model.model_id) 24 | except KeyError: 25 | pass 26 | 27 | # Try to load from transformers. 28 | return HuggingFaceTokenCounter(model.model_id) 29 | 30 | 31 | class TokenCounter: 32 | 33 | def __call__(self, *, messages=None, text=None): 34 | nb_tokens = 0 35 | if messages is not None: 36 | nb_tokens += sum(len(self.tokenize(msg["content"])) for msg in messages) 37 | 38 | if text is not None: 39 | nb_tokens += len(self.tokenize(text)) 40 | 41 | return nb_tokens 42 | 43 | 44 | class OpenAITokenCounter(TokenCounter): 45 | def __init__(self, model: str): 46 | self.model = model 47 | if self.model in tiktoken.model.MODEL_TO_ENCODING: 48 | self.tokenize = tiktoken.encoding_for_model(self.model).encode 49 | else: 50 | self.tokenize = tiktoken.encoding_for_model(self.model.split("_")[0]).encode 51 | 52 | 53 | class HuggingFaceTokenCounter(TokenCounter): 54 | def __init__(self, model: str): 55 | self.model = model 56 | try: 57 | self.tokenize = AutoTokenizer.from_pretrained(self.model).tokenize 58 | except OSError: 59 | msg = ( 60 | f"Tokenizer not found for model {self.model}," 61 | " make sure you have access to the model" 62 | " (e.g., HuggingFace API key is correctly set)." 63 | ) 64 | raise ValueError(msg) 65 | 66 | def __call__(self, *, messages=None, text=None): 67 | nb_tokens = 0 68 | if messages is not None: 69 | nb_tokens += sum(len(self.tokenize(msg["content"])) for msg in messages) 70 | 71 | if text is not None: 72 | nb_tokens += len(self.tokenize(text)) 73 | 74 | return nb_tokens 75 | 76 | 77 | class ClaudeTokenCounter(TokenCounter): 78 | 79 | def __init__(self, model: Model): 80 | from anthropic import Anthropic 81 | 82 | self.model = model.claude_model_id 83 | self.client = Anthropic(api_key=model.get_key()) 84 | 85 | def __call__(self, *, messages=None, text=None): 86 | from anthropic import NOT_GIVEN 87 | 88 | messages = list(messages or []) 89 | if text is not None: 90 | messages += [{"role": "assistant", "content": text.strip()}] 91 | 92 | # Extract system messages, if any. 93 | system = NOT_GIVEN 94 | if messages and messages[0]["role"] == "system": 95 | system = messages[0]["content"] 96 | messages.pop(0) 97 | 98 | return self.client.beta.messages.count_tokens( 99 | model=self.model, 100 | messages=messages, 101 | system=system, 102 | ).input_tokens 103 | 104 | 105 | class GeminiTokenCounter(TokenCounter): 106 | 107 | def __init__(self, model: Model): 108 | from google import genai 109 | 110 | self.model = model.model_id 111 | self.client = genai.Client(api_key=model.get_key()) 112 | 113 | def __call__(self, *, messages=None, text=None): 114 | from google.genai import types 115 | 116 | messages = list(messages or []) 117 | if text is not None: 118 | messages += [{"role": "assistant", "content": text.strip()}] 119 | 120 | system = None 121 | if messages and messages[0]["role"] == "system": 122 | system = [messages[0]["content"]] 123 | messages.pop(0) 124 | 125 | chat = self.client.chats.create( 126 | model=self.model, 127 | history=[ 128 | types.Content( 129 | role=msg["role"].replace("assistant", "model"), 130 | parts=[types.Part(text=msg["content"])], 131 | ) 132 | for msg in messages 133 | ], 134 | config=types.GenerateContentConfig(system_instruction=system), 135 | ) 136 | 137 | return self.client.models.count_tokens( 138 | model=self.model, 139 | contents=chat.get_history(), 140 | ).total_tokens 141 | -------------------------------------------------------------------------------- /tales/utils.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import logging 4 | import os 5 | import shutil 6 | import tempfile 7 | from os.path import join as pjoin 8 | 9 | import numpy as np 10 | import requests 11 | from llm import AsyncResponse, Conversation, Prompt, Response 12 | from tqdm import tqdm 13 | 14 | from tales.logger import log 15 | 16 | 17 | def mkdirs(dirpath: str) -> str: 18 | """Create a directory and all its parents. 19 | 20 | If the folder already exists, its path is returned without raising any exceptions. 21 | 22 | Arguments: 23 | dirpath: Path where a folder need to be created. 24 | 25 | Returns: 26 | Path to the (created) folder. 27 | """ 28 | try: 29 | os.makedirs(dirpath) 30 | except FileExistsError: 31 | pass 32 | 33 | return dirpath 34 | 35 | 36 | def download(url, dst, desc=None, force=False): 37 | """Download a remote file using HTTP get request. 38 | 39 | Args: 40 | url (str): URL where to get the file. 41 | dst (str): Destination folder where to save the file. 42 | force (bool, optional): 43 | Download again if it exists]. Defaults to False. 44 | 45 | Returns: 46 | str: Path to the downloaded file. 47 | 48 | Notes: 49 | This code is inspired by 50 | https://github.com/huggingface/transformers/blob/v4.0.0/src/transformers/file_utils.py#L1069 51 | """ 52 | filename = url.split("/")[-1] 53 | path = pjoin(mkdirs(dst), filename) 54 | 55 | if os.path.isfile(path) and not force: 56 | return path 57 | 58 | # Download to a temp folder first to avoid corrupting the cache 59 | # with incomplete downloads. 60 | temp_dir = mkdirs(pjoin(tempfile.gettempdir(), "tales")) 61 | temp_path = pjoin(temp_dir, filename) 62 | with open(temp_path, "ab") as temp_file: 63 | headers = {} 64 | resume_size = temp_file.tell() 65 | if resume_size: 66 | headers["Range"] = f"bytes={resume_size}-" 67 | headers["x-ms-version"] = "2020-04-08" # Needed for Range support. 68 | 69 | r = requests.get(url, stream=True, headers=headers) 70 | if r.headers.get("x-ms-error-code") == "InvalidRange" and r.headers[ 71 | "Content-Range" 72 | ].rsplit("/", 1)[-1] == str(resume_size): 73 | shutil.move(temp_path, path) 74 | return path 75 | 76 | r.raise_for_status() # Bad request. 77 | content_length = r.headers.get("Content-Length") 78 | total = resume_size + int(content_length) 79 | pbar = tqdm( 80 | unit="B", 81 | initial=resume_size, 82 | unit_scale=True, 83 | total=total, 84 | desc=desc or "Downloading {}".format(filename), 85 | leave=False, 86 | ) 87 | 88 | for chunk in r.iter_content(chunk_size=1024): 89 | if chunk: # filter out keep-alive new chunks 90 | pbar.update(len(chunk)) 91 | temp_file.write(chunk) 92 | 93 | shutil.move(temp_path, path) 94 | 95 | pbar.close() 96 | return path 97 | 98 | 99 | def merge_messages(messages): 100 | """Merge messages from the same role into a single message.""" 101 | messages_out = [dict(messages[0])] 102 | for message in messages[1:]: 103 | if message["role"] == messages_out[-1]["role"]: 104 | messages_out[-1]["content"] += "\n\n" + message["content"] 105 | else: 106 | messages_out.append(dict(message)) 107 | 108 | return messages_out 109 | 110 | 111 | def messages2conversation(model, messages): 112 | messages = merge_messages(messages) # Just in case. 113 | responses = [] 114 | 115 | system = None 116 | for message in messages: 117 | if message["role"] == "system": 118 | system = message["content"] 119 | continue 120 | 121 | if message["role"] == "user": 122 | prompt = message["content"] 123 | continue 124 | 125 | if message["role"] == "assistant": 126 | # Make a fake response object. 127 | response = Response( 128 | model=model, 129 | prompt=Prompt( 130 | prompt, 131 | system=system, 132 | model=model, 133 | ), 134 | stream=False, 135 | ) 136 | response._done = True 137 | response._chunks = [message["content"]] 138 | responses.append(response) 139 | 140 | system = None 141 | prompt = None 142 | 143 | return Conversation(model, responses=responses) 144 | 145 | 146 | def format_messages_to_markdown(messages): 147 | """Concatenate messages into a single markdown string.""" 148 | markdown_content = "" 149 | for message in messages: 150 | role = message["role"].capitalize() 151 | content = message["content"] 152 | markdown_content += f"#### {role}\n\n```\n{content}\n```\n\n" 153 | return markdown_content 154 | 155 | 156 | def is_recoverable_error(exception): 157 | # List of exceptions thrown by various libraries that can be retried. 158 | recoverable_errors = [ 159 | "openai.APIStatusError", 160 | "openai.APITimeoutError", 161 | "openai.error.Timeout", 162 | "openai.error.RateLimitError", 163 | "openai.error.ServiceUnavailableError", 164 | "openai.Timeout", 165 | "openai.APIError", 166 | "openai.APIConnectionError", 167 | "openai.RateLimitError", 168 | "openai.InternalServerError", 169 | "anthropic.error.RateLimitError", 170 | "anthropic.InternalServerError", 171 | "anthropic.OverloadedError", 172 | "anthropic.APIStatusError", 173 | "anthropic._exceptions.OverloadedError", 174 | "llm.errors.ModelError", # Gemini 175 | # Add more as needed 176 | ] 177 | exception_full_name = ( 178 | f"{exception.__class__.__module__}.{exception.__class__.__name__}" 179 | ) 180 | log.warning(f"Exception_full_name: {exception_full_name}") 181 | log.warning(f"Exception: {exception}") 182 | return exception_full_name in recoverable_errors 183 | 184 | 185 | class NumpyEncoder(json.JSONEncoder): 186 | def default(self, obj): 187 | if isinstance(obj, (np.integer, np.floating)): 188 | return obj.item() 189 | elif isinstance(obj, np.ndarray): 190 | return obj.tolist() 191 | return super(NumpyEncoder, self).default(obj) 192 | -------------------------------------------------------------------------------- /tales/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.1" 2 | -------------------------------------------------------------------------------- /website/Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | activesupport (7.2.2.1) 5 | base64 6 | benchmark (>= 0.3) 7 | bigdecimal 8 | concurrent-ruby (~> 1.0, >= 1.3.1) 9 | connection_pool (>= 2.2.5) 10 | drb 11 | i18n (>= 1.6, < 2) 12 | logger (>= 1.4.2) 13 | minitest (>= 5.1) 14 | securerandom (>= 0.3) 15 | tzinfo (~> 2.0, >= 2.0.5) 16 | addressable (2.8.7) 17 | public_suffix (>= 2.0.2, < 7.0) 18 | base64 (0.2.0) 19 | benchmark (0.4.0) 20 | bigdecimal (3.1.9) 21 | coffee-script (2.4.1) 22 | coffee-script-source 23 | execjs 24 | coffee-script-source (1.12.2) 25 | colorator (1.1.0) 26 | commonmarker (0.23.11) 27 | concurrent-ruby (1.3.5) 28 | connection_pool (2.5.0) 29 | csv (3.3.4) 30 | dnsruby (1.72.4) 31 | base64 (~> 0.2.0) 32 | logger (~> 1.6.5) 33 | simpleidn (~> 0.2.1) 34 | drb (2.2.1) 35 | em-websocket (0.5.3) 36 | eventmachine (>= 0.12.9) 37 | http_parser.rb (~> 0) 38 | ethon (0.16.0) 39 | ffi (>= 1.15.0) 40 | eventmachine (1.2.7) 41 | execjs (2.10.0) 42 | faraday (2.13.0) 43 | faraday-net_http (>= 2.0, < 3.5) 44 | json 45 | logger 46 | faraday-net_http (3.4.0) 47 | net-http (>= 0.5.0) 48 | ffi (1.17.1) 49 | ffi (1.17.1-arm64-darwin) 50 | ffi (1.17.1-x86_64-darwin) 51 | forwardable-extended (2.6.0) 52 | gemoji (4.1.0) 53 | github-pages (232) 54 | github-pages-health-check (= 1.18.2) 55 | jekyll (= 3.10.0) 56 | jekyll-avatar (= 0.8.0) 57 | jekyll-coffeescript (= 1.2.2) 58 | jekyll-commonmark-ghpages (= 0.5.1) 59 | jekyll-default-layout (= 0.1.5) 60 | jekyll-feed (= 0.17.0) 61 | jekyll-gist (= 1.5.0) 62 | jekyll-github-metadata (= 2.16.1) 63 | jekyll-include-cache (= 0.2.1) 64 | jekyll-mentions (= 1.6.0) 65 | jekyll-optional-front-matter (= 0.3.2) 66 | jekyll-paginate (= 1.1.0) 67 | jekyll-readme-index (= 0.3.0) 68 | jekyll-redirect-from (= 0.16.0) 69 | jekyll-relative-links (= 0.6.1) 70 | jekyll-remote-theme (= 0.4.3) 71 | jekyll-sass-converter (= 1.5.2) 72 | jekyll-seo-tag (= 2.8.0) 73 | jekyll-sitemap (= 1.4.0) 74 | jekyll-swiss (= 1.0.0) 75 | jekyll-theme-architect (= 0.2.0) 76 | jekyll-theme-cayman (= 0.2.0) 77 | jekyll-theme-dinky (= 0.2.0) 78 | jekyll-theme-hacker (= 0.2.0) 79 | jekyll-theme-leap-day (= 0.2.0) 80 | jekyll-theme-merlot (= 0.2.0) 81 | jekyll-theme-midnight (= 0.2.0) 82 | jekyll-theme-minimal (= 0.2.0) 83 | jekyll-theme-modernist (= 0.2.0) 84 | jekyll-theme-primer (= 0.6.0) 85 | jekyll-theme-slate (= 0.2.0) 86 | jekyll-theme-tactile (= 0.2.0) 87 | jekyll-theme-time-machine (= 0.2.0) 88 | jekyll-titles-from-headings (= 0.5.3) 89 | jemoji (= 0.13.0) 90 | kramdown (= 2.4.0) 91 | kramdown-parser-gfm (= 1.1.0) 92 | liquid (= 4.0.4) 93 | mercenary (~> 0.3) 94 | minima (= 2.5.1) 95 | nokogiri (>= 1.16.2, < 2.0) 96 | rouge (= 3.30.0) 97 | terminal-table (~> 1.4) 98 | webrick (~> 1.8) 99 | github-pages-health-check (1.18.2) 100 | addressable (~> 2.3) 101 | dnsruby (~> 1.60) 102 | octokit (>= 4, < 8) 103 | public_suffix (>= 3.0, < 6.0) 104 | typhoeus (~> 1.3) 105 | html-pipeline (2.14.3) 106 | activesupport (>= 2) 107 | nokogiri (>= 1.4) 108 | http_parser.rb (0.8.0) 109 | i18n (1.14.7) 110 | concurrent-ruby (~> 1.0) 111 | jekyll (3.10.0) 112 | addressable (~> 2.4) 113 | colorator (~> 1.0) 114 | csv (~> 3.0) 115 | em-websocket (~> 0.5) 116 | i18n (>= 0.7, < 2) 117 | jekyll-sass-converter (~> 1.0) 118 | jekyll-watch (~> 2.0) 119 | kramdown (>= 1.17, < 3) 120 | liquid (~> 4.0) 121 | mercenary (~> 0.3.3) 122 | pathutil (~> 0.9) 123 | rouge (>= 1.7, < 4) 124 | safe_yaml (~> 1.0) 125 | webrick (>= 1.0) 126 | jekyll-avatar (0.8.0) 127 | jekyll (>= 3.0, < 5.0) 128 | jekyll-coffeescript (1.2.2) 129 | coffee-script (~> 2.2) 130 | coffee-script-source (~> 1.12) 131 | jekyll-commonmark (1.4.0) 132 | commonmarker (~> 0.22) 133 | jekyll-commonmark-ghpages (0.5.1) 134 | commonmarker (>= 0.23.7, < 1.1.0) 135 | jekyll (>= 3.9, < 4.0) 136 | jekyll-commonmark (~> 1.4.0) 137 | rouge (>= 2.0, < 5.0) 138 | jekyll-default-layout (0.1.5) 139 | jekyll (>= 3.0, < 5.0) 140 | jekyll-feed (0.17.0) 141 | jekyll (>= 3.7, < 5.0) 142 | jekyll-gist (1.5.0) 143 | octokit (~> 4.2) 144 | jekyll-github-metadata (2.16.1) 145 | jekyll (>= 3.4, < 5.0) 146 | octokit (>= 4, < 7, != 4.4.0) 147 | jekyll-include-cache (0.2.1) 148 | jekyll (>= 3.7, < 5.0) 149 | jekyll-mentions (1.6.0) 150 | html-pipeline (~> 2.3) 151 | jekyll (>= 3.7, < 5.0) 152 | jekyll-optional-front-matter (0.3.2) 153 | jekyll (>= 3.0, < 5.0) 154 | jekyll-paginate (1.1.0) 155 | jekyll-readme-index (0.3.0) 156 | jekyll (>= 3.0, < 5.0) 157 | jekyll-redirect-from (0.16.0) 158 | jekyll (>= 3.3, < 5.0) 159 | jekyll-relative-links (0.6.1) 160 | jekyll (>= 3.3, < 5.0) 161 | jekyll-remote-theme (0.4.3) 162 | addressable (~> 2.0) 163 | jekyll (>= 3.5, < 5.0) 164 | jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0) 165 | rubyzip (>= 1.3.0, < 3.0) 166 | jekyll-sass-converter (1.5.2) 167 | sass (~> 3.4) 168 | jekyll-seo-tag (2.8.0) 169 | jekyll (>= 3.8, < 5.0) 170 | jekyll-sitemap (1.4.0) 171 | jekyll (>= 3.7, < 5.0) 172 | jekyll-swiss (1.0.0) 173 | jekyll-theme-architect (0.2.0) 174 | jekyll (> 3.5, < 5.0) 175 | jekyll-seo-tag (~> 2.0) 176 | jekyll-theme-cayman (0.2.0) 177 | jekyll (> 3.5, < 5.0) 178 | jekyll-seo-tag (~> 2.0) 179 | jekyll-theme-dinky (0.2.0) 180 | jekyll (> 3.5, < 5.0) 181 | jekyll-seo-tag (~> 2.0) 182 | jekyll-theme-hacker (0.2.0) 183 | jekyll (> 3.5, < 5.0) 184 | jekyll-seo-tag (~> 2.0) 185 | jekyll-theme-leap-day (0.2.0) 186 | jekyll (> 3.5, < 5.0) 187 | jekyll-seo-tag (~> 2.0) 188 | jekyll-theme-merlot (0.2.0) 189 | jekyll (> 3.5, < 5.0) 190 | jekyll-seo-tag (~> 2.0) 191 | jekyll-theme-midnight (0.2.0) 192 | jekyll (> 3.5, < 5.0) 193 | jekyll-seo-tag (~> 2.0) 194 | jekyll-theme-minimal (0.2.0) 195 | jekyll (> 3.5, < 5.0) 196 | jekyll-seo-tag (~> 2.0) 197 | jekyll-theme-modernist (0.2.0) 198 | jekyll (> 3.5, < 5.0) 199 | jekyll-seo-tag (~> 2.0) 200 | jekyll-theme-primer (0.6.0) 201 | jekyll (> 3.5, < 5.0) 202 | jekyll-github-metadata (~> 2.9) 203 | jekyll-seo-tag (~> 2.0) 204 | jekyll-theme-slate (0.2.0) 205 | jekyll (> 3.5, < 5.0) 206 | jekyll-seo-tag (~> 2.0) 207 | jekyll-theme-tactile (0.2.0) 208 | jekyll (> 3.5, < 5.0) 209 | jekyll-seo-tag (~> 2.0) 210 | jekyll-theme-time-machine (0.2.0) 211 | jekyll (> 3.5, < 5.0) 212 | jekyll-seo-tag (~> 2.0) 213 | jekyll-titles-from-headings (0.5.3) 214 | jekyll (>= 3.3, < 5.0) 215 | jekyll-watch (2.2.1) 216 | listen (~> 3.0) 217 | jemoji (0.13.0) 218 | gemoji (>= 3, < 5) 219 | html-pipeline (~> 2.2) 220 | jekyll (>= 3.0, < 5.0) 221 | json (2.10.2) 222 | kramdown (2.4.0) 223 | rexml 224 | kramdown-parser-gfm (1.1.0) 225 | kramdown (~> 2.0) 226 | liquid (4.0.4) 227 | listen (3.9.0) 228 | rb-fsevent (~> 0.10, >= 0.10.3) 229 | rb-inotify (~> 0.9, >= 0.9.10) 230 | logger (1.6.6) 231 | mercenary (0.3.6) 232 | mini_portile2 (2.8.8) 233 | minima (2.5.1) 234 | jekyll (>= 3.5, < 5.0) 235 | jekyll-feed (~> 0.9) 236 | jekyll-seo-tag (~> 2.1) 237 | minitest (5.25.5) 238 | net-http (0.6.0) 239 | uri 240 | nokogiri (1.18.7) 241 | mini_portile2 (~> 2.8.2) 242 | racc (~> 1.4) 243 | nokogiri (1.18.7-arm64-darwin) 244 | racc (~> 1.4) 245 | nokogiri (1.18.7-x86_64-darwin) 246 | racc (~> 1.4) 247 | octokit (4.25.1) 248 | faraday (>= 1, < 3) 249 | sawyer (~> 0.9) 250 | pathutil (0.16.2) 251 | forwardable-extended (~> 2.6) 252 | public_suffix (5.1.1) 253 | racc (1.8.1) 254 | rb-fsevent (0.11.2) 255 | rb-inotify (0.11.1) 256 | ffi (~> 1.0) 257 | rexml (3.4.1) 258 | rouge (3.30.0) 259 | rubyzip (2.4.1) 260 | safe_yaml (1.0.5) 261 | sass (3.7.4) 262 | sass-listen (~> 4.0.0) 263 | sass-listen (4.0.0) 264 | rb-fsevent (~> 0.9, >= 0.9.4) 265 | rb-inotify (~> 0.9, >= 0.9.7) 266 | sawyer (0.9.2) 267 | addressable (>= 2.3.5) 268 | faraday (>= 0.17.3, < 3) 269 | securerandom (0.4.1) 270 | simpleidn (0.2.3) 271 | terminal-table (1.8.0) 272 | unicode-display_width (~> 1.1, >= 1.1.1) 273 | typhoeus (1.4.1) 274 | ethon (>= 0.9.0) 275 | tzinfo (2.0.6) 276 | concurrent-ruby (~> 1.0) 277 | unicode-display_width (1.8.0) 278 | uri (1.0.3) 279 | webrick (1.9.1) 280 | 281 | PLATFORMS 282 | arm64-darwin 283 | ruby 284 | x86_64-darwin 285 | x86_64-linux 286 | 287 | DEPENDENCIES 288 | github-pages 289 | 290 | BUNDLED WITH 291 | 2.6.8 292 | -------------------------------------------------------------------------------- /website/_site/assets/figs/alfworld_all_games.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/alfworld_all_games.png -------------------------------------------------------------------------------- /website/_site/assets/figs/all_framework_scores.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/all_framework_scores.png -------------------------------------------------------------------------------- /website/_site/assets/figs/jericho_all_games.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/jericho_all_games.png -------------------------------------------------------------------------------- /website/_site/assets/figs/radar_chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/radar_chart.png -------------------------------------------------------------------------------- /website/_site/assets/figs/radar_chart_zoom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/radar_chart_zoom.png -------------------------------------------------------------------------------- /website/_site/assets/figs/scienceworld_all_games.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/scienceworld_all_games.png -------------------------------------------------------------------------------- /website/_site/assets/figs/text-benchmark_bar_chart.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/text-benchmark_bar_chart.png -------------------------------------------------------------------------------- /website/_site/assets/figs/text-benchmark_radar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/text-benchmark_radar.png -------------------------------------------------------------------------------- /website/_site/assets/figs/text-benchmark_radar_zoom.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/text-benchmark_radar_zoom.png -------------------------------------------------------------------------------- /website/_site/assets/figs/textworld_all_games.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/textworld_all_games.png -------------------------------------------------------------------------------- /website/_site/assets/figs/textworld_express_all_games.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/textworld_express_all_games.png -------------------------------------------------------------------------------- /website/_site/assets/js/tabs.js: -------------------------------------------------------------------------------- 1 | function openTab(evt, tabName) { 2 | var i, tabcontent, tabbuttons; 3 | 4 | // Hide all tab content 5 | tabcontent = document.getElementsByClassName("tab-content"); 6 | for (i = 0; i < tabcontent.length; i++) { 7 | tabcontent[i].style.display = "none"; 8 | } 9 | 10 | // Remove "active" class from all tab buttons 11 | tabbuttons = document.getElementsByClassName("tab-button"); 12 | for (i = 0; i < tabbuttons.length; i++) { 13 | tabbuttons[i].className = tabbuttons[i].className.replace(" active", ""); 14 | } 15 | 16 | // Show the current tab and add "active" class to the button 17 | document.getElementById(tabName).style.display = "block"; 18 | evt.currentTarget.className += " active"; 19 | } 20 | 21 | // Nested tab functionality 22 | function openNestedTab(evt, tabName) { 23 | var i, tabcontent, tabbuttons; 24 | 25 | // Hide all nested tab content within the parent tab 26 | var parentTab = evt.currentTarget.closest('.tab-content'); 27 | tabcontent = parentTab.getElementsByClassName("nested-tab-content"); 28 | for (i = 0; i < tabcontent.length; i++) { 29 | tabcontent[i].style.display = "none"; 30 | } 31 | 32 | // Remove "active" class from all nested tab buttons 33 | tabbuttons = parentTab.getElementsByClassName("nested-tab-button"); 34 | for (i = 0; i < tabbuttons.length; i++) { 35 | tabbuttons[i].className = tabbuttons[i].className.replace(" active", ""); 36 | } 37 | 38 | // Show the current nested tab and add "active" class to the button 39 | document.getElementById(tabName).style.display = "block"; 40 | evt.currentTarget.className += " active"; 41 | } 42 | 43 | // Initialize tabs 44 | document.addEventListener('DOMContentLoaded', function() { 45 | // Make sure the first tab and its first nested tab are active by default 46 | document.querySelector('.tab-button').click(); 47 | }); -------------------------------------------------------------------------------- /website/_site/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/favicon.ico -------------------------------------------------------------------------------- /website/_site/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | T A L E S | the Text Adventure Learning Environment Suite 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | Skip to the content. 44 | 45 | 53 | 54 |
55 |
56 |
57 | 58 | 59 | 60 | 61 | 62 |
63 | 64 |
65 | 66 |
67 | 68 | 69 |
70 | 71 |
72 |

Overview

73 | Insert overview description here. 74 |
75 | 76 |
77 |

Environments

78 | 79 |
80 |
81 | 82 |
83 | 84 |
85 | 86 | 87 | 88 | 89 | 90 |
91 | 92 |
93 |

Scores for all Textworld games for Top 9 models

94 |

tw_allgames chart

95 |
96 | 97 |
98 |

Scores for all Textworld Express games for Top 9 models

99 |

twx_allgames chart

100 |
101 | 102 |
103 |

Scores for all Alfworld games for Top 9 models

104 |

alfw_allgames chart

105 |
106 | 107 |
108 |

Scores for all Scienceworld games for Top 9 models

109 |

sciencew_allgames chart

110 |
111 | 112 |
113 |

Scores for all Jericho games for Top 9 models

114 |

jericho_allgames chart

115 |
116 | 117 |
118 |

Scores for all Jericho games for Top 9 models

119 |

jerichoallgames chart

120 |
121 |
122 | 123 |
124 | 125 |

Breakdown of scores per framework

126 |

fws chart

127 |
128 | 129 |
130 | 131 |

Tab 4 Content

132 |

This is where you'll put the content for Tab 4.

133 |
134 | 135 |
136 | 137 |

Tab 5 Content

138 |

This is where you'll put the content for Tab 5.

139 |
140 |
141 |
156 | 157 | 158 | --------------------------------------------------------------------------------