├── .github
└── workflows
│ ├── codeql.yml
│ ├── formatter.yml
│ └── publish-website.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .vscode
└── launch.json
├── CODE_OF_CONDUCT.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── RESPONSIBLE_AI.md
├── SECURITY.md
├── SUPPORT.md
├── agents
├── human.py
├── llm.py
├── llm_walkthrough.py
├── random.py
├── react.py
├── reasoning.py
└── walkthrough.py
├── benchmark.py
├── docs
└── website
│ ├── Gemfile
│ ├── _config.yml
│ ├── _includes
│ ├── footer.html
│ ├── head-custom.html
│ ├── simple_table.md
│ ├── table.md
│ └── test.md
│ ├── _layouts
│ └── default.html
│ ├── _site
│ ├── assets
│ │ └── css
│ │ │ └── style.css
│ └── index.html
│ ├── assets
│ ├── css
│ │ ├── custom.css
│ │ └── style.scss
│ ├── figs
│ │ ├── alfworld_all_games.png
│ │ ├── alfworld_image.png
│ │ ├── all_framework_scores.png
│ │ ├── arxiv-logomark-small.svg
│ │ ├── arxiv-logomark.svg
│ │ ├── figure1_eric.png
│ │ ├── github-mark.svg
│ │ ├── jericho_all_games.png
│ │ ├── jericho_image.png
│ │ ├── pull_run_data.ipynb
│ │ ├── radar_chart.png
│ │ ├── radar_chart_zoom.png
│ │ ├── scienceworld_all_games.png
│ │ ├── scienceworld_image.png
│ │ ├── simon_says_chatgpt.png
│ │ ├── static_banner.png
│ │ ├── text-benchmark_bar_chart.png
│ │ ├── text-benchmark_radar.png
│ │ ├── text-benchmark_radar_zoom.png
│ │ ├── textworld_all_games.png
│ │ ├── textworld_express_all_games.png
│ │ ├── textworld_image.png
│ │ └── zork1.png
│ ├── js
│ │ └── tabs.js
│ └── videos
│ │ └── figure1v4.mp4
│ ├── favicon.ico
│ └── index.md
├── print_results.py
├── pyproject.toml
├── requirements.txt
├── scripts
└── example_script.sh
├── tales
├── __init__.py
├── agent.py
├── alfworld
│ ├── __init__.py
│ ├── alfworld_data.py
│ └── alfworld_env.py
├── config.py
├── download.py
├── jericho
│ ├── __init__.py
│ ├── games.json
│ ├── jericho_data.py
│ └── jericho_env.py
├── logger.py
├── scienceworld
│ ├── __init__.py
│ ├── scienceworld_data.py
│ └── scienceworld_env.py
├── textworld
│ ├── __init__.py
│ ├── textworld_data.py
│ └── textworld_env.py
├── textworld_express
│ ├── __init__.py
│ ├── twx_data.py
│ └── twx_env.py
├── token.py
├── utils.py
└── version.py
└── website
├── Gemfile.lock
└── _site
├── assets
├── css
│ └── style.css
├── figs
│ ├── alfworld_all_games.png
│ ├── all_framework_scores.png
│ ├── jericho_all_games.png
│ ├── pull_run_data.ipynb
│ ├── radar_chart.png
│ ├── radar_chart_zoom.png
│ ├── scienceworld_all_games.png
│ ├── text-benchmark_bar_chart.png
│ ├── text-benchmark_radar.png
│ ├── text-benchmark_radar_zoom.png
│ ├── textworld_all_games.png
│ └── textworld_express_all_games.png
└── js
│ └── tabs.js
├── favicon.ico
└── index.html
/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
1 | # For most projects, this workflow file will not need changing; you simply need
2 | # to commit it to your repository.
3 | #
4 | # You may wish to alter this file to override the set of languages analyzed,
5 | # or to provide custom queries or build logic.
6 | #
7 | # ******** NOTE ********
8 | # We have attempted to detect the languages in your repository. Please check
9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 |
14 | on:
15 | push:
16 | branches: [ "main" ]
17 | pull_request:
18 | # The branches below must be a subset of the branches above
19 | branches: [ "main" ]
20 | schedule:
21 | - cron: '37 20 * * 3'
22 |
23 | jobs:
24 | analyze:
25 | name: Analyze
26 | runs-on: ubuntu-latest
27 | permissions:
28 | actions: read
29 | contents: read
30 | security-events: write
31 |
32 | strategy:
33 | fail-fast: false
34 | matrix:
35 | language: [ "python" ]
36 | # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 | # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
38 |
39 | steps:
40 | - name: Checkout repository
41 | uses: actions/checkout@v4
42 | # Install Python dependencies manually
43 | - name: Set up python
44 | uses: actions/setup-python@v5
45 | with:
46 | python-version: '3.12'
47 | cache: 'pip'
48 | # flash-attn requires torch to be installed
49 | - name: Install dependencies
50 | run: |
51 | pip install --upgrade pip
52 | pip install -e ".[dev]"
53 | # Initializes the CodeQL tools for scanning.
54 | - name: Initialize CodeQL
55 | uses: github/codeql-action/init@v3
56 | with:
57 | languages: python
58 | # languages: ${{ matrix.language }}
59 | # If you wish to specify custom queries, you can do so here or in a config file.
60 | # By default, queries listed here will override any specified in a config file.
61 | # Prefix the list here with "+" to use these queries and those in the config file.
62 |
63 | # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
64 | # queries: security-extended,security-and-quality
65 |
66 |
67 | # Autobuild attempts to build any compiled languages (C/C++, C#, Go, or Java).
68 | # If this step fails, then you should remove it and run the build manually (see below)
69 | - name: Autobuild
70 | uses: github/codeql-action/autobuild@v3
71 |
72 | # ℹ️ Command-line programs to run using the OS shell.
73 | # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
74 |
75 | # If the Autobuild fails above, remove it and uncomment the following three lines.
76 | # modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
77 |
78 | # - run: |
79 | # echo "Run, Build Application using script"
80 | # ./location_of_script_within_repo/buildscript.sh
81 |
82 | - name: Perform CodeQL Analysis
83 | uses: github/codeql-action/analyze@v3
84 | with:
85 | category: "/language:${{matrix.language}}"
86 |
--------------------------------------------------------------------------------
/.github/workflows/formatter.yml:
--------------------------------------------------------------------------------
1 | name: "Formatter"
2 |
3 | on:
4 | push:
5 | branches: [ "main" ]
6 | pull_request:
7 | branches: [ "main" ]
8 | schedule:
9 | - cron: '37 20 * * 3'
10 |
11 | jobs:
12 | black:
13 | runs-on: ubuntu-latest
14 | steps:
15 | - uses: actions/checkout@v4
16 | - uses: psf/black@stable
17 | with:
18 | options: "--check --verbose --line-length 88"
19 |
20 | isort:
21 | runs-on: ubuntu-latest
22 | steps:
23 | - uses: actions/checkout@v4
24 | - uses: isort/isort-action@v1
25 | with:
26 | requirements-files: "requirements.txt"
27 | configuration: "--check-only --diff --profile black --filter-files --verbose"
28 |
--------------------------------------------------------------------------------
/.github/workflows/publish-website.yml:
--------------------------------------------------------------------------------
1 | # Sample workflow for building and deploying a Jekyll site to GitHub Pages
2 | name: Deploy Website
3 |
4 | on:
5 | # Runs on pushes targeting the default branch
6 | push:
7 | branches: ["main"]
8 |
9 | # Allows you to run this workflow manually from the Actions tab
10 | workflow_dispatch:
11 |
12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
13 | permissions:
14 | contents: read
15 | pages: write
16 | id-token: write
17 |
18 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
19 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
20 | concurrency:
21 | group: "pages"
22 | cancel-in-progress: false
23 |
24 | jobs:
25 | # Build job
26 | build:
27 | runs-on: ubuntu-latest
28 | steps:
29 | - name: Checkout
30 | uses: actions/checkout@v4
31 | - name: Setup Pages
32 | uses: actions/configure-pages@v5
33 | - name: Build with Jekyll
34 | uses: actions/jekyll-build-pages@v1
35 | with:
36 | source: ./docs/website
37 | destination: ./_site
38 | - name: Upload artifact
39 | uses: actions/upload-pages-artifact@v3
40 |
41 | # Deployment job
42 | deploy:
43 | environment:
44 | name: github-pages
45 | url: ${{ steps.deployment.outputs.page_url }}
46 | runs-on: ubuntu-latest
47 | needs: build
48 | steps:
49 | - name: Deploy to GitHub Pages
50 | id: deployment
51 | uses: actions/deploy-pages@v4
52 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | docs/website/_site/media
55 |
56 | # Translations
57 | *.mo
58 | *.pot
59 |
60 | # Django stuff:
61 | *.log
62 | local_settings.py
63 | db.sqlite3
64 | db.sqlite3-journal
65 |
66 | # Flask stuff:
67 | instance/
68 | .webassets-cache
69 |
70 | # Scrapy stuff:
71 | .scrapy
72 |
73 | # Sphinx documentation
74 | docs/_build/
75 |
76 | # PyBuilder
77 | .pybuilder/
78 | target/
79 |
80 | # Jupyter Notebook
81 | .ipynb_checkpoints
82 |
83 | # IPython
84 | profile_default/
85 | ipython_config.py
86 |
87 | # pyenv
88 | # For a library or package, you might want to ignore these files since the code is
89 | # intended to run in multiple environments; otherwise, check them in:
90 | # .python-version
91 |
92 | # pipenv
93 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
94 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
95 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
96 | # install all needed dependencies.
97 | #Pipfile.lock
98 |
99 | # poetry
100 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101 | # This is especially recommended for binary packages to ensure reproducibility, and is more
102 | # commonly ignored for libraries.
103 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104 | #poetry.lock
105 |
106 | # pdm
107 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108 | #pdm.lock
109 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110 | # in version control.
111 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
112 | .pdm.toml
113 | .pdm-python
114 | .pdm-build/
115 |
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 |
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 |
123 | # SageMath parsed files
124 | *.sage.py
125 |
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 |
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 |
139 | # Rope project settings
140 | .ropeproject
141 |
142 | # mkdocs documentation
143 | /site
144 |
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 |
150 | # Pyre type checker
151 | .pyre/
152 |
153 | # pytype static type analyzer
154 | .pytype/
155 |
156 | # Cython debug symbols
157 | cython_debug/
158 |
159 | # PyCharm
160 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | # and can be added to the global gitignore or merged into this file. For a more nuclear
163 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 |
166 | # Logging
167 | wandb/
168 | logs/
169 |
170 | # Compute
171 | .amltconfig
172 | .amltignore
173 | amlt/
174 |
175 | # Website
176 | docs/website/_site
177 | docs/website/Gemfile.lock
178 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | repos:
2 | - repo: https://github.com/pycqa/isort
3 | rev: 6.0.0
4 | hooks:
5 | - id: isort
6 | args: ["--profile", "black", "--filter-files"]
7 |
8 | - repo: https://github.com/psf/black
9 | rev: 24.4.2
10 | hooks:
11 | - id: black
12 | args: ["--line-length", "88"]
--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
1 | {
2 | // Use IntelliSense to learn about possible attributes.
3 | // Hover to view descriptions of existing attributes.
4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5 | "version": "0.2.0",
6 | "configurations": [
7 | {
8 | "name": "Python Debugger: Current File",
9 | "type": "debugpy",
10 | "request": "launch",
11 | "program": "benchmark.py",
12 | "console": "integratedTerminal",
13 | "args": ["--games", "games/detective.z5", "games/advent.z5", "--agent", "agent_llm.py:LLMAgent", "--llm", "azure_openai", "--enable_wandb", "-vv", "--conversation", "--context", "100", "--admissible_commands"]
14 | }
15 | ]
16 | }
--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Microsoft Open Source Code of Conduct
2 |
3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
4 |
5 | Resources:
6 |
7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) Microsoft Corporation.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
2 | include README.md
3 | include LICENSE
4 | include pyproject.toml
5 |
6 | global-exclude */__pycache__/*
7 |
8 | prune wandb
9 | prune logs
10 | prune website
11 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # TALES: Text-Adventure Learning Environment Suite
2 | This repository contains the files needed to benchmark language agents on a curated list of text-based games from the following frameworks: [Jericho](https://github.com/microsoft/jericho), [TextWorld](https://github.com/microsoft/textworld), [TextWorld-Express](https://github.com/cognitiveailab/TextWorldExpress), [ScienceWorld](https://github.com/allenai/ScienceWorld), [ALFWorld](https://github.com/alfworld/alfworld)).
3 |
4 | [[Technical Report](https://arxiv.org/abs/2504.14128)] [[Project Page](https://t.co/rFPMRoqO9y)]
5 |
6 | ## 1. Installation
7 |
8 | It is recommended to create and activate a conda or virtual environment. `tales` requires `Python>=3.12`:
9 |
10 | conda create -n tales python=3.12
11 | conda activate tales
12 |
13 | Then, install `tales` directly from PyPI:
14 |
15 | pip install tale-suite
16 |
17 | > [!WARNING]
18 | > The name of the Python package on PyPI is `tale-suite` and not `tales`.
19 |
20 | Alternatively, clone the repository and install locally:
21 |
22 | git clone https://github.com/microsoft/tale-suite
23 | cd tale-suite
24 | pip install -e .
25 |
26 | > [!WARNING]
27 | > You will need Java 1.8+ installed to run the environments TextWorld-Express and ScienceWorld.
28 | >
29 | > sudo apt update && apt install openjdk-8-jre-headless -y
30 |
31 | Alternatively, if the above isn't working:
32 |
33 | > sudo apt-get update && apt-get install default-jre default-jdk
34 |
35 | ### Using Docker
36 | We provide a pre-built docker image at
37 |
38 | docker pull czcui/twb:prebuilt
39 |
40 | [Please see the following docs page for more details on how to set up a local vllm for use with the text world benchmark.](https://docs.google.com/document/d/1Q5FtcNpYDpMLbyraJ1dSKxJLwOgLvWCECiPsnDkEq2Y/edit?usp=sharing)
41 |
42 | An example script can be found in the scripts folder.
43 |
44 | ## 2. Getting Started
45 |
46 | 1. Run benchmark evaluation on all the games for the specified random agent:
47 |
48 | ```python
49 | python benchmark.py --agent agents/random.py random
50 |
51 | 2. Run benchmark evaluation on a subset of the games:
52 |
53 | ```python
54 | python benchmark.py --agent agents/random.py random --env textworld
55 |
56 | 3. Run benchmark evaluation on specific games:
57 |
58 | ```python
59 | python benchmark.py --agent agents/random.py random --envs JerichoEnvZork1 JerichoEnvDetective
60 |
61 | 4. Run benchmark evaluation using as a HumanAgent:
62 |
63 | ```python
64 | python benchmark.py --agent agents/human.py human --envs TWCookingLevel1
65 |
66 | 5. Run benchmark evaluation where the ground-truth walkthrough is being followed:
67 |
68 | ```python
69 | python benchmark.py --agent agents/walkthrough.py walkthrough --envs JerichoEnvZork1
70 |
71 |
72 | ## 3. Benchmarking LLMs
73 |
74 | In order to benchmark a given LLM acting as language agent playing text-based games, you will need to first configure it. `tales` is leveraging the [`llm`](https://llm.datasette.io/en/stable/) library to handle communication with different LLMs.
75 |
76 | python benchmark.py --agent agents/llm.py zero-shot --envs TWCookingLevel1
77 |
78 | ### API-based LLMs
79 |
80 | `llm` natively supports OpenAI models and self-hosted models that offer an OpenAI-compatible API (e.g. like vLLM does - more on this below).
81 |
82 | ### Adding support to other LLMs
83 |
84 | `llm` offers different plugins to include other LLMs. E.g.
85 |
86 | llm install llm-anthropic
87 |
88 | See the `llm`plugins [page](https://llm.datasette.io/en/stable/plugins/directory.html) for more information.
89 |
90 | ### Deploying a model locally using vLLM
91 |
92 | To serve a custom HugginFace model with vLLM, one can use the vllm docker image like this:
93 |
94 | docker run --runtime nvidia --gpus all --restart unless-stopped --name vllm-Llama-3.1-8B-Instruct --env "HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}" -v ~/.cache/huggingface:/root/.cache/huggingface -p 8000:8000 --ipc=host vllm/vllm-openai:latest --model meta-llama/Llama-3.1-8B-Instruct --tensor-parallel-size 4 --host 0.0.0.0
95 |
96 | Then, add the following entrypoint in `~/.config/io.datasette.llm/extra-openai-models.yaml`
97 |
98 | ```
99 | - model_id: meta-llama/Llama-3.1-8B-Instruct
100 | model_name: meta-llama/Llama-3.1-8B-Instruct
101 | api_base: "http://0.0.0.0:8000/v1"
102 | ```
103 |
104 | You can check that everything is working properly with this simple command:
105 |
106 | llm -m meta-llama/Llama-3.1-8B-Instruct "Hi. What's your name?"
107 |
108 | ## 4. Building Custom Agents
109 |
110 | To build a custom agent, you need to create a new file (e.g., `custom.py`) in the agents folder and implement the `Agent` class and implement the proper arguments parser.
111 |
112 | ```python
113 | from typing import Dict, Any
114 | import tales
115 |
116 | class CustomAgent(tales.Agent):
117 |
118 | def act(self, obs: str, reward: float, done: bool, infos: Dict[str, Any]) -> str:
119 | # ...
120 | return "help"
121 |
122 |
123 | def build_argparser(parser=None):
124 | return parser or argparse.ArgumentParser()
125 |
126 |
127 | register(
128 | name="my-agent",
129 | desc=(
130 | "This is a custom agent that always output 'help' as a text action."
131 | ),
132 | klass=CustomAgent,
133 | add_arguments=build_argparser,
134 | )
135 | ```
136 |
137 | You can then use this agent by specifying the path to the file and the class name in the `--agent` argument.
138 |
139 | python benchmark.py --agent agents/custom.py my-agent
140 |
141 | > [!NOTE]
142 | > See the [agents folder](https://github.com/microsoft/tale-suite/tree/main/agents) for more concrete examples.
143 |
144 | ## Citation
145 | ```
146 | @article{cui2025tales,
147 | title={TALES: Text-Adventure Learning Environment Suite},
148 | author={Christopher Cui, Xingdi Yuan, Ziang Xiao, Prithviraj Ammanabrolu, Marc-Alexandre C\^ot\'e},
149 | journal={arXiv preprint arXiv:2504.14128},
150 | year={2025},
151 | url={https://arxiv.org/abs/2504.14128}
152 | }
153 | ```
154 |
155 | If you use this benchmark, please consider citing the original frameworks as well.
156 | ```
157 | @article{cote18textworld,
158 | author = {Marc-Alexandre C\^ot\'e and \'Akos K\'ad\'ar and Xingdi Yuan and Ben Kybartas and Tavian Barnes and Emery Fine and James Moore and Ruo Yu Tao and Matthew Hausknecht and Layla El Asri and Mahmoud Adada and Wendy Tay and Adam Trischler},
159 | title = {TextWorld: A Learning Environment for Text-based Games},
160 | journal = {CoRR},
161 | volume = {abs/1806.11532},
162 | year = {2018}
163 | }
164 | @article{jansen2022textworldexpress,
165 | url = {https://arxiv.org/abs/2208.01174},
166 | author = {Jansen, Peter A. and Côté, Marc-Alexandre},
167 | title = {TextWorldExpress: Simulating Text Games at One Million Steps Per Second},
168 | journal = {arXiv},
169 | year = {2022},
170 | }
171 | @inproceedings{hausknecht2020interactive,
172 | title={Interactive fiction games: A colossal adventure},
173 | author={Hausknecht, Matthew and Ammanabrolu, Prithviraj and C{\^o}t{\'e}, Marc-Alexandre and Yuan, Xingdi},
174 | booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
175 | volume={34},
176 | number={05},
177 | year={2020}
178 | }
179 | @inproceedings{ALFWorld20,
180 | title ={{ALFWorld: Aligning Text and Embodied Environments for Interactive Learning}},
181 | author={Mohit Shridhar and Xingdi Yuan and Marc-Alexandre C\^ot\'e and Yonatan Bisk and Adam Trischler and Matthew Hausknecht},
182 | booktitle = {Proceedings of the International
183 | Conference on Learning Representations (ICLR)},
184 | year = {2021},
185 | url = {https://arxiv.org/abs/2010.03768}}
186 | @misc{scienceworld2022,
187 | title={ScienceWorld: Is your Agent Smarter than a 5th Grader?},
188 | author={Ruoyao Wang and Peter Jansen and Marc-Alexandre C{\^o}t{\'e} and Prithviraj Ammanabrolu},
189 | year={2022},
190 | eprint={2203.07540},
191 | archivePrefix={arXiv},
192 | primaryClass={cs.CL},
193 | url={https://arxiv.org/abs/2203.07540}
194 | }
195 | ```
196 |
197 | ## Contributing
198 |
199 | This project welcomes contributions and suggestions. Most contributions require you to agree to a
200 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
201 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
202 |
203 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
204 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
205 | provided by the bot. You will only need to do this once across all repos using our CLA.
206 |
207 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
208 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
209 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
210 |
211 | ## Trademarks
212 |
213 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
214 | trademarks or logos is subject to and must follow
215 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
216 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
217 | Any use of third-party trademarks or logos are subject to those third-party's policies.
218 |
219 | ## Privacy
220 | This framework does not collect user's personal data. For more information about Microsoft's privacy policies. Please see [Microsoft Privacy Statement](https://www.microsoft.com/en-ca/privacy/privacystatement).
221 |
222 | ## Responsible AI
223 | Please see our [Responsible AI Statement](https://github.com/microsoft/tale-suite/blob/main/RESPONSIBLE_AI.md).
--------------------------------------------------------------------------------
/RESPONSIBLE_AI.md:
--------------------------------------------------------------------------------
1 | # TALES - Text Adventure Learning Environment Suite
2 |
3 | TALES is a benchmark, which consists of a diverse collection of synthetic and human-written text-adventure games designed to evaluate reasoning capabilities of Large Language Model (LLM)-based agents.
4 |
5 | ### WHAT CAN TALES DO
6 |
7 | TALES was developed to evaluate LLM-based agents’ capabilities to solve text-adventure games. Text-adventure games are goal-oriented environments where an agent is required to interact with a game engine in multi-step setting to understand the goal, explore the game world, find clues, and plan itself towards solving the game. We curated the set of games in TALES in a way to cover a diverse spectrum of reasoning skills an LLM-based agent may need in solving real-world tasks, such as inductive reasoning, deductive reasoning, spatial reasoning, and grounded reasoning. We believe while being much more cost-efficient compared to realistic tasks, testing LLM-based agents’ performance on TALES can provide useful insights in evaluating the agents from different aspects, including LLM backbones, agent architecture design, and prompt engineering. These insights can further guide practitioners in developing their agents in use cases beyond text-adventure games.
8 |
9 | A detailed discussion of TALES, including how it was developed and tested, can be found in our paper at: https://arxiv.org/abs/2504.14128
10 |
11 |
12 | ### INTENDED USES
13 |
14 | TALES is best suited for Evaluating AI agents’ capability of solving text-adventure games.
15 |
16 | TALES is being shared with the research community to facilitate reproduction of our results and foster further research in this area.
17 |
18 | TALES is intended to be used by domain experts who are independently capable of evaluating the quality of outputs before acting on them.
19 |
20 | ### OUT-OF-SCOPE USES
21 |
22 | TALES is designed exclusively for evaluation; it is not well suited for training AI agents.
23 |
24 | We develop TALES for research purposes only, the benchmark does not cover all necessary criteria for real world decision making. We do not recommend using TALES in any way to make real world decisions.
25 |
26 | ### LIMITATIONS
27 |
28 | TALES was developed for research and experimental purposes. The games in the benchmark are exclusively selected to test LLM-based agents’ inductive reasoning, deductive reasoning, spatial reasoning, and grounded reasoning capabilities. We acknowledge that in real-world scenarios, decision making process may require additional context, more complex reasoning, as well as the combination of multiple reasoning types. We do not claim that our research findings can be directly transferred into real-world decision making. Further testing and validation are needed before considering its application in commercial or real-world scenarios.
29 |
30 |
31 | TALES was designed and tested using the English language. Performance in other languages may vary and should be assessed by someone who is both an expert in the expected outputs and a native speaker of that language.
32 |
33 | Outputs generated by AI may include factual errors, fabrication, or speculation. Users are responsible for assessing the accuracy of generated content. All decisions leveraging outputs of the system should be made with human oversight and not be based solely on system outputs.
34 |
35 | ### BEST PRACTICES
36 |
37 | We strongly encourage users to use LLMs/MLLMs that support robust Responsible AI mitigations, such as Azure Open AI (AOAI) services. Such services continually update their safety and RAI mitigations with the latest industry standards for responsible use. For more on AOAI’s best practices when employing foundations models for scripts and applications:
38 |
39 | [Blog post on responsible AI features in AOAI that were presented at Ignite 2023](https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/announcing-new-ai-safety-amp-responsible-ai-features-in-azure/ba-p/3983686)
40 |
41 | [Overview of Responsible AI practices for Azure OpenAI models] (https://learn.microsoft.com/en-us/legal/cognitive-services/openai/overview)
42 |
43 | [Azure OpenAI Transparency Note](https://learn.microsoft.com/en-us/legal/cognitive-services/openai/transparency-note)
44 |
45 | [OpenAI’s Usage policies](https://openai.com/policies/usage-policies)
46 |
47 | [Azure OpenAI’s Code of Conduct](https://learn.microsoft.com/en-us/legal/cognitive-services/openai/code-of-conduct)
48 |
49 | TALES contains a set of text adventure games specifically curated to fulfill our research on LLM-based agents’ capability of performing certain types of reasoning. We refer practitioners to our paper https://arxiv.org/abs/2504.14128 for detailed guidance on how to properly use this benchmark and how to correctly interpret an LLM-based agent’s results on this benchmark. Additionally, we recommend practitioners to use TALES in concert with other benchmarks to understand LLM-based agents’ performance and capabilities from multiple perspective and thus gain a less biased view.
50 |
51 | ### LICENSE
52 |
53 | We use the MIT license, please see the [license file](https://github.com/microsoft/tale-suite/blob/main/LICENSE).
54 |
55 | ### CONTACT
56 |
57 | We welcome feedback and collaboration from our audience. If you have suggestions, questions, or observe unexpected/offensive behavior in our technology, please contact us via [GitHub issues](https://github.com/microsoft/tale-suite/issues) or at textworld@microsoft.com.
58 |
--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ## Security
4 |
5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
6 |
7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
8 |
9 | ## Reporting Security Issues
10 |
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 |
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
14 |
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
16 |
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc).
18 |
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 |
21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 | * Full paths of source file(s) related to the manifestation of the issue
23 | * The location of the affected source code (tag/branch/commit or direct URL)
24 | * Any special configuration required to reproduce the issue
25 | * Step-by-step instructions to reproduce the issue
26 | * Proof-of-concept or exploit code (if possible)
27 | * Impact of the issue, including how an attacker might exploit the issue
28 |
29 | This information will help us triage your report more quickly.
30 |
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
32 |
33 | ## Preferred Languages
34 |
35 | We prefer all communications to be in English.
36 |
37 | ## Policy
38 |
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
40 |
41 |
42 |
--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
1 | # Support
2 |
3 | ## How to file issues and get help
4 |
5 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing
6 | issues before filing new issues to avoid duplicates. For new issues, file your bug or
7 | feature request as a new Issue.
8 |
9 | For help and questions about using this project, please email textworld@microsoft.com.
10 |
11 | ## Microsoft Support Policy
12 |
13 | Support for this project is limited to the resources listed above.
14 |
--------------------------------------------------------------------------------
/agents/human.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import sys
3 |
4 | import tales
5 | from tales.agent import register
6 | from tales.token import get_token_counter
7 | from tales.utils import format_messages_to_markdown, merge_messages
8 |
9 | prompt_toolkit_available = False
10 | try:
11 | # For command line history and autocompletion.
12 | from prompt_toolkit import prompt
13 | from prompt_toolkit.completion import WordCompleter
14 | from prompt_toolkit.history import InMemoryHistory
15 |
16 | prompt_toolkit_available = sys.stdout.isatty()
17 | except ImportError:
18 | pass
19 |
20 |
21 | class HumanAgent(tales.Agent):
22 |
23 | def __init__(self, *args, **kwargs):
24 | self.token_counter = get_token_counter()
25 | self.history = []
26 |
27 | self._history = None
28 | if prompt_toolkit_available:
29 | self._history = InMemoryHistory()
30 |
31 | @property
32 | def uid(self):
33 | return f"HumanAgent"
34 |
35 | @property
36 | def params(self):
37 | return {
38 | "agent_type": "human",
39 | }
40 |
41 | def act(self, obs, reward, done, infos):
42 | available_commands = infos.get("admissible_commands", [])
43 | if prompt_toolkit_available:
44 | actions_completer = WordCompleter(
45 | available_commands, ignore_case=True, sentence=True
46 | )
47 | response = prompt(
48 | "\n> ",
49 | completer=actions_completer,
50 | history=self._history,
51 | enable_history_search=True,
52 | )
53 | else:
54 | if available_commands:
55 | print("Available actions: {}\n".format(available_commands))
56 |
57 | response = input("\n> ")
58 |
59 | messages = self.build_messages(f"{obs}\n> ")
60 | # response = self._llm_call_from_messages(
61 | # messages,
62 | # temperature=self.act_temp,
63 | # max_tokens=100, # Text actions are short phrases.
64 | # seed=self.seed,
65 | # stream=False,
66 | # )
67 |
68 | action = response.strip()
69 | self.history.append((f"{obs}\n> ", f"{action}\n"))
70 |
71 | # Compute usage statistics
72 | stats = {
73 | "prompt": format_messages_to_markdown(messages),
74 | "response": response,
75 | "nb_tokens": self.token_counter(messages=messages, text=response),
76 | }
77 |
78 | return action, stats
79 |
80 | def build_messages(self, observation):
81 | messages = []
82 |
83 | for i, (obs, action) in enumerate(self.history):
84 | messages.append({"role": "user", "content": obs})
85 | messages.append({"role": "assistant", "content": action})
86 |
87 | messages.append({"role": "user", "content": observation})
88 |
89 | # Just in case, let's avoid having multiple messages from the same role.
90 | messages = merge_messages(messages)
91 |
92 | return messages
93 |
94 |
95 | def build_argparser(parser=None):
96 | parser = parser or argparse.ArgumentParser()
97 | group = parser.add_argument_group("HumanAgent settings")
98 | return parser
99 |
100 |
101 | register(
102 | name="human",
103 | desc=("Manually decide which action to take."),
104 | klass=HumanAgent,
105 | add_arguments=build_argparser,
106 | )
107 |
--------------------------------------------------------------------------------
/agents/llm.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import llm
4 | import numpy as np
5 | from tenacity import (
6 | retry,
7 | retry_if_exception,
8 | stop_after_attempt,
9 | wait_random_exponential,
10 | )
11 |
12 | import tales
13 | from tales.agent import register
14 | from tales.token import get_token_counter
15 | from tales.utils import (
16 | format_messages_to_markdown,
17 | is_recoverable_error,
18 | merge_messages,
19 | messages2conversation,
20 | )
21 |
22 | SYSTEM_PROMPT = (
23 | "You are playing a text-based game and your goal is to finish it with the highest score."
24 | " Upon reading the text observation, provide a *single* short phrase to interact with the game, e.g. `get lamp` (without the backticks)."
25 | " When stuck, try using the `help` command to see what commands are available."
26 | )
27 |
28 |
29 | class LLMAgent(tales.Agent):
30 |
31 | def __init__(self, *args, **kwargs):
32 | self.llm = kwargs["llm"]
33 | self.model = llm.get_model(self.llm)
34 | self.token_counter = get_token_counter(self.model)
35 | self.allows_system_prompt = self.llm not in ["o1-mini", "o1-preview"]
36 |
37 | # Provide the API key, if one is needed and has been provided
38 | self.model.key = llm.get_key(
39 | kwargs.get("key"), kwargs["llm"], self.model.key_env_var
40 | ) or llm.get_key(None, self.model.needs_key, self.model.key_env_var)
41 |
42 | self.seed = kwargs["seed"]
43 | self.rng = np.random.RandomState(self.seed)
44 |
45 | self.history = []
46 | self.context_limit = kwargs["context_limit"]
47 | if self.context_limit is not None:
48 | assert self.context_limit > 0, "--context-limit must be greater than 0."
49 |
50 | self.act_temp = kwargs["act_temp"]
51 | self.conversation = kwargs["conversation"]
52 |
53 | @property
54 | def uid(self):
55 | return (
56 | f"LLMAgent_{self.llm}"
57 | f"_s{self.seed}"
58 | f"_c{self.context_limit}"
59 | f"_t{self.act_temp}"
60 | f"_conv{self.conversation}"
61 | )
62 |
63 | @property
64 | def params(self):
65 | return {
66 | "agent_type": "zero-shot",
67 | "llm": self.llm,
68 | "seed": self.seed,
69 | "context_limit": self.context_limit,
70 | "act_temp": self.act_temp,
71 | "conversation": self.conversation,
72 | }
73 |
74 | @retry(
75 | retry=retry_if_exception(is_recoverable_error),
76 | wait=wait_random_exponential(multiplier=1, max=40),
77 | stop=stop_after_attempt(100),
78 | )
79 | def _llm_call_from_conversation(self, conversation, *args, **kwargs):
80 | response = conversation.prompt(*args, **kwargs)
81 | response.duration_ms() # Forces the response to be computed.
82 | return response
83 |
84 | def _llm_call_from_messages(self, messages, *args, **kwargs):
85 | conversation = messages2conversation(self.model, messages)
86 | prompt = messages[-1]["content"]
87 | system = messages[0]["content"] if self.allows_system_prompt else None
88 |
89 | return self._llm_call_from_conversation(
90 | conversation, prompt=prompt, system=system, *args, **kwargs
91 | )
92 |
93 | def act(self, obs, reward, done, infos):
94 | messages = self.build_messages(f"{obs}\n> ")
95 | llm_kwargs = {
96 | "temperature": self.act_temp,
97 | "max_tokens": 100, # Text actions are short phrases.
98 | "seed": self.seed,
99 | "stream": False,
100 | }
101 | if self.llm in [
102 | "claude-3.5-haiku",
103 | "claude-3.5-sonnet",
104 | "claude-3.5-sonnet-latest",
105 | ]:
106 | # For these models, we cannot set the seed.
107 | llm_kwargs.pop("seed")
108 |
109 | if "gemini" in self.llm or "gemma" in self.llm:
110 | # For these models, we cannot set the seed and max_tokens has a different name.
111 | llm_kwargs.pop("seed")
112 | llm_kwargs["max_output_tokens"] = llm_kwargs.pop("max_tokens")
113 |
114 | response = self._llm_call_from_messages(messages, **llm_kwargs)
115 |
116 | action = response.text().strip()
117 | self.history.append((f"{obs}\n> ", f"{action}\n"))
118 |
119 | # Compute usage statistics
120 | stats = {
121 | "prompt": format_messages_to_markdown(messages),
122 | "response": response.text(),
123 | "nb_tokens": self.token_counter(messages=messages, text=response.text()),
124 | }
125 |
126 | return action, stats
127 |
128 | def build_messages(self, observation):
129 | messages = [{"role": "system", "content": SYSTEM_PROMPT}]
130 | limit = self.context_limit or len(self.history) + 1
131 |
132 | for i, (obs, action) in enumerate(self.history[-limit:]):
133 | if len(self.history) >= limit and i == 0:
134 | # Add the current observation.
135 | obs = (
136 | f"// History has been truncated to the last {limit} steps.\n...\n> "
137 | )
138 |
139 | messages.append({"role": "user", "content": obs})
140 | messages.append({"role": "assistant", "content": action})
141 |
142 | messages.append({"role": "user", "content": observation})
143 |
144 | # Just in case, let's avoid having multiple messages from the same role.
145 | messages = merge_messages(messages)
146 |
147 | if not self.conversation:
148 | # Merge all messages into a single message except for the system.
149 | content = "".join([msg["content"] for msg in messages[1:]])
150 | messages = messages[:1] + [{"role": "user", "content": content}]
151 |
152 | if not self.allows_system_prompt:
153 | # Make sure the system prompt is added to the following message.
154 | messages.pop(0)
155 | messages[1]["content"] = f"{SYSTEM_PROMPT}\n\n{messages[1]['content']}"
156 |
157 | return messages
158 |
159 |
160 | def build_argparser(parser=None):
161 | parser = parser or argparse.ArgumentParser()
162 | group = parser.add_argument_group("LLMAgent settings")
163 |
164 | group.add_argument(
165 | "--llm",
166 | default="gpt-4o-mini",
167 | help="LLM to be used for evaluation. Default: %(default)s",
168 | )
169 | group.add_argument(
170 | "--seed",
171 | type=int,
172 | default=20241001,
173 | help="Seed for LLM (not all endpoints support this). Default: %(default)s",
174 | )
175 | group.add_argument(
176 | "--act-temp",
177 | type=float,
178 | default=0.0,
179 | help="Temperature for LLM when taking actions. Default: %(default)s",
180 | )
181 | group.add_argument(
182 | "--context-limit",
183 | type=int,
184 | help="Limit context for LLM (in conversation turns). Default: no limit.",
185 | )
186 | group.add_argument(
187 | "--conversation",
188 | required=True,
189 | action=argparse.BooleanOptionalAction,
190 | help="Enable conversation mode. Otherwise, use single prompt.",
191 | )
192 |
193 | return parser
194 |
195 |
196 | register(
197 | name="zero-shot",
198 | desc=(
199 | "This agent uses a LLM to decide which action to take in a zero-shot manner."
200 | ),
201 | klass=LLMAgent,
202 | add_arguments=build_argparser,
203 | )
204 |
--------------------------------------------------------------------------------
/agents/llm_walkthrough.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import gymnasium as gym
4 |
5 | from agents.llm import LLMAgent
6 | from tales.agent import register
7 | from tales.utils import merge_messages
8 |
9 |
10 | # For the LLMWlkThrAgent, the sysprompt is initialized in the __init__ function as we need to change it once we extract the walkthrough from the env
11 | class LLMWalkThroughAgent(LLMAgent):
12 |
13 | def __init__(self, *args, **kwargs):
14 | super().__init__(*args, **kwargs)
15 | self.sys_prompt = "Not Initialized"
16 |
17 | @property
18 | def uid(self):
19 | return (
20 | f"LLMAgent_{self.llm}"
21 | f"_s{self.seed}"
22 | f"_c{self.context_limit}"
23 | f"_t{self.act_temp}"
24 | f"_conv{self.conversation is not None}"
25 | f"Walkthrough Agent"
26 | )
27 |
28 | def build_messages(self, observation):
29 | messages = [{"role": "system", "content": self.sys_prompt}]
30 | limit = self.context_limit or len(self.history) + 1
31 |
32 | for i, (obs, action) in enumerate(self.history[-limit:]):
33 | if len(self.history) >= limit and i == 0:
34 | # Add the current observation.
35 | obs = (
36 | f"// History has been truncated to the last {limit} steps.\n...\n> "
37 | )
38 |
39 | messages.append({"role": "user", "content": obs})
40 | messages.append({"role": "assistant", "content": action})
41 |
42 | messages.append({"role": "user", "content": observation})
43 |
44 | # Just in case, let's avoid having multiple messages from the same role.
45 | messages = merge_messages(messages)
46 |
47 | if not self.conversation:
48 | # Merge all messages into a single message except for the system.
49 | content = "".join([msg["content"] for msg in messages[1:]])
50 | messages = messages[:1] + [{"role": "user", "content": content}]
51 |
52 | if not self.allows_system_prompt:
53 | # Make sure the system prompt is added to the following message.
54 | messages.pop(0)
55 | messages[1]["content"] = f"{self.sys_prompt}\n\n{messages[1]['content']}"
56 |
57 | return messages
58 |
59 | def reset(self, obs, info, env_name):
60 | walkthrough = info.get("extra.walkthrough")
61 | if walkthrough is None or len(walkthrough) < 1:
62 | raise ValueError("Walkthrough not initalized: Check the environment")
63 |
64 | # Check if the walkthrough is valid.
65 | env = gym.make(f"tales/{env_name}-v0", disable_env_checker=True)
66 |
67 | _, _ = env.reset()
68 |
69 | for act in walkthrough:
70 | _, _, _, info_ = env.step(act)
71 |
72 | if info_["score"] != info_["max_score"]:
73 | raise ValueError(
74 | "Provided walkthrough does not successfully complete game."
75 | )
76 |
77 | numbered_walkthrough = ", ".join(
78 | f"{i + 1}.){act}" for i, act in enumerate(walkthrough)
79 | )
80 | self.sys_prompt = (
81 | "You are playing a text-based game and your goal is to finish it with the highest score."
82 | " The following is a walkthrough in the form of a list of actions to beat the game."
83 | " You should follow this walkthrough as closely as possible to get the maximum score"
84 | " You must ONLY respond with the action you wish to take with no other special tokens."
85 | f"Walkthrough: {numbered_walkthrough}"
86 | )
87 |
88 |
89 | def build_argparser(parser=None):
90 | parser = parser or argparse.ArgumentParser()
91 | group = parser.add_argument_group("LLMAgent settings")
92 |
93 | group.add_argument(
94 | "--llm",
95 | default="gpt-4o-mini",
96 | help="LLM to be used for evaluation. Default: %(default)s",
97 | )
98 | group.add_argument(
99 | "--seed",
100 | type=int,
101 | default=20241001,
102 | help="Seed for LLM (not all endpoints support this). Default: %(default)s",
103 | )
104 | group.add_argument(
105 | "--act-temp",
106 | type=float,
107 | default=0.0,
108 | help="Temperature for LLM when taking actions. Default: %(default)s",
109 | )
110 | group.add_argument(
111 | "--context-limit",
112 | type=int,
113 | default=10,
114 | help="Limit context for LLM (in conversation turns). Default: %(default)s",
115 | )
116 | group.add_argument(
117 | "--conversation",
118 | action="store_true",
119 | help="Enable conversation mode. Otherwise, use single prompt.",
120 | )
121 | group.add_argument(
122 | "--wlkthr-limit",
123 | type=int,
124 | default=10000,
125 | help="Number of walkthrough actions to provide the LLM. Default: %(default)s",
126 | )
127 |
128 | return parser
129 |
130 |
131 | register(
132 | name="llm-walkthrough",
133 | desc=(
134 | "This agent uses the ground-truth walkthrough from the environment to attempt to progress through the game."
135 | ),
136 | klass=LLMWalkThroughAgent,
137 | add_arguments=build_argparser,
138 | )
139 |
--------------------------------------------------------------------------------
/agents/random.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import re
3 |
4 | import numpy as np
5 |
6 | import tales
7 | from tales.agent import register
8 | from tales.token import get_token_counter
9 |
10 |
11 | class RandomAgent(tales.Agent):
12 | def __init__(self, **kwargs):
13 | self.seed = kwargs.get("seed", 1234)
14 | self.rng = np.random.RandomState(self.seed)
15 | self.token_counter = get_token_counter()
16 |
17 | # fmt:off
18 | self.actions = [
19 | "north", "south", "east", "west", "up", "down",
20 | "look", "inventory",
21 | "drop", "take", "take all",
22 | "eat", "attack",
23 | "wait", "YES",
24 | ]
25 | # fmt:on
26 |
27 | @property
28 | def uid(self):
29 | return f"RandomAgent_s{self.seed}"
30 |
31 | @property
32 | def params(self):
33 | return {
34 | "agent_type": "random",
35 | "seed": self.seed,
36 | }
37 |
38 | def act(self, obs, reward, done, info):
39 | stats = {
40 | "prompt": None,
41 | "response": None,
42 | "nb_tokens": self.token_counter(text=obs),
43 | }
44 |
45 | if "admissible_commands" in info:
46 | return self.rng.choice(info["admissible_commands"]), stats
47 |
48 | action = self.rng.choice(self.actions)
49 | if action in ["take", "drop", "eat", "attack"]:
50 | words = re.findall(
51 | r"\b[a-zA-Z]{4,}\b", obs
52 | ) # Extract words with 4 or more letters.
53 | if len(words) > 0:
54 | action += " " + self.rng.choice(words)
55 |
56 | return str(action), stats
57 |
58 |
59 | def build_argparser(parser=None):
60 | parser = parser or argparse.ArgumentParser()
61 | group = parser.add_argument_group("RandomAgent settings")
62 | group.add_argument(
63 | "--seed",
64 | type=int,
65 | default=20241001,
66 | help="Random generator seed to select actions. Default: %(default)s",
67 | )
68 | return parser
69 |
70 |
71 | register(
72 | name="random",
73 | desc=(
74 | "This agent will pick an action at random among a predefined set of actions or,"
75 | " if available, the admissible commands."
76 | ),
77 | klass=RandomAgent,
78 | add_arguments=build_argparser,
79 | )
80 |
--------------------------------------------------------------------------------
/agents/react.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import llm
4 | import numpy as np
5 | from tenacity import (
6 | retry,
7 | retry_if_exception,
8 | stop_after_attempt,
9 | wait_random_exponential,
10 | )
11 | from termcolor import colored
12 |
13 | import tales
14 | from tales.agent import register
15 | from tales.token import get_token_counter
16 | from tales.utils import (
17 | format_messages_to_markdown,
18 | is_recoverable_error,
19 | log,
20 | merge_messages,
21 | messages2conversation,
22 | )
23 |
24 | SYSTEM_PROMPT = (
25 | "You are playing a text-based game and your goal is to finish it with the highest score."
26 | " Upon reading the text observation, generate a plan with subgoals when asked to think step-by-step,"
27 | " then provide a *single* short phrase to interact with the game when asked to do so, e.g. `get lamp` (without the backticks)."
28 | " When stuck, try using the `help` command to see what commands are available."
29 | )
30 |
31 |
32 | class ReactAgent(tales.Agent):
33 |
34 | def __init__(self, *args, **kwargs):
35 | self.llm = kwargs["llm"]
36 | self.model = llm.get_model(self.llm)
37 | self.token_counter = get_token_counter(self.model)
38 | self.allows_system_prompt = self.llm not in ["o1-mini", "o1-preview"]
39 |
40 | # Provide the API key, if one is needed and has been provided
41 | self.model.key = llm.get_key(
42 | kwargs.get("key"), kwargs["llm"], self.model.key_env_var
43 | ) or llm.get_key(None, self.model.needs_key, self.model.key_env_var)
44 |
45 | self.seed = kwargs["seed"]
46 | self.rng = np.random.RandomState(self.seed)
47 |
48 | self.history = []
49 | self.context_limit = kwargs["context_limit"]
50 | if self.context_limit is not None:
51 | assert self.context_limit > 0, "--context-limit must be greater than 0."
52 |
53 | self.act_temp = kwargs["act_temp"]
54 | self.cot_temp = kwargs["cot_temp"]
55 | self.cot_max_tokens = kwargs["cot_max_tokens"]
56 | self.conversation = kwargs["conversation"]
57 |
58 | @property
59 | def uid(self):
60 | return (
61 | f"ReactAgent_{self.llm}"
62 | f"_s{self.seed}"
63 | f"_c{self.context_limit}"
64 | f"_t{self.act_temp}"
65 | f"_cotT{self.cot_temp}"
66 | f"_cotN{self.cot_max_tokens}"
67 | f"_conv{self.conversation}"
68 | )
69 |
70 | @property
71 | def params(self):
72 | return {
73 | "agent_type": "react",
74 | "llm": self.llm,
75 | "seed": self.seed,
76 | "context_limit": self.context_limit,
77 | "act_temp": self.act_temp,
78 | "cot_temp": self.cot_temp,
79 | "cot_max_tokens": self.cot_max_tokens,
80 | "conversation": self.conversation,
81 | }
82 |
83 | @retry(
84 | retry=retry_if_exception(is_recoverable_error),
85 | wait=wait_random_exponential(multiplier=1, max=40),
86 | stop=stop_after_attempt(100),
87 | )
88 | def _llm_call_from_conversation(self, conversation, *args, **kwargs):
89 | response = conversation.prompt(*args, **kwargs)
90 | response.duration_ms() # Forces the response to be computed.
91 | return response
92 |
93 | def _llm_call_from_messages(self, messages, *args, **kwargs):
94 | conversation = messages2conversation(self.model, messages)
95 | prompt = messages[-1]["content"]
96 | system = messages[0]["content"] if self.allows_system_prompt else None
97 |
98 | return self._llm_call_from_conversation(
99 | conversation, prompt=prompt, system=system, *args, **kwargs
100 | )
101 |
102 | def act(self, obs, reward, done, infos):
103 | question = "// Based on the above information (history), what is the best action to take? Let's think step by step.\n"
104 | messages = self.build_messages(obs, question, [])
105 | response = self._llm_call_from_messages(
106 | messages,
107 | temperature=self.cot_temp,
108 | max_tokens=self.cot_max_tokens,
109 | seed=self.seed,
110 | stream=False,
111 | )
112 |
113 | answer = response.text().strip()
114 | log.debug(colored(question, "cyan"))
115 | log.debug(colored(answer, "green"))
116 |
117 | # Compute usage statistics for the CoT.
118 | nb_tokens_cot = self.token_counter(messages=messages, text=response.text())
119 |
120 | prompt = "// Provide your chosen action on a single line while respecting the desired format.\n> "
121 | messages = self.build_messages(obs, prompt, [(question, f"{answer}\n")])
122 | response = self._llm_call_from_messages(
123 | messages,
124 | temperature=self.act_temp,
125 | max_tokens=100, # Text actions are short phrases.
126 | seed=self.seed,
127 | stream=False,
128 | )
129 |
130 | action = response.text().strip()
131 | self.history.append((f"{obs}\n> ", f"{action}\n"))
132 | log.debug(colored(prompt, "cyan"))
133 |
134 | # Compute usage statistics
135 | nb_tokens_act = self.token_counter(messages=messages, text=response.text())
136 | stats = {
137 | "prompt": format_messages_to_markdown(messages),
138 | "response": response.text(),
139 | "nb_tokens": nb_tokens_cot + nb_tokens_act,
140 | }
141 |
142 | return action, stats
143 |
144 | def build_messages(self, observation, question, qa_history):
145 | messages = [{"role": "system", "content": SYSTEM_PROMPT}]
146 | limit = self.context_limit or len(self.history) + 1
147 |
148 | for i, (obs, action) in enumerate(self.history[-limit:]):
149 | if len(self.history) >= limit and i == 0:
150 | # Add the current observation.
151 | obs = (
152 | f"// History has been truncated to the last {limit} steps.\n...\n> "
153 | )
154 |
155 | messages.append({"role": "user", "content": obs})
156 | messages.append({"role": "assistant", "content": action})
157 |
158 | messages.append({"role": "user", "content": observation})
159 |
160 | for q, a in qa_history:
161 | messages.append({"role": "user", "content": q})
162 | messages.append({"role": "assistant", "content": a})
163 |
164 | messages.append({"role": "user", "content": question})
165 |
166 | # Merging the current game observation current and the question.
167 | messages = merge_messages(messages)
168 |
169 | if not self.conversation:
170 | # Merge all messages into a single message except for the system.
171 | content = "".join([msg["content"] for msg in messages[1:]])
172 | messages = messages[:1] + [{"role": "user", "content": content}]
173 |
174 | if not self.allows_system_prompt:
175 | # Make sure the system prompt is added to the following message.
176 | messages.pop(0)
177 | messages[1]["content"] = f"{SYSTEM_PROMPT}\n\n{messages[1]['content']}"
178 |
179 | return messages
180 |
181 |
182 | def build_argparser(parser=None):
183 | parser = parser or argparse.ArgumentParser()
184 | group = parser.add_argument_group("LLMAgent settings")
185 |
186 | group.add_argument(
187 | "--llm",
188 | default="gpt-4o-mini",
189 | help="LLM to be used for evaluation. Default: %(default)s",
190 | )
191 | group.add_argument(
192 | "--seed",
193 | type=int,
194 | default=20241001,
195 | help="Seed for LLM (not all endpoints support this). Default: %(default)s",
196 | )
197 | group.add_argument(
198 | "--cot-temp",
199 | type=float,
200 | default=0.0,
201 | help="Temperature for LLM when doing chain-of-thoughts. Default: %(default)s",
202 | )
203 | group.add_argument(
204 | "--cot-max-tokens",
205 | type=int,
206 | default=1024,
207 | help="Maximum number of token for chain-of-thoughts. Default: %(default)s",
208 | )
209 | group.add_argument(
210 | "--act-temp",
211 | type=float,
212 | default=0.0,
213 | help="Temperature for LLM when taking actions. Default: %(default)s",
214 | )
215 | group.add_argument(
216 | "--context-limit",
217 | type=int,
218 | help="Limit context for LLM (in conversation turns). Default: no limit",
219 | )
220 | group.add_argument(
221 | "--conversation",
222 | required=True,
223 | action=argparse.BooleanOptionalAction,
224 | help="Enable conversation mode. Otherwise, use single prompt.",
225 | )
226 |
227 | return parser
228 |
229 |
230 | register(
231 | name="react",
232 | desc=(
233 | "This agent uses a LLM to decide which action to take by following a CoT/ReAct approach."
234 | ),
235 | klass=ReactAgent,
236 | add_arguments=build_argparser,
237 | )
238 |
--------------------------------------------------------------------------------
/agents/reasoning.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import llm
4 | import numpy as np
5 | from tenacity import (
6 | retry,
7 | retry_if_exception,
8 | stop_after_attempt,
9 | wait_random_exponential,
10 | )
11 | from termcolor import colored
12 |
13 | import tales
14 | from tales.agent import register
15 | from tales.token import get_token_counter
16 | from tales.utils import (
17 | format_messages_to_markdown,
18 | is_recoverable_error,
19 | merge_messages,
20 | messages2conversation,
21 | )
22 |
23 | SYSTEM_PROMPT = (
24 | "You are playing a text-based game and your goal is to finish it with the highest score."
25 | " Upon reading the text observation, provide a *single* short phrase to interact with the game, e.g. `get lamp` (without the backticks)."
26 | " When stuck, try using the `help` command to see what commands are available."
27 | )
28 |
29 | DEEPSEEK_CHAT_TEMPLATE_NO_THINK = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '' in content %}{% set content = content.split('')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>\\n\\n'}}{% endif %}"
30 |
31 |
32 | class ReasoningAgent(tales.Agent):
33 |
34 | def __init__(self, *args, **kwargs):
35 | self.llm = kwargs["llm"]
36 | self.model = llm.get_model(self.llm)
37 | self.token_counter = get_token_counter(self.model)
38 | self.allows_system_prompt = self.llm not in [
39 | "o1",
40 | "o1-mini",
41 | "o1-preview",
42 | "o3-mini",
43 | ]
44 |
45 | # Provide the API key, if one is needed and has been provided
46 | self.model.key = llm.get_key(
47 | kwargs.get("key"), kwargs["llm"], self.model.key_env_var
48 | ) or llm.get_key(None, self.model.needs_key, self.model.key_env_var)
49 |
50 | self.seed = kwargs["seed"]
51 | self.rng = np.random.RandomState(self.seed)
52 |
53 | self.history = []
54 | self.context_limit = kwargs["context_limit"]
55 | if self.context_limit is not None:
56 | assert self.context_limit > 0, "--context-limit must be greater than 0."
57 |
58 | self.act_temp = kwargs["act_temp"]
59 | self.cot_temp = kwargs["cot_temp"]
60 | self.reasoning_effort = kwargs["reasoning_effort"]
61 | self.conversation = kwargs["conversation"]
62 |
63 | @property
64 | def uid(self):
65 | return (
66 | f"ReasoningAgent_{self.llm}"
67 | f"_s{self.seed}"
68 | f"_c{self.context_limit}"
69 | f"_conv{self.conversation}"
70 | f"_actT{self.act_temp}"
71 | f"_cotT{self.cot_temp}"
72 | f"_effort{self.reasoning_effort}"
73 | )
74 |
75 | @property
76 | def params(self):
77 | return {
78 | "agent_type": "react",
79 | "llm": self.llm,
80 | "seed": self.seed,
81 | "context_limit": self.context_limit,
82 | "conversation": self.conversation,
83 | "act_temp": self.act_temp,
84 | "cot_temp": self.cot_temp,
85 | "reasoning_effort": self.reasoning_effort,
86 | }
87 |
88 | @retry(
89 | retry=retry_if_exception(is_recoverable_error),
90 | wait=wait_random_exponential(multiplier=1, max=40),
91 | stop=stop_after_attempt(100),
92 | )
93 | def _llm_call_from_conversation(self, conversation, *args, **kwargs):
94 | response = conversation.prompt(*args, **kwargs)
95 | response.duration_ms() # Forces the response to be computed.
96 | return response
97 |
98 | def _llm_call_from_messages(self, messages, *args, **kwargs):
99 | conversation = messages2conversation(self.model, messages)
100 | prompt = messages[-1]["content"]
101 | system = messages[0]["content"] if self.allows_system_prompt else None
102 |
103 | return self._llm_call_from_conversation(
104 | conversation, prompt=prompt, system=system, *args, **kwargs
105 | )
106 |
107 | def act(self, obs, reward, done, infos):
108 | llm_kwargs = {
109 | "temperature": self.cot_temp,
110 | "seed": self.seed,
111 | "stream": True, # Should prevent openai.APITimeoutError
112 | }
113 | if isinstance(self.reasoning_effort, int):
114 | if self.llm in ["claude-3.7-sonnet"]:
115 | llm_kwargs["thinking_budget"] = self.reasoning_effort
116 | else:
117 | llm_kwargs["max_tokens"] = self.reasoning_effort
118 |
119 | elif self.llm in ["o1", "o1-preview", "o3-mini"]:
120 | llm_kwargs["reasoning_effort"] = self.reasoning_effort
121 |
122 | if self.llm in ["o1", "o1-mini", "o1-preview", "o3-mini", "claude-3.7-sonnet"]:
123 | # For these models, we cannot set the temperature.
124 | llm_kwargs.pop("temperature")
125 |
126 | if self.llm in ["o3-mini"]:
127 | llm_kwargs.pop("stream")
128 |
129 | if self.llm in ["claude-3.7-sonnet"]:
130 | llm_kwargs["thinking"] = 1
131 | llm_kwargs.pop("seed")
132 |
133 | if "gemini" in self.llm or "gemma" in self.llm:
134 | # For these models, we cannot set the seed and max_tokens has a different name.
135 | llm_kwargs.pop("seed")
136 |
137 | messages = self.build_messages(f"{obs}\n> ")
138 | response = self._llm_call_from_messages(messages, **llm_kwargs)
139 | response_text = response.text()
140 |
141 | action = response.text().strip()
142 |
143 | thinking = None
144 | if "DeepSeek-R1" in self.llm:
145 | # Strip the reasoning and .
146 | reasoning_end = action.find("")
147 | if reasoning_end == -1:
148 | # Send another request to get the action with the current reasoning.
149 | messages.append(
150 | {
151 | "role": "assistant",
152 | "content": "\n" + response_text.strip() + "\n",
153 | }
154 | )
155 | # prompt = "// Thinking exceeded the length limit. Based on the thoughts so far, provide your chosen action on a single line while respecting the desired format.\n> "
156 | # messages.append({"role": "user", "content": prompt})
157 | llm_kwargs["max_tokens"] = (
158 | 100 # Text actions should be short phrases but deepseek forces thought process by starting the generation with .
159 | )
160 | llm_kwargs["temperature"] = self.act_temp
161 | llm_kwargs["extra_body"] = {
162 | "chat_template": DEEPSEEK_CHAT_TEMPLATE_NO_THINK,
163 | }
164 | response = self._llm_call_from_messages(messages, **llm_kwargs)
165 | response_text += "\n" + response.text()
166 | action = response.text().strip()
167 | reasoning_end = action.find("")
168 | if reasoning_end == -1:
169 | reasoning_end = (
170 | 0 # Give up and use the entire response as the action.
171 | )
172 | else:
173 | reasoning_end += len("")
174 | else:
175 | reasoning_end += len("")
176 |
177 | # Extract the reasoning part from the response.
178 | thinking = action[:reasoning_end].strip()
179 | # Extract the action part from the response.
180 | action = action[reasoning_end:].strip()
181 |
182 | elif self.llm in ["claude-3.7-sonnet"]:
183 | # Extract the thinking part from the response JSON.
184 | thinking = "".join(
185 | [item.get("thinking", "") for item in response.json()["content"]]
186 | )
187 |
188 | self.history.append((f"{obs}\n> ", f"{action}\n"))
189 |
190 | # Compute usage statistics
191 | stats = {
192 | "prompt": format_messages_to_markdown(messages),
193 | "thinking": thinking,
194 | "response": response_text,
195 | "nb_tokens": self.token_counter(messages=messages, text=response_text),
196 | }
197 |
198 | if thinking is not None:
199 | stats["nb_tokens"] += self.token_counter(text=thinking)
200 |
201 | return action, stats
202 |
203 | def build_messages(self, observation):
204 | messages = [{"role": "system", "content": SYSTEM_PROMPT}]
205 | limit = self.context_limit or len(self.history) + 1
206 |
207 | for i, (obs, action) in enumerate(self.history[-limit:]):
208 | if len(self.history) >= limit and i == 0:
209 | # Add the current observation.
210 | obs = (
211 | f"// History has been truncated to the last {limit} steps.\n...\n> "
212 | )
213 |
214 | messages.append({"role": "user", "content": obs})
215 | messages.append({"role": "assistant", "content": action})
216 |
217 | messages.append({"role": "user", "content": observation})
218 |
219 | # Just in case, let's avoid having multiple messages from the same role.
220 | messages = merge_messages(messages)
221 |
222 | if not self.conversation:
223 | # Merge all messages into a single message except for the system.
224 | content = "".join([msg["content"] for msg in messages[1:]])
225 | messages = messages[:1] + [{"role": "user", "content": content}]
226 |
227 | if not self.allows_system_prompt:
228 | # Make sure the system prompt is added to the following message.
229 | messages[1]["content"] = f"{SYSTEM_PROMPT}\n\n{messages[1]['content']}"
230 | messages.pop(0)
231 |
232 | return messages
233 |
234 |
235 | def build_argparser(parser=None):
236 | parser = parser or argparse.ArgumentParser()
237 | group = parser.add_argument_group("LLMAgent settings")
238 |
239 | group.add_argument(
240 | "--llm",
241 | default="gpt-4o-mini",
242 | help="LLM to be used for evaluation. Default: %(default)s",
243 | )
244 | group.add_argument(
245 | "--seed",
246 | type=int,
247 | default=20241001,
248 | help="Seed for LLM (not all endpoints support this). Default: %(default)s",
249 | )
250 | group.add_argument(
251 | "--act-temp",
252 | type=float,
253 | default=0.0,
254 | help="Temperature for LLM when taking actions. Default: %(default)s",
255 | )
256 | group.add_argument(
257 | "--cot-temp",
258 | type=float,
259 | default=0.0,
260 | help="Temperature for LLM when doing chain-of-thoughts. Default: %(default)s",
261 | )
262 | subgroup = group.add_mutually_exclusive_group(required=True)
263 | subgroup.add_argument(
264 | "--reasoning-effort",
265 | default="medium",
266 | dest="reasoning_effort",
267 | help="Reasoning effort for reasoning-type LLMs.",
268 | )
269 | subgroup.add_argument(
270 | "--cot-max-tokens",
271 | type=int,
272 | default=1024,
273 | dest="reasoning_effort",
274 | help="Maximum number of token for chain-of-thoughts. Default: %(default)s",
275 | )
276 | group.add_argument(
277 | "--context-limit",
278 | type=int,
279 | help="Limit context for LLM (in conversation turns). Default: no limit",
280 | )
281 | group.add_argument(
282 | "--conversation",
283 | required=True,
284 | action=argparse.BooleanOptionalAction,
285 | help="Enable conversation mode. Otherwise, use single prompt.",
286 | )
287 |
288 | return parser
289 |
290 |
291 | register(
292 | name="reasoning",
293 | desc=(
294 | "This agent uses reasoning LLM (o1/o3, deepseek-r1, etc.) to do CoT/thinking followed deciding which action to take."
295 | ),
296 | klass=ReasoningAgent,
297 | add_arguments=build_argparser,
298 | )
299 |
--------------------------------------------------------------------------------
/agents/walkthrough.py:
--------------------------------------------------------------------------------
1 | import argparse
2 |
3 | import tales
4 | from tales.agent import register
5 | from tales.token import get_token_counter
6 |
7 |
8 | class WalkthroughAgent(tales.Agent):
9 | def __init__(self, **kwargs):
10 | self.token_counter = get_token_counter()
11 | self.walkthrough = None
12 |
13 | @property
14 | def uid(self):
15 | return f"WalkthroughAgent"
16 |
17 | @property
18 | def params(self):
19 | return {}
20 |
21 | def reset(self, obs, info, env_name):
22 | # Store the walkthrough in reverse order so we can pop from it.
23 | if self.walkthrough is None:
24 | self.walkthrough = info.get("extra.walkthrough")[::-1]
25 |
26 | def act(self, obs, reward, done, info):
27 | stats = {
28 | "prompt": None,
29 | "response": None,
30 | "nb_tokens": self.token_counter(text=obs),
31 | }
32 |
33 | if len(self.walkthrough) == 0:
34 | return "QUIT", stats
35 |
36 | return self.walkthrough.pop(), stats
37 |
38 |
39 | def build_argparser(parser=None):
40 | return parser or argparse.ArgumentParser()
41 |
42 |
43 | register(
44 | name="walkthrough",
45 | desc=("This agent will follow the walkthrough provided by the environment."),
46 | klass=WalkthroughAgent,
47 | add_arguments=build_argparser,
48 | )
49 |
--------------------------------------------------------------------------------
/docs/website/Gemfile:
--------------------------------------------------------------------------------
1 | source "https://rubygems.org"
2 |
3 | gem "github-pages", group: :jekyll_plugins
--------------------------------------------------------------------------------
/docs/website/_config.yml:
--------------------------------------------------------------------------------
1 | remote_theme: pages-themes/cayman@v0.2.0
2 | plugins:
3 | - jekyll-remote-theme # add this line to the plugins list if you already have one
4 |
5 | title: "the Text Adventure Learning Environment Suite"
6 |
7 | # description: "A Text-games Benchmark"
--------------------------------------------------------------------------------
/docs/website/_includes/footer.html:
--------------------------------------------------------------------------------
1 |
16 |
17 |
--------------------------------------------------------------------------------
/docs/website/_includes/head-custom.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/docs/website/_includes/simple_table.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Rank |
6 | Model |
7 | Organization |
8 | Model Type |
9 | TALES Score |
10 |
11 |
12 |
13 |
14 | 1 |
15 | claude-3.7-sonnet |
16 | Anthropic |
17 | Reasoning |
18 | 52.5% |
19 |
20 |
21 | 2 |
22 | claude-3.5-sonnet-latest |
23 | Anthropic |
24 | Non-reasoning |
25 | 50.4% |
26 |
27 |
28 | 3 |
29 | gemini-2.5-pro-preview* |
30 | Google |
31 | Non-reasoning |
32 | 48.8% |
33 |
34 |
35 | 4 |
36 | o1 |
37 | Anthropic |
38 | Reasoning |
39 | 44.2% |
40 |
41 |
42 | 5 |
43 | gpt-4o |
44 | OpenAI |
45 | Non-reasoning |
46 | 40.6% |
47 |
48 |
49 | 6 |
50 | claude-3.5-haiku |
51 | Anthropic |
52 | Non-reasoning |
53 | 39.6% |
54 |
55 |
56 | 7 |
57 | Llama-3.1-405B-Instruct |
58 | Meta |
59 | Non-reasoning |
60 | 36.4% |
61 |
62 |
63 | 8 |
64 | gemini-2.0-flash |
65 | Google |
66 | Non-reasoning |
67 | 35.0% |
68 |
69 |
70 | 9 |
71 | Llama-3.3-70B-Instruct |
72 | Meta |
73 | Non-reasoning |
74 | 32.8% |
75 |
76 |
77 | 10 |
78 | Llama-3.1-70B-Instruct |
79 | Meta |
80 | Non-reasoning |
81 | 32.0% |
82 |
83 |
84 | 11 |
85 | Qwen2.5-72B-Instruct |
86 | Alibaba |
87 | Non-reasoning |
88 | 30.7% |
89 |
90 |
91 | 12 |
92 | Mistral-Large-Instruct-2407 |
93 | Mistral AI |
94 | Non-reasoning |
95 | 30.3% |
96 |
97 |
98 | 13 |
99 | gpt-4o-mini |
100 | OpenAI |
101 | Non-reasoning |
102 | 21.8% |
103 |
104 |
105 | 14 |
106 | Llama-4-Scout-17B-16E-Instruct |
107 | Meta |
108 | Non-reasoning |
109 | 19.8% |
110 |
111 |
112 | 15 |
113 | Llama-4-Maverick-17B-128E-Instruct |
114 | Meta |
115 | Non-reasoning |
116 | 15.5% |
117 |
118 |
119 | 16 |
120 | Mistral-Small-Instruct-2409 |
121 | Mistral AI |
122 | Non-reasoning |
123 | 14.8% |
124 |
125 |
126 | 17 |
127 | Llama-3.1-8B-Instruct |
128 | Meta |
129 | Non-reasoning |
130 | 13.9% |
131 |
132 |
133 | 18 |
134 | DeepSeek-R1 |
135 | DeepSeek AI |
136 | Reasoning |
137 | 12.4% |
138 |
139 |
140 | 19 |
141 | Qwen2.5-7B-Instruct |
142 | Alibaba |
143 | Non-reasoning |
144 | 11.7% |
145 |
146 |
147 | 20 |
148 | Llama-3.2-3B-Instruct |
149 | Meta |
150 | Non-reasoning |
151 | 10.4% |
152 |
153 |
154 | 21 |
155 | phi-4 |
156 | Microsoft |
157 | Non-reasoning |
158 | 10.3% |
159 |
160 |
161 | 22 |
162 | Mistral-Small-24B-Instruct-2501 |
163 | Mistral AI |
164 | Non-reasoning |
165 | 8.8% |
166 |
167 |
168 | 23 |
169 | DeepSeek-R1-Distill-Llama-70B |
170 | DeepSeek AI |
171 | Reasoning |
172 | 8.4% |
173 |
174 |
175 | 24 |
176 | Ministral-8B-Instruct-2410 |
177 | Mistral AI |
178 | Non-reasoning |
179 | 4.6% |
180 |
181 |
182 | 25 |
183 | Mistral-Small-3.1-24B-Instruct-2503 |
184 | Mistral AI |
185 | Non-reasoning |
186 | 4.5% |
187 |
188 |
189 | 26 |
190 | Mixtral-8x22B-Instruct-v0.1 |
191 | Mistral AI |
192 | Non-reasoning |
193 | 3.7% |
194 |
195 |
196 | 27 |
197 | Llama-3.2-1B-Instruct |
198 | Meta |
199 | Non-reasoning |
200 | 3.3% |
201 |
202 |
203 | 28 |
204 | Phi-3-mini-128k-instruct |
205 | Microsoft |
206 | Non-reasoning |
207 | 2.2% |
208 |
209 |
210 | 29 |
211 | Phi-3.5-MoE-instruct |
212 | Microsoft |
213 | Non-reasoning |
214 | 1.7% |
215 |
216 |
217 | 30 |
218 | Phi-4-mini-instruct |
219 | Microsoft |
220 | Non-reasoning |
221 | 1.5% |
222 |
223 |
224 | 31 |
225 | Mixtral-8x7B-Instruct-v0.1 |
226 | Mistral AI |
227 | Non-reasoning |
228 | 1.3% |
229 |
230 |
231 | 32 |
232 | Phi-3.5-mini-instruct |
233 | Microsoft |
234 | Non-reasoning |
235 | 1.0% |
236 |
237 |
238 | 33 |
239 | Phi-3-medium-128k-instruct |
240 | Microsoft |
241 | Non-reasoning |
242 | 0.7% |
243 |
244 |
245 |
246 |
247 |
--------------------------------------------------------------------------------
/docs/website/_includes/table.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | Model |
6 | Textworld |
7 | Textworld Express |
8 | Alfworld |
9 | Scienceworld |
10 | Jericho |
11 | Overall |
12 |
13 |
14 |
15 |
16 | claude-3.7-sonnet |
17 | 97.3% |
18 | 91.3% |
19 | 83.3% |
20 | 76.5% |
21 | 12.5% |
22 | 52.5% |
23 |
24 |
25 | claude-3.5-sonnet-latest |
26 | 95.5% |
27 | 81.6% |
28 | 75.0% |
29 | 82.3% |
30 | 9.6% |
31 | 50.4% |
32 |
33 |
34 | gemini-2.5-pro-preview* |
35 | 98.5% |
36 | 91.8% |
37 | 75.0% |
38 | 64.2% |
39 | 12.4% |
40 | 48.8% |
41 |
42 |
43 | o1 |
44 | 97.8% |
45 | 70.2% |
46 | 28.3% |
47 | 80.1% |
48 | 10.3% |
49 | 44.2% |
50 |
51 |
52 | gpt-4o |
53 | 83.6% |
54 | 80.6% |
55 | 56.7% |
56 | 61.4% |
57 | 5.6% |
58 | 40.6% |
59 |
60 |
61 | claude-3.5-haiku |
62 | 94.9% |
63 | 79.8% |
64 | 26.7% |
65 | 67.3% |
66 | 5.0% |
67 | 39.6% |
68 |
69 |
70 | Llama-3.1-405B-Instruct |
71 | 90.9% |
72 | 79.2% |
73 | 31.7% |
74 | 51.8% |
75 | 6.1% |
76 | 36.4% |
77 |
78 |
79 | gemini-2.0-flash |
80 | 80.8% |
81 | 76.1% |
82 | 20.0% |
83 | 57.1% |
84 | 5.4% |
85 | 35.0% |
86 |
87 |
88 | Llama-3.3-70B-Instruct |
89 | 69.6% |
90 | 77.2% |
91 | 15.0% |
92 | 55.1% |
93 | 4.5% |
94 | 32.8% |
95 |
96 |
97 | Llama-3.1-70B-Instruct |
98 | 65.6% |
99 | 81.9% |
100 | 8.3% |
101 | 51.9% |
102 | 5.3% |
103 | 32.0% |
104 |
105 |
106 | Qwen2.5-72B-Instruct |
107 | 76.5% |
108 | 83.8% |
109 | 36.7% |
110 | 35.0% |
111 | 2.9% |
112 | 30.7% |
113 |
114 |
115 | Mistral-Large-Instruct-2407 |
116 | 82.4% |
117 | 68.3% |
118 | 6.7% |
119 | 46.1% |
120 | 5.8% |
121 | 30.3% |
122 |
123 |
124 | gpt-4o-mini |
125 | 56.5% |
126 | 73.6% |
127 | 0.0% |
128 | 27.2% |
129 | 1.8% |
130 | 21.8% |
131 |
132 |
133 | Llama-4-Scout-17B-16E-Instruct |
134 | 41.1% |
135 | 68.4% |
136 | 0.0% |
137 | 27.0% |
138 | 1.8% |
139 | 19.8% |
140 |
141 |
142 | Llama-4-Maverick-17B-128E-Instruct- |
143 | 43.5% |
144 | 56.1% |
145 | 8.3% |
146 | 11.5% |
147 | 2.0% |
148 | 15.5% |
149 |
150 |
151 | Mistral-Small-Instruct-2409 |
152 | 56.1% |
153 | 27.3% |
154 | 0.0% |
155 | 24.4% |
156 | 1.4% |
157 | 14.8% |
158 |
159 |
160 | Llama-3.1-8B-Instruct |
161 | 29.7% |
162 | 50.3% |
163 | 0.0% |
164 | 15.7% |
165 | 2.3% |
166 | 13.9% |
167 |
168 |
169 | DeepSeek-R1 |
170 | 37.1% |
171 | 38.6% |
172 | 0.0% |
173 | 15.8% |
174 | 1.0% |
175 | 12.4% |
176 |
177 |
178 | Qwen2.5-7B-Instruct |
179 | 27.7% |
180 | 45.6% |
181 | 0.0% |
182 | 12.6% |
183 | 0.7% |
184 | 11.7% |
185 |
186 |
187 | Llama-3.2-3B-Instruct |
188 | 21.4% |
189 | 42.0% |
190 | 0.0% |
191 | 10.0% |
192 | 1.5% |
193 | 10.4% |
194 |
195 |
196 | phi-4 |
197 | 20.8% |
198 | 43.8% |
199 | 0.0% |
200 | 8.9% |
201 | 1.6% |
202 | 10.3% |
203 |
204 |
205 | Mistral-Small-24B-Instruct-2501 |
206 | 15.8% |
207 | 23.0% |
208 | 0.0% |
209 | 15.8% |
210 | 1.4% |
211 | 8.8% |
212 |
213 |
214 | DeepSeek-R1-Distill-Llama-70B |
215 | 8.7% |
216 | 39.8% |
217 | 0.0% |
218 | 7.7% |
219 | 1.3% |
220 | 8.4% |
221 |
222 |
223 | Ministral-8B-Instruct-2410 |
224 | 10.9% |
225 | 22.8% |
226 | 0.0% |
227 | 2.3% |
228 | 0.4% |
229 | 4.6% |
230 |
231 |
232 | Mistral-Small-3.1-24B-Instruct-2503 |
233 | 2.5% |
234 | 10.3% |
235 | 0.0% |
236 | 10.5% |
237 | 0.8% |
238 | 4.5% |
239 |
240 |
241 | Mixtral-8x22B-Instruct-v0.1 |
242 | 17.1% |
243 | 8.4% |
244 | 0.0% |
245 | 4.0% |
246 | 0.4% |
247 | 3.7% |
248 |
249 |
250 | Llama-3.2-1B-Instruct |
251 | 0.0% |
252 | 19.0% |
253 | 0.0% |
254 | 2.4% |
255 | 0.6% |
256 | 3.3% |
257 |
258 |
259 | Phi-3-mini-128k-instruct |
260 | 2.7% |
261 | 9.4% |
262 | 0.0% |
263 | 2.4% |
264 | 0.3% |
265 | 2.2% |
266 |
267 |
268 | Phi-3.5-MoE-instruct |
269 | 0.0% |
270 | 7.0% |
271 | 0.0% |
272 | 2.3% |
273 | 0.4% |
274 | 1.7% |
275 |
276 |
277 | Phi-4-mini-instruct |
278 | 0.0% |
279 | 5.5% |
280 | 0.0% |
281 | 2.3% |
282 | 0.5% |
283 | 1.5% |
284 |
285 |
286 | Mixtral-8x7B-Instruct-v0.1 |
287 | 0.0% |
288 | 1.6% |
289 | 0.0% |
290 | 4.0% |
291 | 0.3% |
292 | 1.3% |
293 |
294 |
295 | Phi-3.5-mini-instruct |
296 | 0.0% |
297 | 2.0% |
298 | 0.0% |
299 | 2.4% |
300 | 0.5% |
301 | 1.0% |
302 |
303 |
304 | Phi-3-medium-128k-instruct |
305 | 0.0% |
306 | 0.0% |
307 | 0.0% |
308 | 2.3% |
309 | 0.3% |
310 | 0.7% |
311 |
312 |
313 |
314 |
315 |
--------------------------------------------------------------------------------
/docs/website/_includes/test.md:
--------------------------------------------------------------------------------
1 | This is the string you want to save.
--------------------------------------------------------------------------------
/docs/website/_layouts/default.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 | {% seo %}
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 | {% include head-custom.html %}
20 |
21 |
22 |
23 |
24 |
25 | Skip to the content.
26 |
27 |
40 |
41 |
42 | {{ content }}
43 |
44 | {%- include footer.html -%}
45 |
46 |
47 |
48 |
49 |
79 |
80 |
81 |
--------------------------------------------------------------------------------
/docs/website/assets/css/custom.css:
--------------------------------------------------------------------------------
1 | .site-footer-credits {
2 | color: #67747a;
3 | }
4 |
--------------------------------------------------------------------------------
/docs/website/assets/css/style.scss:
--------------------------------------------------------------------------------
1 | ---
2 | ---
3 |
4 | @import "{{ site.theme }}";
5 |
6 | /* Tabs styling */
7 | .tab-container {
8 | width: 100%;
9 | margin-top: 20px;
10 | }
11 |
12 | .tabs, .nested-tabs {
13 | display: flex; /* Add flexbox display */
14 | overflow: hidden;
15 | border: 1px solid #ccc;
16 | background-color: #f1f1f1;
17 | border-radius: 4px 4px 0 0;
18 | width: 100%; /* Ensure full width */
19 | }
20 |
21 | .nested-tabs {
22 | margin-bottom: 15px;
23 | }
24 |
25 | /* Shared styles for both main and nested tab buttons */
26 | .tab-button, .nested-tab-button {
27 | background-color: inherit;
28 | border: none;
29 | outline: none;
30 | cursor: pointer;
31 | transition: 0.3s;
32 | flex: 1; /* Make tabs grow evenly to fill space */
33 | text-align: center; /* Center text in tabs */
34 | }
35 |
36 | /* Main tab button specific styles */
37 | .tab-button {
38 | padding: 14px 16px;
39 | font-size: 16px;
40 | }
41 |
42 | /* Nested tab button specific styles */
43 | .nested-tab-button {
44 | padding: 10px 12px;
45 | font-size: 14px;
46 | }
47 |
48 | .tab-button:hover, .nested-tab-button:hover {
49 | background-color: #ddd;
50 | }
51 |
52 | .tab-button.active {
53 | background-color: #157878;
54 | color: white;
55 | }
56 |
57 | .nested-tab-button.active {
58 | background-color: #1a9a9a; /* Slightly different color to distinguish */
59 | color: white;
60 | }
61 |
62 | .tab-content {
63 | display: none;
64 | padding: 20px;
65 | border: 1px solid #ccc;
66 | border-top: none;
67 | border-radius: 0 0 4px 4px;
68 | width: 100%; /* Ensure content takes full width */
69 | }
70 |
71 | .tab-content.active {
72 | display: block;
73 | }
74 |
75 | .nested-tab-content {
76 | display: none;
77 | padding: 10px 0;
78 | border-top: none;
79 | width: 100%; /* Ensure nested content takes full width */
80 | }
81 |
82 | .nested-tab-content.active {
83 | display: block;
84 | }
85 |
86 | #main-description {
87 | font-weight: normal;
88 | font-style: normal;
89 | }
90 |
91 | /* If needed, control other text properties */
92 | #main-description {
93 | font-size: 16px;
94 | line-height: 1.5;
95 | color: #333;
96 | }
97 |
98 | .author-tagline {
99 | text-align: center;
100 | font-style: italic;
101 | color: #666;
102 | margin-bottom: 20px;
103 | }
104 |
105 | .abstract-container {
106 | background-color: #f5f5f5;
107 | border-radius: 8px;
108 | padding: 20px;
109 | margin: 20px 0;
110 | border-left: 4px solid #ddd;
111 | }
112 |
113 | .abstract-container h3 {
114 | margin-top: 0;
115 | color: #333;
116 | }
117 |
118 | .abstract-container p {
119 | margin-bottom: 0;
120 | line-height: 1.6;
121 | }
122 |
123 | .abstract-tagline {
124 | text-align: center;
125 | font-weight: bold;
126 | color: #666;
127 | margin-bottom: 20px;
128 | }
129 |
130 | /* Table styling */
131 | // .table-container {
132 | // overflow-x: auto;
133 | // margin: 20px 0;
134 | // }
135 |
136 | .model-scores {
137 | width: 100%;
138 | border-collapse: collapse;
139 | font-size: 14px;
140 | }
141 |
142 | .model-scores th {
143 | background-color: #157878;
144 | color: white;
145 | text-align: left;
146 | padding: 10px;
147 | position: sticky;
148 | top: 0;
149 | }
150 |
151 | .model-scores td {
152 | padding: 8px 10px;
153 | border-bottom: 1px solid #ddd;
154 | }
155 |
156 | .model-scores tr:nth-child(even) {
157 | background-color: #f2f2f2;
158 | }
159 |
160 | .model-scores tr:hover {
161 | background-color: #e8f4f4;
162 | }
163 |
164 | /* Responsive design for mobile */
165 | @media screen and (max-width: 768px) {
166 | .model-scores {
167 | font-size: 12px;
168 | }
169 |
170 | .model-scores th, .model-scores td {
171 | padding: 6px;
172 | }
173 | }
174 |
175 | .environment-container {
176 | background-color: #f5f5f5;
177 | border-radius: 8px;
178 | padding: 20px;
179 | margin: 20px 0;
180 | border-left: 4px solid #157878;
181 | }
182 |
183 | .environment-tagline {
184 | text-align: center;
185 | font-weight: bold;
186 | color: #157878;
187 | margin-bottom: 20px;
188 | }
189 |
190 | .environment-container p:not(.environment-tagline) {
191 | margin-bottom: 0;
192 | line-height: 1.6;
193 | }
194 |
195 | .cite-button {
196 | background: none;
197 | border: none;
198 | color: #157878;
199 | cursor: pointer;
200 | font-size: 0.8em;
201 | padding: 0 5px;
202 | vertical-align: middle;
203 | transition: transform 0.2s;
204 | }
205 |
206 | .cite-button:hover {
207 | transform: scale(1.2);
208 | }
209 |
210 | .modal {
211 | display: none;
212 | position: fixed;
213 | z-index: 1000;
214 | left: 0;
215 | top: 0;
216 | width: 100%;
217 | height: 100%;
218 | overflow: auto;
219 | background-color: rgba(0,0,0,0.4);
220 | }
221 |
222 | /* Modal Content */
223 | .modal-content {
224 | background-color: #fefefe;
225 | margin: 10% auto;
226 | padding: 20px;
227 | border: 1px solid #888;
228 | width: 80%;
229 | max-width: 600px;
230 | border-radius: 8px;
231 | box-shadow: 0 4px 8px rgba(0,0,0,0.2);
232 | }
233 |
234 | /* The Close Button */
235 | .close-modal {
236 | color: #aaa;
237 | float: right;
238 | font-size: 28px;
239 | font-weight: bold;
240 | cursor: pointer;
241 | }
242 |
243 | .close-modal:hover,
244 | .close-modal:focus {
245 | color: black;
246 | text-decoration: none;
247 | }
248 |
249 | .citation-popup {
250 | display: none;
251 | position: absolute; /* Use absolute instead of fixed */
252 | z-index: 1000;
253 | background-color: #fefefe;
254 | border: 1px solid #ddd;
255 | border-radius: 8px;
256 | box-shadow: 0 4px 8px rgba(0,0,0,0.2);
257 | width: 400px;
258 | max-width: 90vw;
259 | padding: 15px;
260 | }
261 |
262 | /* Citation box styling */
263 | .citation-box {
264 | background-color: #f9f9f9;
265 | border: 1px solid #ddd;
266 | border-radius: 4px;
267 | padding: 10px;
268 | margin: 10px 0;
269 | max-height: 200px;
270 | overflow-y: auto;
271 | }
272 |
273 | .citation-box pre {
274 | white-space: pre-wrap;
275 | word-wrap: break-word;
276 | margin: 0;
277 | font-family: monospace;
278 | font-size: 12px;
279 | }
280 |
281 | /* Popup header */
282 | .popup-header {
283 | display: flex;
284 | justify-content: space-between;
285 | align-items: center;
286 | margin-bottom: 10px;
287 | }
288 |
289 | .popup-header h3 {
290 | margin: 0;
291 | font-size: 16px;
292 | }
293 |
294 | .close-popup {
295 | cursor: pointer;
296 | color: #888;
297 | font-size: 18px;
298 | font-weight: bold;
299 | }
300 |
301 | .close-popup:hover {
302 | color: #333;
303 | }
304 |
305 | /* Copy button styling */
306 | .copy-button {
307 | background-color: #157878;
308 | color: white;
309 | border: none;
310 | padding: 8px 16px;
311 | text-align: center;
312 | text-decoration: none;
313 | display: inline-block;
314 | font-size: 14px;
315 | margin: 10px 0 0 0;
316 | cursor: pointer;
317 | border-radius: 4px;
318 | transition: background-color 0.3s;
319 | }
320 |
321 | .copy-button:hover {
322 | background-color: #0b5c5c;
323 | }
324 |
325 | /* Existing button styling */
326 | .cite-button {
327 | background: none;
328 | border: none;
329 | color: #157878;
330 | cursor: pointer;
331 | font-size: 0.8em;
332 | padding: 0 5px;
333 | vertical-align: middle;
334 | transition: transform 0.2s;
335 | position: relative; /* For positioning the popup */
336 | }
337 |
338 | .cite-button:hover {
339 | transform: scale(1.2);
340 | }
341 |
342 | .citation-notice {
343 | text-align: center;
344 | font-style: italic;
345 | color: #555;
346 | margin-bottom: 15px;
347 | font-size: 0.9em;
348 | }
349 |
350 | .environment-image {
351 | display: block;
352 | margin: 0 auto;
353 | max-width: 100%;
354 | height: auto;
355 | }
356 |
357 | .environment-image-container {
358 | text-align: center;
359 | margin: 20px 0;
360 | }
361 |
362 |
363 | .image-caption {
364 | text-align: center;
365 | font-size: 0.85em;
366 | color: #666;
367 | margin-top: 5px;
368 | font-style: italic;
369 | }
370 |
371 | /* Add this to your stylesheet */
372 | .table-container {
373 | width: 100%;
374 | overflow-x: hidden;
375 | }
376 |
377 | /* Force the table to fit within the container */
378 | .responsive-table {
379 | width: 100%;
380 | overflow-x: hidden;
381 | }
382 |
383 | /* Style the model-scores table */
384 | .model-scores {
385 | width: 100%;
386 | table-layout: fixed;
387 | border-collapse: collapse;
388 | font-size: 14px; /* Base font size */
389 | }
390 |
391 | /* Give more space to model names, less to percentages */
392 | .model-scores th:first-child,
393 | .model-scores td:first-child {
394 | width: 16%; /* Prioritize model names */
395 | text-align: left;
396 | font-weight: 500;
397 | padding-right: 8px;
398 | }
399 |
400 | /* Make percentage columns very compact */
401 | .model-scores th:not(:first-child),
402 | .model-scores td:not(:first-child) {
403 | width: 14%; /* Distribute remaining 65% across 6 columns */
404 | text-align: center;
405 | padding-left: 2px;
406 | padding-right: 2px;
407 | }
408 |
409 | /* Force text wrapping in all cells */
410 | .model-scores th,
411 | .model-scores td {
412 | word-break: break-word;
413 | overflow-wrap: break-word;
414 | white-space: normal;
415 | padding-top: 4px;
416 | padding-bottom: 4px;
417 | }
418 |
419 | /* Progressive font size reduction for different screen sizes */
420 | @media screen and (max-width: 992px) {
421 | .model-scores {
422 | font-size: 13px;
423 | }
424 | }
425 |
426 | @media screen and (max-width: 768px) {
427 | .model-scores {
428 | font-size: 12px;
429 | }
430 |
431 | .model-scores th:first-child,
432 | .model-scores td:first-child {
433 | width: 16%;
434 | }
435 |
436 | .model-scores th:not(:first-child),
437 | .model-scores td:not(:first-child) {
438 | width: 14%;
439 | }
440 | }
441 |
442 | @media screen and (max-width: 576px) {
443 | .model-scores {
444 | font-size: 10px;
445 | }
446 |
447 | .model-scores th:first-child,
448 | .model-scores td:first-child {
449 | width: 12.5%;
450 | }
451 |
452 | .model-scores th:not(:first-child),
453 | .model-scores td:not(:first-child) {
454 | width: 25%;
455 | }
456 | }
457 |
458 | /* For extremely small screens */
459 | @media screen and (max-width: 400px) {
460 | .model-scores {
461 | font-size: 9px;
462 | }
463 | }
464 |
465 | .simplified-table-container {
466 | width: 100%;
467 | overflow-x: hidden;
468 | max-width: 100%;
469 | display: block;
470 | }
471 |
472 |
473 | /* Make sure simplified-table-container is correctly displayed in nested-tab-content */
474 | #tab6-subtab1 .responsive-table.simplified-table-container {
475 | width: 100%;
476 | overflow-x: auto; /* Change from hidden to auto if table might overflow on small screens */
477 | }
478 |
479 | /* Column-specific widths for the simplified table */
480 | .simplified-table-container .model-scores th:nth-child(1),
481 | .simplified-table-container .model-scores td:nth-child(1) {
482 | width: 10%; /* Rank column - very narrow */
483 | text-align: center;
484 | }
485 |
486 | .simplified-table-container .model-scores th:nth-child(2),
487 | .simplified-table-container .model-scores td:nth-child(2) {
488 | width: 40%; /* Model name column - give it most space */
489 | text-align: center;
490 | }
491 |
492 | .simplified-table-container .model-scores th:nth-child(3),
493 | .simplified-table-container .model-scores td:nth-child(3) {
494 | width: 20%; /* Score column - moderate space */
495 | text-align: center;
496 | }
497 |
498 | .simplified-table-container .model-scores th:nth-child(4),
499 | .simplified-table-container .model-scores td:nth-child(4) {
500 | width: 20%; /* Reasoning column - moderate space */
501 | text-align: center;
502 | }
503 |
504 | .simplified-table-container .model-scores th:nth-child(5),
505 | .simplified-table-container .model-scores td:nth-child(5) {
506 | width: 20%; /* Reasoning column - moderate space */
507 | text-align: center;
508 | }
509 |
510 | /* Responsive adjustments for simplified table */
511 | @media screen and (max-width: 768px) {
512 | .simplified-table-container .model-scores th:nth-child(1),
513 | .simplified-table-container .model-scores td:nth-child(1) {
514 | width: 10%;
515 | }
516 |
517 | .simplified-table-container .model-scores th:nth-child(2),
518 | .simplified-table-container .model-scores td:nth-child(2) {
519 | width: 40%;
520 | }
521 |
522 | .simplified-table-container .model-scores th:nth-child(3),
523 | .simplified-table-container .model-scores td:nth-child(3),
524 | .simplified-table-container .model-scores th:nth-child(4),
525 | .simplified-table-container .model-scores td:nth-child(4),
526 | .simplified-table-container .model-scores td:nth-child(5)
527 | {
528 | width: 10;
529 | }
530 | }
531 |
532 | .asterisk-note {
533 | font-size: 0.85em;
534 | color: #666;
535 | margin-top: -10px;
536 | margin-bottom: 15px;
537 | font-style: italic;
538 | }
539 |
540 | .video-container {
541 | width: 100%;
542 | margin: 1em 0;
543 | position: relative;
544 | }
545 |
546 | .video-container video {
547 | width: 100%;
548 | display: block;
549 | }
--------------------------------------------------------------------------------
/docs/website/assets/figs/alfworld_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/alfworld_all_games.png
--------------------------------------------------------------------------------
/docs/website/assets/figs/alfworld_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/alfworld_image.png
--------------------------------------------------------------------------------
/docs/website/assets/figs/all_framework_scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/all_framework_scores.png
--------------------------------------------------------------------------------
/docs/website/assets/figs/arxiv-logomark-small.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/website/assets/figs/arxiv-logomark.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/website/assets/figs/figure1_eric.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/figure1_eric.png
--------------------------------------------------------------------------------
/docs/website/assets/figs/github-mark.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/website/assets/figs/jericho_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/jericho_all_games.png
--------------------------------------------------------------------------------
/docs/website/assets/figs/jericho_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/jericho_image.png
--------------------------------------------------------------------------------
/docs/website/assets/figs/radar_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/radar_chart.png
--------------------------------------------------------------------------------
/docs/website/assets/figs/radar_chart_zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/radar_chart_zoom.png
--------------------------------------------------------------------------------
/docs/website/assets/figs/scienceworld_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/scienceworld_all_games.png
--------------------------------------------------------------------------------
/docs/website/assets/figs/scienceworld_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/scienceworld_image.png
--------------------------------------------------------------------------------
/docs/website/assets/figs/simon_says_chatgpt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/simon_says_chatgpt.png
--------------------------------------------------------------------------------
/docs/website/assets/figs/static_banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/static_banner.png
--------------------------------------------------------------------------------
/docs/website/assets/figs/text-benchmark_bar_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/text-benchmark_bar_chart.png
--------------------------------------------------------------------------------
/docs/website/assets/figs/text-benchmark_radar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/text-benchmark_radar.png
--------------------------------------------------------------------------------
/docs/website/assets/figs/text-benchmark_radar_zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/text-benchmark_radar_zoom.png
--------------------------------------------------------------------------------
/docs/website/assets/figs/textworld_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/textworld_all_games.png
--------------------------------------------------------------------------------
/docs/website/assets/figs/textworld_express_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/textworld_express_all_games.png
--------------------------------------------------------------------------------
/docs/website/assets/figs/textworld_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/textworld_image.png
--------------------------------------------------------------------------------
/docs/website/assets/figs/zork1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/zork1.png
--------------------------------------------------------------------------------
/docs/website/assets/js/tabs.js:
--------------------------------------------------------------------------------
1 | function openTab(evt, tabName) {
2 | var i, tabcontent, tabbuttons;
3 |
4 | // Hide all tab content
5 | tabcontent = document.getElementsByClassName("tab-content");
6 | for (i = 0; i < tabcontent.length; i++) {
7 | tabcontent[i].style.display = "none";
8 | }
9 |
10 | // Remove "active" class from all tab buttons
11 | tabbuttons = document.getElementsByClassName("tab-button");
12 | for (i = 0; i < tabbuttons.length; i++) {
13 | tabbuttons[i].className = tabbuttons[i].className.replace(" active", "");
14 | }
15 |
16 | // Show the current tab and add "active" class to the button
17 | document.getElementById(tabName).style.display = "block";
18 | evt.currentTarget.className += " active";
19 | }
20 |
21 | // Nested tab functionality
22 | function openNestedTab(evt, tabName) {
23 | var i, tabcontent, tabbuttons;
24 |
25 | // Hide all nested tab content within the parent tab
26 | var parentTab = evt.currentTarget.closest('.tab-content');
27 | tabcontent = parentTab.getElementsByClassName("nested-tab-content");
28 | for (i = 0; i < tabcontent.length; i++) {
29 | tabcontent[i].style.display = "none";
30 | }
31 |
32 | // Remove "active" class from all nested tab buttons
33 | tabbuttons = parentTab.getElementsByClassName("nested-tab-button");
34 | for (i = 0; i < tabbuttons.length; i++) {
35 | tabbuttons[i].className = tabbuttons[i].className.replace(" active", "");
36 | }
37 |
38 | // Show the current nested tab and add "active" class to the button
39 | document.getElementById(tabName).style.display = "block";
40 | evt.currentTarget.className += " active";
41 | }
42 |
43 |
44 | function copyTextToClipboard(elementId, event) {
45 | console.log("Citation button clicked for: " + elementId);
46 |
47 | // Get the citation text
48 | var citationText = document.getElementById(elementId);
49 | if (!citationText) {
50 | console.error("Citation element not found: " + elementId);
51 | return;
52 | }
53 |
54 | // Force create popup if not exists
55 | var popup = document.getElementById('citation-popup');
56 | if (!popup) {
57 | console.log("Creating popup because it doesn't exist yet");
58 | var popupHTML =
59 | '';
69 |
70 | document.body.insertAdjacentHTML('beforeend', popupHTML);
71 | popup = document.getElementById('citation-popup');
72 |
73 | // Set up event handlers for the newly created popup
74 | var closeButton = document.querySelector('.close-popup');
75 | var copyButton = document.getElementById('copy-citation-button');
76 |
77 | if (closeButton) {
78 | closeButton.onclick = function() {
79 | popup.style.display = 'none';
80 | };
81 | }
82 |
83 | if (copyButton) {
84 | copyButton.onclick = function() {
85 | var text = document.getElementById('citation-text').innerText;
86 | navigator.clipboard.writeText(text).then(function() {
87 | copyButton.innerText = 'Copied!';
88 | setTimeout(function() {
89 | copyButton.innerText = 'Copy to Clipboard';
90 | }, 1500);
91 | });
92 | };
93 | }
94 | }
95 |
96 | // Set the citation text in the popup
97 | var citationTextElement = document.getElementById('citation-text');
98 | if (citationTextElement) {
99 | citationTextElement.innerText = citationText.innerText;
100 | }
101 |
102 | // Position the popup near the mouse cursor instead of the button
103 | var x = event.clientX;
104 | var y = event.clientY;
105 |
106 | // Get dimensions
107 | var viewportWidth = window.innerWidth || document.documentElement.clientWidth;
108 | var viewportHeight = window.innerHeight || document.documentElement.clientHeight;
109 | var scrollTop = window.pageYOffset || document.documentElement.scrollTop;
110 | var scrollLeft = window.pageXOffset || document.documentElement.scrollLeft;
111 |
112 | // Show the popup temporarily to get its dimensions
113 | popup.style.visibility = 'hidden';
114 | popup.style.display = 'block';
115 | var popupWidth = popup.offsetWidth;
116 | var popupHeight = popup.offsetHeight;
117 |
118 | // Calculate position to ensure popup stays in viewport
119 | // Add 10px padding from edges
120 | var padding = 10;
121 |
122 | // Position horizontally
123 | if (x + popupWidth + padding > viewportWidth) {
124 | // If too far right, position to the left of cursor
125 | x = Math.max(padding, x - popupWidth - padding);
126 | } else {
127 | // Otherwise position to the right of cursor with padding
128 | x = x + padding;
129 | }
130 |
131 | // Position vertically
132 | if (y + popupHeight + padding > viewportHeight) {
133 | // If too far down, position above cursor
134 | y = Math.max(padding, y - popupHeight - padding);
135 | } else {
136 | // Otherwise position below cursor with padding
137 | y = y + padding;
138 | }
139 |
140 | // Apply the position (convert from viewport coordinates to document coordinates)
141 | popup.style.left = (x + scrollLeft) + 'px';
142 | popup.style.top = (y + scrollTop) + 'px';
143 |
144 | // Make the popup visible
145 | popup.style.visibility = 'visible';
146 |
147 | // Prevent default action and event bubbling
148 | event.preventDefault();
149 | event.stopPropagation();
150 | }
151 |
152 | // Initialize tabs and set up the citation popup
153 | document.addEventListener('DOMContentLoaded', function() {
154 | // Make sure the first tab and its first nested tab are active by default
155 | var firstTabButton = document.querySelector('.tab-button');
156 | if (firstTabButton) {
157 | firstTabButton.click();
158 | }
159 |
160 | // Create the citation popup HTML if it doesn't exist
161 | if (!document.getElementById('citation-popup')) {
162 | var popupHTML =
163 | '';
173 |
174 | document.body.insertAdjacentHTML('beforeend', popupHTML);
175 |
176 | // Now set up the event handlers for the popup
177 | var popup = document.getElementById('citation-popup');
178 | var closeButton = document.querySelector('.close-popup');
179 | var copyButton = document.getElementById('copy-citation-button');
180 |
181 | // Close popup when clicking the close button
182 | if (closeButton) {
183 | closeButton.onclick = function() {
184 | popup.style.display = 'none';
185 | };
186 | }
187 |
188 | // When the user clicks the copy button
189 | if (copyButton) {
190 | copyButton.onclick = function() {
191 | var text = document.getElementById('citation-text').innerText;
192 | navigator.clipboard.writeText(text).then(function() {
193 | // Change button text temporarily to provide feedback
194 | var originalText = copyButton.innerText;
195 | copyButton.innerText = 'Copied!';
196 | setTimeout(function() {
197 | copyButton.innerText = originalText;
198 | }, 1500);
199 | }).catch(function(err) {
200 | console.error('Could not copy text: ', err);
201 | });
202 | };
203 | }
204 |
205 | // Close popup when clicking outside
206 | document.addEventListener('click', function(event) {
207 | if (popup &&
208 | !popup.contains(event.target) &&
209 | !event.target.classList.contains('cite-button') &&
210 | popup.style.display === 'block') {
211 | popup.style.display = 'none';
212 | }
213 | });
214 | }
215 | });
216 |
217 |
218 | document.addEventListener('DOMContentLoaded', function() {
219 | // Find the simplified table by its container
220 | const simplifiedTableContainer = document.getElementById('tab6-subtab1');
221 | if (simplifiedTableContainer) {
222 | const table = simplifiedTableContainer.querySelector('table');
223 | if (table) {
224 | table.classList.add('simplified-scores');
225 | }
226 | }
227 | });
--------------------------------------------------------------------------------
/docs/website/assets/videos/figure1v4.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/videos/figure1v4.mp4
--------------------------------------------------------------------------------
/docs/website/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/favicon.ico
--------------------------------------------------------------------------------
/print_results.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import glob
3 | import os
4 | from os.path import join as pjoin
5 |
6 | import pandas as pd
7 |
8 |
9 | def parse_args():
10 | # fmt: off
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument("--logs", metavar="path", nargs="+", default=["logs"],
13 | help="Paths within which to look for .jsonl files.")
14 | return parser.parse_args()
15 | # fmt: on
16 |
17 |
18 | def main():
19 | args = parse_args()
20 |
21 | results = []
22 | for logpath in args.logs:
23 | for logfile in glob.glob(pjoin(logpath, "**", "*.jsonl"), recursive=True):
24 |
25 | path, _ = os.path.splitext(logfile)
26 | _, agent, env_name, env_params = path.rsplit(os.path.sep, maxsplit=3)
27 | admissible_command, game_seed = env_params.split("_")
28 | admissible_command = bool(int(admissible_command[1]))
29 | agent = agent.split("_", maxsplit=1)[1]
30 |
31 | data = pd.read_json(logfile, lines=True)
32 |
33 | results.append(
34 | {
35 | "agent": agent,
36 | "env_name": env_name,
37 | # "env_params": env_params,
38 | "admissible_command": admissible_command,
39 | "game_seed": game_seed,
40 | "total_tokens": data["Token Usage"].sum(),
41 | "avg_tokens_per_step": data["Token Usage"].mean(),
42 | "norm_score": data["Normalized Score"].max(),
43 | "nb_steps": data["Step"].max(),
44 | # TODO: add more metrics: duration, nb_resets, nb_wins/losts, nb_invalid_actions, in-game moves
45 | }
46 | )
47 | df = pd.DataFrame.from_records(results)
48 |
49 | group = df.groupby(["agent", "admissible_command", "env_name"])
50 | columns = ["total_tokens", "avg_tokens_per_step", "norm_score", "nb_steps"]
51 | print(group[columns].mean())
52 | print()
53 |
54 | group = df.groupby(["agent", "admissible_command"])
55 | aggregated_results = group.agg(
56 | {
57 | "total_tokens": "sum",
58 | "avg_tokens_per_step": "mean",
59 | "norm_score": ["mean", "std"],
60 | "nb_steps": "mean",
61 | }
62 | )
63 | print(aggregated_results)
64 |
65 |
66 | if __name__ == "__main__":
67 | main()
68 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "tale-suite"
7 | version = "1.0.0rc1"
8 | description = "TALES: Text-Adventure Learning Environment Suite"
9 | readme = "README.md"
10 | requires-python = ">=3.12"
11 | dynamic = ["dependencies"]
12 |
13 | classifiers = [
14 | "Programming Language :: Python :: 3",
15 | "License :: OSI Approved :: MIT License",
16 | "Operating System :: OS Independent",
17 | ]
18 |
19 | [tool.setuptools.dynamic]
20 | dependencies = {file = ["requirements.txt"]}
21 |
22 | [tool.setuptools.packages.find]
23 | exclude = ["wandb/*", "logs/*", "website/*"]
24 |
25 |
26 | [project.optional-dependencies]
27 | dev = [
28 | "pytest",
29 | "pre-commit",
30 | "black",
31 | "isort",
32 | ]
33 |
34 | [project.urls]
35 | "Homepage" = "https://github.com/microsoft/tale-suite"
36 | "Bug Tracker" = "https://github.com/microsoft/tale-suite/issues"
37 |
38 | [tool.black]
39 | line-length = 88
40 |
41 | [tool.isort]
42 | profile = "black"
43 | known_third_party = ["wandb"]
44 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tatsu==5.8.3
2 | gymnasium>=1.0.0
3 | jericho>=3.3.0
4 | textworld[pddl]>=1.6.2rc3
5 | textworld-express>=1.1.0rc5
6 | scienceworld>=1.2.2
7 | discoveryworld
8 |
9 | alfworld>=0.4.0
10 |
11 | termcolor
12 | wandb
13 | numpy
14 | pandas
15 |
16 | # llm>=0.18.0
17 | llm @ git+https://github.com/MarcCote/llm.git@add_extra_body_option
18 | llm-anthropic
19 | llm-gemini
20 | llm-azure-openai @ git+https://github.com/MarcCote/llm-azure-openai.git@generic_ad_auth
21 | anthropic
22 | google-genai
23 | tiktoken
24 | tenacity
25 | transformers
26 |
--------------------------------------------------------------------------------
/scripts/example_script.sh:
--------------------------------------------------------------------------------
1 | # This is an example script to show how to use a self-hosted model with vllm to run the twb
2 | model=""
3 |
4 | cat <> .config/io.datasette.llm/extra-openai-models.yaml
5 |
6 | - model_id: $model
7 | model_name: $model
8 | api_base: "http://127.0.0.1:8002/v1"
9 | EOL
10 |
11 | export WANDB_API_KEY=''
12 | # Makes a log folder for vllm. This may error out if you already have a logs folder
13 | mkdir logs
14 |
15 | # Run the vllm server for the meta-llama/Llama-3.1-8B-Instruct model on port 8002. Make sure you have set your HF token
16 | nohup bash -c 'until ! (python -m vllm.entrypoints.openai.api_server --model mistralai/Ministral-8B-Instruct-2410 --port 8002 --tensor-parallel-size 1 --trust-remote-code --host 0.0.0.0 > logs/vllm_1.log 2>&1); do sleep 120; done' &
17 |
18 | # To make sure this doesn't run forever, we let it run for 300 seconds and check every 30 seconds
19 | echo "Waiting for VLLM server to start..."
20 | timeout=500
21 | interval=30
22 | elapsed=0
23 |
24 | # Wait loop with timeout
25 | until curl -s -o /dev/null -w "%{http_code}" http://localhost:8002/v1/models | grep -q "200"; do
26 | if [ $elapsed -ge $timeout ]; then
27 | echo "Timeout reached! VLLM server did not start within 5 minutes."
28 | exit 1
29 | fi
30 | sleep $interval
31 | echo "Pinging vllm server..."
32 | elapsed=$((elapsed + interval))
33 | done
34 |
35 | # Send a test request to the API
36 | curl -X POST "http://localhost:8002/v1/completions" -H "Content-Type: application/json" -d '{"model": "mistralai/Ministral-8B-Instruct-2410", "prompt": "You want to play a (text) game?", "max_tokens": 10}'
37 |
38 | # Run the text games benchmark with the model we just set up for xork1
39 | wandb login
40 |
41 | set -ex
42 |
43 | pids=""
44 |
45 | for i in {1..5}; do
46 | python benchmark.py --agent agents/llm.py zero-shot --conversation --llm mistralai/Ministral-8B-Instruct-2410 --envs JerichoEnvZork1 --context 100 --nb-steps 100 --conversation --seed "20241106$((i))"
47 | pids="$pids $!"
48 | sleep 60
49 | done
50 |
51 | wait $pids
--------------------------------------------------------------------------------
/tales/__init__.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import os
3 | import traceback
4 | import warnings
5 | from collections import defaultdict
6 |
7 | from termcolor import colored
8 |
9 | from tales.agent import Agent
10 | from tales.version import __version__
11 |
12 | root_dir = os.path.dirname(os.path.abspath(__file__))
13 | tasks = []
14 | envs = []
15 | envs_per_task = defaultdict(list)
16 |
17 | _exclude_path = ["__pycache__", "tests"]
18 |
19 | for dirname in os.listdir(root_dir):
20 | if not os.path.isdir(os.path.join(root_dir, dirname)):
21 | continue
22 |
23 | if dirname in _exclude_path:
24 | continue
25 |
26 | if "skip" in os.listdir(os.path.join(root_dir, dirname)):
27 | continue
28 |
29 | if "__init__.py" in os.listdir(os.path.join(root_dir, dirname)):
30 | tasks.append(dirname)
31 |
32 |
33 | for task in tasks:
34 | try:
35 | # Load environments
36 | module = importlib.import_module(f".{task}", package="tales")
37 | environments = getattr(module, "environments", None)
38 | if environments:
39 | for env_name, version in environments:
40 | envs.append(env_name)
41 | envs_per_task[task].append(env_name)
42 | else:
43 | warnings.warn(
44 | "Failed to load `{}.environments`. Skipping the task.".format(task),
45 | UserWarning,
46 | )
47 | continue
48 |
49 | except Exception as e:
50 | warnings.warn(
51 | "Failed to import `{}`. Skipping the task.".format(task), UserWarning
52 | )
53 | warnings.warn(colored(f"{e}", "red"), UserWarning)
54 | warnings.warn(colored(f"{traceback.format_exc()}", "red"), UserWarning)
55 | continue
56 |
57 | envs_per_task = dict(envs_per_task)
58 | env2task = {env: task for task, envs in envs_per_task.items() for env in envs}
59 |
60 | __all__ = ["Agent", "__version__", "envs", "envs_per_task", "tasks"]
61 |
--------------------------------------------------------------------------------
/tales/agent.py:
--------------------------------------------------------------------------------
1 | class Agent:
2 |
3 | def reset(self, obs, info, env):
4 | pass
5 |
6 | def act(self, obs, reward, done, info):
7 | raise NotImplementedError("Child class must implement this method.")
8 |
9 | @property
10 | def uid(self):
11 | """Unique identifier for this agent.
12 |
13 | Usually, this is a string that contains the class name and the values of the
14 | parameters used to initialize the agent.
15 | """
16 | # return f"{self.__class__.__name__}_" + "_".join(
17 | # f"{k}:{v}" for k, v in self.kwargs.items()
18 | # ).strip("_")
19 | raise NotImplementedError("Child class must implement this property.")
20 |
21 | @property
22 | def params(self):
23 | """Parameters used to initialize the agent.
24 |
25 | Returns:
26 | dict: Parameters used to initialize the agent.
27 | """
28 | # return self.kwargs
29 | raise NotImplementedError("Child class must implement this property.")
30 |
31 |
32 | # Registry for available agents to benchmark.
33 | AGENTS = {}
34 |
35 |
36 | def register(name: str, desc: str, klass: callable, add_arguments: callable) -> None:
37 | """ Register a new type of Agent.
38 |
39 | Arguments:
40 | name:
41 | Name of the agent (must be unique).
42 | desc:
43 | Bried description of how the agent works (for `benchmark.py --help`).
44 | klass:
45 | Class used to instantiate the agent.
46 | add_arguments:
47 | Function that should add the `argparse` arguments needed for this agent.
48 | The provided function should expect a `argparse.ArgumentParser` object.
49 |
50 | Example:
51 |
52 | >>> from tales.agent import register
53 | >>> from tales.agents import RandomAgent
54 | >>> def _add_arguments(parser):
55 | parser.add_argument("--seed", required=True, type=int,
56 | help="Random seed to use.")
57 | >>> \
58 | >>> register(name="random",
59 | >>> desc="This agent randomly select actions.",
60 | >>> klass=RandomAgent,
61 | >>> add_arguments=_add_arguments)
62 | """
63 | if name in AGENTS:
64 | raise ValueError(f"Agent '{name}' already registered.")
65 |
66 | AGENTS[name] = (desc, klass, add_arguments)
67 |
--------------------------------------------------------------------------------
/tales/alfworld/__init__.py:
--------------------------------------------------------------------------------
1 | import gymnasium as gym
2 |
3 | from .alfworld_data import TASK_TYPES, prepare_alfworld_data
4 | from .alfworld_env import ALFWorldTask
5 |
6 | environments = []
7 |
8 | for split in ["seen", "unseen"]:
9 | for task_type in TASK_TYPES:
10 | task_name = task_type.replace("_", " ").title().replace(" ", "")
11 | env_name = f"ALFWorld{task_name}{split.title()}"
12 | environments.append([env_name, "v0"])
13 |
14 | gym.register(
15 | id=f"tales/{env_name}-v0",
16 | entry_point="tales.alfworld:ALFWorldTask",
17 | kwargs={"task_type": task_type, "split": split},
18 | )
19 |
20 |
21 | def download():
22 | prepare_alfworld_data()
23 |
--------------------------------------------------------------------------------
/tales/alfworld/alfworld_data.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import zipfile
4 | from os.path import join as pjoin
5 |
6 | from tales.config import TALES_CACHE_HOME, TALES_FORCE_DOWNLOAD
7 | from tales.utils import download
8 |
9 | TASK_TYPES = [
10 | "pick_and_place_simple",
11 | "look_at_obj_in_light",
12 | "pick_clean_then_place_in_recep",
13 | "pick_heat_then_place_in_recep",
14 | "pick_cool_then_place_in_recep",
15 | "pick_two_obj_and_place",
16 | ]
17 |
18 | ALFWORLD_DATA_URL = "https://github.com/alfworld/alfworld/releases/download/0.4.2/json_2.1.3_tw-pddl.zip"
19 | TALES_CACHE_ALFWORLD = pjoin(TALES_CACHE_HOME, "alfworld")
20 | TALES_CACHE_ALFWORLD_DATA_ZIP = pjoin(TALES_CACHE_ALFWORLD, "json_2.1.3_tw-pddl.zip")
21 | TALES_CACHE_ALFWORLD_VALID_SEEN = pjoin(
22 | TALES_CACHE_ALFWORLD, "json_2.1.1", "valid_seen"
23 | )
24 | TALES_CACHE_ALFWORLD_VALID_UNSEEN = pjoin(
25 | TALES_CACHE_ALFWORLD, "json_2.1.1", "valid_unseen"
26 | )
27 |
28 |
29 | def prepare_alfworld_data(force=TALES_FORCE_DOWNLOAD):
30 | os.makedirs(TALES_CACHE_ALFWORLD, exist_ok=True)
31 | data_exists = os.path.exists(TALES_CACHE_ALFWORLD_VALID_SEEN) and os.path.exists(
32 | TALES_CACHE_ALFWORLD_VALID_UNSEEN
33 | )
34 | if data_exists and not force:
35 | return
36 |
37 | if not os.path.exists(TALES_CACHE_ALFWORLD_DATA_ZIP) or force:
38 | download(
39 | ALFWORLD_DATA_URL,
40 | dst=TALES_CACHE_ALFWORLD,
41 | desc="Downloading ALFWorld data",
42 | force=force,
43 | )
44 |
45 | # Extract the content of the folder test from the downloaded file
46 | with zipfile.ZipFile(TALES_CACHE_ALFWORLD_DATA_ZIP, "r") as zip_ref:
47 | # Only extract the test folder
48 | for member in zip_ref.namelist():
49 | if "valid_seen" in member or "valid_unseen" in member:
50 | zip_ref.extract(member, TALES_CACHE_ALFWORLD)
51 |
52 |
53 | def get_alfworld_game(task_type, split="seen"):
54 | prepare_alfworld_data() # make sure the data is ready
55 |
56 | if split == "seen":
57 | root = TALES_CACHE_ALFWORLD_VALID_SEEN
58 | elif split == "unseen":
59 | root = TALES_CACHE_ALFWORLD_VALID_UNSEEN
60 | else:
61 | raise ValueError(f"Unknown split: {split}")
62 |
63 | game_files = sorted(glob.glob(pjoin(root, f"{task_type}*", "**", "*.tw-pddl")))
64 | return game_files
65 |
--------------------------------------------------------------------------------
/tales/alfworld/alfworld_env.py:
--------------------------------------------------------------------------------
1 | import gymnasium as gym
2 | import textworld
3 | import textworld.gym
4 | from alfworld.agents.environment.alfred_tw_env import AlfredDemangler
5 | from textworld.envs.wrappers import Filter
6 |
7 | from . import alfworld_data
8 |
9 |
10 | class ALFWorldEnv(gym.Env):
11 |
12 | def __init__(self, gamefile, admissible_commands=False, *args, **kwargs):
13 | self.infos = textworld.EnvInfos(
14 | score=True,
15 | max_score=True,
16 | won=True,
17 | lost=True,
18 | feedback=True,
19 | moves=True,
20 | admissible_commands=admissible_commands,
21 | extras=["walkthrough", "expert_plan"],
22 | )
23 | self.gamefile = gamefile
24 | self.env = None
25 |
26 | def reset(self, *, seed=None, options=None):
27 | super().reset(seed=seed, options=options)
28 |
29 | if self.env is None:
30 | self.env = textworld.start(
31 | self.gamefile, self.infos, wrappers=[Filter, AlfredDemangler()]
32 | )
33 |
34 | obs, info = self.env.reset()
35 | info["feedback"] = obs
36 | info["score"] = 0
37 | info["max_score"] = 1
38 | return obs, info
39 |
40 | def step(self, action):
41 | obs, done, reward, info = self.env.step(action)
42 | # if obs == "Nothing happens.":
43 | # obs = "Invalid command or this command can't be used in this context. Type 'help' for a list of available commands."
44 |
45 | info["feedback"] = obs
46 | info["score"] = int(done)
47 | info["max_score"] = 1
48 | return obs, done, reward, info
49 |
50 |
51 | class ALFWorldTask(ALFWorldEnv):
52 |
53 | def __init__(self, task_type, split, *args, **kwargs):
54 | self.gamefiles = sorted(alfworld_data.get_alfworld_game(task_type, split))
55 | super().__init__(self.gamefiles[0], *args, **kwargs)
56 |
57 | def reset(self, *, seed=None, options=None):
58 | if seed is not None:
59 | self.gamefile = self.gamefiles[seed % len(self.gamefiles)]
60 | if self.env is not None:
61 | self.env.close()
62 | self.env = None
63 |
64 | return super().reset(seed=seed, options=options)
65 |
--------------------------------------------------------------------------------
/tales/config.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | DEFAULT_TALES_CACHE_HOME = os.path.expanduser("~/.cache/tales")
4 | TALES_CACHE_HOME = os.getenv("TALES_CACHE_HOME", DEFAULT_TALES_CACHE_HOME)
5 | os.environ["TALES_CACHE_HOME"] = (
6 | TALES_CACHE_HOME # Set the environment variable, in case it wasn't.
7 | )
8 | os.makedirs(TALES_CACHE_HOME, exist_ok=True)
9 |
10 | # Check if cache is flag is set to force download
11 | TALES_FORCE_DOWNLOAD = os.getenv("TALES_FORCE_DOWNLOAD", "false").lower() in (
12 | "yes",
13 | "true",
14 | "t",
15 | "1",
16 | )
17 |
--------------------------------------------------------------------------------
/tales/download.py:
--------------------------------------------------------------------------------
1 | import importlib
2 | import traceback
3 | import warnings
4 |
5 | from termcolor import colored
6 | from tqdm import tqdm
7 |
8 | from tales import tasks
9 |
10 |
11 | def download():
12 | for task in tqdm(tasks, desc="Downloading data for TALES"):
13 | try:
14 | module = importlib.import_module(f".{task}", package="tales")
15 | module.download()
16 | except Exception as e:
17 | warnings.warn(
18 | "Failed to download data for `{task}`.",
19 | UserWarning,
20 | )
21 | warnings.warn(colored(f"{e}", "red"), UserWarning)
22 | warnings.warn(colored(f"{traceback.format_exc()}", "red"), UserWarning)
23 | continue
24 |
25 |
26 | if __name__ == "__main__":
27 | download()
28 |
--------------------------------------------------------------------------------
/tales/jericho/__init__.py:
--------------------------------------------------------------------------------
1 | import gymnasium as gym
2 |
3 | from .jericho_data import GAMES_INFOS, prepare_jericho_data
4 | from .jericho_env import JerichoEnv
5 |
6 | environments = []
7 |
8 | for game, infos in GAMES_INFOS.items():
9 | env_name = f"JerichoEnv{game.title()}"
10 | environments.append([env_name, "v0"])
11 |
12 | gym.register(
13 | id=f"tales/{env_name}-v0",
14 | entry_point="tales.jericho:JerichoEnv",
15 | kwargs={"game": game},
16 | )
17 |
18 |
19 | def download():
20 | prepare_jericho_data()
21 |
--------------------------------------------------------------------------------
/tales/jericho/games.json:
--------------------------------------------------------------------------------
1 | {
2 | "905": {
3 | "filename": "905.z5",
4 | "info": "http://ifdb.tads.org/viewgame?id=qzftg3j8nh5f34i2",
5 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/905.z5",
6 | "md5": "4c5067169b834d247a30bb08d1039896"
7 | },
8 | "acorncourt": {
9 | "filename": "acorncourt.z5",
10 | "info": "http://ifdb.tads.org/viewgame?id=tqvambr6vowym20v",
11 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/acorncourt.z5",
12 | "md5": "a61400439aa76f8faba3b8f01edd4a72"
13 | },
14 | "advent": {
15 | "filename": "advent.z5",
16 | "info": "http://ifdb.tads.org/viewgame?id=fft6pu91j85y4acv",
17 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/Advent.z5",
18 | "md5": "ee2242e155fd8910921b0f8e04019a3a"
19 | },
20 | "adventureland": {
21 | "filename": "adventureland.z5",
22 | "info": "http://ifdb.tads.org/viewgame?id=dy4ok8sdlut6ddj7",
23 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/Adventureland.z5",
24 | "md5": "a42545bd17330ae5e6fed02270ccfb4a"
25 | },
26 | "afflicted": {
27 | "filename": "afflicted.z8",
28 | "info": "http://ifdb.tads.org/viewgame?id=epl4q2933rczoo9x",
29 | "link": "http://mirror.ifarchive.org/if-archive/games/competition2008/zcode/afflicted/afflicted.z8",
30 | "md5": "064272be87de7106192b6fb743c4dfc4"
31 | },
32 | "anchor": {
33 | "filename": "anchor.z8",
34 | "info": "http://ifdb.tads.org/viewgame?id=op0uw1gn1tjqmjt7",
35 | "link": "http://ifarchive.org/if-archive/games/zcode/anchor.z8",
36 | "md5": "c043df8624e0e1e9fda92f1a74b6e402"
37 | },
38 | "awaken": {
39 | "filename": "awaken.z5",
40 | "info": "http://ifdb.tads.org/viewgame?id=rwseuddvj1gbo481",
41 | "link": "https://github.com/danielricks/textplayer/raw/master/games/awaken.z5",
42 | "md5": "9ba48c72d96ab3e7956a8570b12d34d6"
43 | },
44 | "balances": {
45 | "filename": "balances.z5",
46 | "info": "http://ifdb.tads.org/viewgame?id=x6ne0bbd2oqm6h3a",
47 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/Balances.z5",
48 | "md5": "f2cb8f94a7e8df3b850a758da26fa387"
49 | },
50 | "ballyhoo": {
51 | "filename": "ballyhoo.z3",
52 | "info": "http://ifdb.tads.org/viewgame?id=b0i6bx7g4rkrekgg",
53 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FBallyhoo%20v97%20%281986%29%28Infocom%29.zip:BALLYHOO.DAT",
54 | "md5": "5d54e326815b0ed3aff8efb8ff02ef2f"
55 | },
56 | "curses": {
57 | "filename": "curses.z5",
58 | "info": "http://ifdb.tads.org/viewgame?id=plvzam05bmz3enh8",
59 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/curses.z5",
60 | "md5": "f06a42a29a5a4e6aa70958c9ae4c37cd"
61 | },
62 | "cutthroat": {
63 | "filename": "cutthroat.z3",
64 | "info": "http://ifdb.tads.org/viewgame?id=4ao65o1u0xuvj8jf",
65 | "link": "https://github.com/BYU-PCCL/z-machine-games/raw/master/jericho-game-suite/cutthroat.z3",
66 | "md5": "216eeeba1c8017a77343dc8482f6f185"
67 | },
68 | "deephome": {
69 | "filename": "deephome.z5",
70 | "info": "http://ifdb.tads.org/viewgame?id=x85otcikhwp8bwup",
71 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/deephome.z5",
72 | "md5": "5e56a6e5cdeecded434a8fd8012fc2c6"
73 | },
74 | "detective": {
75 | "filename": "detective.z5",
76 | "info": "http://ifdb.tads.org/viewgame?id=1po9rgq2xssupefw",
77 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/detective.z5",
78 | "md5": "822655c9be83e292e06d3d3b1d6a9734"
79 | },
80 | "dragon": {
81 | "filename": "dragon.z5",
82 | "info": "http://ifdb.tads.org/viewgame?id=sjiyffz8n5patu8l",
83 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/dragon.zip:Dragon.z5",
84 | "md5": "96d314997e5d3a5a793c83845977d44d"
85 | },
86 | "enchanter": {
87 | "filename": "enchanter.z3",
88 | "info": "http://ifdb.tads.org/viewgame?id=vu4xhul3abknifcr",
89 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FEnchanter%20v24%20%281984%29%28Infocom%29%5Bh%5D.zip:ench_24.z3",
90 | "md5": "ad3cdea88d81033fe29167688bd98c31"
91 | },
92 | "enter": {
93 | "filename": "enter.z5",
94 | "info": "http://ifdb.tads.org/viewgame?id=ld1f3t5epeagilfz",
95 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/enter.z5",
96 | "md5": "4c48ba2c5523d78c5f7f9b7809d16b1d"
97 | },
98 | "gold": {
99 | "filename": "gold.z5",
100 | "info": "http://ifdb.tads.org/viewgame?id=59ztsy9p01avd6wp",
101 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/gold.z5",
102 | "md5": "f275ddf32ce8a9e744d53c3b99c5a658"
103 | },
104 | "hhgg": {
105 | "filename": "hhgg.z3",
106 | "info": "http://ifdb.tads.org/viewgame?id=ouv80gvsl32xlion",
107 | "link": "https://github.com/BYU-PCCL/z-machine-games/raw/master/jericho-game-suite/hhgg.z3",
108 | "md5": "6666389f60e0c8e4ceb08242a263bb52"
109 | },
110 | "hollywood": {
111 | "filename": "hollywood.z3",
112 | "info": "http://ifdb.tads.org/viewgame?id=jnfkbgdgopwfqist",
113 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FHollywood%20Hijinx%20v235%20%281986%29%28Infocom%29%5Bh%5D%5B861118%5D.zip:hollywoo_235.z3",
114 | "md5": "1ea91a064941a3f612b20833f0a47df7"
115 | },
116 | "huntdark": {
117 | "filename": "huntdark.z5",
118 | "info": "http://ifdb.tads.org/viewgame=mh1a6hizgwjdbeg7",
119 | "link": "http://mirror.ifarchive.org/if-archive/games/competition99/inform/huntdark/huntdark.z5",
120 | "md5": "253b02c8012710577085b9fd3a155cb7"
121 | },
122 | "infidel": {
123 | "filename": "infidel.z3",
124 | "info": "http://ifdb.tads.org/viewgame?id=anu79a4n1jedg5mm",
125 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FInfidel%20v22%20%281983%29%28Infocom%29%5B830916%5D.zip:INFIDEL.DAT",
126 | "md5": "2fe5b5693fa60b0cf8621402423994b1"
127 | },
128 | "inhumane": {
129 | "filename": "inhumane.z5",
130 | "info": "http://ifdb.tads.org/viewgame?id=wvs2vmbigm9unlpd",
131 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/inhumane.z5",
132 | "md5": "84d3ce7ccfafb873736490811a0cc78c"
133 | },
134 | "jewel": {
135 | "filename": "jewel.z5",
136 | "info": "http://ifdb.tads.org/viewgame?id=hu60gp1bgkhlo5yx",
137 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/jewel.z5",
138 | "md5": "1eef9c0fa009ca4adf4872cfc5249d45"
139 | },
140 | "karn": {
141 | "filename": "karn.z5",
142 | "info": "http://ifdb.tads.org/viewgame?id=bx8118ggp6j7nslo",
143 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/karn.z5",
144 | "md5": "ec55791be814db3663ad1aec0d6b7690"
145 | },
146 | "library": {
147 | "filename": "library.z5",
148 | "info": "http://ifdb.tads.org/viewgame?id=400zakqderzjnu1i",
149 | "link": "http://mirror.ifarchive.org/if-archive/games/competition95/library.z5",
150 | "md5": "389acf3b617a40dc4848da3bda62ce06"
151 | },
152 | "loose": {
153 | "filename": "loose.z5",
154 | "info": "http://ifdb.tads.org/viewgame?id=4wd3lyaxi4thp8qi",
155 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/loose.z5",
156 | "md5": "31a0c1e360dce94aa5bece5240691d17"
157 | },
158 | "lostpig": {
159 | "filename": "lostpig.z8",
160 | "info": "http://ifdb.tads.org/viewgame?id=mohwfk47yjzii14w",
161 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/LostPig.z8",
162 | "md5": "aaf0b90fbb31717481c02832bf412070"
163 | },
164 | "ludicorp": {
165 | "filename": "ludicorp.z5",
166 | "info": "http://ifdb.tads.org/viewgame?id=r6g7pflngn3uxbam",
167 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/ludicorp.z5",
168 | "md5": "646a63307f77dcdcd011f330277ae262"
169 | },
170 | "lurking": {
171 | "filename": "lurking.z3",
172 | "info": "http://ifdb.tads.org/viewgame?id=jhbd0kja1t57uop",
173 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FLurking%20Horror%2C%20The%20v219%20%281987%29%28Infocom%29%5B870912%5D.zip:Lurking.z3",
174 | "md5": "5f42ff092a2f30471ae98150ef4da2e1"
175 | },
176 | "moonlit": {
177 | "filename": "moonlit.z5",
178 | "info": "http://ifdb.tads.org/viewgame?id=10387w68qlwehbyq",
179 | "link": "http://mirror.ifarchive.org/if-archive/games/competition2002/zcode/moonlit/Moonlit.z5",
180 | "md5": "bf75b9651cff0e2d04302f19c443588e"
181 | },
182 | "murdac": {
183 | "filename": "murdac.z5",
184 | "info": "http://ifdb.tads.org/viewgame?id=q36lh5np0q9nak28",
185 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/Murdac.z5",
186 | "md5": "570179d4f21b2f600862dbffbb5afc3e"
187 | },
188 | "night": {
189 | "filename": "night.z5",
190 | "info": "http://ifdb.tads.org/viewgame?id=ydhwa11st460g9u3",
191 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/night.z5",
192 | "md5": "72125f159cccd581786ac16a2828d4e3"
193 | },
194 | "omniquest": {
195 | "filename": "omniquest.z5",
196 | "info": "http://ifdb.tads.org/viewgame?id=mygqz9tzxqvryead",
197 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/omniquest.z5",
198 | "md5": "80ea198bca425b6d819c74bfa854236e"
199 | },
200 | "partyfoul": {
201 | "filename": "partyfoul.z8",
202 | "info": "http://ifdb.tads.org/viewgame?id=cqwq699i9qiqdju",
203 | "link": "http://mirror.ifarchive.org/if-archive/games/mini-comps/cgdc7/PartyFoul.zblorb",
204 | "md5": "d221daa82708c4e54447f1a884c239ef"
205 | },
206 | "pentari": {
207 | "filename": "pentari.z5",
208 | "info": "http://ifdb.tads.org/viewgame?id=llchvog0ukwrphih",
209 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/pentari.z5",
210 | "md5": "f24c6863468823b744e910ccfe997c6d"
211 | },
212 | "planetfall": {
213 | "filename": "planetfall.z3",
214 | "info": "http://ifdb.tads.org/viewgame?id=xe6kb3cuqwie2q38",
215 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FPlanetfall%20v29%20%281983%29%28Infocom%29%5B840118%5D.zip:planetfa.z3",
216 | "md5": "6487dc814b280f5603c53155de378d27"
217 | },
218 | "plundered": {
219 | "filename": "plundered.z3",
220 | "info": "http://ifdb.tads.org/viewgame?id=ddagftras22bnz8h",
221 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FPlundered%20Hearts%20v26%20%281987%29%28Infocom%29%5B870730%5D.zip:PLUNDERE.DAT",
222 | "md5": "29fc7b270af2fbd406a0548a8298da7f"
223 | },
224 | "reverb": {
225 | "filename": "reverb.z5",
226 | "info": "http://ifdb.tads.org/viewgame?id=dop7nbjl90r5zmf9",
227 | "link": "http://mirror.ifarchive.org/if-archive/games/competition96/reverb/reverb.z5",
228 | "md5": "80d286fbfe624c621266b568c0076717"
229 | },
230 | "seastalker": {
231 | "filename": "seastalker.z3",
232 | "info": "http://ifdb.tads.org/viewgame?id=56wb8hflec2isvzm",
233 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FSeastalker%20v86%20%281984%29%28Infocom%29%28beta%29%5B840320%5D.zip:SEASTALK.z3",
234 | "md5": "ee339dbdbb0792f67e20bd71bafe0ea5"
235 | },
236 | "sherlock": {
237 | "filename": "sherlock.z5",
238 | "info": "http://ifdb.tads.org/viewgame?id=j8lmspy4iz73mx26",
239 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FSherlock%20-%20The%20Riddle%20of%20the%20Crown%20Jewels%20v21%20%281987%29%28Infocom%29%5B871214%5D.zip:SHER.z5",
240 | "md5": "35240654d83f9e7073973d338f9657b8"
241 | },
242 | "snacktime": {
243 | "filename": "snacktime.z8",
244 | "info": "http://ifdb.tads.org/viewgame?id=yr3y8s9k8e40hl5q",
245 | "link": "http://mirror.ifarchive.org/if-archive/games/competition2008/zcode/snack/snacktime.z8",
246 | "md5": "0ff228d12d7cb470dc1a8e9a5151769b"
247 | },
248 | "sorcerer": {
249 | "filename": "sorcerer.z3",
250 | "info": "http://ifdb.tads.org/viewgame?id=lidg5nx9ig0bwk55",
251 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FSorcerer%20v18%20%281984%29%28Infocom%29%5Bh%5D%5B860904%5D.zip:sorcerer_18.z3",
252 | "md5": "20f1468a058d0a6de016ae70022e651c"
253 | },
254 | "spellbrkr": {
255 | "filename": "spellbrkr.z3",
256 | "info": "http://ifdb.tads.org/viewgame?id=wqsmrahzozosu3r",
257 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FSpellbreaker%20v63%20%281985%29%28Infocom%29%5B850916%5D.zip:spelbrkr.z3",
258 | "md5": "7a92ce19a39bedd970d0f1e296981f71"
259 | },
260 | "spirit": {
261 | "filename": "spirit.z5",
262 | "info": "http://ifdb.tads.org/viewgame?id=tqpowvmdoemtooqf",
263 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/spirit.z5",
264 | "md5": "808039c4e9554bdd15d7793539b3bd97"
265 | },
266 | "temple": {
267 | "filename": "temple.z5",
268 | "info": "http://ifdb.tads.org/viewgame?id=kq9qgjkf2k6xn1c0",
269 | "link": "http://mirror.ifarchive.org/if-archive/games/competition2002/zcode/temple/temple.z5",
270 | "md5": "047842c7b25c3d477b728cf3412e33de"
271 | },
272 | "theatre": {
273 | "filename": "theatre.z5",
274 | "info": "http://ifdb.tads.org/viewgame?id=bv8of8y9xeo7307g",
275 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/theatre.z5",
276 | "md5": "33dcc5085acb290d1817e07653c13480"
277 | },
278 | "trinity": {
279 | "filename": "trinity.z4",
280 | "info": "http://ifdb.tads.org/viewgame?id=j18kjz80hxjtyayw",
281 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FTrinity%20v12%20%281986%29%28Infocom%29%5B860926%5D.zip:TRINITY.z4",
282 | "md5": "3bf1a444a1fc2057130ecb9806117233"
283 | },
284 | "tryst205": {
285 | "filename": "tryst205.z5",
286 | "info": "http://ifdb.tads.org/viewgame?id=ic0ebhbi70bdmyc2",
287 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/tryst205.z5",
288 | "md5": "fc65ad8d4588da92fd39871f6f7463db"
289 | },
290 | "weapon": {
291 | "filename": "weapon.z5",
292 | "info": "http://ifdb.tads.org/viewgame?id=tcebhl79rlxo3qrk",
293 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/weapon.zip:weapon.z5",
294 | "md5": "c632204be3849d6c5bb6f4eb5aca3cc0"
295 | },
296 | "wishbringer": {
297 | "filename": "wishbringer.z3",
298 | "info": "http://ifdb.tads.org/viewgame?id=z02joykzh66wfhcl",
299 | "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FWishbringer%20-%20The%20Magick%20Stone%20of%20Dreams%20v68%20%281985%29%28Infocom%29%5B850501%5D.zip:WISHBRIN.z3",
300 | "md5": "87ed53d854f7e57c36106fca3b9cf5a6"
301 | },
302 | "yomomma": {
303 | "filename": "yomomma.z8",
304 | "info": "http://ifdb.tads.org/viewgame?id=1iqmpkn009h9gbug",
305 | "link": "http://nitku.net/if/yomomma/yomomma.zblorb",
306 | "md5": "5b10162a7a134e7b4c381ecedfb4bc44"
307 | },
308 | "zenon": {
309 | "filename": "zenon.z5",
310 | "info": "http://ifdb.tads.org/viewgame?id=rw7zv98mifbr3335",
311 | "link": "http://mirror.ifarchive.org/if-archive/games/zcode/zenon.z5",
312 | "md5": "631cc926b4251f5a5f646d3a6bdac8c6"
313 | },
314 | "zork1": {
315 | "filename": "zork1.z5",
316 | "info": "http://ifdb.tads.org/viewgame?id=0dbnusxunq7fw5ro",
317 | "link": "http://www.batmantis.com/zorks/zork1.z5",
318 | "md5": "b732a93a6244ddd92a9b9a3e3a46c687"
319 | },
320 | "zork2": {
321 | "filename": "zork2.z5",
322 | "info": "http://ifdb.tads.org/viewgame?id=yzzm4puxyjakk8c4",
323 | "link": "http://www.batmantis.com/zorks/zork2.z5",
324 | "md5": "5bcd91ee055e9bd42812617571be227b"
325 | },
326 | "zork3": {
327 | "filename": "zork3.z5",
328 | "info": "http://ifdb.tads.org/viewgame?id=vrsot1zgy1wfcdru",
329 | "link": "http://www.batmantis.com/zorks/zork3.z5",
330 | "md5": "ffda9ee2d428fa2fa8e75a1914ff6959"
331 | },
332 | "ztuu": {
333 | "filename": "ztuu.z5",
334 | "info": "http://ifdb.tads.org/viewgame?id=40hswtkhap88gzvn",
335 | "link": "http://www.batmantis.com/zorks/ztuu.z5",
336 | "md5": "d8e1578470cbc676e013e03d72c93141"
337 | }
338 | }
--------------------------------------------------------------------------------
/tales/jericho/jericho_data.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | from os.path import join as pjoin
4 |
5 | from tales.config import TALES_CACHE_HOME, TALES_FORCE_DOWNLOAD
6 | from tales.utils import download
7 |
8 | GAMES_URLS = "https://github.com/BYU-PCCL/z-machine-games/raw/master/jericho-game-suite"
9 | TALES_CACHE_JERICHO = pjoin(TALES_CACHE_HOME, "jericho")
10 |
11 |
12 | with open(pjoin(os.path.dirname(__file__), "games.json")) as f:
13 | GAMES_INFOS = json.load(f)
14 |
15 | # Remove known games that are not working.
16 | GAMES_INFOS.pop("hollywood", None)
17 | GAMES_INFOS.pop("theatre", None)
18 |
19 |
20 | def prepare_jericho_data(force=TALES_FORCE_DOWNLOAD):
21 | os.makedirs(TALES_CACHE_JERICHO, exist_ok=True)
22 |
23 | for name, game_info in GAMES_INFOS.items():
24 | filename = game_info["filename"]
25 |
26 | game_file = pjoin(TALES_CACHE_JERICHO, filename)
27 | if os.path.isfile(game_file) and not force:
28 | continue
29 |
30 | link = f"{GAMES_URLS}/{filename}"
31 | download(link, dst=TALES_CACHE_JERICHO, force=force)
32 |
33 |
34 | def get_game(game):
35 | prepare_jericho_data() # make sure the data is ready
36 |
37 | game_info = GAMES_INFOS[game]
38 | game_file = pjoin(TALES_CACHE_JERICHO, game_info["filename"])
39 | return game_file
40 |
--------------------------------------------------------------------------------
/tales/jericho/jericho_env.py:
--------------------------------------------------------------------------------
1 | import gymnasium as gym
2 | import textworld
3 | from textworld.envs.wrappers import Filter
4 |
5 | from . import jericho_data
6 |
7 |
8 | class JerichoEnv(gym.Env):
9 |
10 | def __init__(self, game, admissible_commands=False, *args, **kwargs):
11 | gamefile = jericho_data.get_game(game)
12 | self.infos = textworld.EnvInfos(
13 | score=True,
14 | max_score=True,
15 | won=True,
16 | lost=True,
17 | feedback=True,
18 | moves=True,
19 | admissible_commands=admissible_commands,
20 | extras=["walkthrough"],
21 | )
22 | self.env = textworld.start(gamefile, self.infos, wrappers=[Filter])
23 |
24 | def reset(self, *, seed=None, options=None):
25 | self.env.seed(seed)
26 | return self.env.reset()
27 |
28 | def step(self, action):
29 | return self.env.step(action)
30 |
--------------------------------------------------------------------------------
/tales/logger.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import logging
3 | import os
4 | import platform
5 | import re
6 | from os.path import join as pjoin
7 |
8 | from tqdm import tqdm
9 |
10 | log = logging.getLogger("tales")
11 |
12 |
13 | class TqdmLoggingHandler(logging.Handler):
14 | def __init__(self, level=logging.NOTSET):
15 | super().__init__(level)
16 |
17 | def emit(self, record):
18 | try:
19 | msg = self.format(record)
20 | tqdm.write(msg)
21 | self.flush()
22 | except (KeyboardInterrupt, SystemExit):
23 | raise
24 | except Exception:
25 | self.handleError(record)
26 |
27 |
28 | class StripAnsiFormatter(logging.Formatter):
29 | ansi_escape = re.compile(r"\x1B[@-_][0-?]*[ -/]*[@-~]")
30 |
31 | def format(self, record):
32 | msg = super().format(record)
33 | return self.ansi_escape.sub("", msg)
34 |
35 |
36 | def setup_logging(args):
37 | log.setLevel(logging.DEBUG)
38 |
39 | def add_new_file_handler(logfile):
40 | fh = logging.FileHandler(logfile, mode="w")
41 | formatter = StripAnsiFormatter("%(asctime)s: %(message)s")
42 | log.addHandler(fh)
43 | fh.setLevel(logging.DEBUG)
44 | fh.setFormatter(formatter)
45 |
46 | # Log some system information at the top of the log file.
47 | def _emit_msg(msg):
48 | fh.emit(
49 | logging.makeLogRecord(
50 | {"name": log.name, "level": logging.DEBUG, "msg": msg}
51 | )
52 | )
53 |
54 | _emit_msg("System information:")
55 | _emit_msg(f"args = {args}")
56 | _emit_msg(f"system = {platform.system()}")
57 | _emit_msg(f"server = {platform.uname()[1]}")
58 | _emit_msg(f"working_dir = {os.getcwd()}")
59 | _emit_msg(f"datetime = {datetime.datetime.now()}")
60 | _emit_msg(f"git_commit = {os.popen('git rev-parse HEAD').read().strip()}")
61 |
62 | return fh
63 |
64 | log.add_new_file_handler = add_new_file_handler
65 |
66 | timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
67 | logfile = pjoin(args.log_dir, f"{timestamp}.log")
68 | log.add_new_file_handler(logfile)
69 |
70 | ch = TqdmLoggingHandler()
71 | formatter = logging.Formatter("%(message)s")
72 | ch.setLevel(args.logging_level)
73 | ch.setFormatter(formatter)
74 | log.addHandler(ch)
75 |
--------------------------------------------------------------------------------
/tales/scienceworld/__init__.py:
--------------------------------------------------------------------------------
1 | import gymnasium as gym
2 |
3 | from .scienceworld_env import TASK_NAMES, ScienceWorldEnv
4 |
5 | environments = []
6 |
7 | for task_name in TASK_NAMES:
8 | env_name = f"ScienceWorld{task_name.title().replace('-', '')}"
9 | environments.append([env_name, "v0"])
10 |
11 | gym.register(
12 | id=f"tales/{env_name}-v0",
13 | entry_point="tales.scienceworld:ScienceWorldEnv",
14 | kwargs={"task_name": task_name},
15 | )
16 |
17 |
18 | def download():
19 | pass
20 |
--------------------------------------------------------------------------------
/tales/scienceworld/scienceworld_data.py:
--------------------------------------------------------------------------------
1 | import scienceworld
2 |
3 |
4 | def get_task_names():
5 | return scienceworld.ScienceWorldEnv().task_names
6 |
7 |
8 | def get_variations(task_name, split, env=None):
9 | env = env or scienceworld.ScienceWorldEnv(task_name)
10 | if split == "train":
11 | return env.get_variations_train()
12 | elif split == "valid":
13 | return env.get_variations_dev()
14 | elif split == "test":
15 | return env.get_variations_test()
16 | else:
17 | raise NotImplementedError("Only plan to support train, dev, and test splits.")
18 |
--------------------------------------------------------------------------------
/tales/scienceworld/scienceworld_env.py:
--------------------------------------------------------------------------------
1 | import gymnasium as gym
2 | import numpy as np
3 | import scienceworld
4 |
5 | from . import scienceworld_data
6 |
7 | TASK_NAMES = scienceworld_data.get_task_names()
8 |
9 |
10 | class ScienceWorldEnv(gym.Env):
11 |
12 | def __init__(self, task_name, admissible_commands=False, *args, **kwargs):
13 | self.task_name = task_name
14 | self.admissible_commands = admissible_commands
15 | self.env = scienceworld.ScienceWorldEnv(self.task_name, envStepLimit=np.inf)
16 | self.variations = scienceworld_data.get_variations(
17 | self.task_name, split="test", env=self.env
18 | )
19 | self.variation = self.variations[0]
20 |
21 | def reset(self, *, seed=None, options=None):
22 | if seed is not None:
23 | self.variation = self.variations[seed % len(self.variations)]
24 |
25 | self.env.load(
26 | self.task_name, self.variation, simplificationStr="", generateGoldPath=True
27 | )
28 | obs, info = self.env.reset()
29 |
30 | # Add task description to the first observation.
31 | obs = info["taskDesc"] + "\n\n" + obs
32 |
33 | info["max_score"] = 100
34 | info["feedback"] = obs
35 | info["won"] = False
36 | info["lost"] = False
37 | info["admissible_commands"] = info["valid"]
38 | info["extra.walkthrough"] = self.env.get_gold_action_sequence()
39 | return obs, info
40 |
41 | def step(self, action):
42 | obs, reward, done, info = self.env.step(action)
43 | info["max_score"] = 100
44 | info["feedback"] = obs
45 | info["won"] = info["score"] == 100
46 | info["lost"] = info["score"] < 0
47 | info["admissible_commands"] = info["valid"]
48 | return obs, reward, done, info
49 |
50 | def close(self):
51 | self.env.close()
52 |
--------------------------------------------------------------------------------
/tales/textworld/__init__.py:
--------------------------------------------------------------------------------
1 | import gymnasium as gym
2 |
3 | from .textworld_data import prepare_twcooking_data
4 | from .textworld_env import TextWorldEnv, TWCookingEnv
5 |
6 | environments = []
7 |
8 | # TWCookingEnv
9 | for difficulty in range(1, 10 + 1):
10 | env_name = f"TWCookingLevel{difficulty}"
11 | environments.append([env_name, "v0"])
12 |
13 | gym.register(
14 | id=f"tales/{env_name}-v0",
15 | entry_point="tales.textworld:TWCookingEnv",
16 | kwargs={"difficulty": difficulty},
17 | )
18 |
19 |
20 | def download():
21 | prepare_twcooking_data()
22 |
--------------------------------------------------------------------------------
/tales/textworld/textworld_data.py:
--------------------------------------------------------------------------------
1 | import glob
2 | import os
3 | import zipfile
4 | from os.path import join as pjoin
5 |
6 | from tales.config import TALES_CACHE_HOME, TALES_FORCE_DOWNLOAD
7 | from tales.utils import download
8 |
9 | TW_COOKING_URL = (
10 | "https://github.com/xingdi-eric-yuan/GATA-public/releases/download/data/rl.0.2.zip"
11 | )
12 | TALES_CACHE_TEXTWORLD = pjoin(TALES_CACHE_HOME, "textworld")
13 | TALES_CACHE_TWCOOKING = pjoin(TALES_CACHE_TEXTWORLD, "tw-cooking")
14 | TALES_CACHE_TWCOOKING_TEST = pjoin(TALES_CACHE_TWCOOKING, "test")
15 |
16 |
17 | def prepare_twcooking_data(force=TALES_FORCE_DOWNLOAD):
18 | os.makedirs(TALES_CACHE_TWCOOKING, exist_ok=True)
19 | if os.path.exists(TALES_CACHE_TWCOOKING_TEST) and not force:
20 | return
21 |
22 | zip_file = pjoin(TALES_CACHE_TWCOOKING, "rl.0.2.zip")
23 | if not os.path.exists(zip_file) or force:
24 | download(
25 | TW_COOKING_URL,
26 | dst=TALES_CACHE_TWCOOKING,
27 | desc="Downloading TWCooking",
28 | force=force,
29 | )
30 |
31 | # Extract the content of the folder test from the downloaded file
32 | with zipfile.ZipFile(zip_file, "r") as zip_ref:
33 | # Only extract the test folder
34 | for member in zip_ref.namelist():
35 | if "test" in member:
36 | zip_ref.extract(member, TALES_CACHE_TWCOOKING)
37 |
38 |
39 | def get_cooking_game(difficulty):
40 | prepare_twcooking_data() # make sure the data is ready
41 |
42 | cooking_dir = pjoin(TALES_CACHE_TWCOOKING_TEST, f"difficulty_level_{difficulty}")
43 | game_files = glob.glob(pjoin(cooking_dir, "*.z8"))
44 | return game_files
45 |
--------------------------------------------------------------------------------
/tales/textworld/textworld_env.py:
--------------------------------------------------------------------------------
1 | import gymnasium as gym
2 | import numpy as np
3 | import textworld
4 | from textworld.envs.wrappers import Filter
5 |
6 | from . import textworld_data
7 |
8 |
9 | class TextWorldEnv(gym.Env):
10 |
11 | def __init__(self, gamefile, admissible_commands=False, *args, **kwargs):
12 | self.infos = textworld.EnvInfos(
13 | score=True,
14 | max_score=True,
15 | won=True,
16 | lost=True,
17 | feedback=True,
18 | moves=True,
19 | admissible_commands=admissible_commands,
20 | extras=["walkthrough"],
21 | )
22 | self.gamefile = gamefile
23 | self.env = None
24 |
25 | def reset(self, *, seed=None, options=None):
26 | super().reset(seed=seed, options=options)
27 |
28 | if self.env is None:
29 | self.env = textworld.start(self.gamefile, self.infos, wrappers=[Filter])
30 |
31 | return self.env.reset()
32 |
33 | def step(self, action):
34 | return self.env.step(action)
35 |
36 |
37 | class TWCookingEnv(TextWorldEnv):
38 |
39 | def __init__(self, difficulty, *args, **kwargs):
40 | self.gamefiles = sorted(textworld_data.get_cooking_game(difficulty))
41 | super().__init__(self.gamefiles[0], *args, **kwargs)
42 |
43 | def reset(self, *, seed=None, options=None):
44 | if seed is not None:
45 | self.gamefile = self.gamefiles[seed % len(self.gamefiles)]
46 | if self.env is not None:
47 | self.env.close()
48 | self.env = None
49 |
50 | return super().reset(seed=seed, options=options)
51 |
--------------------------------------------------------------------------------
/tales/textworld_express/__init__.py:
--------------------------------------------------------------------------------
1 | import gymnasium as gym
2 |
3 | from .twx_env import TASKS, TextWorldExpressEnv
4 |
5 | environments = []
6 |
7 | for task_name, game_name, game_params in TASKS:
8 | env_name = f"TWX{task_name}"
9 | environments.append([env_name, "v0"])
10 |
11 | gym.register(
12 | id=f"tales/{env_name}-v0",
13 | entry_point="tales.textworld_express:TextWorldExpressEnv",
14 | kwargs={"game_name": game_name, "game_params": game_params},
15 | )
16 |
17 |
18 | def download():
19 | pass
20 |
--------------------------------------------------------------------------------
/tales/textworld_express/twx_data.py:
--------------------------------------------------------------------------------
1 | import textworld_express as twx
2 |
3 | # TASK_NAMES = list(twx.GAME_NAMES)
4 |
5 | TASKS = [
6 | (
7 | "CookingWorld",
8 | "cookingworld",
9 | "numLocations=1, numIngredients=2, numDistractorItems=5, includeDoors=0, limitInventorySize=0",
10 | ),
11 | (
12 | "TextWorldCommonsense",
13 | "twc",
14 | "numLocations=1,numItemsToPutAway=1,includeDoors=0,limitInventorySize=0",
15 | ),
16 | (
17 | "CoinCollector",
18 | "coin",
19 | "numLocations=1, numDistractorItems=5, limitInventorySize=0",
20 | ),
21 | ("Arithmetic", "arithmetic", ""),
22 | (
23 | "MapReader",
24 | "mapreader",
25 | "numLocations=2, maxDistanceApart=1, maxDistractorItemsPerLocation=2, includeDoors=0, limitInventorySize=0",
26 | ),
27 | ("Sorting", "sorting", ""),
28 | ("SimonSays10", "simonsays", "gameLength=10, numDistractors=4, memorization=0"),
29 | ("SimonSays50", "simonsays", "gameLength=50, numDistractors=4, memorization=0"),
30 | ("SimonSays100", "simonsays", "gameLength=100, numDistractors=4, memorization=0"),
31 | (
32 | "SimonSaysWithMemory10",
33 | "simonsays",
34 | "gameLength=10, numDistractors=4, memorization=1, verbose=0",
35 | ),
36 | (
37 | "SimonSaysWithMemory50",
38 | "simonsays",
39 | "gameLength=50, numDistractors=4, memorization=1, verbose=0",
40 | ),
41 | (
42 | "SimonSaysWithMemory100",
43 | "simonsays",
44 | "gameLength=100, numDistractors=4, memorization=1, verbose=0",
45 | ),
46 | (
47 | "SimonSaysWithMemory10Verbose",
48 | "simonsays",
49 | "gameLength=10, numDistractors=4, memorization=1, verbose=1",
50 | ),
51 | (
52 | "SimonSaysWithMemory50Verbose",
53 | "simonsays",
54 | "gameLength=50, numDistractors=4, memorization=1, verbose=1",
55 | ),
56 | (
57 | "SimonSaysWithMemory100Verbose",
58 | "simonsays",
59 | "gameLength=100, numDistractors=4, memorization=1, verbose=1",
60 | ),
61 | ("PeckingOrder", "peckingorder", ""),
62 | ]
63 |
64 |
65 | def get_seeds(split, env=None):
66 | env = env or twx.TextWorldExpressEnv()
67 | if split == "train":
68 | return env.getValidSeedsTrain()
69 | elif split == "valid":
70 | return env.getValidSeedsDev()
71 | elif split == "test":
72 | return env.getValidSeedsTest()
73 | else:
74 | raise NotImplementedError("Only plan to support train, dev, and test splits.")
75 |
--------------------------------------------------------------------------------
/tales/textworld_express/twx_env.py:
--------------------------------------------------------------------------------
1 | import gymnasium as gym
2 | import numpy as np
3 | import textworld_express as twx
4 |
5 | from . import twx_data
6 |
7 | TASKS = twx_data.TASKS
8 |
9 |
10 | class TextWorldExpressEnv(gym.Env):
11 |
12 | def __init__(
13 | self, game_name, game_params, admissible_commands=False, *args, **kwargs
14 | ):
15 | self.game_name = game_name
16 | self.game_params = game_params
17 | self.admissible_commands = admissible_commands
18 | self.env = twx.TextWorldExpressEnv(envStepLimit=np.inf)
19 | self.seeds = twx_data.get_seeds(split="test", env=self.env)
20 | self.seed = self.seeds[0]
21 |
22 | def reset(self, *, seed=None, options=None):
23 | if seed is not None:
24 | self.seed = self.seeds[seed % len(self.seeds)]
25 |
26 | obs, info = self.env.reset(
27 | seed=self.seed,
28 | gameFold="test",
29 | gameName=self.game_name,
30 | gameParams=self.game_params,
31 | generateGoldPath=True,
32 | )
33 |
34 | # Add task description to the first observation.
35 | obs = info["taskDescription"] + "\n\n" + obs
36 |
37 | info["max_score"] = 100
38 | info["feedback"] = obs
39 | info["won"] = False
40 | info["lost"] = False
41 | info["moves"] = 0
42 | info["score"] = int(info["score"] * 100)
43 | info["admissible_commands"] = info["validActions"]
44 | info["extra.walkthrough"] = self.env.getGoldActionSequence()
45 | return obs, info
46 |
47 | def step(self, action):
48 | obs, reward, done, info = self.env.step(action)
49 | info["max_score"] = 100
50 | info["feedback"] = obs
51 | info["won"] = info["tasksuccess"]
52 | info["lost"] = info["taskfailure"]
53 | info["moves"] = info["numMoves"]
54 | info["score"] = int(info["score"] * 100)
55 | info["admissible_commands"] = info["validActions"]
56 | return obs, reward, done, info
57 |
58 | def close(self):
59 | self.env.close()
60 |
--------------------------------------------------------------------------------
/tales/token.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Optional
3 |
4 | import tiktoken
5 | from llm import Model
6 |
7 | # Suppress warnings from transformers
8 | os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "True"
9 | from transformers import AutoTokenizer
10 |
11 |
12 | def get_token_counter(model: Optional[Model] = None):
13 | if model is None or model.model_id == "gpt-4o":
14 | return OpenAITokenCounter("gpt-4o")
15 |
16 | if "claude-" in model.model_id:
17 | return ClaudeTokenCounter(model)
18 |
19 | elif "gemini" in model.model_id or "gemma" in model.model_id:
20 | return GeminiTokenCounter(model)
21 |
22 | try:
23 | return OpenAITokenCounter(model.model_id)
24 | except KeyError:
25 | pass
26 |
27 | # Try to load from transformers.
28 | return HuggingFaceTokenCounter(model.model_id)
29 |
30 |
31 | class TokenCounter:
32 |
33 | def __call__(self, *, messages=None, text=None):
34 | nb_tokens = 0
35 | if messages is not None:
36 | nb_tokens += sum(len(self.tokenize(msg["content"])) for msg in messages)
37 |
38 | if text is not None:
39 | nb_tokens += len(self.tokenize(text))
40 |
41 | return nb_tokens
42 |
43 |
44 | class OpenAITokenCounter(TokenCounter):
45 | def __init__(self, model: str):
46 | self.model = model
47 | if self.model in tiktoken.model.MODEL_TO_ENCODING:
48 | self.tokenize = tiktoken.encoding_for_model(self.model).encode
49 | else:
50 | self.tokenize = tiktoken.encoding_for_model(self.model.split("_")[0]).encode
51 |
52 |
53 | class HuggingFaceTokenCounter(TokenCounter):
54 | def __init__(self, model: str):
55 | self.model = model
56 | try:
57 | self.tokenize = AutoTokenizer.from_pretrained(self.model).tokenize
58 | except OSError:
59 | msg = (
60 | f"Tokenizer not found for model {self.model},"
61 | " make sure you have access to the model"
62 | " (e.g., HuggingFace API key is correctly set)."
63 | )
64 | raise ValueError(msg)
65 |
66 | def __call__(self, *, messages=None, text=None):
67 | nb_tokens = 0
68 | if messages is not None:
69 | nb_tokens += sum(len(self.tokenize(msg["content"])) for msg in messages)
70 |
71 | if text is not None:
72 | nb_tokens += len(self.tokenize(text))
73 |
74 | return nb_tokens
75 |
76 |
77 | class ClaudeTokenCounter(TokenCounter):
78 |
79 | def __init__(self, model: Model):
80 | from anthropic import Anthropic
81 |
82 | self.model = model.claude_model_id
83 | self.client = Anthropic(api_key=model.get_key())
84 |
85 | def __call__(self, *, messages=None, text=None):
86 | from anthropic import NOT_GIVEN
87 |
88 | messages = list(messages or [])
89 | if text is not None:
90 | messages += [{"role": "assistant", "content": text.strip()}]
91 |
92 | # Extract system messages, if any.
93 | system = NOT_GIVEN
94 | if messages and messages[0]["role"] == "system":
95 | system = messages[0]["content"]
96 | messages.pop(0)
97 |
98 | return self.client.beta.messages.count_tokens(
99 | model=self.model,
100 | messages=messages,
101 | system=system,
102 | ).input_tokens
103 |
104 |
105 | class GeminiTokenCounter(TokenCounter):
106 |
107 | def __init__(self, model: Model):
108 | from google import genai
109 |
110 | self.model = model.model_id
111 | self.client = genai.Client(api_key=model.get_key())
112 |
113 | def __call__(self, *, messages=None, text=None):
114 | from google.genai import types
115 |
116 | messages = list(messages or [])
117 | if text is not None:
118 | messages += [{"role": "assistant", "content": text.strip()}]
119 |
120 | system = None
121 | if messages and messages[0]["role"] == "system":
122 | system = [messages[0]["content"]]
123 | messages.pop(0)
124 |
125 | chat = self.client.chats.create(
126 | model=self.model,
127 | history=[
128 | types.Content(
129 | role=msg["role"].replace("assistant", "model"),
130 | parts=[types.Part(text=msg["content"])],
131 | )
132 | for msg in messages
133 | ],
134 | config=types.GenerateContentConfig(system_instruction=system),
135 | )
136 |
137 | return self.client.models.count_tokens(
138 | model=self.model,
139 | contents=chat.get_history(),
140 | ).total_tokens
141 |
--------------------------------------------------------------------------------
/tales/utils.py:
--------------------------------------------------------------------------------
1 | import asyncio
2 | import json
3 | import logging
4 | import os
5 | import shutil
6 | import tempfile
7 | from os.path import join as pjoin
8 |
9 | import numpy as np
10 | import requests
11 | from llm import AsyncResponse, Conversation, Prompt, Response
12 | from tqdm import tqdm
13 |
14 | from tales.logger import log
15 |
16 |
17 | def mkdirs(dirpath: str) -> str:
18 | """Create a directory and all its parents.
19 |
20 | If the folder already exists, its path is returned without raising any exceptions.
21 |
22 | Arguments:
23 | dirpath: Path where a folder need to be created.
24 |
25 | Returns:
26 | Path to the (created) folder.
27 | """
28 | try:
29 | os.makedirs(dirpath)
30 | except FileExistsError:
31 | pass
32 |
33 | return dirpath
34 |
35 |
36 | def download(url, dst, desc=None, force=False):
37 | """Download a remote file using HTTP get request.
38 |
39 | Args:
40 | url (str): URL where to get the file.
41 | dst (str): Destination folder where to save the file.
42 | force (bool, optional):
43 | Download again if it exists]. Defaults to False.
44 |
45 | Returns:
46 | str: Path to the downloaded file.
47 |
48 | Notes:
49 | This code is inspired by
50 | https://github.com/huggingface/transformers/blob/v4.0.0/src/transformers/file_utils.py#L1069
51 | """
52 | filename = url.split("/")[-1]
53 | path = pjoin(mkdirs(dst), filename)
54 |
55 | if os.path.isfile(path) and not force:
56 | return path
57 |
58 | # Download to a temp folder first to avoid corrupting the cache
59 | # with incomplete downloads.
60 | temp_dir = mkdirs(pjoin(tempfile.gettempdir(), "tales"))
61 | temp_path = pjoin(temp_dir, filename)
62 | with open(temp_path, "ab") as temp_file:
63 | headers = {}
64 | resume_size = temp_file.tell()
65 | if resume_size:
66 | headers["Range"] = f"bytes={resume_size}-"
67 | headers["x-ms-version"] = "2020-04-08" # Needed for Range support.
68 |
69 | r = requests.get(url, stream=True, headers=headers)
70 | if r.headers.get("x-ms-error-code") == "InvalidRange" and r.headers[
71 | "Content-Range"
72 | ].rsplit("/", 1)[-1] == str(resume_size):
73 | shutil.move(temp_path, path)
74 | return path
75 |
76 | r.raise_for_status() # Bad request.
77 | content_length = r.headers.get("Content-Length")
78 | total = resume_size + int(content_length)
79 | pbar = tqdm(
80 | unit="B",
81 | initial=resume_size,
82 | unit_scale=True,
83 | total=total,
84 | desc=desc or "Downloading {}".format(filename),
85 | leave=False,
86 | )
87 |
88 | for chunk in r.iter_content(chunk_size=1024):
89 | if chunk: # filter out keep-alive new chunks
90 | pbar.update(len(chunk))
91 | temp_file.write(chunk)
92 |
93 | shutil.move(temp_path, path)
94 |
95 | pbar.close()
96 | return path
97 |
98 |
99 | def merge_messages(messages):
100 | """Merge messages from the same role into a single message."""
101 | messages_out = [dict(messages[0])]
102 | for message in messages[1:]:
103 | if message["role"] == messages_out[-1]["role"]:
104 | messages_out[-1]["content"] += "\n\n" + message["content"]
105 | else:
106 | messages_out.append(dict(message))
107 |
108 | return messages_out
109 |
110 |
111 | def messages2conversation(model, messages):
112 | messages = merge_messages(messages) # Just in case.
113 | responses = []
114 |
115 | system = None
116 | for message in messages:
117 | if message["role"] == "system":
118 | system = message["content"]
119 | continue
120 |
121 | if message["role"] == "user":
122 | prompt = message["content"]
123 | continue
124 |
125 | if message["role"] == "assistant":
126 | # Make a fake response object.
127 | response = Response(
128 | model=model,
129 | prompt=Prompt(
130 | prompt,
131 | system=system,
132 | model=model,
133 | ),
134 | stream=False,
135 | )
136 | response._done = True
137 | response._chunks = [message["content"]]
138 | responses.append(response)
139 |
140 | system = None
141 | prompt = None
142 |
143 | return Conversation(model, responses=responses)
144 |
145 |
146 | def format_messages_to_markdown(messages):
147 | """Concatenate messages into a single markdown string."""
148 | markdown_content = ""
149 | for message in messages:
150 | role = message["role"].capitalize()
151 | content = message["content"]
152 | markdown_content += f"#### {role}\n\n```\n{content}\n```\n\n"
153 | return markdown_content
154 |
155 |
156 | def is_recoverable_error(exception):
157 | # List of exceptions thrown by various libraries that can be retried.
158 | recoverable_errors = [
159 | "openai.APIStatusError",
160 | "openai.APITimeoutError",
161 | "openai.error.Timeout",
162 | "openai.error.RateLimitError",
163 | "openai.error.ServiceUnavailableError",
164 | "openai.Timeout",
165 | "openai.APIError",
166 | "openai.APIConnectionError",
167 | "openai.RateLimitError",
168 | "openai.InternalServerError",
169 | "anthropic.error.RateLimitError",
170 | "anthropic.InternalServerError",
171 | "anthropic.OverloadedError",
172 | "anthropic.APIStatusError",
173 | "anthropic._exceptions.OverloadedError",
174 | "llm.errors.ModelError", # Gemini
175 | # Add more as needed
176 | ]
177 | exception_full_name = (
178 | f"{exception.__class__.__module__}.{exception.__class__.__name__}"
179 | )
180 | log.warning(f"Exception_full_name: {exception_full_name}")
181 | log.warning(f"Exception: {exception}")
182 | return exception_full_name in recoverable_errors
183 |
184 |
185 | class NumpyEncoder(json.JSONEncoder):
186 | def default(self, obj):
187 | if isinstance(obj, (np.integer, np.floating)):
188 | return obj.item()
189 | elif isinstance(obj, np.ndarray):
190 | return obj.tolist()
191 | return super(NumpyEncoder, self).default(obj)
192 |
--------------------------------------------------------------------------------
/tales/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.1"
2 |
--------------------------------------------------------------------------------
/website/Gemfile.lock:
--------------------------------------------------------------------------------
1 | GEM
2 | remote: https://rubygems.org/
3 | specs:
4 | activesupport (7.2.2.1)
5 | base64
6 | benchmark (>= 0.3)
7 | bigdecimal
8 | concurrent-ruby (~> 1.0, >= 1.3.1)
9 | connection_pool (>= 2.2.5)
10 | drb
11 | i18n (>= 1.6, < 2)
12 | logger (>= 1.4.2)
13 | minitest (>= 5.1)
14 | securerandom (>= 0.3)
15 | tzinfo (~> 2.0, >= 2.0.5)
16 | addressable (2.8.7)
17 | public_suffix (>= 2.0.2, < 7.0)
18 | base64 (0.2.0)
19 | benchmark (0.4.0)
20 | bigdecimal (3.1.9)
21 | coffee-script (2.4.1)
22 | coffee-script-source
23 | execjs
24 | coffee-script-source (1.12.2)
25 | colorator (1.1.0)
26 | commonmarker (0.23.11)
27 | concurrent-ruby (1.3.5)
28 | connection_pool (2.5.0)
29 | csv (3.3.4)
30 | dnsruby (1.72.4)
31 | base64 (~> 0.2.0)
32 | logger (~> 1.6.5)
33 | simpleidn (~> 0.2.1)
34 | drb (2.2.1)
35 | em-websocket (0.5.3)
36 | eventmachine (>= 0.12.9)
37 | http_parser.rb (~> 0)
38 | ethon (0.16.0)
39 | ffi (>= 1.15.0)
40 | eventmachine (1.2.7)
41 | execjs (2.10.0)
42 | faraday (2.13.0)
43 | faraday-net_http (>= 2.0, < 3.5)
44 | json
45 | logger
46 | faraday-net_http (3.4.0)
47 | net-http (>= 0.5.0)
48 | ffi (1.17.1)
49 | ffi (1.17.1-arm64-darwin)
50 | ffi (1.17.1-x86_64-darwin)
51 | forwardable-extended (2.6.0)
52 | gemoji (4.1.0)
53 | github-pages (232)
54 | github-pages-health-check (= 1.18.2)
55 | jekyll (= 3.10.0)
56 | jekyll-avatar (= 0.8.0)
57 | jekyll-coffeescript (= 1.2.2)
58 | jekyll-commonmark-ghpages (= 0.5.1)
59 | jekyll-default-layout (= 0.1.5)
60 | jekyll-feed (= 0.17.0)
61 | jekyll-gist (= 1.5.0)
62 | jekyll-github-metadata (= 2.16.1)
63 | jekyll-include-cache (= 0.2.1)
64 | jekyll-mentions (= 1.6.0)
65 | jekyll-optional-front-matter (= 0.3.2)
66 | jekyll-paginate (= 1.1.0)
67 | jekyll-readme-index (= 0.3.0)
68 | jekyll-redirect-from (= 0.16.0)
69 | jekyll-relative-links (= 0.6.1)
70 | jekyll-remote-theme (= 0.4.3)
71 | jekyll-sass-converter (= 1.5.2)
72 | jekyll-seo-tag (= 2.8.0)
73 | jekyll-sitemap (= 1.4.0)
74 | jekyll-swiss (= 1.0.0)
75 | jekyll-theme-architect (= 0.2.0)
76 | jekyll-theme-cayman (= 0.2.0)
77 | jekyll-theme-dinky (= 0.2.0)
78 | jekyll-theme-hacker (= 0.2.0)
79 | jekyll-theme-leap-day (= 0.2.0)
80 | jekyll-theme-merlot (= 0.2.0)
81 | jekyll-theme-midnight (= 0.2.0)
82 | jekyll-theme-minimal (= 0.2.0)
83 | jekyll-theme-modernist (= 0.2.0)
84 | jekyll-theme-primer (= 0.6.0)
85 | jekyll-theme-slate (= 0.2.0)
86 | jekyll-theme-tactile (= 0.2.0)
87 | jekyll-theme-time-machine (= 0.2.0)
88 | jekyll-titles-from-headings (= 0.5.3)
89 | jemoji (= 0.13.0)
90 | kramdown (= 2.4.0)
91 | kramdown-parser-gfm (= 1.1.0)
92 | liquid (= 4.0.4)
93 | mercenary (~> 0.3)
94 | minima (= 2.5.1)
95 | nokogiri (>= 1.16.2, < 2.0)
96 | rouge (= 3.30.0)
97 | terminal-table (~> 1.4)
98 | webrick (~> 1.8)
99 | github-pages-health-check (1.18.2)
100 | addressable (~> 2.3)
101 | dnsruby (~> 1.60)
102 | octokit (>= 4, < 8)
103 | public_suffix (>= 3.0, < 6.0)
104 | typhoeus (~> 1.3)
105 | html-pipeline (2.14.3)
106 | activesupport (>= 2)
107 | nokogiri (>= 1.4)
108 | http_parser.rb (0.8.0)
109 | i18n (1.14.7)
110 | concurrent-ruby (~> 1.0)
111 | jekyll (3.10.0)
112 | addressable (~> 2.4)
113 | colorator (~> 1.0)
114 | csv (~> 3.0)
115 | em-websocket (~> 0.5)
116 | i18n (>= 0.7, < 2)
117 | jekyll-sass-converter (~> 1.0)
118 | jekyll-watch (~> 2.0)
119 | kramdown (>= 1.17, < 3)
120 | liquid (~> 4.0)
121 | mercenary (~> 0.3.3)
122 | pathutil (~> 0.9)
123 | rouge (>= 1.7, < 4)
124 | safe_yaml (~> 1.0)
125 | webrick (>= 1.0)
126 | jekyll-avatar (0.8.0)
127 | jekyll (>= 3.0, < 5.0)
128 | jekyll-coffeescript (1.2.2)
129 | coffee-script (~> 2.2)
130 | coffee-script-source (~> 1.12)
131 | jekyll-commonmark (1.4.0)
132 | commonmarker (~> 0.22)
133 | jekyll-commonmark-ghpages (0.5.1)
134 | commonmarker (>= 0.23.7, < 1.1.0)
135 | jekyll (>= 3.9, < 4.0)
136 | jekyll-commonmark (~> 1.4.0)
137 | rouge (>= 2.0, < 5.0)
138 | jekyll-default-layout (0.1.5)
139 | jekyll (>= 3.0, < 5.0)
140 | jekyll-feed (0.17.0)
141 | jekyll (>= 3.7, < 5.0)
142 | jekyll-gist (1.5.0)
143 | octokit (~> 4.2)
144 | jekyll-github-metadata (2.16.1)
145 | jekyll (>= 3.4, < 5.0)
146 | octokit (>= 4, < 7, != 4.4.0)
147 | jekyll-include-cache (0.2.1)
148 | jekyll (>= 3.7, < 5.0)
149 | jekyll-mentions (1.6.0)
150 | html-pipeline (~> 2.3)
151 | jekyll (>= 3.7, < 5.0)
152 | jekyll-optional-front-matter (0.3.2)
153 | jekyll (>= 3.0, < 5.0)
154 | jekyll-paginate (1.1.0)
155 | jekyll-readme-index (0.3.0)
156 | jekyll (>= 3.0, < 5.0)
157 | jekyll-redirect-from (0.16.0)
158 | jekyll (>= 3.3, < 5.0)
159 | jekyll-relative-links (0.6.1)
160 | jekyll (>= 3.3, < 5.0)
161 | jekyll-remote-theme (0.4.3)
162 | addressable (~> 2.0)
163 | jekyll (>= 3.5, < 5.0)
164 | jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0)
165 | rubyzip (>= 1.3.0, < 3.0)
166 | jekyll-sass-converter (1.5.2)
167 | sass (~> 3.4)
168 | jekyll-seo-tag (2.8.0)
169 | jekyll (>= 3.8, < 5.0)
170 | jekyll-sitemap (1.4.0)
171 | jekyll (>= 3.7, < 5.0)
172 | jekyll-swiss (1.0.0)
173 | jekyll-theme-architect (0.2.0)
174 | jekyll (> 3.5, < 5.0)
175 | jekyll-seo-tag (~> 2.0)
176 | jekyll-theme-cayman (0.2.0)
177 | jekyll (> 3.5, < 5.0)
178 | jekyll-seo-tag (~> 2.0)
179 | jekyll-theme-dinky (0.2.0)
180 | jekyll (> 3.5, < 5.0)
181 | jekyll-seo-tag (~> 2.0)
182 | jekyll-theme-hacker (0.2.0)
183 | jekyll (> 3.5, < 5.0)
184 | jekyll-seo-tag (~> 2.0)
185 | jekyll-theme-leap-day (0.2.0)
186 | jekyll (> 3.5, < 5.0)
187 | jekyll-seo-tag (~> 2.0)
188 | jekyll-theme-merlot (0.2.0)
189 | jekyll (> 3.5, < 5.0)
190 | jekyll-seo-tag (~> 2.0)
191 | jekyll-theme-midnight (0.2.0)
192 | jekyll (> 3.5, < 5.0)
193 | jekyll-seo-tag (~> 2.0)
194 | jekyll-theme-minimal (0.2.0)
195 | jekyll (> 3.5, < 5.0)
196 | jekyll-seo-tag (~> 2.0)
197 | jekyll-theme-modernist (0.2.0)
198 | jekyll (> 3.5, < 5.0)
199 | jekyll-seo-tag (~> 2.0)
200 | jekyll-theme-primer (0.6.0)
201 | jekyll (> 3.5, < 5.0)
202 | jekyll-github-metadata (~> 2.9)
203 | jekyll-seo-tag (~> 2.0)
204 | jekyll-theme-slate (0.2.0)
205 | jekyll (> 3.5, < 5.0)
206 | jekyll-seo-tag (~> 2.0)
207 | jekyll-theme-tactile (0.2.0)
208 | jekyll (> 3.5, < 5.0)
209 | jekyll-seo-tag (~> 2.0)
210 | jekyll-theme-time-machine (0.2.0)
211 | jekyll (> 3.5, < 5.0)
212 | jekyll-seo-tag (~> 2.0)
213 | jekyll-titles-from-headings (0.5.3)
214 | jekyll (>= 3.3, < 5.0)
215 | jekyll-watch (2.2.1)
216 | listen (~> 3.0)
217 | jemoji (0.13.0)
218 | gemoji (>= 3, < 5)
219 | html-pipeline (~> 2.2)
220 | jekyll (>= 3.0, < 5.0)
221 | json (2.10.2)
222 | kramdown (2.4.0)
223 | rexml
224 | kramdown-parser-gfm (1.1.0)
225 | kramdown (~> 2.0)
226 | liquid (4.0.4)
227 | listen (3.9.0)
228 | rb-fsevent (~> 0.10, >= 0.10.3)
229 | rb-inotify (~> 0.9, >= 0.9.10)
230 | logger (1.6.6)
231 | mercenary (0.3.6)
232 | mini_portile2 (2.8.8)
233 | minima (2.5.1)
234 | jekyll (>= 3.5, < 5.0)
235 | jekyll-feed (~> 0.9)
236 | jekyll-seo-tag (~> 2.1)
237 | minitest (5.25.5)
238 | net-http (0.6.0)
239 | uri
240 | nokogiri (1.18.7)
241 | mini_portile2 (~> 2.8.2)
242 | racc (~> 1.4)
243 | nokogiri (1.18.7-arm64-darwin)
244 | racc (~> 1.4)
245 | nokogiri (1.18.7-x86_64-darwin)
246 | racc (~> 1.4)
247 | octokit (4.25.1)
248 | faraday (>= 1, < 3)
249 | sawyer (~> 0.9)
250 | pathutil (0.16.2)
251 | forwardable-extended (~> 2.6)
252 | public_suffix (5.1.1)
253 | racc (1.8.1)
254 | rb-fsevent (0.11.2)
255 | rb-inotify (0.11.1)
256 | ffi (~> 1.0)
257 | rexml (3.4.1)
258 | rouge (3.30.0)
259 | rubyzip (2.4.1)
260 | safe_yaml (1.0.5)
261 | sass (3.7.4)
262 | sass-listen (~> 4.0.0)
263 | sass-listen (4.0.0)
264 | rb-fsevent (~> 0.9, >= 0.9.4)
265 | rb-inotify (~> 0.9, >= 0.9.7)
266 | sawyer (0.9.2)
267 | addressable (>= 2.3.5)
268 | faraday (>= 0.17.3, < 3)
269 | securerandom (0.4.1)
270 | simpleidn (0.2.3)
271 | terminal-table (1.8.0)
272 | unicode-display_width (~> 1.1, >= 1.1.1)
273 | typhoeus (1.4.1)
274 | ethon (>= 0.9.0)
275 | tzinfo (2.0.6)
276 | concurrent-ruby (~> 1.0)
277 | unicode-display_width (1.8.0)
278 | uri (1.0.3)
279 | webrick (1.9.1)
280 |
281 | PLATFORMS
282 | arm64-darwin
283 | ruby
284 | x86_64-darwin
285 | x86_64-linux
286 |
287 | DEPENDENCIES
288 | github-pages
289 |
290 | BUNDLED WITH
291 | 2.6.8
292 |
--------------------------------------------------------------------------------
/website/_site/assets/figs/alfworld_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/alfworld_all_games.png
--------------------------------------------------------------------------------
/website/_site/assets/figs/all_framework_scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/all_framework_scores.png
--------------------------------------------------------------------------------
/website/_site/assets/figs/jericho_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/jericho_all_games.png
--------------------------------------------------------------------------------
/website/_site/assets/figs/radar_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/radar_chart.png
--------------------------------------------------------------------------------
/website/_site/assets/figs/radar_chart_zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/radar_chart_zoom.png
--------------------------------------------------------------------------------
/website/_site/assets/figs/scienceworld_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/scienceworld_all_games.png
--------------------------------------------------------------------------------
/website/_site/assets/figs/text-benchmark_bar_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/text-benchmark_bar_chart.png
--------------------------------------------------------------------------------
/website/_site/assets/figs/text-benchmark_radar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/text-benchmark_radar.png
--------------------------------------------------------------------------------
/website/_site/assets/figs/text-benchmark_radar_zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/text-benchmark_radar_zoom.png
--------------------------------------------------------------------------------
/website/_site/assets/figs/textworld_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/textworld_all_games.png
--------------------------------------------------------------------------------
/website/_site/assets/figs/textworld_express_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/textworld_express_all_games.png
--------------------------------------------------------------------------------
/website/_site/assets/js/tabs.js:
--------------------------------------------------------------------------------
1 | function openTab(evt, tabName) {
2 | var i, tabcontent, tabbuttons;
3 |
4 | // Hide all tab content
5 | tabcontent = document.getElementsByClassName("tab-content");
6 | for (i = 0; i < tabcontent.length; i++) {
7 | tabcontent[i].style.display = "none";
8 | }
9 |
10 | // Remove "active" class from all tab buttons
11 | tabbuttons = document.getElementsByClassName("tab-button");
12 | for (i = 0; i < tabbuttons.length; i++) {
13 | tabbuttons[i].className = tabbuttons[i].className.replace(" active", "");
14 | }
15 |
16 | // Show the current tab and add "active" class to the button
17 | document.getElementById(tabName).style.display = "block";
18 | evt.currentTarget.className += " active";
19 | }
20 |
21 | // Nested tab functionality
22 | function openNestedTab(evt, tabName) {
23 | var i, tabcontent, tabbuttons;
24 |
25 | // Hide all nested tab content within the parent tab
26 | var parentTab = evt.currentTarget.closest('.tab-content');
27 | tabcontent = parentTab.getElementsByClassName("nested-tab-content");
28 | for (i = 0; i < tabcontent.length; i++) {
29 | tabcontent[i].style.display = "none";
30 | }
31 |
32 | // Remove "active" class from all nested tab buttons
33 | tabbuttons = parentTab.getElementsByClassName("nested-tab-button");
34 | for (i = 0; i < tabbuttons.length; i++) {
35 | tabbuttons[i].className = tabbuttons[i].className.replace(" active", "");
36 | }
37 |
38 | // Show the current nested tab and add "active" class to the button
39 | document.getElementById(tabName).style.display = "block";
40 | evt.currentTarget.className += " active";
41 | }
42 |
43 | // Initialize tabs
44 | document.addEventListener('DOMContentLoaded', function() {
45 | // Make sure the first tab and its first nested tab are active by default
46 | document.querySelector('.tab-button').click();
47 | });
--------------------------------------------------------------------------------
/website/_site/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/favicon.ico
--------------------------------------------------------------------------------
/website/_site/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | T A L E S | the Text Adventure Learning Environment Suite
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
41 |
42 |
43 | Skip to the content.
44 |
45 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 |
61 |
62 |
63 |
64 |
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
Overview
73 | Insert overview description here.
74 |
75 |
76 |
77 |
Environments
78 |
79 |
80 |
81 |
82 |
83 |
84 |
85 |
86 |
87 |
88 |
89 |
90 |
91 |
92 |
93 |
Scores for all Textworld games for Top 9 models
94 |

95 |
96 |
97 |
98 |
Scores for all Textworld Express games for Top 9 models
99 |

100 |
101 |
102 |
103 |
Scores for all Alfworld games for Top 9 models
104 |

105 |
106 |
107 |
108 |
Scores for all Scienceworld games for Top 9 models
109 |

110 |
111 |
112 |
113 |
Scores for all Jericho games for Top 9 models
114 |

115 |
116 |
117 |
118 |
Scores for all Jericho games for Top 9 models
119 |

120 |
121 |
122 |
123 |
124 |
125 |
Breakdown of scores per framework
126 |

127 |
128 |
129 |
130 |
131 |
Tab 4 Content
132 |
This is where you'll put the content for Tab 4.
133 |
134 |
135 |
136 |
137 |
Tab 5 Content
138 |
This is where you'll put the content for Tab 5.
139 |
140 |
141 |
156 |
157 |
158 |
--------------------------------------------------------------------------------