├── .github
    └── workflows
    │   ├── codeql.yml
    │   ├── formatter.yml
    │   └── publish-website.yml
├── .gitignore
├── .pre-commit-config.yaml
├── .vscode
    └── launch.json
├── CODE_OF_CONDUCT.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── RESPONSIBLE_AI.md
├── SECURITY.md
├── SUPPORT.md
├── agents
    ├── human.py
    ├── llm.py
    ├── llm_walkthrough.py
    ├── random.py
    ├── react.py
    ├── reasoning.py
    └── walkthrough.py
├── benchmark.py
├── docs
    └── website
    │   ├── Gemfile
    │   ├── _config.yml
    │   ├── _includes
    │       ├── footer.html
    │       ├── head-custom.html
    │       ├── simple_table.md
    │       ├── table.md
    │       └── test.md
    │   ├── _layouts
    │       └── default.html
    │   ├── _site
    │       ├── assets
    │       │   └── css
    │       │   │   └── style.css
    │       └── index.html
    │   ├── assets
    │       ├── css
    │       │   ├── custom.css
    │       │   └── style.scss
    │       ├── figs
    │       │   ├── alfworld_all_games.png
    │       │   ├── alfworld_image.png
    │       │   ├── all_framework_scores.png
    │       │   ├── arxiv-logomark-small.svg
    │       │   ├── arxiv-logomark.svg
    │       │   ├── figure1_eric.png
    │       │   ├── github-mark.svg
    │       │   ├── jericho_all_games.png
    │       │   ├── jericho_image.png
    │       │   ├── pull_run_data.ipynb
    │       │   ├── radar_chart.png
    │       │   ├── radar_chart_zoom.png
    │       │   ├── scienceworld_all_games.png
    │       │   ├── scienceworld_image.png
    │       │   ├── simon_says_chatgpt.png
    │       │   ├── static_banner.png
    │       │   ├── text-benchmark_bar_chart.png
    │       │   ├── text-benchmark_radar.png
    │       │   ├── text-benchmark_radar_zoom.png
    │       │   ├── textworld_all_games.png
    │       │   ├── textworld_express_all_games.png
    │       │   ├── textworld_image.png
    │       │   └── zork1.png
    │       ├── js
    │       │   └── tabs.js
    │       └── videos
    │       │   └── figure1v4.mp4
    │   ├── favicon.ico
    │   └── index.md
├── print_results.py
├── pyproject.toml
├── requirements.txt
├── scripts
    └── example_script.sh
├── tales
    ├── __init__.py
    ├── agent.py
    ├── alfworld
    │   ├── __init__.py
    │   ├── alfworld_data.py
    │   └── alfworld_env.py
    ├── config.py
    ├── download.py
    ├── jericho
    │   ├── __init__.py
    │   ├── games.json
    │   ├── jericho_data.py
    │   └── jericho_env.py
    ├── logger.py
    ├── scienceworld
    │   ├── __init__.py
    │   ├── scienceworld_data.py
    │   └── scienceworld_env.py
    ├── textworld
    │   ├── __init__.py
    │   ├── textworld_data.py
    │   └── textworld_env.py
    ├── textworld_express
    │   ├── __init__.py
    │   ├── twx_data.py
    │   └── twx_env.py
    ├── token.py
    ├── utils.py
    └── version.py
└── website
    ├── Gemfile.lock
    └── _site
        ├── assets
            ├── css
            │   └── style.css
            ├── figs
            │   ├── alfworld_all_games.png
            │   ├── all_framework_scores.png
            │   ├── jericho_all_games.png
            │   ├── pull_run_data.ipynb
            │   ├── radar_chart.png
            │   ├── radar_chart_zoom.png
            │   ├── scienceworld_all_games.png
            │   ├── text-benchmark_bar_chart.png
            │   ├── text-benchmark_radar.png
            │   ├── text-benchmark_radar_zoom.png
            │   ├── textworld_all_games.png
            │   └── textworld_express_all_games.png
            └── js
            │   └── tabs.js
        ├── favicon.ico
        └── index.html


/.github/workflows/codeql.yml:
--------------------------------------------------------------------------------
 1 | # For most projects, this workflow file will not need changing; you simply need
 2 | # to commit it to your repository.
 3 | #
 4 | # You may wish to alter this file to override the set of languages analyzed,
 5 | # or to provide custom queries or build logic.
 6 | #
 7 | # ******** NOTE ********
 8 | # We have attempted to detect the languages in your repository. Please check
 9 | # the `language` matrix defined below to confirm you have the correct set of
10 | # supported CodeQL languages.
11 | #
12 | name: "CodeQL"
13 | 
14 | on:
15 |   push:
16 |     branches: [ "main" ]
17 |   pull_request:
18 |     # The branches below must be a subset of the branches above
19 |     branches: [ "main" ]
20 |   schedule:
21 |     - cron: '37 20 * * 3'
22 | 
23 | jobs:
24 |   analyze:
25 |     name: Analyze
26 |     runs-on: ubuntu-latest
27 |     permissions:
28 |       actions: read
29 |       contents: read
30 |       security-events: write
31 | 
32 |     strategy:
33 |       fail-fast: false
34 |       matrix:
35 |         language: [ "python" ]
36 |         # CodeQL supports [ 'cpp', 'csharp', 'go', 'java', 'javascript', 'python', 'ruby' ]
37 |         # Learn more about CodeQL language support at https://aka.ms/codeql-docs/language-support
38 | 
39 |     steps:
40 |     - name: Checkout repository
41 |       uses: actions/checkout@v4
42 |     # Install Python dependencies manually
43 |     - name: Set up python
44 |       uses: actions/setup-python@v5
45 |       with:
46 |         python-version: '3.12'
47 |         cache: 'pip'
48 |     # flash-attn requires torch to be installed
49 |     - name: Install dependencies
50 |       run: |
51 |         pip install --upgrade pip
52 |         pip install -e ".[dev]"
53 |     # Initializes the CodeQL tools for scanning.
54 |     - name: Initialize CodeQL
55 |       uses: github/codeql-action/init@v3
56 |       with:
57 |         languages: python
58 |         # languages: ${{ matrix.language }}
59 |         # If you wish to specify custom queries, you can do so here or in a config file.
60 |         # By default, queries listed here will override any specified in a config file.
61 |         # Prefix the list here with "+" to use these queries and those in the config file.
62 | 
63 |         # Details on CodeQL's query packs refer to : https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
64 |         # queries: security-extended,security-and-quality
65 | 
66 | 
67 |     # Autobuild attempts to build any compiled languages  (C/C++, C#, Go, or Java).
68 |     # If this step fails, then you should remove it and run the build manually (see below)
69 |     - name: Autobuild
70 |       uses: github/codeql-action/autobuild@v3
71 | 
72 |     # ℹ️ Command-line programs to run using the OS shell.
73 |     # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
74 | 
75 |     #   If the Autobuild fails above, remove it and uncomment the following three lines.
76 |     #   modify them (or add more) to build your code if your project, please refer to the EXAMPLE below for guidance.
77 | 
78 |     # - run: |
79 |     #   echo "Run, Build Application using script"
80 |     #   ./location_of_script_within_repo/buildscript.sh
81 | 
82 |     - name: Perform CodeQL Analysis
83 |       uses: github/codeql-action/analyze@v3
84 |       with:
85 |         category: "/language:${{matrix.language}}"
86 | 


--------------------------------------------------------------------------------
/.github/workflows/formatter.yml:
--------------------------------------------------------------------------------
 1 | name: "Formatter"
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ "main" ]
 6 |   pull_request:
 7 |     branches: [ "main" ]
 8 |   schedule:
 9 |     - cron: '37 20 * * 3'
10 | 
11 | jobs:
12 |   black:
13 |     runs-on: ubuntu-latest
14 |     steps:
15 |       - uses: actions/checkout@v4
16 |       - uses: psf/black@stable
17 |         with:
18 |             options: "--check --verbose --line-length 88"
19 | 
20 |   isort:
21 |     runs-on: ubuntu-latest
22 |     steps:
23 |       - uses: actions/checkout@v4
24 |       - uses: isort/isort-action@v1
25 |         with:
26 |             requirements-files: "requirements.txt"
27 |             configuration: "--check-only --diff --profile black --filter-files --verbose"
28 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-website.yml:
--------------------------------------------------------------------------------
 1 | # Sample workflow for building and deploying a Jekyll site to GitHub Pages
 2 | name: Deploy Website
 3 | 
 4 | on:
 5 |   # Runs on pushes targeting the default branch
 6 |   push:
 7 |     branches: ["main"]
 8 | 
 9 |   # Allows you to run this workflow manually from the Actions tab
10 |   workflow_dispatch:
11 | 
12 | # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
13 | permissions:
14 |   contents: read
15 |   pages: write
16 |   id-token: write
17 | 
18 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued.
19 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete.
20 | concurrency:
21 |   group: "pages"
22 |   cancel-in-progress: false
23 | 
24 | jobs:
25 |   # Build job
26 |   build:
27 |     runs-on: ubuntu-latest
28 |     steps:
29 |       - name: Checkout
30 |         uses: actions/checkout@v4
31 |       - name: Setup Pages
32 |         uses: actions/configure-pages@v5
33 |       - name: Build with Jekyll
34 |         uses: actions/jekyll-build-pages@v1
35 |         with:
36 |           source: ./docs/website
37 |           destination: ./_site
38 |       - name: Upload artifact
39 |         uses: actions/upload-pages-artifact@v3
40 | 
41 |   # Deployment job
42 |   deploy:
43 |     environment:
44 |       name: github-pages
45 |       url: ${{ steps.deployment.outputs.page_url }}
46 |     runs-on: ubuntu-latest
47 |     needs: build
48 |     steps:
49 |       - name: Deploy to GitHub Pages
50 |         id: deployment
51 |         uses: actions/deploy-pages@v4
52 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | docs/website/_site/media
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | .pybuilder/
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | #   For a library or package, you might want to ignore these files since the code is
 89 | #   intended to run in multiple environments; otherwise, check them in:
 90 | # .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # poetry
100 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
102 | #   commonly ignored for libraries.
103 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104 | #poetry.lock
105 | 
106 | # pdm
107 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108 | #pdm.lock
109 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110 | #   in version control.
111 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
112 | .pdm.toml
113 | .pdm-python
114 | .pdm-build/
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 | 
166 | # Logging
167 | wandb/
168 | logs/
169 | 
170 | # Compute
171 | .amltconfig
172 | .amltignore
173 | amlt/
174 | 
175 | # Website
176 | docs/website/_site
177 | docs/website/Gemfile.lock
178 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | - repo: https://github.com/pycqa/isort
 3 |   rev: 6.0.0
 4 |   hooks:
 5 |   - id: isort
 6 |     args: ["--profile", "black", "--filter-files"]
 7 | 
 8 | - repo: https://github.com/psf/black
 9 |   rev: 24.4.2
10 |   hooks:
11 |   - id: black
12 |     args: ["--line-length", "88"]


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Use IntelliSense to learn about possible attributes.
 3 |     // Hover to view descriptions of existing attributes.
 4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "Python Debugger: Current File",
 9 |             "type": "debugpy",
10 |             "request": "launch",
11 |             "program": "benchmark.py",
12 |             "console": "integratedTerminal",
13 |             "args": ["--games", "games/detective.z5", "games/advent.z5", "--agent", "agent_llm.py:LLMAgent", "--llm", "azure_openai", "--enable_wandb", "-vv", "--conversation", "--context", "100", "--admissible_commands"]
14 |         }
15 |     ]
16 | }


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
 1 | include requirements.txt
 2 | include README.md
 3 | include LICENSE
 4 | include pyproject.toml
 5 | 
 6 | global-exclude */__pycache__/*
 7 | 
 8 | prune wandb
 9 | prune logs
10 | prune website
11 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # TALES: Text-Adventure Learning Environment Suite
  2 | This repository contains the files needed to benchmark language agents on a curated list of text-based games from the following frameworks: [Jericho](https://github.com/microsoft/jericho), [TextWorld](https://github.com/microsoft/textworld), [TextWorld-Express](https://github.com/cognitiveailab/TextWorldExpress), [ScienceWorld](https://github.com/allenai/ScienceWorld), [ALFWorld](https://github.com/alfworld/alfworld)).
  3 | 
  4 | [[Technical Report](https://arxiv.org/abs/2504.14128)] [[Project Page](https://t.co/rFPMRoqO9y)]
  5 | 
  6 | ## 1. Installation
  7 | 
  8 | It is recommended to create and activate a conda or virtual environment. `tales` requires `Python>=3.12`:
  9 | 
 10 |     conda create -n tales python=3.12
 11 |     conda activate tales
 12 | 
 13 | Then, install `tales` directly from PyPI:
 14 | 
 15 |     pip install tale-suite
 16 | 
 17 | > [!WARNING]
 18 | > The name of the Python package on PyPI is `tale-suite` and not `tales`.
 19 | 
 20 | Alternatively, clone the repository and install locally:
 21 | 
 22 |     git clone https://github.com/microsoft/tale-suite
 23 |     cd tale-suite
 24 |     pip install -e .
 25 | 
 26 | > [!WARNING]
 27 | > You will need Java 1.8+ installed to run the environments TextWorld-Express and ScienceWorld.
 28 | >
 29 | >     sudo apt update && apt install openjdk-8-jre-headless -y
 30 | 
 31 | Alternatively, if the above isn't working:
 32 | 
 33 | >      sudo apt-get update && apt-get install default-jre default-jdk
 34 | 
 35 | ### Using Docker
 36 | We provide a pre-built docker image at
 37 | 
 38 |     docker pull czcui/twb:prebuilt
 39 | 
 40 | [Please see the following docs page for more details on how to set up a local vllm for use with the text world benchmark.](https://docs.google.com/document/d/1Q5FtcNpYDpMLbyraJ1dSKxJLwOgLvWCECiPsnDkEq2Y/edit?usp=sharing)
 41 | 
 42 | An example script can be found in the scripts folder.
 43 | 
 44 | ## 2. Getting Started
 45 | 
 46 | 1.	Run benchmark evaluation on all the games for the specified random agent:
 47 | 
 48 |     ```python
 49 |     python benchmark.py --agent agents/random.py random
 50 | 
 51 | 2.	Run benchmark evaluation on a subset of the games:
 52 | 
 53 |     ```python
 54 |     python benchmark.py --agent agents/random.py random --env textworld
 55 | 
 56 | 3.	Run benchmark evaluation on specific games:
 57 | 
 58 |     ```python
 59 |     python benchmark.py --agent agents/random.py random --envs JerichoEnvZork1 JerichoEnvDetective
 60 | 
 61 | 4.	Run benchmark evaluation using as a HumanAgent:
 62 | 
 63 |     ```python
 64 |     python benchmark.py --agent agents/human.py human --envs TWCookingLevel1
 65 | 
 66 | 5.	Run benchmark evaluation where the ground-truth walkthrough is being followed:
 67 | 
 68 |     ```python
 69 |     python benchmark.py --agent agents/walkthrough.py walkthrough --envs JerichoEnvZork1
 70 | 
 71 | 
 72 | ## 3. Benchmarking LLMs
 73 | 
 74 | In order to benchmark a given LLM acting as language agent playing text-based games, you will need to first configure it. `tales` is leveraging the [`llm`](https://llm.datasette.io/en/stable/) library to handle communication with different LLMs.
 75 | 
 76 |     python benchmark.py --agent agents/llm.py zero-shot --envs TWCookingLevel1
 77 | 
 78 | ### API-based LLMs
 79 | 
 80 | `llm` natively supports OpenAI models and self-hosted models that offer an OpenAI-compatible API (e.g. like vLLM does - more on this below).
 81 | 
 82 | ### Adding support to other LLMs
 83 | 
 84 | `llm` offers different plugins to include other LLMs. E.g.
 85 | 
 86 |     llm install llm-anthropic
 87 | 
 88 | See the `llm`plugins [page](https://llm.datasette.io/en/stable/plugins/directory.html) for more information.
 89 | 
 90 | ### Deploying a model locally using vLLM
 91 | 
 92 | To serve a custom HugginFace model with vLLM, one can use the vllm docker image like this:
 93 | 
 94 |     docker run --runtime nvidia --gpus all --restart unless-stopped --name vllm-Llama-3.1-8B-Instruct --env "HUGGING_FACE_HUB_TOKEN=${HUGGING_FACE_HUB_TOKEN}" -v ~/.cache/huggingface:/root/.cache/huggingface -p 8000:8000 --ipc=host vllm/vllm-openai:latest --model meta-llama/Llama-3.1-8B-Instruct --tensor-parallel-size 4 --host 0.0.0.0
 95 | 
 96 | Then, add the following entrypoint in `~/.config/io.datasette.llm/extra-openai-models.yaml`
 97 | 
 98 | ```
 99 | - model_id: meta-llama/Llama-3.1-8B-Instruct
100 |   model_name: meta-llama/Llama-3.1-8B-Instruct
101 |   api_base: "http://0.0.0.0:8000/v1"
102 | ```
103 | 
104 | You can check that everything is working properly with this simple command:
105 | 
106 |     llm -m meta-llama/Llama-3.1-8B-Instruct "Hi. What's your name?"
107 | 
108 | ## 4. Building Custom Agents
109 | 
110 | To build a custom agent, you need to create a new file (e.g., `custom.py`) in the agents folder and implement the `Agent` class and implement the proper arguments parser.
111 | 
112 | ```python
113 | from typing import Dict, Any
114 | import tales
115 | 
116 | class CustomAgent(tales.Agent):
117 | 
118 |     def act(self, obs: str, reward: float, done: bool, infos: Dict[str, Any]) -> str:
119 |         # ...
120 |         return "help"
121 | 
122 | 
123 | def build_argparser(parser=None):
124 |     return parser or argparse.ArgumentParser()
125 | 
126 | 
127 | register(
128 |     name="my-agent",
129 |     desc=(
130 |         "This is a custom agent that always output 'help' as a text action."
131 |     ),
132 |     klass=CustomAgent,
133 |     add_arguments=build_argparser,
134 | )
135 | ```
136 | 
137 | You can then use this agent by specifying the path to the file and the class name in the `--agent` argument.
138 | 
139 |         python benchmark.py --agent agents/custom.py my-agent
140 | 
141 | > [!NOTE]
142 | > See the [agents folder](https://github.com/microsoft/tale-suite/tree/main/agents) for more concrete examples.
143 | 
144 | ## Citation
145 | ```
146 | @article{cui2025tales,
147 |   title={TALES: Text-Adventure Learning Environment Suite},
148 |   author={Christopher Cui, Xingdi Yuan, Ziang Xiao, Prithviraj Ammanabrolu, Marc-Alexandre C\^ot\'e},
149 |   journal={arXiv preprint arXiv:2504.14128},
150 |   year={2025},
151 |   url={https://arxiv.org/abs/2504.14128}
152 | }
153 | ```
154 | 
155 | If you use this benchmark, please consider citing the original frameworks as well.
156 | ```
157 | @article{cote18textworld,
158 |   author = {Marc-Alexandre C\^ot\'e and \'Akos K\'ad\'ar and Xingdi Yuan and Ben Kybartas and Tavian Barnes and Emery Fine and James Moore and Ruo Yu Tao and Matthew Hausknecht and Layla El Asri and Mahmoud Adada and Wendy Tay and Adam Trischler},
159 |   title = {TextWorld: A Learning Environment for Text-based Games},
160 |   journal = {CoRR},
161 |   volume = {abs/1806.11532},
162 |   year = {2018}
163 | }
164 | @article{jansen2022textworldexpress,
165 |   url = {https://arxiv.org/abs/2208.01174},
166 |   author = {Jansen, Peter A. and Côté, Marc-Alexandre},
167 |   title = {TextWorldExpress: Simulating Text Games at One Million Steps Per Second},
168 |   journal = {arXiv},
169 |   year = {2022},
170 | }
171 | @inproceedings{hausknecht2020interactive,
172 |   title={Interactive fiction games: A colossal adventure},
173 |   author={Hausknecht, Matthew and Ammanabrolu, Prithviraj and C{\^o}t{\'e}, Marc-Alexandre and Yuan, Xingdi},
174 |   booktitle={Proceedings of the AAAI Conference on Artificial Intelligence},
175 |   volume={34},
176 |   number={05},
177 |   year={2020}
178 | }
179 | @inproceedings{ALFWorld20,
180 |                title ={{ALFWorld: Aligning Text and Embodied Environments for Interactive Learning}},
181 |                author={Mohit Shridhar and Xingdi Yuan and Marc-Alexandre C\^ot\'e and Yonatan Bisk and Adam Trischler and Matthew Hausknecht},
182 |                booktitle = {Proceedings of the International
183 |                Conference on Learning Representations (ICLR)},
184 |                year = {2021},
185 |                url = {https://arxiv.org/abs/2010.03768}}
186 | @misc{scienceworld2022,
187 |     title={ScienceWorld: Is your Agent Smarter than a 5th Grader?},
188 |     author={Ruoyao Wang and Peter Jansen and Marc-Alexandre C{\^o}t{\'e} and Prithviraj Ammanabrolu},
189 |     year={2022},
190 |     eprint={2203.07540},
191 |     archivePrefix={arXiv},
192 |     primaryClass={cs.CL},
193 |     url={https://arxiv.org/abs/2203.07540}
194 | }
195 | ```
196 | 
197 | ## Contributing
198 | 
199 | This project welcomes contributions and suggestions.  Most contributions require you to agree to a
200 | Contributor License Agreement (CLA) declaring that you have the right to, and actually do, grant us
201 | the rights to use your contribution. For details, visit https://cla.opensource.microsoft.com.
202 | 
203 | When you submit a pull request, a CLA bot will automatically determine whether you need to provide
204 | a CLA and decorate the PR appropriately (e.g., status check, comment). Simply follow the instructions
205 | provided by the bot. You will only need to do this once across all repos using our CLA.
206 | 
207 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
208 | For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or
209 | contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
210 | 
211 | ## Trademarks
212 | 
213 | This project may contain trademarks or logos for projects, products, or services. Authorized use of Microsoft
214 | trademarks or logos is subject to and must follow
215 | [Microsoft's Trademark & Brand Guidelines](https://www.microsoft.com/en-us/legal/intellectualproperty/trademarks/usage/general).
216 | Use of Microsoft trademarks or logos in modified versions of this project must not cause confusion or imply Microsoft sponsorship.
217 | Any use of third-party trademarks or logos are subject to those third-party's policies.
218 | 
219 | ## Privacy
220 | This framework does not collect user's personal data. For more information about Microsoft's privacy policies. Please see [Microsoft Privacy Statement](https://www.microsoft.com/en-ca/privacy/privacystatement).
221 | 
222 | ## Responsible AI
223 | Please see our [Responsible AI Statement](https://github.com/microsoft/tale-suite/blob/main/RESPONSIBLE_AI.md).


--------------------------------------------------------------------------------
/RESPONSIBLE_AI.md:
--------------------------------------------------------------------------------
 1 | # TALES - Text Adventure Learning Environment Suite
 2 | 
 3 | TALES is a benchmark, which consists of a diverse collection of synthetic and human-written text-adventure games designed to evaluate reasoning capabilities of Large Language Model (LLM)-based agents.
 4 | 
 5 | ### WHAT CAN TALES DO
 6 | 
 7 | TALES was developed to evaluate LLM-based agents’ capabilities to solve text-adventure games. Text-adventure games are goal-oriented environments where an agent is required to interact with a game engine in multi-step setting to understand the goal, explore the game world, find clues, and plan itself towards solving the game. We curated the set of games in TALES in a way to cover a diverse spectrum of reasoning skills an LLM-based agent may need in solving real-world tasks, such as inductive reasoning, deductive reasoning, spatial reasoning, and grounded reasoning. We believe while being much more cost-efficient compared to realistic tasks, testing LLM-based agents’ performance on TALES can provide useful insights in evaluating the agents from different aspects, including LLM backbones, agent architecture design, and prompt engineering. These insights can further guide practitioners in developing their agents in use cases beyond text-adventure games.
 8 | 
 9 | A detailed discussion of TALES, including how it was developed and tested, can be found in our paper at: https://arxiv.org/abs/2504.14128
10 | 
11 | 
12 | ### INTENDED USES
13 | 
14 | TALES is best suited for Evaluating AI agents’ capability of solving text-adventure games.
15 | 
16 | TALES is being shared with the research community to facilitate reproduction of our results and foster further research in this area.
17 | 
18 | TALES is intended to be used by domain experts who are independently capable of evaluating the quality of outputs before acting on them.
19 | 
20 | ### OUT-OF-SCOPE USES
21 | 
22 | TALES is designed exclusively for evaluation; it is not well suited for training AI agents.
23 | 
24 | We develop TALES for research purposes only, the benchmark does not cover all necessary criteria for real world decision making. We do not recommend using TALES in any way to make real world decisions.
25 | 
26 | ### LIMITATIONS 
27 | 
28 | TALES was developed for research and experimental purposes. The games in the benchmark are exclusively selected to test LLM-based agents’ inductive reasoning, deductive reasoning, spatial reasoning, and grounded reasoning capabilities. We acknowledge that in real-world scenarios, decision making process may require additional context, more complex reasoning, as well as the combination of multiple reasoning types. We do not claim that our research findings can be directly transferred into real-world decision making. Further testing and validation are needed before considering its application in commercial or real-world scenarios.
29 | 
30 | 
31 | TALES was designed and tested using the English language. Performance in other languages may vary and should be assessed by someone who is both an expert in the expected outputs and a native speaker of that language. 
32 | 
33 | Outputs generated by AI may include factual errors, fabrication, or speculation. Users are responsible for assessing the accuracy of generated content. All decisions leveraging outputs of the system should be made with human oversight and not be based solely on system outputs.
34 | 
35 | ### BEST PRACTICES 
36 | 
37 | We strongly encourage users to use LLMs/MLLMs that support robust Responsible AI mitigations, such as Azure Open AI (AOAI) services. Such services continually update their safety and RAI mitigations with the latest industry standards for responsible use. For more on AOAI’s best practices when employing foundations models for scripts and applications:
38 | 
39 | [Blog post on responsible AI features in AOAI that were presented at Ignite 2023](https://techcommunity.microsoft.com/t5/ai-azure-ai-services-blog/announcing-new-ai-safety-amp-responsible-ai-features-in-azure/ba-p/3983686)
40 | 
41 | [Overview of Responsible AI practices for Azure OpenAI models] (https://learn.microsoft.com/en-us/legal/cognitive-services/openai/overview)
42 | 
43 | [Azure OpenAI Transparency Note](https://learn.microsoft.com/en-us/legal/cognitive-services/openai/transparency-note)
44 | 
45 | [OpenAI’s Usage policies](https://openai.com/policies/usage-policies)
46 | 
47 | [Azure OpenAI’s Code of Conduct](https://learn.microsoft.com/en-us/legal/cognitive-services/openai/code-of-conduct)
48 | 
49 | TALES contains a set of text adventure games specifically curated to fulfill our research on LLM-based agents’ capability of performing certain types of reasoning. We refer practitioners to our paper https://arxiv.org/abs/2504.14128 for detailed guidance on how to properly use this benchmark and how to correctly interpret an LLM-based agent’s results on this benchmark. Additionally, we recommend practitioners to use TALES in concert with other benchmarks to understand LLM-based agents’ performance and capabilities from multiple perspective and thus gain a less biased view.
50 | 
51 | ### LICENSE
52 | 
53 | We use the MIT license, please see the [license file](https://github.com/microsoft/tale-suite/blob/main/LICENSE).
54 | 
55 | ### CONTACT
56 | 
57 | We welcome feedback and collaboration from our audience. If you have suggestions, questions, or observe unexpected/offensive behavior in our technology, please contact us via [GitHub issues](https://github.com/microsoft/tale-suite/issues) or at textworld@microsoft.com.
58 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.9 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet) and [Xamarin](https://github.com/xamarin).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://aka.ms/security.md/definition), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://aka.ms/security.md/msrc/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://aka.ms/security.md/msrc/pgp).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://aka.ms/security.md/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://aka.ms/security.md/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->
42 | 


--------------------------------------------------------------------------------
/SUPPORT.md:
--------------------------------------------------------------------------------
 1 | # Support
 2 | 
 3 | ## How to file issues and get help
 4 | 
 5 | This project uses GitHub Issues to track bugs and feature requests. Please search the existing
 6 | issues before filing new issues to avoid duplicates.  For new issues, file your bug or
 7 | feature request as a new Issue.
 8 | 
 9 | For help and questions about using this project, please email textworld@microsoft.com.
10 | 
11 | ## Microsoft Support Policy
12 | 
13 | Support for this project is limited to the resources listed above.
14 | 


--------------------------------------------------------------------------------
/agents/human.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import sys
  3 | 
  4 | import tales
  5 | from tales.agent import register
  6 | from tales.token import get_token_counter
  7 | from tales.utils import format_messages_to_markdown, merge_messages
  8 | 
  9 | prompt_toolkit_available = False
 10 | try:
 11 |     # For command line history and autocompletion.
 12 |     from prompt_toolkit import prompt
 13 |     from prompt_toolkit.completion import WordCompleter
 14 |     from prompt_toolkit.history import InMemoryHistory
 15 | 
 16 |     prompt_toolkit_available = sys.stdout.isatty()
 17 | except ImportError:
 18 |     pass
 19 | 
 20 | 
 21 | class HumanAgent(tales.Agent):
 22 | 
 23 |     def __init__(self, *args, **kwargs):
 24 |         self.token_counter = get_token_counter()
 25 |         self.history = []
 26 | 
 27 |         self._history = None
 28 |         if prompt_toolkit_available:
 29 |             self._history = InMemoryHistory()
 30 | 
 31 |     @property
 32 |     def uid(self):
 33 |         return f"HumanAgent"
 34 | 
 35 |     @property
 36 |     def params(self):
 37 |         return {
 38 |             "agent_type": "human",
 39 |         }
 40 | 
 41 |     def act(self, obs, reward, done, infos):
 42 |         available_commands = infos.get("admissible_commands", [])
 43 |         if prompt_toolkit_available:
 44 |             actions_completer = WordCompleter(
 45 |                 available_commands, ignore_case=True, sentence=True
 46 |             )
 47 |             response = prompt(
 48 |                 "\n> ",
 49 |                 completer=actions_completer,
 50 |                 history=self._history,
 51 |                 enable_history_search=True,
 52 |             )
 53 |         else:
 54 |             if available_commands:
 55 |                 print("Available actions: {}\n".format(available_commands))
 56 | 
 57 |             response = input("\n> ")
 58 | 
 59 |         messages = self.build_messages(f"{obs}\n> ")
 60 |         # response = self._llm_call_from_messages(
 61 |         #     messages,
 62 |         #     temperature=self.act_temp,
 63 |         #     max_tokens=100,  # Text actions are short phrases.
 64 |         #     seed=self.seed,
 65 |         #     stream=False,
 66 |         # )
 67 | 
 68 |         action = response.strip()
 69 |         self.history.append((f"{obs}\n> ", f"{action}\n"))
 70 | 
 71 |         # Compute usage statistics
 72 |         stats = {
 73 |             "prompt": format_messages_to_markdown(messages),
 74 |             "response": response,
 75 |             "nb_tokens": self.token_counter(messages=messages, text=response),
 76 |         }
 77 | 
 78 |         return action, stats
 79 | 
 80 |     def build_messages(self, observation):
 81 |         messages = []
 82 | 
 83 |         for i, (obs, action) in enumerate(self.history):
 84 |             messages.append({"role": "user", "content": obs})
 85 |             messages.append({"role": "assistant", "content": action})
 86 | 
 87 |         messages.append({"role": "user", "content": observation})
 88 | 
 89 |         # Just in case, let's avoid having multiple messages from the same role.
 90 |         messages = merge_messages(messages)
 91 | 
 92 |         return messages
 93 | 
 94 | 
 95 | def build_argparser(parser=None):
 96 |     parser = parser or argparse.ArgumentParser()
 97 |     group = parser.add_argument_group("HumanAgent settings")
 98 |     return parser
 99 | 
100 | 
101 | register(
102 |     name="human",
103 |     desc=("Manually decide which action to take."),
104 |     klass=HumanAgent,
105 |     add_arguments=build_argparser,
106 | )
107 | 


--------------------------------------------------------------------------------
/agents/llm.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import llm
  4 | import numpy as np
  5 | from tenacity import (
  6 |     retry,
  7 |     retry_if_exception,
  8 |     stop_after_attempt,
  9 |     wait_random_exponential,
 10 | )
 11 | 
 12 | import tales
 13 | from tales.agent import register
 14 | from tales.token import get_token_counter
 15 | from tales.utils import (
 16 |     format_messages_to_markdown,
 17 |     is_recoverable_error,
 18 |     merge_messages,
 19 |     messages2conversation,
 20 | )
 21 | 
 22 | SYSTEM_PROMPT = (
 23 |     "You are playing a text-based game and your goal is to finish it with the highest score."
 24 |     " Upon reading the text observation, provide a *single* short phrase to interact with the game, e.g. `get lamp` (without the backticks)."
 25 |     " When stuck, try using the `help` command to see what commands are available."
 26 | )
 27 | 
 28 | 
 29 | class LLMAgent(tales.Agent):
 30 | 
 31 |     def __init__(self, *args, **kwargs):
 32 |         self.llm = kwargs["llm"]
 33 |         self.model = llm.get_model(self.llm)
 34 |         self.token_counter = get_token_counter(self.model)
 35 |         self.allows_system_prompt = self.llm not in ["o1-mini", "o1-preview"]
 36 | 
 37 |         # Provide the API key, if one is needed and has been provided
 38 |         self.model.key = llm.get_key(
 39 |             kwargs.get("key"), kwargs["llm"], self.model.key_env_var
 40 |         ) or llm.get_key(None, self.model.needs_key, self.model.key_env_var)
 41 | 
 42 |         self.seed = kwargs["seed"]
 43 |         self.rng = np.random.RandomState(self.seed)
 44 | 
 45 |         self.history = []
 46 |         self.context_limit = kwargs["context_limit"]
 47 |         if self.context_limit is not None:
 48 |             assert self.context_limit > 0, "--context-limit must be greater than 0."
 49 | 
 50 |         self.act_temp = kwargs["act_temp"]
 51 |         self.conversation = kwargs["conversation"]
 52 | 
 53 |     @property
 54 |     def uid(self):
 55 |         return (
 56 |             f"LLMAgent_{self.llm}"
 57 |             f"_s{self.seed}"
 58 |             f"_c{self.context_limit}"
 59 |             f"_t{self.act_temp}"
 60 |             f"_conv{self.conversation}"
 61 |         )
 62 | 
 63 |     @property
 64 |     def params(self):
 65 |         return {
 66 |             "agent_type": "zero-shot",
 67 |             "llm": self.llm,
 68 |             "seed": self.seed,
 69 |             "context_limit": self.context_limit,
 70 |             "act_temp": self.act_temp,
 71 |             "conversation": self.conversation,
 72 |         }
 73 | 
 74 |     @retry(
 75 |         retry=retry_if_exception(is_recoverable_error),
 76 |         wait=wait_random_exponential(multiplier=1, max=40),
 77 |         stop=stop_after_attempt(100),
 78 |     )
 79 |     def _llm_call_from_conversation(self, conversation, *args, **kwargs):
 80 |         response = conversation.prompt(*args, **kwargs)
 81 |         response.duration_ms()  # Forces the response to be computed.
 82 |         return response
 83 | 
 84 |     def _llm_call_from_messages(self, messages, *args, **kwargs):
 85 |         conversation = messages2conversation(self.model, messages)
 86 |         prompt = messages[-1]["content"]
 87 |         system = messages[0]["content"] if self.allows_system_prompt else None
 88 | 
 89 |         return self._llm_call_from_conversation(
 90 |             conversation, prompt=prompt, system=system, *args, **kwargs
 91 |         )
 92 | 
 93 |     def act(self, obs, reward, done, infos):
 94 |         messages = self.build_messages(f"{obs}\n> ")
 95 |         llm_kwargs = {
 96 |             "temperature": self.act_temp,
 97 |             "max_tokens": 100,  # Text actions are short phrases.
 98 |             "seed": self.seed,
 99 |             "stream": False,
100 |         }
101 |         if self.llm in [
102 |             "claude-3.5-haiku",
103 |             "claude-3.5-sonnet",
104 |             "claude-3.5-sonnet-latest",
105 |         ]:
106 |             # For these models, we cannot set the seed.
107 |             llm_kwargs.pop("seed")
108 | 
109 |         if "gemini" in self.llm or "gemma" in self.llm:
110 |             # For these models, we cannot set the seed and max_tokens has a different name.
111 |             llm_kwargs.pop("seed")
112 |             llm_kwargs["max_output_tokens"] = llm_kwargs.pop("max_tokens")
113 | 
114 |         response = self._llm_call_from_messages(messages, **llm_kwargs)
115 | 
116 |         action = response.text().strip()
117 |         self.history.append((f"{obs}\n> ", f"{action}\n"))
118 | 
119 |         # Compute usage statistics
120 |         stats = {
121 |             "prompt": format_messages_to_markdown(messages),
122 |             "response": response.text(),
123 |             "nb_tokens": self.token_counter(messages=messages, text=response.text()),
124 |         }
125 | 
126 |         return action, stats
127 | 
128 |     def build_messages(self, observation):
129 |         messages = [{"role": "system", "content": SYSTEM_PROMPT}]
130 |         limit = self.context_limit or len(self.history) + 1
131 | 
132 |         for i, (obs, action) in enumerate(self.history[-limit:]):
133 |             if len(self.history) >= limit and i == 0:
134 |                 # Add the current observation.
135 |                 obs = (
136 |                     f"// History has been truncated to the last {limit} steps.\n...\n> "
137 |                 )
138 | 
139 |             messages.append({"role": "user", "content": obs})
140 |             messages.append({"role": "assistant", "content": action})
141 | 
142 |         messages.append({"role": "user", "content": observation})
143 | 
144 |         # Just in case, let's avoid having multiple messages from the same role.
145 |         messages = merge_messages(messages)
146 | 
147 |         if not self.conversation:
148 |             # Merge all messages into a single message except for the system.
149 |             content = "".join([msg["content"] for msg in messages[1:]])
150 |             messages = messages[:1] + [{"role": "user", "content": content}]
151 | 
152 |         if not self.allows_system_prompt:
153 |             # Make sure the system prompt is added to the following message.
154 |             messages.pop(0)
155 |             messages[1]["content"] = f"{SYSTEM_PROMPT}\n\n{messages[1]['content']}"
156 | 
157 |         return messages
158 | 
159 | 
160 | def build_argparser(parser=None):
161 |     parser = parser or argparse.ArgumentParser()
162 |     group = parser.add_argument_group("LLMAgent settings")
163 | 
164 |     group.add_argument(
165 |         "--llm",
166 |         default="gpt-4o-mini",
167 |         help="LLM to be used for evaluation. Default: %(default)s",
168 |     )
169 |     group.add_argument(
170 |         "--seed",
171 |         type=int,
172 |         default=20241001,
173 |         help="Seed for LLM (not all endpoints support this). Default: %(default)s",
174 |     )
175 |     group.add_argument(
176 |         "--act-temp",
177 |         type=float,
178 |         default=0.0,
179 |         help="Temperature for LLM when taking actions. Default: %(default)s",
180 |     )
181 |     group.add_argument(
182 |         "--context-limit",
183 |         type=int,
184 |         help="Limit context for LLM (in conversation turns). Default: no limit.",
185 |     )
186 |     group.add_argument(
187 |         "--conversation",
188 |         required=True,
189 |         action=argparse.BooleanOptionalAction,
190 |         help="Enable conversation mode. Otherwise, use single prompt.",
191 |     )
192 | 
193 |     return parser
194 | 
195 | 
196 | register(
197 |     name="zero-shot",
198 |     desc=(
199 |         "This agent uses a LLM to decide which action to take in a zero-shot manner."
200 |     ),
201 |     klass=LLMAgent,
202 |     add_arguments=build_argparser,
203 | )
204 | 


--------------------------------------------------------------------------------
/agents/llm_walkthrough.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import gymnasium as gym
  4 | 
  5 | from agents.llm import LLMAgent
  6 | from tales.agent import register
  7 | from tales.utils import merge_messages
  8 | 
  9 | 
 10 | # For the LLMWlkThrAgent, the sysprompt is initialized in the __init__ function as we need to change it once we extract the walkthrough from the env
 11 | class LLMWalkThroughAgent(LLMAgent):
 12 | 
 13 |     def __init__(self, *args, **kwargs):
 14 |         super().__init__(*args, **kwargs)
 15 |         self.sys_prompt = "Not Initialized"
 16 | 
 17 |     @property
 18 |     def uid(self):
 19 |         return (
 20 |             f"LLMAgent_{self.llm}"
 21 |             f"_s{self.seed}"
 22 |             f"_c{self.context_limit}"
 23 |             f"_t{self.act_temp}"
 24 |             f"_conv{self.conversation is not None}"
 25 |             f"Walkthrough Agent"
 26 |         )
 27 | 
 28 |     def build_messages(self, observation):
 29 |         messages = [{"role": "system", "content": self.sys_prompt}]
 30 |         limit = self.context_limit or len(self.history) + 1
 31 | 
 32 |         for i, (obs, action) in enumerate(self.history[-limit:]):
 33 |             if len(self.history) >= limit and i == 0:
 34 |                 # Add the current observation.
 35 |                 obs = (
 36 |                     f"// History has been truncated to the last {limit} steps.\n...\n> "
 37 |                 )
 38 | 
 39 |             messages.append({"role": "user", "content": obs})
 40 |             messages.append({"role": "assistant", "content": action})
 41 | 
 42 |         messages.append({"role": "user", "content": observation})
 43 | 
 44 |         # Just in case, let's avoid having multiple messages from the same role.
 45 |         messages = merge_messages(messages)
 46 | 
 47 |         if not self.conversation:
 48 |             # Merge all messages into a single message except for the system.
 49 |             content = "".join([msg["content"] for msg in messages[1:]])
 50 |             messages = messages[:1] + [{"role": "user", "content": content}]
 51 | 
 52 |         if not self.allows_system_prompt:
 53 |             # Make sure the system prompt is added to the following message.
 54 |             messages.pop(0)
 55 |             messages[1]["content"] = f"{self.sys_prompt}\n\n{messages[1]['content']}"
 56 | 
 57 |         return messages
 58 | 
 59 |     def reset(self, obs, info, env_name):
 60 |         walkthrough = info.get("extra.walkthrough")
 61 |         if walkthrough is None or len(walkthrough) < 1:
 62 |             raise ValueError("Walkthrough not initalized: Check the environment")
 63 | 
 64 |         # Check if the walkthrough is valid.
 65 |         env = gym.make(f"tales/{env_name}-v0", disable_env_checker=True)
 66 | 
 67 |         _, _ = env.reset()
 68 | 
 69 |         for act in walkthrough:
 70 |             _, _, _, info_ = env.step(act)
 71 | 
 72 |         if info_["score"] != info_["max_score"]:
 73 |             raise ValueError(
 74 |                 "Provided walkthrough does not successfully complete game."
 75 |             )
 76 | 
 77 |         numbered_walkthrough = ", ".join(
 78 |             f"{i + 1}.){act}" for i, act in enumerate(walkthrough)
 79 |         )
 80 |         self.sys_prompt = (
 81 |             "You are playing a text-based game and your goal is to finish it with the highest score."
 82 |             " The following is a walkthrough in the form of a list of actions to beat the game."
 83 |             " You should follow this walkthrough as closely as possible to get the maximum score"
 84 |             " You must ONLY respond with the action you wish to take with no other special tokens."
 85 |             f"Walkthrough: {numbered_walkthrough}"
 86 |         )
 87 | 
 88 | 
 89 | def build_argparser(parser=None):
 90 |     parser = parser or argparse.ArgumentParser()
 91 |     group = parser.add_argument_group("LLMAgent settings")
 92 | 
 93 |     group.add_argument(
 94 |         "--llm",
 95 |         default="gpt-4o-mini",
 96 |         help="LLM to be used for evaluation. Default: %(default)s",
 97 |     )
 98 |     group.add_argument(
 99 |         "--seed",
100 |         type=int,
101 |         default=20241001,
102 |         help="Seed for LLM (not all endpoints support this). Default: %(default)s",
103 |     )
104 |     group.add_argument(
105 |         "--act-temp",
106 |         type=float,
107 |         default=0.0,
108 |         help="Temperature for LLM when taking actions. Default: %(default)s",
109 |     )
110 |     group.add_argument(
111 |         "--context-limit",
112 |         type=int,
113 |         default=10,
114 |         help="Limit context for LLM (in conversation turns). Default: %(default)s",
115 |     )
116 |     group.add_argument(
117 |         "--conversation",
118 |         action="store_true",
119 |         help="Enable conversation mode. Otherwise, use single prompt.",
120 |     )
121 |     group.add_argument(
122 |         "--wlkthr-limit",
123 |         type=int,
124 |         default=10000,
125 |         help="Number of walkthrough actions to provide the LLM. Default: %(default)s",
126 |     )
127 | 
128 |     return parser
129 | 
130 | 
131 | register(
132 |     name="llm-walkthrough",
133 |     desc=(
134 |         "This agent uses the ground-truth walkthrough from the environment to attempt to progress through the game."
135 |     ),
136 |     klass=LLMWalkThroughAgent,
137 |     add_arguments=build_argparser,
138 | )
139 | 


--------------------------------------------------------------------------------
/agents/random.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import re
 3 | 
 4 | import numpy as np
 5 | 
 6 | import tales
 7 | from tales.agent import register
 8 | from tales.token import get_token_counter
 9 | 
10 | 
11 | class RandomAgent(tales.Agent):
12 |     def __init__(self, **kwargs):
13 |         self.seed = kwargs.get("seed", 1234)
14 |         self.rng = np.random.RandomState(self.seed)
15 |         self.token_counter = get_token_counter()
16 | 
17 |         # fmt:off
18 |         self.actions = [
19 |             "north", "south", "east", "west", "up", "down",
20 |             "look", "inventory",
21 |             "drop", "take", "take all",
22 |             "eat", "attack",
23 |             "wait", "YES",
24 |         ]
25 |         # fmt:on
26 | 
27 |     @property
28 |     def uid(self):
29 |         return f"RandomAgent_s{self.seed}"
30 | 
31 |     @property
32 |     def params(self):
33 |         return {
34 |             "agent_type": "random",
35 |             "seed": self.seed,
36 |         }
37 | 
38 |     def act(self, obs, reward, done, info):
39 |         stats = {
40 |             "prompt": None,
41 |             "response": None,
42 |             "nb_tokens": self.token_counter(text=obs),
43 |         }
44 | 
45 |         if "admissible_commands" in info:
46 |             return self.rng.choice(info["admissible_commands"]), stats
47 | 
48 |         action = self.rng.choice(self.actions)
49 |         if action in ["take", "drop", "eat", "attack"]:
50 |             words = re.findall(
51 |                 r"\b[a-zA-Z]{4,}\b", obs
52 |             )  # Extract words with 4 or more letters.
53 |             if len(words) > 0:
54 |                 action += " " + self.rng.choice(words)
55 | 
56 |         return str(action), stats
57 | 
58 | 
59 | def build_argparser(parser=None):
60 |     parser = parser or argparse.ArgumentParser()
61 |     group = parser.add_argument_group("RandomAgent settings")
62 |     group.add_argument(
63 |         "--seed",
64 |         type=int,
65 |         default=20241001,
66 |         help="Random generator seed to select actions. Default: %(default)s",
67 |     )
68 |     return parser
69 | 
70 | 
71 | register(
72 |     name="random",
73 |     desc=(
74 |         "This agent will pick an action at random among a predefined set of actions or,"
75 |         " if available, the admissible commands."
76 |     ),
77 |     klass=RandomAgent,
78 |     add_arguments=build_argparser,
79 | )
80 | 


--------------------------------------------------------------------------------
/agents/react.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import llm
  4 | import numpy as np
  5 | from tenacity import (
  6 |     retry,
  7 |     retry_if_exception,
  8 |     stop_after_attempt,
  9 |     wait_random_exponential,
 10 | )
 11 | from termcolor import colored
 12 | 
 13 | import tales
 14 | from tales.agent import register
 15 | from tales.token import get_token_counter
 16 | from tales.utils import (
 17 |     format_messages_to_markdown,
 18 |     is_recoverable_error,
 19 |     log,
 20 |     merge_messages,
 21 |     messages2conversation,
 22 | )
 23 | 
 24 | SYSTEM_PROMPT = (
 25 |     "You are playing a text-based game and your goal is to finish it with the highest score."
 26 |     " Upon reading the text observation, generate a plan with subgoals when asked to think step-by-step,"
 27 |     " then provide a *single* short phrase to interact with the game when asked to do so, e.g. `get lamp` (without the backticks)."
 28 |     " When stuck, try using the `help` command to see what commands are available."
 29 | )
 30 | 
 31 | 
 32 | class ReactAgent(tales.Agent):
 33 | 
 34 |     def __init__(self, *args, **kwargs):
 35 |         self.llm = kwargs["llm"]
 36 |         self.model = llm.get_model(self.llm)
 37 |         self.token_counter = get_token_counter(self.model)
 38 |         self.allows_system_prompt = self.llm not in ["o1-mini", "o1-preview"]
 39 | 
 40 |         # Provide the API key, if one is needed and has been provided
 41 |         self.model.key = llm.get_key(
 42 |             kwargs.get("key"), kwargs["llm"], self.model.key_env_var
 43 |         ) or llm.get_key(None, self.model.needs_key, self.model.key_env_var)
 44 | 
 45 |         self.seed = kwargs["seed"]
 46 |         self.rng = np.random.RandomState(self.seed)
 47 | 
 48 |         self.history = []
 49 |         self.context_limit = kwargs["context_limit"]
 50 |         if self.context_limit is not None:
 51 |             assert self.context_limit > 0, "--context-limit must be greater than 0."
 52 | 
 53 |         self.act_temp = kwargs["act_temp"]
 54 |         self.cot_temp = kwargs["cot_temp"]
 55 |         self.cot_max_tokens = kwargs["cot_max_tokens"]
 56 |         self.conversation = kwargs["conversation"]
 57 | 
 58 |     @property
 59 |     def uid(self):
 60 |         return (
 61 |             f"ReactAgent_{self.llm}"
 62 |             f"_s{self.seed}"
 63 |             f"_c{self.context_limit}"
 64 |             f"_t{self.act_temp}"
 65 |             f"_cotT{self.cot_temp}"
 66 |             f"_cotN{self.cot_max_tokens}"
 67 |             f"_conv{self.conversation}"
 68 |         )
 69 | 
 70 |     @property
 71 |     def params(self):
 72 |         return {
 73 |             "agent_type": "react",
 74 |             "llm": self.llm,
 75 |             "seed": self.seed,
 76 |             "context_limit": self.context_limit,
 77 |             "act_temp": self.act_temp,
 78 |             "cot_temp": self.cot_temp,
 79 |             "cot_max_tokens": self.cot_max_tokens,
 80 |             "conversation": self.conversation,
 81 |         }
 82 | 
 83 |     @retry(
 84 |         retry=retry_if_exception(is_recoverable_error),
 85 |         wait=wait_random_exponential(multiplier=1, max=40),
 86 |         stop=stop_after_attempt(100),
 87 |     )
 88 |     def _llm_call_from_conversation(self, conversation, *args, **kwargs):
 89 |         response = conversation.prompt(*args, **kwargs)
 90 |         response.duration_ms()  # Forces the response to be computed.
 91 |         return response
 92 | 
 93 |     def _llm_call_from_messages(self, messages, *args, **kwargs):
 94 |         conversation = messages2conversation(self.model, messages)
 95 |         prompt = messages[-1]["content"]
 96 |         system = messages[0]["content"] if self.allows_system_prompt else None
 97 | 
 98 |         return self._llm_call_from_conversation(
 99 |             conversation, prompt=prompt, system=system, *args, **kwargs
100 |         )
101 | 
102 |     def act(self, obs, reward, done, infos):
103 |         question = "// Based on the above information (history), what is the best action to take? Let's think step by step.\n"
104 |         messages = self.build_messages(obs, question, [])
105 |         response = self._llm_call_from_messages(
106 |             messages,
107 |             temperature=self.cot_temp,
108 |             max_tokens=self.cot_max_tokens,
109 |             seed=self.seed,
110 |             stream=False,
111 |         )
112 | 
113 |         answer = response.text().strip()
114 |         log.debug(colored(question, "cyan"))
115 |         log.debug(colored(answer, "green"))
116 | 
117 |         # Compute usage statistics for the CoT.
118 |         nb_tokens_cot = self.token_counter(messages=messages, text=response.text())
119 | 
120 |         prompt = "// Provide your chosen action on a single line while respecting the desired format.\n> "
121 |         messages = self.build_messages(obs, prompt, [(question, f"{answer}\n")])
122 |         response = self._llm_call_from_messages(
123 |             messages,
124 |             temperature=self.act_temp,
125 |             max_tokens=100,  # Text actions are short phrases.
126 |             seed=self.seed,
127 |             stream=False,
128 |         )
129 | 
130 |         action = response.text().strip()
131 |         self.history.append((f"{obs}\n> ", f"{action}\n"))
132 |         log.debug(colored(prompt, "cyan"))
133 | 
134 |         # Compute usage statistics
135 |         nb_tokens_act = self.token_counter(messages=messages, text=response.text())
136 |         stats = {
137 |             "prompt": format_messages_to_markdown(messages),
138 |             "response": response.text(),
139 |             "nb_tokens": nb_tokens_cot + nb_tokens_act,
140 |         }
141 | 
142 |         return action, stats
143 | 
144 |     def build_messages(self, observation, question, qa_history):
145 |         messages = [{"role": "system", "content": SYSTEM_PROMPT}]
146 |         limit = self.context_limit or len(self.history) + 1
147 | 
148 |         for i, (obs, action) in enumerate(self.history[-limit:]):
149 |             if len(self.history) >= limit and i == 0:
150 |                 # Add the current observation.
151 |                 obs = (
152 |                     f"// History has been truncated to the last {limit} steps.\n...\n> "
153 |                 )
154 | 
155 |             messages.append({"role": "user", "content": obs})
156 |             messages.append({"role": "assistant", "content": action})
157 | 
158 |         messages.append({"role": "user", "content": observation})
159 | 
160 |         for q, a in qa_history:
161 |             messages.append({"role": "user", "content": q})
162 |             messages.append({"role": "assistant", "content": a})
163 | 
164 |         messages.append({"role": "user", "content": question})
165 | 
166 |         # Merging the current game observation current and the question.
167 |         messages = merge_messages(messages)
168 | 
169 |         if not self.conversation:
170 |             # Merge all messages into a single message except for the system.
171 |             content = "".join([msg["content"] for msg in messages[1:]])
172 |             messages = messages[:1] + [{"role": "user", "content": content}]
173 | 
174 |         if not self.allows_system_prompt:
175 |             # Make sure the system prompt is added to the following message.
176 |             messages.pop(0)
177 |             messages[1]["content"] = f"{SYSTEM_PROMPT}\n\n{messages[1]['content']}"
178 | 
179 |         return messages
180 | 
181 | 
182 | def build_argparser(parser=None):
183 |     parser = parser or argparse.ArgumentParser()
184 |     group = parser.add_argument_group("LLMAgent settings")
185 | 
186 |     group.add_argument(
187 |         "--llm",
188 |         default="gpt-4o-mini",
189 |         help="LLM to be used for evaluation. Default: %(default)s",
190 |     )
191 |     group.add_argument(
192 |         "--seed",
193 |         type=int,
194 |         default=20241001,
195 |         help="Seed for LLM (not all endpoints support this). Default: %(default)s",
196 |     )
197 |     group.add_argument(
198 |         "--cot-temp",
199 |         type=float,
200 |         default=0.0,
201 |         help="Temperature for LLM when doing chain-of-thoughts. Default: %(default)s",
202 |     )
203 |     group.add_argument(
204 |         "--cot-max-tokens",
205 |         type=int,
206 |         default=1024,
207 |         help="Maximum number of token for chain-of-thoughts. Default: %(default)s",
208 |     )
209 |     group.add_argument(
210 |         "--act-temp",
211 |         type=float,
212 |         default=0.0,
213 |         help="Temperature for LLM when taking actions. Default: %(default)s",
214 |     )
215 |     group.add_argument(
216 |         "--context-limit",
217 |         type=int,
218 |         help="Limit context for LLM (in conversation turns). Default: no limit",
219 |     )
220 |     group.add_argument(
221 |         "--conversation",
222 |         required=True,
223 |         action=argparse.BooleanOptionalAction,
224 |         help="Enable conversation mode. Otherwise, use single prompt.",
225 |     )
226 | 
227 |     return parser
228 | 
229 | 
230 | register(
231 |     name="react",
232 |     desc=(
233 |         "This agent uses a LLM to decide which action to take by following a CoT/ReAct approach."
234 |     ),
235 |     klass=ReactAgent,
236 |     add_arguments=build_argparser,
237 | )
238 | 


--------------------------------------------------------------------------------
/agents/reasoning.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | 
  3 | import llm
  4 | import numpy as np
  5 | from tenacity import (
  6 |     retry,
  7 |     retry_if_exception,
  8 |     stop_after_attempt,
  9 |     wait_random_exponential,
 10 | )
 11 | from termcolor import colored
 12 | 
 13 | import tales
 14 | from tales.agent import register
 15 | from tales.token import get_token_counter
 16 | from tales.utils import (
 17 |     format_messages_to_markdown,
 18 |     is_recoverable_error,
 19 |     merge_messages,
 20 |     messages2conversation,
 21 | )
 22 | 
 23 | SYSTEM_PROMPT = (
 24 |     "You are playing a text-based game and your goal is to finish it with the highest score."
 25 |     " Upon reading the text observation, provide a *single* short phrase to interact with the game, e.g. `get lamp` (without the backticks)."
 26 |     " When stuck, try using the `help` command to see what commands are available."
 27 | )
 28 | 
 29 | DEEPSEEK_CHAT_TEMPLATE_NO_THINK = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜><think>\\n</think>\\n'}}{% endif %}"
 30 | 
 31 | 
 32 | class ReasoningAgent(tales.Agent):
 33 | 
 34 |     def __init__(self, *args, **kwargs):
 35 |         self.llm = kwargs["llm"]
 36 |         self.model = llm.get_model(self.llm)
 37 |         self.token_counter = get_token_counter(self.model)
 38 |         self.allows_system_prompt = self.llm not in [
 39 |             "o1",
 40 |             "o1-mini",
 41 |             "o1-preview",
 42 |             "o3-mini",
 43 |         ]
 44 | 
 45 |         # Provide the API key, if one is needed and has been provided
 46 |         self.model.key = llm.get_key(
 47 |             kwargs.get("key"), kwargs["llm"], self.model.key_env_var
 48 |         ) or llm.get_key(None, self.model.needs_key, self.model.key_env_var)
 49 | 
 50 |         self.seed = kwargs["seed"]
 51 |         self.rng = np.random.RandomState(self.seed)
 52 | 
 53 |         self.history = []
 54 |         self.context_limit = kwargs["context_limit"]
 55 |         if self.context_limit is not None:
 56 |             assert self.context_limit > 0, "--context-limit must be greater than 0."
 57 | 
 58 |         self.act_temp = kwargs["act_temp"]
 59 |         self.cot_temp = kwargs["cot_temp"]
 60 |         self.reasoning_effort = kwargs["reasoning_effort"]
 61 |         self.conversation = kwargs["conversation"]
 62 | 
 63 |     @property
 64 |     def uid(self):
 65 |         return (
 66 |             f"ReasoningAgent_{self.llm}"
 67 |             f"_s{self.seed}"
 68 |             f"_c{self.context_limit}"
 69 |             f"_conv{self.conversation}"
 70 |             f"_actT{self.act_temp}"
 71 |             f"_cotT{self.cot_temp}"
 72 |             f"_effort{self.reasoning_effort}"
 73 |         )
 74 | 
 75 |     @property
 76 |     def params(self):
 77 |         return {
 78 |             "agent_type": "react",
 79 |             "llm": self.llm,
 80 |             "seed": self.seed,
 81 |             "context_limit": self.context_limit,
 82 |             "conversation": self.conversation,
 83 |             "act_temp": self.act_temp,
 84 |             "cot_temp": self.cot_temp,
 85 |             "reasoning_effort": self.reasoning_effort,
 86 |         }
 87 | 
 88 |     @retry(
 89 |         retry=retry_if_exception(is_recoverable_error),
 90 |         wait=wait_random_exponential(multiplier=1, max=40),
 91 |         stop=stop_after_attempt(100),
 92 |     )
 93 |     def _llm_call_from_conversation(self, conversation, *args, **kwargs):
 94 |         response = conversation.prompt(*args, **kwargs)
 95 |         response.duration_ms()  # Forces the response to be computed.
 96 |         return response
 97 | 
 98 |     def _llm_call_from_messages(self, messages, *args, **kwargs):
 99 |         conversation = messages2conversation(self.model, messages)
100 |         prompt = messages[-1]["content"]
101 |         system = messages[0]["content"] if self.allows_system_prompt else None
102 | 
103 |         return self._llm_call_from_conversation(
104 |             conversation, prompt=prompt, system=system, *args, **kwargs
105 |         )
106 | 
107 |     def act(self, obs, reward, done, infos):
108 |         llm_kwargs = {
109 |             "temperature": self.cot_temp,
110 |             "seed": self.seed,
111 |             "stream": True,  # Should prevent openai.APITimeoutError
112 |         }
113 |         if isinstance(self.reasoning_effort, int):
114 |             if self.llm in ["claude-3.7-sonnet"]:
115 |                 llm_kwargs["thinking_budget"] = self.reasoning_effort
116 |             else:
117 |                 llm_kwargs["max_tokens"] = self.reasoning_effort
118 | 
119 |         elif self.llm in ["o1", "o1-preview", "o3-mini"]:
120 |             llm_kwargs["reasoning_effort"] = self.reasoning_effort
121 | 
122 |         if self.llm in ["o1", "o1-mini", "o1-preview", "o3-mini", "claude-3.7-sonnet"]:
123 |             # For these models, we cannot set the temperature.
124 |             llm_kwargs.pop("temperature")
125 | 
126 |         if self.llm in ["o3-mini"]:
127 |             llm_kwargs.pop("stream")
128 | 
129 |         if self.llm in ["claude-3.7-sonnet"]:
130 |             llm_kwargs["thinking"] = 1
131 |             llm_kwargs.pop("seed")
132 | 
133 |         if "gemini" in self.llm or "gemma" in self.llm:
134 |             # For these models, we cannot set the seed and max_tokens has a different name.
135 |             llm_kwargs.pop("seed")
136 | 
137 |         messages = self.build_messages(f"{obs}\n> ")
138 |         response = self._llm_call_from_messages(messages, **llm_kwargs)
139 |         response_text = response.text()
140 | 
141 |         action = response.text().strip()
142 | 
143 |         thinking = None
144 |         if "DeepSeek-R1" in self.llm:
145 |             # Strip the reasoning <think> and </think>.
146 |             reasoning_end = action.find("</think>")
147 |             if reasoning_end == -1:
148 |                 # Send another request to get the action with the current reasoning.
149 |                 messages.append(
150 |                     {
151 |                         "role": "assistant",
152 |                         "content": "<think>\n" + response_text.strip() + "\n</think>",
153 |                     }
154 |                 )
155 |                 # prompt = "// Thinking exceeded the length limit. Based on the thoughts so far, provide your chosen action on a single line while respecting the desired format.\n> "
156 |                 # messages.append({"role": "user", "content": prompt})
157 |                 llm_kwargs["max_tokens"] = (
158 |                     100  # Text actions should be short phrases but deepseek forces thought process by starting the generation with <think>.
159 |                 )
160 |                 llm_kwargs["temperature"] = self.act_temp
161 |                 llm_kwargs["extra_body"] = {
162 |                     "chat_template": DEEPSEEK_CHAT_TEMPLATE_NO_THINK,
163 |                 }
164 |                 response = self._llm_call_from_messages(messages, **llm_kwargs)
165 |                 response_text += "\n" + response.text()
166 |                 action = response.text().strip()
167 |                 reasoning_end = action.find("</think>")
168 |                 if reasoning_end == -1:
169 |                     reasoning_end = (
170 |                         0  # Give up and use the entire response as the action.
171 |                     )
172 |                 else:
173 |                     reasoning_end += len("</think>")
174 |             else:
175 |                 reasoning_end += len("</think>")
176 | 
177 |             # Extract the reasoning part from the response.
178 |             thinking = action[:reasoning_end].strip()
179 |             # Extract the action part from the response.
180 |             action = action[reasoning_end:].strip()
181 | 
182 |         elif self.llm in ["claude-3.7-sonnet"]:
183 |             # Extract the thinking part from the response JSON.
184 |             thinking = "".join(
185 |                 [item.get("thinking", "") for item in response.json()["content"]]
186 |             )
187 | 
188 |         self.history.append((f"{obs}\n> ", f"{action}\n"))
189 | 
190 |         # Compute usage statistics
191 |         stats = {
192 |             "prompt": format_messages_to_markdown(messages),
193 |             "thinking": thinking,
194 |             "response": response_text,
195 |             "nb_tokens": self.token_counter(messages=messages, text=response_text),
196 |         }
197 | 
198 |         if thinking is not None:
199 |             stats["nb_tokens"] += self.token_counter(text=thinking)
200 | 
201 |         return action, stats
202 | 
203 |     def build_messages(self, observation):
204 |         messages = [{"role": "system", "content": SYSTEM_PROMPT}]
205 |         limit = self.context_limit or len(self.history) + 1
206 | 
207 |         for i, (obs, action) in enumerate(self.history[-limit:]):
208 |             if len(self.history) >= limit and i == 0:
209 |                 # Add the current observation.
210 |                 obs = (
211 |                     f"// History has been truncated to the last {limit} steps.\n...\n> "
212 |                 )
213 | 
214 |             messages.append({"role": "user", "content": obs})
215 |             messages.append({"role": "assistant", "content": action})
216 | 
217 |         messages.append({"role": "user", "content": observation})
218 | 
219 |         # Just in case, let's avoid having multiple messages from the same role.
220 |         messages = merge_messages(messages)
221 | 
222 |         if not self.conversation:
223 |             # Merge all messages into a single message except for the system.
224 |             content = "".join([msg["content"] for msg in messages[1:]])
225 |             messages = messages[:1] + [{"role": "user", "content": content}]
226 | 
227 |         if not self.allows_system_prompt:
228 |             # Make sure the system prompt is added to the following message.
229 |             messages[1]["content"] = f"{SYSTEM_PROMPT}\n\n{messages[1]['content']}"
230 |             messages.pop(0)
231 | 
232 |         return messages
233 | 
234 | 
235 | def build_argparser(parser=None):
236 |     parser = parser or argparse.ArgumentParser()
237 |     group = parser.add_argument_group("LLMAgent settings")
238 | 
239 |     group.add_argument(
240 |         "--llm",
241 |         default="gpt-4o-mini",
242 |         help="LLM to be used for evaluation. Default: %(default)s",
243 |     )
244 |     group.add_argument(
245 |         "--seed",
246 |         type=int,
247 |         default=20241001,
248 |         help="Seed for LLM (not all endpoints support this). Default: %(default)s",
249 |     )
250 |     group.add_argument(
251 |         "--act-temp",
252 |         type=float,
253 |         default=0.0,
254 |         help="Temperature for LLM when taking actions. Default: %(default)s",
255 |     )
256 |     group.add_argument(
257 |         "--cot-temp",
258 |         type=float,
259 |         default=0.0,
260 |         help="Temperature for LLM when doing chain-of-thoughts. Default: %(default)s",
261 |     )
262 |     subgroup = group.add_mutually_exclusive_group(required=True)
263 |     subgroup.add_argument(
264 |         "--reasoning-effort",
265 |         default="medium",
266 |         dest="reasoning_effort",
267 |         help="Reasoning effort for reasoning-type LLMs.",
268 |     )
269 |     subgroup.add_argument(
270 |         "--cot-max-tokens",
271 |         type=int,
272 |         default=1024,
273 |         dest="reasoning_effort",
274 |         help="Maximum number of token for chain-of-thoughts. Default: %(default)s",
275 |     )
276 |     group.add_argument(
277 |         "--context-limit",
278 |         type=int,
279 |         help="Limit context for LLM (in conversation turns). Default: no limit",
280 |     )
281 |     group.add_argument(
282 |         "--conversation",
283 |         required=True,
284 |         action=argparse.BooleanOptionalAction,
285 |         help="Enable conversation mode. Otherwise, use single prompt.",
286 |     )
287 | 
288 |     return parser
289 | 
290 | 
291 | register(
292 |     name="reasoning",
293 |     desc=(
294 |         "This agent uses reasoning LLM (o1/o3, deepseek-r1, etc.) to do CoT/thinking followed deciding which action to take."
295 |     ),
296 |     klass=ReasoningAgent,
297 |     add_arguments=build_argparser,
298 | )
299 | 


--------------------------------------------------------------------------------
/agents/walkthrough.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | import tales
 4 | from tales.agent import register
 5 | from tales.token import get_token_counter
 6 | 
 7 | 
 8 | class WalkthroughAgent(tales.Agent):
 9 |     def __init__(self, **kwargs):
10 |         self.token_counter = get_token_counter()
11 |         self.walkthrough = None
12 | 
13 |     @property
14 |     def uid(self):
15 |         return f"WalkthroughAgent"
16 | 
17 |     @property
18 |     def params(self):
19 |         return {}
20 | 
21 |     def reset(self, obs, info, env_name):
22 |         # Store the walkthrough in reverse order so we can pop from it.
23 |         if self.walkthrough is None:
24 |             self.walkthrough = info.get("extra.walkthrough")[::-1]
25 | 
26 |     def act(self, obs, reward, done, info):
27 |         stats = {
28 |             "prompt": None,
29 |             "response": None,
30 |             "nb_tokens": self.token_counter(text=obs),
31 |         }
32 | 
33 |         if len(self.walkthrough) == 0:
34 |             return "QUIT", stats
35 | 
36 |         return self.walkthrough.pop(), stats
37 | 
38 | 
39 | def build_argparser(parser=None):
40 |     return parser or argparse.ArgumentParser()
41 | 
42 | 
43 | register(
44 |     name="walkthrough",
45 |     desc=("This agent will follow the walkthrough provided by the environment."),
46 |     klass=WalkthroughAgent,
47 |     add_arguments=build_argparser,
48 | )
49 | 


--------------------------------------------------------------------------------
/docs/website/Gemfile:
--------------------------------------------------------------------------------
1 | source "https://rubygems.org"
2 | 
3 | gem "github-pages", group: :jekyll_plugins


--------------------------------------------------------------------------------
/docs/website/_config.yml:
--------------------------------------------------------------------------------
1 | remote_theme: pages-themes/cayman@v0.2.0
2 | plugins:
3 | - jekyll-remote-theme # add this line to the plugins list if you already have one
4 | 
5 | title: "the Text Adventure Learning Environment Suite"
6 | 
7 | # description: "A Text-games Benchmark"


--------------------------------------------------------------------------------
/docs/website/_includes/footer.html:
--------------------------------------------------------------------------------
 1 | <footer class="site-footer" style="text-align: center;">
 2 |     <span class="site-footer-credits">
 3 |         <a href="https://www.microsoft.com" id="copyright">&copy; 2025 Microsoft</a>
 4 |         |
 5 |         <a href="https://go.microsoft.com/fwlink/?LinkId=521839">Privacy</a>
 6 |         |
 7 |         <a href="https://go.microsoft.com/fwlink/?LinkId=2259814">Consumer Health Privacy</a>
 8 |         |
 9 |         <a id="cookiesManager" onClick="manageConsent();">Cookies</a>
10 |         |
11 |         <a href="https://go.microsoft.com/fwlink/?LinkID=206977">Terms of Use</a>
12 |         |
13 |         <a href="https://www.microsoft.com/trademarks">Trademarks</a>
14 |     </span>
15 | </footer>
16 | 
17 | <script>
18 |   function manageConsent() {
19 |       if (typeof WcpConsent !== 'undefined' && WcpConsent.siteConsent) {
20 |           WcpConsent.siteConsent.manageConsent();
21 |       } else {
22 |           console.warn('WcpConsent library not fully initialized');
23 |           alert('Cookie preferences management is currently unavailable. Please try again later.');
24 |       }
25 |   }
26 | </script>


--------------------------------------------------------------------------------
/docs/website/_includes/head-custom.html:
--------------------------------------------------------------------------------
 1 | <!-- start custom head snippets, customize with your own _includes/head-custom.html file -->
 2 | 
 3 | <!-- Setup Analytics -->
 4 | 
 5 | <!-- You can set your favicon here -->
 6 | <!-- link rel="shortcut icon" type="image/x-icon" href="{{ '/favicon.ico' | relative_url }}" -->
 7 | 
 8 | <!-- Microsoft Cookie Consent Banner -->
 9 | 
10 | <!-- Add your custom head content here -->
11 | <script src="{{ '/assets/js/tabs.js' | relative_url }}"></script>
12 | 
13 | <!-- end custom head snippets -->
14 | 


--------------------------------------------------------------------------------
/docs/website/_includes/simple_table.md:
--------------------------------------------------------------------------------
  1 | <div class="table-container">
  2 | <table class="model-scores simplified-scores">
  3 |     <thead>
  4 |     <tr>
  5 |         <th>Rank</th>
  6 |         <th>Model</th>
  7 |         <th>Organization</th>
  8 |         <th>Model Type</th>
  9 |         <th>TALES Score</th>
 10 |     </tr>
 11 |     </thead>
 12 |     <tbody>
 13 |     <tr>
 14 |         <td>1</td>
 15 |         <td><strong>claude-3.7-sonnet</strong></td>
 16 |         <td>Anthropic</td>
 17 |         <td>Reasoning</td>
 18 |         <td>52.5%</td>
 19 |     </tr>
 20 |     <tr>
 21 |         <td>2</td>
 22 |         <td><strong>claude-3.5-sonnet-latest</strong></td>
 23 |         <td>Anthropic</td>
 24 |         <td>Non-reasoning</td>
 25 |         <td>50.4%</td>
 26 |     </tr>
 27 |     <tr>
 28 |         <td>3</td>
 29 |         <td><strong>gemini-2.5-pro-preview*</strong></td>
 30 |         <td>Google</td>
 31 |         <td>Non-reasoning</td>
 32 |         <td>48.8%</td>
 33 |     </tr>
 34 |     <tr>
 35 |         <td>4</td>
 36 |         <td><strong>o1</strong></td>
 37 |         <td>Anthropic</td>
 38 |         <td>Reasoning</td>
 39 |         <td>44.2%</td>
 40 |     </tr>
 41 |     <tr>
 42 |         <td>5</td>
 43 |         <td><strong>gpt-4o</strong></td>
 44 |         <td>OpenAI</td>
 45 |         <td>Non-reasoning</td>
 46 |         <td>40.6%</td>
 47 |     </tr>
 48 |     <tr>
 49 |         <td>6</td>
 50 |         <td><strong>claude-3.5-haiku</strong></td>
 51 |         <td>Anthropic</td>
 52 |         <td>Non-reasoning</td>
 53 |         <td>39.6%</td>
 54 |     </tr>
 55 |     <tr>
 56 |         <td>7</td>
 57 |         <td><strong>Llama-3.1-405B-Instruct</strong></td>
 58 |         <td>Meta</td>
 59 |         <td>Non-reasoning</td>
 60 |         <td>36.4%</td>
 61 |     </tr>
 62 |     <tr>
 63 |         <td>8</td>
 64 |         <td><strong>gemini-2.0-flash</strong></td>
 65 |         <td>Google</td>
 66 |         <td>Non-reasoning</td>
 67 |         <td>35.0%</td>
 68 |     </tr>
 69 |     <tr>
 70 |         <td>9</td>
 71 |         <td><strong>Llama-3.3-70B-Instruct</strong></td>
 72 |         <td>Meta</td>
 73 |         <td>Non-reasoning</td>
 74 |         <td>32.8%</td>
 75 |     </tr>
 76 |     <tr>
 77 |         <td>10</td>
 78 |         <td><strong>Llama-3.1-70B-Instruct</strong></td>
 79 |         <td>Meta</td>
 80 |         <td>Non-reasoning</td>
 81 |         <td>32.0%</td>
 82 |     </tr>
 83 |     <tr>
 84 |         <td>11</td>
 85 |         <td><strong>Qwen2.5-72B-Instruct</strong></td>
 86 |         <td>Alibaba</td>
 87 |         <td>Non-reasoning</td>
 88 |         <td>30.7%</td>
 89 |     </tr>
 90 |     <tr>
 91 |         <td>12</td>
 92 |         <td><strong>Mistral-Large-Instruct-2407</strong></td>
 93 |         <td>Mistral AI</td>
 94 |         <td>Non-reasoning</td>
 95 |         <td>30.3%</td>
 96 |     </tr>
 97 |     <tr>
 98 |         <td>13</td>
 99 |         <td><strong>gpt-4o-mini</strong></td>
100 |         <td>OpenAI</td>
101 |         <td>Non-reasoning</td>
102 |         <td>21.8%</td>
103 |     </tr>
104 |     <tr>
105 |         <td>14</td>
106 |         <td><strong>Llama-4-Scout-17B-16E-Instruct</strong></td>
107 |         <td>Meta</td>
108 |         <td>Non-reasoning</td>
109 |         <td>19.8%</td>
110 |     </tr>
111 |     <tr>
112 |         <td>15</td>
113 |         <td><strong>Llama-4-Maverick-17B-128E-Instruct</strong></td>
114 |         <td>Meta</td>
115 |         <td>Non-reasoning</td>
116 |         <td>15.5%</td>
117 |     </tr>
118 |     <tr>
119 |         <td>16</td>
120 |         <td><strong>Mistral-Small-Instruct-2409</strong></td>
121 |         <td>Mistral AI</td>
122 |         <td>Non-reasoning</td>
123 |         <td>14.8%</td>
124 |     </tr>
125 |     <tr>
126 |         <td>17</td>
127 |         <td><strong>Llama-3.1-8B-Instruct</strong></td>
128 |         <td>Meta</td>
129 |         <td>Non-reasoning</td>
130 |         <td>13.9%</td>
131 |     </tr>
132 |     <tr>
133 |         <td>18</td>
134 |         <td><strong>DeepSeek-R1</strong></td>
135 |         <td>DeepSeek AI</td>
136 |         <td>Reasoning</td>
137 |         <td>12.4%</td>
138 |     </tr>
139 |     <tr>
140 |         <td>19</td>
141 |         <td><strong>Qwen2.5-7B-Instruct</strong></td>
142 |         <td>Alibaba</td>
143 |         <td>Non-reasoning</td>
144 |         <td>11.7%</td>
145 |     </tr>
146 |     <tr>
147 |         <td>20</td>
148 |         <td><strong>Llama-3.2-3B-Instruct</strong></td>
149 |         <td>Meta</td>
150 |         <td>Non-reasoning</td>
151 |         <td>10.4%</td>
152 |     </tr>
153 |     <tr>
154 |         <td>21</td>
155 |         <td><strong>phi-4</strong></td>
156 |         <td>Microsoft</td>
157 |         <td>Non-reasoning</td>
158 |         <td>10.3%</td>
159 |     </tr>
160 |     <tr>
161 |         <td>22</td>
162 |         <td><strong>Mistral-Small-24B-Instruct-2501</strong></td>
163 |         <td>Mistral AI</td>
164 |         <td>Non-reasoning</td>
165 |         <td>8.8%</td>
166 |     </tr>
167 |     <tr>
168 |         <td>23</td>
169 |         <td><strong>DeepSeek-R1-Distill-Llama-70B</strong></td>
170 |         <td>DeepSeek AI</td>
171 |         <td>Reasoning</td>
172 |         <td>8.4%</td>
173 |     </tr>
174 |     <tr>
175 |         <td>24</td>
176 |         <td><strong>Ministral-8B-Instruct-2410</strong></td>
177 |         <td>Mistral AI</td>
178 |         <td>Non-reasoning</td>
179 |         <td>4.6%</td>
180 |     </tr>
181 |     <tr>
182 |         <td>25</td>
183 |         <td><strong>Mistral-Small-3.1-24B-Instruct-2503</strong></td>
184 |         <td>Mistral AI</td>
185 |         <td>Non-reasoning</td>
186 |         <td>4.5%</td>
187 |     </tr>
188 |     <tr>
189 |         <td>26</td>
190 |         <td><strong>Mixtral-8x22B-Instruct-v0.1</strong></td>
191 |         <td>Mistral AI</td>
192 |         <td>Non-reasoning</td>
193 |         <td>3.7%</td>
194 |     </tr>
195 |     <tr>
196 |         <td>27</td>
197 |         <td><strong>Llama-3.2-1B-Instruct</strong></td>
198 |         <td>Meta</td>
199 |         <td>Non-reasoning</td>
200 |         <td>3.3%</td>
201 |     </tr>
202 |     <tr>
203 |         <td>28</td>
204 |         <td><strong>Phi-3-mini-128k-instruct</strong></td>
205 |         <td>Microsoft</td>
206 |         <td>Non-reasoning</td>
207 |         <td>2.2%</td>
208 |     </tr>
209 |     <tr>
210 |         <td>29</td>
211 |         <td><strong>Phi-3.5-MoE-instruct</strong></td>
212 |         <td>Microsoft</td>
213 |         <td>Non-reasoning</td>
214 |         <td>1.7%</td>
215 |     </tr>
216 |     <tr>
217 |         <td>30</td>
218 |         <td><strong>Phi-4-mini-instruct</strong></td>
219 |         <td>Microsoft</td>
220 |         <td>Non-reasoning</td>
221 |         <td>1.5%</td>
222 |     </tr>
223 |     <tr>
224 |         <td>31</td>
225 |         <td><strong>Mixtral-8x7B-Instruct-v0.1</strong></td>
226 |         <td>Mistral AI</td>
227 |         <td>Non-reasoning</td>
228 |         <td>1.3%</td>
229 |     </tr>
230 |     <tr>
231 |         <td>32</td>
232 |         <td><strong>Phi-3.5-mini-instruct</strong></td>
233 |         <td>Microsoft</td>
234 |         <td>Non-reasoning</td>
235 |         <td>1.0%</td>
236 |     </tr>
237 |     <tr>
238 |         <td>33</td>
239 |         <td><strong>Phi-3-medium-128k-instruct</strong></td>
240 |         <td>Microsoft</td>
241 |         <td>Non-reasoning</td>
242 |         <td>0.7%</td>
243 |     </tr>
244 |     </tbody>
245 | </table>
246 | </div>
247 | 


--------------------------------------------------------------------------------
/docs/website/_includes/table.md:
--------------------------------------------------------------------------------
  1 | <div class="table-container">
  2 | <table class="model-scores">
  3 |     <thead>
  4 |     <tr>
  5 |         <th>Model</th>
  6 |         <th>Textworld</th>
  7 |         <th>Textworld Express</th>
  8 |         <th>Alfworld</th>
  9 |         <th>Scienceworld</th>
 10 |         <th>Jericho</th>
 11 |         <th>Overall</th>
 12 |     </tr>
 13 |     </thead>
 14 |     <tbody>
 15 |     <tr>
 16 |         <td><strong>claude-3.7-sonnet</strong></td>
 17 |         <td>97.3%</td>
 18 |         <td>91.3%</td>
 19 |         <td>83.3%</td>
 20 |         <td>76.5%</td>
 21 |         <td>12.5%</td>
 22 |         <td>52.5%</td>
 23 |     </tr>
 24 |     <tr>
 25 |         <td>claude-3.5-sonnet-latest</td>
 26 |         <td>95.5%</td>
 27 |         <td>81.6%</td>
 28 |         <td>75.0%</td>
 29 |         <td>82.3%</td>
 30 |         <td>9.6%</td>
 31 |         <td>50.4%</td>
 32 |     </tr>
 33 |     <tr>
 34 |         <td>gemini-2.5-pro-preview*</td>
 35 |         <td>98.5%</td>
 36 |         <td>91.8%</td>
 37 |         <td>75.0%</td>
 38 |         <td>64.2%</td>
 39 |         <td>12.4%</td>
 40 |         <td>48.8%</td>
 41 |     </tr>
 42 |     <tr>
 43 |         <td>o1</td>
 44 |         <td>97.8%</td>
 45 |         <td>70.2%</td>
 46 |         <td>28.3%</td>
 47 |         <td>80.1%</td>
 48 |         <td>10.3%</td>
 49 |         <td>44.2%</td>
 50 |     </tr>
 51 |     <tr>
 52 |         <td>gpt-4o</td>
 53 |         <td>83.6%</td>
 54 |         <td>80.6%</td>
 55 |         <td>56.7%</td>
 56 |         <td>61.4%</td>
 57 |         <td>5.6%</td>
 58 |         <td>40.6%</td>
 59 |     </tr>
 60 |     <tr>
 61 |         <td>claude-3.5-haiku</td>
 62 |         <td>94.9%</td>
 63 |         <td>79.8%</td>
 64 |         <td>26.7%</td>
 65 |         <td>67.3%</td>
 66 |         <td>5.0%</td>
 67 |         <td>39.6%</td>
 68 |     </tr>
 69 |     <tr>
 70 |         <td>Llama-3.1-405B-Instruct</td>
 71 |         <td>90.9%</td>
 72 |         <td>79.2%</td>
 73 |         <td>31.7%</td>
 74 |         <td>51.8%</td>
 75 |         <td>6.1%</td>
 76 |         <td>36.4%</td>
 77 |     </tr>
 78 |     <tr>
 79 |         <td>gemini-2.0-flash</td>
 80 |         <td>80.8%</td>
 81 |         <td>76.1%</td>
 82 |         <td>20.0%</td>
 83 |         <td>57.1%</td>
 84 |         <td>5.4%</td>
 85 |         <td>35.0%</td>
 86 |     </tr>
 87 |     <tr>
 88 |         <td>Llama-3.3-70B-Instruct</td>
 89 |         <td>69.6%</td>
 90 |         <td>77.2%</td>
 91 |         <td>15.0%</td>
 92 |         <td>55.1%</td>
 93 |         <td>4.5%</td>
 94 |         <td>32.8%</td>
 95 |     </tr>
 96 |     <tr>
 97 |         <td>Llama-3.1-70B-Instruct</td>
 98 |         <td>65.6%</td>
 99 |         <td>81.9%</td>
100 |         <td>8.3%</td>
101 |         <td>51.9%</td>
102 |         <td>5.3%</td>
103 |         <td>32.0%</td>
104 |     </tr>
105 |     <tr>
106 |         <td>Qwen2.5-72B-Instruct</td>
107 |         <td>76.5%</td>
108 |         <td>83.8%</td>
109 |         <td>36.7%</td>
110 |         <td>35.0%</td>
111 |         <td>2.9%</td>
112 |         <td>30.7%</td>
113 |     </tr>
114 |     <tr>
115 |         <td>Mistral-Large-Instruct-2407</td>
116 |         <td>82.4%</td>
117 |         <td>68.3%</td>
118 |         <td>6.7%</td>
119 |         <td>46.1%</td>
120 |         <td>5.8%</td>
121 |         <td>30.3%</td>
122 |     </tr>
123 |     <tr>
124 |         <td>gpt-4o-mini</td>
125 |         <td>56.5%</td>
126 |         <td>73.6%</td>
127 |         <td>0.0%</td>
128 |         <td>27.2%</td>
129 |         <td>1.8%</td>
130 |         <td>21.8%</td>
131 |     </tr>
132 |     <tr>
133 |         <td>Llama-4-Scout-17B-16E-Instruct</td>
134 |         <td>41.1%</td>
135 |         <td>68.4%</td>
136 |         <td>0.0%</td>
137 |         <td>27.0%</td>
138 |         <td>1.8%</td>
139 |         <td>19.8%</td>
140 |     </tr>
141 |     <tr>
142 |         <td>Llama-4-Maverick-17B-128E-Instruct-</td>
143 |         <td>43.5%</td>
144 |         <td>56.1%</td>
145 |         <td>8.3%</td>
146 |         <td>11.5%</td>
147 |         <td>2.0%</td>
148 |         <td>15.5%</td>
149 |     </tr>
150 |     <tr>
151 |         <td>Mistral-Small-Instruct-2409</td>
152 |         <td>56.1%</td>
153 |         <td>27.3%</td>
154 |         <td>0.0%</td>
155 |         <td>24.4%</td>
156 |         <td>1.4%</td>
157 |         <td>14.8%</td>
158 |     </tr>
159 |     <tr>
160 |         <td>Llama-3.1-8B-Instruct</td>
161 |         <td>29.7%</td>
162 |         <td>50.3%</td>
163 |         <td>0.0%</td>
164 |         <td>15.7%</td>
165 |         <td>2.3%</td>
166 |         <td>13.9%</td>
167 |     </tr>
168 |     <tr>
169 |         <td>DeepSeek-R1</td>
170 |         <td>37.1%</td>
171 |         <td>38.6%</td>
172 |         <td>0.0%</td>
173 |         <td>15.8%</td>
174 |         <td>1.0%</td>
175 |         <td>12.4%</td>
176 |     </tr>
177 |     <tr>
178 |         <td>Qwen2.5-7B-Instruct</td>
179 |         <td>27.7%</td>
180 |         <td>45.6%</td>
181 |         <td>0.0%</td>
182 |         <td>12.6%</td>
183 |         <td>0.7%</td>
184 |         <td>11.7%</td>
185 |     </tr>
186 |     <tr>
187 |         <td>Llama-3.2-3B-Instruct</td>
188 |         <td>21.4%</td>
189 |         <td>42.0%</td>
190 |         <td>0.0%</td>
191 |         <td>10.0%</td>
192 |         <td>1.5%</td>
193 |         <td>10.4%</td>
194 |     </tr>
195 |     <tr>
196 |         <td>phi-4</td>
197 |         <td>20.8%</td>
198 |         <td>43.8%</td>
199 |         <td>0.0%</td>
200 |         <td>8.9%</td>
201 |         <td>1.6%</td>
202 |         <td>10.3%</td>
203 |     </tr>
204 |     <tr>
205 |         <td>Mistral-Small-24B-Instruct-2501</td>
206 |         <td>15.8%</td>
207 |         <td>23.0%</td>
208 |         <td>0.0%</td>
209 |         <td>15.8%</td>
210 |         <td>1.4%</td>
211 |         <td>8.8%</td>
212 |     </tr>
213 |     <tr>
214 |         <td>DeepSeek-R1-Distill-Llama-70B</td>
215 |         <td>8.7%</td>
216 |         <td>39.8%</td>
217 |         <td>0.0%</td>
218 |         <td>7.7%</td>
219 |         <td>1.3%</td>
220 |         <td>8.4%</td>
221 |     </tr>
222 |     <tr>
223 |         <td>Ministral-8B-Instruct-2410</td>
224 |         <td>10.9%</td>
225 |         <td>22.8%</td>
226 |         <td>0.0%</td>
227 |         <td>2.3%</td>
228 |         <td>0.4%</td>
229 |         <td>4.6%</td>
230 |     </tr>
231 |     <tr>
232 |         <td>Mistral-Small-3.1-24B-Instruct-2503</td>
233 |         <td>2.5%</td>
234 |         <td>10.3%</td>
235 |         <td>0.0%</td>
236 |         <td>10.5%</td>
237 |         <td>0.8%</td>
238 |         <td>4.5%</td>
239 |     </tr>
240 |     <tr>
241 |         <td>Mixtral-8x22B-Instruct-v0.1</td>
242 |         <td>17.1%</td>
243 |         <td>8.4%</td>
244 |         <td>0.0%</td>
245 |         <td>4.0%</td>
246 |         <td>0.4%</td>
247 |         <td>3.7%</td>
248 |     </tr>
249 |     <tr>
250 |         <td>Llama-3.2-1B-Instruct</td>
251 |         <td>0.0%</td>
252 |         <td>19.0%</td>
253 |         <td>0.0%</td>
254 |         <td>2.4%</td>
255 |         <td>0.6%</td>
256 |         <td>3.3%</td>
257 |     </tr>
258 |     <tr>
259 |         <td>Phi-3-mini-128k-instruct</td>
260 |         <td>2.7%</td>
261 |         <td>9.4%</td>
262 |         <td>0.0%</td>
263 |         <td>2.4%</td>
264 |         <td>0.3%</td>
265 |         <td>2.2%</td>
266 |     </tr>
267 |     <tr>
268 |         <td>Phi-3.5-MoE-instruct</td>
269 |         <td>0.0%</td>
270 |         <td>7.0%</td>
271 |         <td>0.0%</td>
272 |         <td>2.3%</td>
273 |         <td>0.4%</td>
274 |         <td>1.7%</td>
275 |     </tr>
276 |     <tr>
277 |         <td>Phi-4-mini-instruct</td>
278 |         <td>0.0%</td>
279 |         <td>5.5%</td>
280 |         <td>0.0%</td>
281 |         <td>2.3%</td>
282 |         <td>0.5%</td>
283 |         <td>1.5%</td>
284 |     </tr>
285 |     <tr>
286 |         <td>Mixtral-8x7B-Instruct-v0.1</td>
287 |         <td>0.0%</td>
288 |         <td>1.6%</td>
289 |         <td>0.0%</td>
290 |         <td>4.0%</td>
291 |         <td>0.3%</td>
292 |         <td>1.3%</td>
293 |     </tr>
294 |     <tr>
295 |         <td>Phi-3.5-mini-instruct</td>
296 |         <td>0.0%</td>
297 |         <td>2.0%</td>
298 |         <td>0.0%</td>
299 |         <td>2.4%</td>
300 |         <td>0.5%</td>
301 |         <td>1.0%</td>
302 |     </tr>
303 |     <tr>
304 |         <td>Phi-3-medium-128k-instruct</td>
305 |         <td>0.0%</td>
306 |         <td>0.0%</td>
307 |         <td>0.0%</td>
308 |         <td>2.3%</td>
309 |         <td>0.3%</td>
310 |         <td>0.7%</td>
311 |     </tr>
312 |     </tbody>
313 | </table>
314 | </div>
315 | 


--------------------------------------------------------------------------------
/docs/website/_includes/test.md:
--------------------------------------------------------------------------------
1 | This is the string you want to save.


--------------------------------------------------------------------------------
/docs/website/_layouts/default.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="{{ site.lang | default: "en-US" }}">
 3 |   <head>
 4 |     <meta charset="UTF-8">
 5 | 
 6 | {% seo %}
 7 |     <link rel="preconnect" href="https://fonts.gstatic.com">
 8 |     <link rel="preload" href="https://fonts.googleapis.com/css?family=Open+Sans:400,700&display=swap" as="style" type="text/css" crossorigin>
 9 |     <meta name="viewport" content="width=device-width, initial-scale=1">
10 |     <meta name="theme-color" content="#157878">
11 |     <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
12 |     <link rel="stylesheet" href="{{ '/assets/css/style.css?v=' | append: site.github.build_revision | relative_url }}">
13 |     <link rel="stylesheet" href="{{ '/assets/css/custom.css' | relative_url }}">
14 |     <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css">
15 | 
16 |     <!-- Microsoft WcpConsent library -->
17 |     <script src="https://wcpstatic.microsoft.com/mscc/lib/v2/wcp-consent.js"></script>
18 | 
19 |     {% include head-custom.html %}
20 |   </head>
21 |   <body>
22 |     <!-- Microsoft cookie banner placeholder -->
23 |     <div id="cookie-banner"></div>
24 | 
25 |     <a id="skip-to-content" href="#content">Skip to the content.</a>
26 | 
27 |     <header class="page-header" role="banner">
28 |       <h1 class="project-name">{{ page.title | default: TALES | default: TALES }}</h1>
29 |       <p><img src="assets/figs/static_banner.png" alt="banner" /></p>
30 |       <h2 class="project-tagline">{{ page.description | default: site.description | default: site.github.project_tagline }}</h2>
31 |       <a href="{{ site.github.repository_url }}" class="btn">
32 |         <img src="assets/figs/github-mark.svg" alt="GitHub" width="20" height="20" style="vertical-align: middle; margin-right: 5px;">
33 |         Code
34 |       </a>
35 |       <a href="https://arxiv.org/abs/2504.14128" class="btn">
36 |         <img src="assets/figs/arxiv-logomark-small.svg" alt="ArXiV" width="20" height="20" style="vertical-align: middle; margin-right: 5px;">
37 |         Paper
38 |       </a>
39 |     </header>
40 | 
41 |     <main id="content" class="main-content" role="main">
42 |       {{ content }}
43 | 
44 |       {%- include footer.html -%}
45 | 
46 |     </main>
47 | 
48 |     <!-- Initialize Microsoft WCP Consent -->
49 |     <script>
50 |       (function() {
51 |         // Initialize WcpConsent
52 |         function initializeWcp() {
53 |           if (typeof WcpConsent !== 'undefined') {
54 |             WcpConsent.init("en-US", "cookie-banner", function(err, siteConsent) {
55 |               if (err) {
56 |                 console.error("WCP initialization failed: ", err);
57 |               } else {
58 |                 console.log("WCP initialization succeeded!");
59 |               }
60 |             }, onConsentChanged);
61 |           } else {
62 |             console.error("WcpConsent library not loaded");
63 |             setTimeout(initializeWcp, 500); // Try again after 500ms
64 |           }
65 |         }
66 | 
67 |         function onConsentChanged(newConsent) {
68 |           console.log("Consent changed: ", newConsent);
69 |         }
70 | 
71 |         // Call initialization function when the page loads
72 |         if (document.readyState === 'loading') {
73 |           document.addEventListener('DOMContentLoaded', initializeWcp);
74 |         } else {
75 |           initializeWcp();
76 |         }
77 |       })();
78 |     </script>
79 |   </body>
80 | </html>
81 | 


--------------------------------------------------------------------------------
/docs/website/assets/css/custom.css:
--------------------------------------------------------------------------------
1 | .site-footer-credits {
2 |     color: #67747a;
3 | }
4 | 


--------------------------------------------------------------------------------
/docs/website/assets/css/style.scss:
--------------------------------------------------------------------------------
  1 | ---
  2 | ---
  3 | 
  4 | @import "{{ site.theme }}";
  5 | 
  6 | /* Tabs styling */
  7 | .tab-container {
  8 |     width: 100%;
  9 |     margin-top: 20px;
 10 | }
 11 | 
 12 | .tabs, .nested-tabs {
 13 |     display: flex; /* Add flexbox display */
 14 |     overflow: hidden;
 15 |     border: 1px solid #ccc;
 16 |     background-color: #f1f1f1;
 17 |     border-radius: 4px 4px 0 0;
 18 |     width: 100%; /* Ensure full width */
 19 | }
 20 | 
 21 | .nested-tabs {
 22 |     margin-bottom: 15px;
 23 | }
 24 | 
 25 | /* Shared styles for both main and nested tab buttons */
 26 | .tab-button, .nested-tab-button {
 27 |     background-color: inherit;
 28 |     border: none;
 29 |     outline: none;
 30 |     cursor: pointer;
 31 |     transition: 0.3s;
 32 |     flex: 1; /* Make tabs grow evenly to fill space */
 33 |     text-align: center; /* Center text in tabs */
 34 | }
 35 | 
 36 | /* Main tab button specific styles */
 37 | .tab-button {
 38 |     padding: 14px 16px;
 39 |     font-size: 16px;
 40 | }
 41 | 
 42 | /* Nested tab button specific styles */
 43 | .nested-tab-button {
 44 |     padding: 10px 12px;
 45 |     font-size: 14px;
 46 | }
 47 | 
 48 | .tab-button:hover, .nested-tab-button:hover {
 49 |     background-color: #ddd;
 50 | }
 51 | 
 52 | .tab-button.active {
 53 |     background-color: #157878;
 54 |     color: white;
 55 | }
 56 | 
 57 | .nested-tab-button.active {
 58 |     background-color: #1a9a9a; /* Slightly different color to distinguish */
 59 |     color: white;
 60 | }
 61 | 
 62 | .tab-content {
 63 |     display: none;
 64 |     padding: 20px;
 65 |     border: 1px solid #ccc;
 66 |     border-top: none;
 67 |     border-radius: 0 0 4px 4px;
 68 |     width: 100%; /* Ensure content takes full width */
 69 | }
 70 | 
 71 | .tab-content.active {
 72 |     display: block;
 73 | }
 74 | 
 75 | .nested-tab-content {
 76 |     display: none;
 77 |     padding: 10px 0;
 78 |     border-top: none;
 79 |     width: 100%; /* Ensure nested content takes full width */
 80 | }
 81 | 
 82 | .nested-tab-content.active {
 83 |     display: block;
 84 | }
 85 | 
 86 | #main-description {
 87 |     font-weight: normal;
 88 |     font-style: normal;
 89 | }
 90 | 
 91 | /* If needed, control other text properties */
 92 | #main-description {
 93 |     font-size: 16px;
 94 |     line-height: 1.5;
 95 |     color: #333;
 96 | }
 97 | 
 98 | .author-tagline {
 99 |     text-align: center;
100 |     font-style: italic;
101 |     color: #666;
102 |     margin-bottom: 20px;
103 | }
104 | 
105 | .abstract-container {
106 |     background-color: #f5f5f5;
107 |     border-radius: 8px;
108 |     padding: 20px;
109 |     margin: 20px 0;
110 |     border-left: 4px solid #ddd;
111 | }
112 | 
113 | .abstract-container h3 {
114 |     margin-top: 0;
115 |     color: #333;
116 | }
117 | 
118 | .abstract-container p {
119 |     margin-bottom: 0;
120 |     line-height: 1.6;
121 | }
122 | 
123 | .abstract-tagline {
124 |     text-align: center;
125 |     font-weight: bold;
126 |     color: #666;
127 |     margin-bottom: 20px;
128 | }
129 | 
130 | /* Table styling */
131 | // .table-container {
132 | //     overflow-x: auto;
133 | //     margin: 20px 0;
134 | //   }
135 |   
136 |   .model-scores {
137 |     width: 100%;
138 |     border-collapse: collapse;
139 |     font-size: 14px;
140 |   }
141 |   
142 |   .model-scores th {
143 |     background-color: #157878;
144 |     color: white;
145 |     text-align: left;
146 |     padding: 10px;
147 |     position: sticky;
148 |     top: 0;
149 |   }
150 |   
151 |   .model-scores td {
152 |     padding: 8px 10px;
153 |     border-bottom: 1px solid #ddd;
154 |   }
155 |   
156 |   .model-scores tr:nth-child(even) {
157 |     background-color: #f2f2f2;
158 |   }
159 |   
160 |   .model-scores tr:hover {
161 |     background-color: #e8f4f4;
162 |   }
163 |   
164 |   /* Responsive design for mobile */
165 |   @media screen and (max-width: 768px) {
166 |     .model-scores {
167 |       font-size: 12px;
168 |     }
169 |     
170 |     .model-scores th, .model-scores td {
171 |       padding: 6px;
172 |     }
173 |   }
174 | 
175 |   .environment-container {
176 |     background-color: #f5f5f5;
177 |     border-radius: 8px;
178 |     padding: 20px;
179 |     margin: 20px 0;
180 |     border-left: 4px solid #157878;
181 | }
182 | 
183 | .environment-tagline {
184 |     text-align: center;
185 |     font-weight: bold;
186 |     color: #157878;
187 |     margin-bottom: 20px;
188 | }
189 | 
190 | .environment-container p:not(.environment-tagline) {
191 |     margin-bottom: 0;
192 |     line-height: 1.6;
193 | }
194 | 
195 | .cite-button {
196 |     background: none;
197 |     border: none;
198 |     color: #157878;
199 |     cursor: pointer;
200 |     font-size: 0.8em;
201 |     padding: 0 5px;
202 |     vertical-align: middle;
203 |     transition: transform 0.2s;
204 | }
205 | 
206 | .cite-button:hover {
207 |     transform: scale(1.2);
208 | }
209 | 
210 | .modal {
211 |     display: none;
212 |     position: fixed;
213 |     z-index: 1000;
214 |     left: 0;
215 |     top: 0;
216 |     width: 100%;
217 |     height: 100%;
218 |     overflow: auto;
219 |     background-color: rgba(0,0,0,0.4);
220 |   }
221 |   
222 |   /* Modal Content */
223 |   .modal-content {
224 |     background-color: #fefefe;
225 |     margin: 10% auto;
226 |     padding: 20px;
227 |     border: 1px solid #888;
228 |     width: 80%;
229 |     max-width: 600px;
230 |     border-radius: 8px;
231 |     box-shadow: 0 4px 8px rgba(0,0,0,0.2);
232 |   }
233 |   
234 |   /* The Close Button */
235 |   .close-modal {
236 |     color: #aaa;
237 |     float: right;
238 |     font-size: 28px;
239 |     font-weight: bold;
240 |     cursor: pointer;
241 |   }
242 |   
243 |   .close-modal:hover,
244 |   .close-modal:focus {
245 |     color: black;
246 |     text-decoration: none;
247 |   }
248 |   
249 |   .citation-popup {
250 |     display: none;
251 |     position: absolute;  /* Use absolute instead of fixed */
252 |     z-index: 1000;
253 |     background-color: #fefefe;
254 |     border: 1px solid #ddd;
255 |     border-radius: 8px;
256 |     box-shadow: 0 4px 8px rgba(0,0,0,0.2);
257 |     width: 400px;
258 |     max-width: 90vw;
259 |     padding: 15px;
260 | }
261 |   
262 |   /* Citation box styling */
263 |   .citation-box {
264 |     background-color: #f9f9f9;
265 |     border: 1px solid #ddd;
266 |     border-radius: 4px;
267 |     padding: 10px;
268 |     margin: 10px 0;
269 |     max-height: 200px;
270 |     overflow-y: auto;
271 |   }
272 |   
273 |   .citation-box pre {
274 |     white-space: pre-wrap;
275 |     word-wrap: break-word;
276 |     margin: 0;
277 |     font-family: monospace;
278 |     font-size: 12px;
279 |   }
280 |   
281 |   /* Popup header */
282 |   .popup-header {
283 |     display: flex;
284 |     justify-content: space-between;
285 |     align-items: center;
286 |     margin-bottom: 10px;
287 |   }
288 |   
289 |   .popup-header h3 {
290 |     margin: 0;
291 |     font-size: 16px;
292 |   }
293 |   
294 |   .close-popup {
295 |     cursor: pointer;
296 |     color: #888;
297 |     font-size: 18px;
298 |     font-weight: bold;
299 |   }
300 |   
301 |   .close-popup:hover {
302 |     color: #333;
303 |   }
304 |   
305 |   /* Copy button styling */
306 |   .copy-button {
307 |     background-color: #157878;
308 |     color: white;
309 |     border: none;
310 |     padding: 8px 16px;
311 |     text-align: center;
312 |     text-decoration: none;
313 |     display: inline-block;
314 |     font-size: 14px;
315 |     margin: 10px 0 0 0;
316 |     cursor: pointer;
317 |     border-radius: 4px;
318 |     transition: background-color 0.3s;
319 |   }
320 |   
321 |   .copy-button:hover {
322 |     background-color: #0b5c5c;
323 |   }
324 |   
325 |   /* Existing button styling */
326 |   .cite-button {
327 |     background: none;
328 |     border: none;
329 |     color: #157878;
330 |     cursor: pointer;
331 |     font-size: 0.8em;
332 |     padding: 0 5px;
333 |     vertical-align: middle;
334 |     transition: transform 0.2s;
335 |     position: relative; /* For positioning the popup */
336 |   }
337 |   
338 |   .cite-button:hover {
339 |     transform: scale(1.2);
340 |   }
341 | 
342 |   .citation-notice {
343 |     text-align: center;
344 |     font-style: italic;
345 |     color: #555;
346 |     margin-bottom: 15px;
347 |     font-size: 0.9em;
348 |   }
349 | 
350 |   .environment-image {
351 |     display: block;
352 |     margin: 0 auto;
353 |     max-width: 100%;
354 |     height: auto;
355 |   }
356 |   
357 |   .environment-image-container {
358 |     text-align: center;
359 |     margin: 20px 0;
360 |   }
361 | 
362 | 
363 |   .image-caption {
364 |     text-align: center;
365 |     font-size: 0.85em;
366 |     color: #666;
367 |     margin-top: 5px;
368 |     font-style: italic;
369 | }
370 | 
371 | /* Add this to your stylesheet */
372 | .table-container {
373 |   width: 100%;
374 |   overflow-x: hidden;
375 | }
376 | 
377 | /* Force the table to fit within the container */
378 | .responsive-table {
379 |   width: 100%;
380 |   overflow-x: hidden;
381 | }
382 | 
383 | /* Style the model-scores table */
384 | .model-scores {
385 |   width: 100%;
386 |   table-layout: fixed;
387 |   border-collapse: collapse;
388 |   font-size: 14px; /* Base font size */
389 | }
390 | 
391 | /* Give more space to model names, less to percentages */
392 | .model-scores th:first-child,
393 | .model-scores td:first-child {
394 |   width: 16%; /* Prioritize model names */
395 |   text-align: left;
396 |   font-weight: 500;
397 |   padding-right: 8px;
398 | }
399 | 
400 | /* Make percentage columns very compact */
401 | .model-scores th:not(:first-child),
402 | .model-scores td:not(:first-child) {
403 |   width: 14%; /* Distribute remaining 65% across 6 columns */
404 |   text-align: center;
405 |   padding-left: 2px;
406 |   padding-right: 2px;
407 | }
408 | 
409 | /* Force text wrapping in all cells */
410 | .model-scores th,
411 | .model-scores td {
412 |   word-break: break-word;
413 |   overflow-wrap: break-word;
414 |   white-space: normal;
415 |   padding-top: 4px;
416 |   padding-bottom: 4px;
417 | }
418 | 
419 | /* Progressive font size reduction for different screen sizes */
420 | @media screen and (max-width: 992px) {
421 |   .model-scores {
422 |     font-size: 13px;
423 |   }
424 | }
425 | 
426 | @media screen and (max-width: 768px) {
427 |   .model-scores {
428 |     font-size: 12px;
429 |   }
430 |   
431 |   .model-scores th:first-child,
432 |   .model-scores td:first-child {
433 |     width: 16%;
434 |   }
435 |   
436 |   .model-scores th:not(:first-child),
437 |   .model-scores td:not(:first-child) {
438 |     width: 14%;
439 |   }
440 | }
441 | 
442 | @media screen and (max-width: 576px) {
443 |   .model-scores {
444 |     font-size: 10px;
445 |   }
446 |   
447 |   .model-scores th:first-child,
448 |   .model-scores td:first-child {
449 |     width: 12.5%;
450 |   }
451 |   
452 |   .model-scores th:not(:first-child),
453 |   .model-scores td:not(:first-child) {
454 |     width: 25%;
455 |   }
456 | }
457 | 
458 | /* For extremely small screens */
459 | @media screen and (max-width: 400px) {
460 |   .model-scores {
461 |     font-size: 9px;
462 |   }
463 | }
464 | 
465 | .simplified-table-container {
466 |   width: 100%;
467 |   overflow-x: hidden;
468 |   max-width: 100%;
469 |   display: block;
470 | }
471 | 
472 | 
473 | /* Make sure simplified-table-container is correctly displayed in nested-tab-content */
474 | #tab6-subtab1 .responsive-table.simplified-table-container {
475 |   width: 100%;
476 |   overflow-x: auto; /* Change from hidden to auto if table might overflow on small screens */
477 | }
478 | 
479 | /* Column-specific widths for the simplified table */
480 | .simplified-table-container .model-scores th:nth-child(1),
481 | .simplified-table-container .model-scores td:nth-child(1) {
482 |   width: 10%; /* Rank column - very narrow */
483 |   text-align: center;
484 | }
485 | 
486 | .simplified-table-container .model-scores th:nth-child(2),
487 | .simplified-table-container .model-scores td:nth-child(2) {
488 |   width: 40%; /* Model name column - give it most space */
489 |   text-align: center;
490 | }
491 | 
492 | .simplified-table-container .model-scores th:nth-child(3),
493 | .simplified-table-container .model-scores td:nth-child(3) {
494 |   width: 20%; /* Score column - moderate space */
495 |   text-align: center;
496 | }
497 | 
498 | .simplified-table-container .model-scores th:nth-child(4),
499 | .simplified-table-container .model-scores td:nth-child(4) {
500 |   width: 20%; /* Reasoning column - moderate space */
501 |   text-align: center;
502 | }
503 | 
504 | .simplified-table-container .model-scores th:nth-child(5),
505 | .simplified-table-container .model-scores td:nth-child(5) {
506 |   width: 20%; /* Reasoning column - moderate space */
507 |   text-align: center;
508 | }
509 | 
510 | /* Responsive adjustments for simplified table */
511 | @media screen and (max-width: 768px) {
512 |   .simplified-table-container .model-scores th:nth-child(1),
513 |   .simplified-table-container .model-scores td:nth-child(1) {
514 |     width: 10%;
515 |   }
516 |   
517 |   .simplified-table-container .model-scores th:nth-child(2),
518 |   .simplified-table-container .model-scores td:nth-child(2) {
519 |     width: 40%;
520 |   }
521 |   
522 |   .simplified-table-container .model-scores th:nth-child(3),
523 |   .simplified-table-container .model-scores td:nth-child(3),
524 |   .simplified-table-container .model-scores th:nth-child(4),
525 |   .simplified-table-container .model-scores td:nth-child(4),
526 |   .simplified-table-container .model-scores td:nth-child(5)
527 |    {
528 |     width: 10;
529 |   }
530 | }
531 | 
532 | .asterisk-note {
533 |   font-size: 0.85em;
534 |   color: #666;
535 |   margin-top: -10px;
536 |   margin-bottom: 15px;
537 |   font-style: italic;
538 | }
539 | 
540 | .video-container {
541 |   width: 100%;
542 |   margin: 1em 0;
543 |   position: relative;
544 | }
545 | 
546 | .video-container video {
547 |   width: 100%;
548 |   display: block;
549 | }


--------------------------------------------------------------------------------
/docs/website/assets/figs/alfworld_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/alfworld_all_games.png


--------------------------------------------------------------------------------
/docs/website/assets/figs/alfworld_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/alfworld_image.png


--------------------------------------------------------------------------------
/docs/website/assets/figs/all_framework_scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/all_framework_scores.png


--------------------------------------------------------------------------------
/docs/website/assets/figs/arxiv-logomark-small.svg:
--------------------------------------------------------------------------------
1 | <svg id="logomark" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 17.732 24.269"><g id="tiny"><path d="M573.549,280.916l2.266,2.738,6.674-7.84c.353-.47.52-.717.353-1.117a1.218,1.218,0,0,0-1.061-.748h0a.953.953,0,0,0-.712.262Z" transform="translate(-566.984 -271.548)" fill="#bdb9b4"/><path d="M579.525,282.225l-10.606-10.174a1.413,1.413,0,0,0-.834-.5,1.09,1.09,0,0,0-1.027.66c-.167.4-.047.681.319,1.206l8.44,10.242h0l-6.282,7.716a1.336,1.336,0,0,0-.323,1.3,1.114,1.114,0,0,0,1.04.69A.992.992,0,0,0,571,293l8.519-7.92A1.924,1.924,0,0,0,579.525,282.225Z" transform="translate(-566.984 -271.548)" fill="#b31b1b"/><path d="M584.32,293.912l-8.525-10.275,0,0L573.53,280.9l-1.389,1.254a2.063,2.063,0,0,0,0,2.965l10.812,10.419a.925.925,0,0,0,.742.282,1.039,1.039,0,0,0,.953-.667A1.261,1.261,0,0,0,584.32,293.912Z" transform="translate(-566.984 -271.548)" fill="#bdb9b4"/></g></svg>


--------------------------------------------------------------------------------
/docs/website/assets/figs/arxiv-logomark.svg:
--------------------------------------------------------------------------------
1 | <svg id="logomark" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 135.611 201"><g id="medium"><path d="M456.756,249.1l-42.324,49.375c-1.885,2.01-3.054,5.535-2,8.066a6.9,6.9,0,0,0,6.443,4.27c1.6,0,2.911-.56,4.631-2.289l52.364-54.278a8.927,8.927,0,0,0,.061-12.584Z" transform="translate(-391.029 -166.577)" fill="#aa142d"/><path d="M455.876,248.071l39.168-49.545c2.186-2.914,3.219-4.44,2.186-6.921a7.536,7.536,0,0,0-6.566-4.633h0a5.9,5.9,0,0,0-4.408,1.624l-50.072,53.061c-3.816,3.816-3.808,8.753.023,12.584l70.031,73.438a5.738,5.738,0,0,0,4.6,1.748c2.833,0,4.67-1.667,5.9-4.131,1.054-2.531-.112-5.038-2.056-7.664l-58.8-69.561" transform="translate(-391.029 -166.577)" fill="#bdb9b4"/><path d="M475.931,241.657l-68.388-71.966s-2.511-3.048-5.165-3.112a6.75,6.75,0,0,0-6.357,4.084c-1.033,2.481-.291,4.223,1.977,7.47L456.756,249.1Z" transform="translate(-391.029 -166.577)" fill="#aa142d"/><path d="M472.39,367.577a2.2,2.2,0,0,1-1.895-1.1l-8.528-14.161-8.576,14.161a2.152,2.152,0,0,1-1.9,1.1,2.33,2.33,0,0,1-2.343-2.3,2.269,2.269,0,0,1,.348-1.246l9.925-15.907L451.3,335.114a2.722,2.722,0,0,1-.351-1.247,2.331,2.331,0,0,1,2.347-2.292,2.093,2.093,0,0,1,1.894,1.046l6.779,11.42,6.734-11.42a2.011,2.011,0,0,1,1.843-1.046,2.268,2.268,0,0,1,2.346,2.292,2.7,2.7,0,0,1-.3,1.247l-8.079,13.015,9.825,15.907a1.956,1.956,0,0,1,.4,1.246A2.363,2.363,0,0,1,472.39,367.577Z" transform="translate(-391.029 -166.577)"/><path d="M429.138,342.132c1.021,0,1.7.681,2.188,1.9a2.817,2.817,0,0,1,2.578-1.9h8.169a2.231,2.231,0,0,1,2.237,2.237v4.3a1.983,1.983,0,0,1-2.237,2.237,2,2,0,0,1-2.237-2.237v-2.061h-5.787a1.324,1.324,0,0,0-1.458,1.507v14.929h5.494a2.237,2.237,0,1,1,0,4.473H423.6a2.237,2.237,0,0,1,0-4.473h4.522V346.606H424.13a2.238,2.238,0,0,1,0-4.474Z" transform="translate(-391.029 -166.577)"/><path d="M490.449,342.132a2,2,0,0,1,2.237,2.237v18.673h5.543a2.237,2.237,0,1,1,0,4.473H482.523a2.237,2.237,0,1,1,0-4.473h5.69V346.606h-4.686a2.237,2.237,0,0,1,0-4.474Zm2.317-9.672a3.217,3.217,0,1,1-3.194-3.195A3.241,3.241,0,0,1,492.766,332.46Z" transform="translate(-391.029 -166.577)"/><path d="M526.64,344.369a2.527,2.527,0,0,1-.147.875l-8.169,20.91a2.042,2.042,0,0,1-2.043,1.361h-3.258a2.027,2.027,0,0,1-2.091-1.361l-8.266-20.91a1.78,1.78,0,0,1-.195-.875,2.242,2.242,0,0,1,2.286-2.237,2.086,2.086,0,0,1,2.042,1.459l7.878,19.84,7.634-19.84a2.087,2.087,0,0,1,2.043-1.459A2.242,2.242,0,0,1,526.64,344.369Z" transform="translate(-391.029 -166.577)"/><path d="M408.844,342.132a4.268,4.268,0,0,1,4.506,4.611v20.772h-19.5a2.643,2.643,0,0,1-2.824-2.706v-7.153a4.533,4.533,0,0,1,3.842-4.474l14-1.961v-4.615H393.3a2.2,2.2,0,0,1-2.274-2.237,2.173,2.173,0,0,1,2.391-2.237Zm.032,20.91v-7.3L395.5,357.607v5.435Z" transform="translate(-391.029 -166.577)"/></g></svg>


--------------------------------------------------------------------------------
/docs/website/assets/figs/figure1_eric.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/figure1_eric.png


--------------------------------------------------------------------------------
/docs/website/assets/figs/github-mark.svg:
--------------------------------------------------------------------------------
1 | <svg width="98" height="96" xmlns="http://www.w3.org/2000/svg"><path fill-rule="evenodd" clip-rule="evenodd" d="M48.854 0C21.839 0 0 22 0 49.217c0 21.756 13.993 40.172 33.405 46.69 2.427.49 3.316-1.059 3.316-2.362 0-1.141-.08-5.052-.08-9.127-13.59 2.934-16.42-5.867-16.42-5.867-2.184-5.704-5.42-7.17-5.42-7.17-4.448-3.015.324-3.015.324-3.015 4.934.326 7.523 5.052 7.523 5.052 4.367 7.496 11.404 5.378 14.235 4.074.404-3.178 1.699-5.378 3.074-6.6-10.839-1.141-22.243-5.378-22.243-24.283 0-5.378 1.94-9.778 5.014-13.2-.485-1.222-2.184-6.275.486-13.038 0 0 4.125-1.304 13.426 5.052a46.97 46.97 0 0 1 12.214-1.63c4.125 0 8.33.571 12.213 1.63 9.302-6.356 13.427-5.052 13.427-5.052 2.67 6.763.97 11.816.485 13.038 3.155 3.422 5.015 7.822 5.015 13.2 0 18.905-11.404 23.06-22.324 24.283 1.78 1.548 3.316 4.481 3.316 9.126 0 6.6-.08 11.897-.08 13.526 0 1.304.89 2.853 3.316 2.364 19.412-6.52 33.405-24.935 33.405-46.691C97.707 22 75.788 0 48.854 0z" fill="#24292f"/></svg>


--------------------------------------------------------------------------------
/docs/website/assets/figs/jericho_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/jericho_all_games.png


--------------------------------------------------------------------------------
/docs/website/assets/figs/jericho_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/jericho_image.png


--------------------------------------------------------------------------------
/docs/website/assets/figs/radar_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/radar_chart.png


--------------------------------------------------------------------------------
/docs/website/assets/figs/radar_chart_zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/radar_chart_zoom.png


--------------------------------------------------------------------------------
/docs/website/assets/figs/scienceworld_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/scienceworld_all_games.png


--------------------------------------------------------------------------------
/docs/website/assets/figs/scienceworld_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/scienceworld_image.png


--------------------------------------------------------------------------------
/docs/website/assets/figs/simon_says_chatgpt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/simon_says_chatgpt.png


--------------------------------------------------------------------------------
/docs/website/assets/figs/static_banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/static_banner.png


--------------------------------------------------------------------------------
/docs/website/assets/figs/text-benchmark_bar_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/text-benchmark_bar_chart.png


--------------------------------------------------------------------------------
/docs/website/assets/figs/text-benchmark_radar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/text-benchmark_radar.png


--------------------------------------------------------------------------------
/docs/website/assets/figs/text-benchmark_radar_zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/text-benchmark_radar_zoom.png


--------------------------------------------------------------------------------
/docs/website/assets/figs/textworld_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/textworld_all_games.png


--------------------------------------------------------------------------------
/docs/website/assets/figs/textworld_express_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/textworld_express_all_games.png


--------------------------------------------------------------------------------
/docs/website/assets/figs/textworld_image.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/textworld_image.png


--------------------------------------------------------------------------------
/docs/website/assets/figs/zork1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/figs/zork1.png


--------------------------------------------------------------------------------
/docs/website/assets/js/tabs.js:
--------------------------------------------------------------------------------
  1 | function openTab(evt, tabName) {
  2 |     var i, tabcontent, tabbuttons;
  3 |     
  4 |     // Hide all tab content
  5 |     tabcontent = document.getElementsByClassName("tab-content");
  6 |     for (i = 0; i < tabcontent.length; i++) {
  7 |         tabcontent[i].style.display = "none";
  8 |     }
  9 |     
 10 |     // Remove "active" class from all tab buttons
 11 |     tabbuttons = document.getElementsByClassName("tab-button");
 12 |     for (i = 0; i < tabbuttons.length; i++) {
 13 |         tabbuttons[i].className = tabbuttons[i].className.replace(" active", "");
 14 |     }
 15 |     
 16 |     // Show the current tab and add "active" class to the button
 17 |     document.getElementById(tabName).style.display = "block";
 18 |     evt.currentTarget.className += " active";
 19 | }
 20 | 
 21 | // Nested tab functionality
 22 | function openNestedTab(evt, tabName) {
 23 |     var i, tabcontent, tabbuttons;
 24 |     
 25 |     // Hide all nested tab content within the parent tab
 26 |     var parentTab = evt.currentTarget.closest('.tab-content');
 27 |     tabcontent = parentTab.getElementsByClassName("nested-tab-content");
 28 |     for (i = 0; i < tabcontent.length; i++) {
 29 |         tabcontent[i].style.display = "none";
 30 |     }
 31 |     
 32 |     // Remove "active" class from all nested tab buttons
 33 |     tabbuttons = parentTab.getElementsByClassName("nested-tab-button");
 34 |     for (i = 0; i < tabbuttons.length; i++) {
 35 |         tabbuttons[i].className = tabbuttons[i].className.replace(" active", "");
 36 |     }
 37 |     
 38 |     // Show the current nested tab and add "active" class to the button
 39 |     document.getElementById(tabName).style.display = "block";
 40 |     evt.currentTarget.className += " active";
 41 | }
 42 | 
 43 | 
 44 | function copyTextToClipboard(elementId, event) {
 45 |     console.log("Citation button clicked for: " + elementId);
 46 |     
 47 |     // Get the citation text
 48 |     var citationText = document.getElementById(elementId);
 49 |     if (!citationText) {
 50 |         console.error("Citation element not found: " + elementId);
 51 |         return;
 52 |     }
 53 |     
 54 |     // Force create popup if not exists
 55 |     var popup = document.getElementById('citation-popup');
 56 |     if (!popup) {
 57 |         console.log("Creating popup because it doesn't exist yet");
 58 |         var popupHTML = 
 59 |             '<div class="citation-popup" id="citation-popup">' +
 60 |                 '<div class="popup-header">' +
 61 |                     '<h3>Citation</h3>' +
 62 |                     '<span class="close-popup">&times;</span>' +
 63 |                 '</div>' +
 64 |                 '<div class="citation-box">' +
 65 |                     '<pre id="citation-text"></pre>' +
 66 |                 '</div>' +
 67 |                 '<button id="copy-citation-button" class="copy-button">Copy to Clipboard</button>' +
 68 |             '</div>';
 69 |         
 70 |         document.body.insertAdjacentHTML('beforeend', popupHTML);
 71 |         popup = document.getElementById('citation-popup');
 72 |         
 73 |         // Set up event handlers for the newly created popup
 74 |         var closeButton = document.querySelector('.close-popup');
 75 |         var copyButton = document.getElementById('copy-citation-button');
 76 |         
 77 |         if (closeButton) {
 78 |             closeButton.onclick = function() {
 79 |                 popup.style.display = 'none';
 80 |             };
 81 |         }
 82 |         
 83 |         if (copyButton) {
 84 |             copyButton.onclick = function() {
 85 |                 var text = document.getElementById('citation-text').innerText;
 86 |                 navigator.clipboard.writeText(text).then(function() {
 87 |                     copyButton.innerText = 'Copied!';
 88 |                     setTimeout(function() {
 89 |                         copyButton.innerText = 'Copy to Clipboard';
 90 |                     }, 1500);
 91 |                 });
 92 |             };
 93 |         }
 94 |     }
 95 |     
 96 |     // Set the citation text in the popup
 97 |     var citationTextElement = document.getElementById('citation-text');
 98 |     if (citationTextElement) {
 99 |         citationTextElement.innerText = citationText.innerText;
100 |     }
101 |     
102 |     // Position the popup near the mouse cursor instead of the button
103 |     var x = event.clientX;
104 |     var y = event.clientY;
105 |     
106 |     // Get dimensions
107 |     var viewportWidth = window.innerWidth || document.documentElement.clientWidth;
108 |     var viewportHeight = window.innerHeight || document.documentElement.clientHeight;
109 |     var scrollTop = window.pageYOffset || document.documentElement.scrollTop;
110 |     var scrollLeft = window.pageXOffset || document.documentElement.scrollLeft;
111 |     
112 |     // Show the popup temporarily to get its dimensions
113 |     popup.style.visibility = 'hidden';
114 |     popup.style.display = 'block';
115 |     var popupWidth = popup.offsetWidth;
116 |     var popupHeight = popup.offsetHeight;
117 |     
118 |     // Calculate position to ensure popup stays in viewport
119 |     // Add 10px padding from edges
120 |     var padding = 10;
121 |     
122 |     // Position horizontally
123 |     if (x + popupWidth + padding > viewportWidth) {
124 |         // If too far right, position to the left of cursor
125 |         x = Math.max(padding, x - popupWidth - padding);
126 |     } else {
127 |         // Otherwise position to the right of cursor with padding
128 |         x = x + padding;
129 |     }
130 |     
131 |     // Position vertically
132 |     if (y + popupHeight + padding > viewportHeight) {
133 |         // If too far down, position above cursor
134 |         y = Math.max(padding, y - popupHeight - padding);
135 |     } else {
136 |         // Otherwise position below cursor with padding
137 |         y = y + padding;
138 |     }
139 |     
140 |     // Apply the position (convert from viewport coordinates to document coordinates)
141 |     popup.style.left = (x + scrollLeft) + 'px';
142 |     popup.style.top = (y + scrollTop) + 'px';
143 |     
144 |     // Make the popup visible
145 |     popup.style.visibility = 'visible';
146 |     
147 |     // Prevent default action and event bubbling
148 |     event.preventDefault();
149 |     event.stopPropagation();
150 | }
151 | 
152 | // Initialize tabs and set up the citation popup
153 | document.addEventListener('DOMContentLoaded', function() {
154 |     // Make sure the first tab and its first nested tab are active by default
155 |     var firstTabButton = document.querySelector('.tab-button');
156 |     if (firstTabButton) {
157 |         firstTabButton.click();
158 |     }
159 |     
160 |     // Create the citation popup HTML if it doesn't exist
161 |     if (!document.getElementById('citation-popup')) {
162 |         var popupHTML = 
163 |             '<div class="citation-popup" id="citation-popup">' +
164 |                 '<div class="popup-header">' +
165 |                     '<h3>Citation</h3>' +
166 |                     '<span class="close-popup">&times;</span>' +
167 |                 '</div>' +
168 |                 '<div class="citation-box">' +
169 |                     '<pre id="citation-text"></pre>' +
170 |                 '</div>' +
171 |                 '<button id="copy-citation-button" class="copy-button">Copy to Clipboard</button>' +
172 |             '</div>';
173 |         
174 |         document.body.insertAdjacentHTML('beforeend', popupHTML);
175 |         
176 |         // Now set up the event handlers for the popup
177 |         var popup = document.getElementById('citation-popup');
178 |         var closeButton = document.querySelector('.close-popup');
179 |         var copyButton = document.getElementById('copy-citation-button');
180 |         
181 |         // Close popup when clicking the close button
182 |         if (closeButton) {
183 |             closeButton.onclick = function() {
184 |                 popup.style.display = 'none';
185 |             };
186 |         }
187 |         
188 |         // When the user clicks the copy button
189 |         if (copyButton) {
190 |             copyButton.onclick = function() {
191 |                 var text = document.getElementById('citation-text').innerText;
192 |                 navigator.clipboard.writeText(text).then(function() {
193 |                     // Change button text temporarily to provide feedback
194 |                     var originalText = copyButton.innerText;
195 |                     copyButton.innerText = 'Copied!';
196 |                     setTimeout(function() {
197 |                         copyButton.innerText = originalText;
198 |                     }, 1500);
199 |                 }).catch(function(err) {
200 |                     console.error('Could not copy text: ', err);
201 |                 });
202 |             };
203 |         }
204 |         
205 |         // Close popup when clicking outside
206 |         document.addEventListener('click', function(event) {
207 |             if (popup && 
208 |                 !popup.contains(event.target) && 
209 |                 !event.target.classList.contains('cite-button') && 
210 |                 popup.style.display === 'block') {
211 |                 popup.style.display = 'none';
212 |             }
213 |         });
214 |     }
215 | });
216 | 
217 | 
218 | document.addEventListener('DOMContentLoaded', function() {
219 |     // Find the simplified table by its container
220 |     const simplifiedTableContainer = document.getElementById('tab6-subtab1');
221 |     if (simplifiedTableContainer) {
222 |       const table = simplifiedTableContainer.querySelector('table');
223 |       if (table) {
224 |         table.classList.add('simplified-scores');
225 |       }
226 |     }
227 |   });


--------------------------------------------------------------------------------
/docs/website/assets/videos/figure1v4.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/assets/videos/figure1v4.mp4


--------------------------------------------------------------------------------
/docs/website/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/docs/website/favicon.ico


--------------------------------------------------------------------------------
/print_results.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import os
 4 | from os.path import join as pjoin
 5 | 
 6 | import pandas as pd
 7 | 
 8 | 
 9 | def parse_args():
10 |     # fmt: off
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument("--logs", metavar="path", nargs="+", default=["logs"],
13 |                         help="Paths within which to look for .jsonl files.")
14 |     return parser.parse_args()
15 |     # fmt: on
16 | 
17 | 
18 | def main():
19 |     args = parse_args()
20 | 
21 |     results = []
22 |     for logpath in args.logs:
23 |         for logfile in glob.glob(pjoin(logpath, "**", "*.jsonl"), recursive=True):
24 | 
25 |             path, _ = os.path.splitext(logfile)
26 |             _, agent, env_name, env_params = path.rsplit(os.path.sep, maxsplit=3)
27 |             admissible_command, game_seed = env_params.split("_")
28 |             admissible_command = bool(int(admissible_command[1]))
29 |             agent = agent.split("_", maxsplit=1)[1]
30 | 
31 |             data = pd.read_json(logfile, lines=True)
32 | 
33 |             results.append(
34 |                 {
35 |                     "agent": agent,
36 |                     "env_name": env_name,
37 |                     # "env_params": env_params,
38 |                     "admissible_command": admissible_command,
39 |                     "game_seed": game_seed,
40 |                     "total_tokens": data["Token Usage"].sum(),
41 |                     "avg_tokens_per_step": data["Token Usage"].mean(),
42 |                     "norm_score": data["Normalized Score"].max(),
43 |                     "nb_steps": data["Step"].max(),
44 |                     # TODO: add more metrics: duration, nb_resets, nb_wins/losts, nb_invalid_actions, in-game moves
45 |                 }
46 |             )
47 |     df = pd.DataFrame.from_records(results)
48 | 
49 |     group = df.groupby(["agent", "admissible_command", "env_name"])
50 |     columns = ["total_tokens", "avg_tokens_per_step", "norm_score", "nb_steps"]
51 |     print(group[columns].mean())
52 |     print()
53 | 
54 |     group = df.groupby(["agent", "admissible_command"])
55 |     aggregated_results = group.agg(
56 |         {
57 |             "total_tokens": "sum",
58 |             "avg_tokens_per_step": "mean",
59 |             "norm_score": ["mean", "std"],
60 |             "nb_steps": "mean",
61 |         }
62 |     )
63 |     print(aggregated_results)
64 | 
65 | 
66 | if __name__ == "__main__":
67 |     main()
68 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "tale-suite"
 7 | version = "1.0.0rc1"
 8 | description = "TALES: Text-Adventure Learning Environment Suite"
 9 | readme = "README.md"
10 | requires-python = ">=3.12"
11 | dynamic = ["dependencies"]
12 | 
13 | classifiers = [
14 |     "Programming Language :: Python :: 3",
15 |     "License :: OSI Approved :: MIT License",
16 |     "Operating System :: OS Independent",
17 | ]
18 | 
19 | [tool.setuptools.dynamic]
20 | dependencies = {file = ["requirements.txt"]}
21 | 
22 | [tool.setuptools.packages.find]
23 | exclude = ["wandb/*", "logs/*", "website/*"]
24 | 
25 | 
26 | [project.optional-dependencies]
27 | dev = [
28 |     "pytest",
29 |     "pre-commit",
30 |     "black",
31 |     "isort",
32 | ]
33 | 
34 | [project.urls]
35 | "Homepage" = "https://github.com/microsoft/tale-suite"
36 | "Bug Tracker" = "https://github.com/microsoft/tale-suite/issues"
37 | 
38 | [tool.black]
39 | line-length = 88
40 | 
41 | [tool.isort]
42 | profile = "black"
43 | known_third_party = ["wandb"]
44 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | tatsu==5.8.3
 2 | gymnasium>=1.0.0
 3 | jericho>=3.3.0
 4 | textworld[pddl]>=1.6.2rc3
 5 | textworld-express>=1.1.0rc5
 6 | scienceworld>=1.2.2
 7 | discoveryworld
 8 | 
 9 | alfworld>=0.4.0
10 | 
11 | termcolor
12 | wandb
13 | numpy
14 | pandas
15 | 
16 | # llm>=0.18.0
17 | llm @ git+https://github.com/MarcCote/llm.git@add_extra_body_option
18 | llm-anthropic
19 | llm-gemini
20 | llm-azure-openai @ git+https://github.com/MarcCote/llm-azure-openai.git@generic_ad_auth
21 | anthropic
22 | google-genai
23 | tiktoken
24 | tenacity
25 | transformers
26 | 


--------------------------------------------------------------------------------
/scripts/example_script.sh:
--------------------------------------------------------------------------------
 1 | # This is an example script to show how to use a self-hosted model with vllm to run the twb
 2 | model=""
 3 | 
 4 | cat <<EOL >> .config/io.datasette.llm/extra-openai-models.yaml
 5 | 
 6 |   - model_id: $model
 7 |     model_name: $model
 8 |     api_base: "http://127.0.0.1:8002/v1"
 9 | EOL
10 | 
11 | export WANDB_API_KEY=''
12 | # Makes a log folder for vllm. This may error out if you already have a logs folder
13 | mkdir logs 
14 | 
15 | # Run the vllm server for the meta-llama/Llama-3.1-8B-Instruct model on port 8002. Make sure you have set your HF token 
16 | nohup bash -c 'until ! (python -m vllm.entrypoints.openai.api_server --model mistralai/Ministral-8B-Instruct-2410 --port 8002 --tensor-parallel-size 1 --trust-remote-code --host 0.0.0.0 > logs/vllm_1.log 2>&1); do sleep 120; done' &
17 | 
18 | # To make sure this doesn't run forever, we let it run for 300 seconds and check every 30 seconds
19 | echo "Waiting for VLLM server to start..."
20 | timeout=500 
21 | interval=30  
22 | elapsed=0
23 | 
24 | # Wait loop with timeout
25 | until curl -s -o /dev/null -w "%{http_code}" http://localhost:8002/v1/models | grep -q "200"; do
26 |     if [ $elapsed -ge $timeout ]; then
27 |         echo "Timeout reached! VLLM server did not start within 5 minutes."
28 |         exit 1
29 |     fi
30 |     sleep $interval
31 |     echo "Pinging vllm server..."
32 |     elapsed=$((elapsed + interval))
33 | done
34 | 
35 | # Send a test request to the API
36 | curl -X POST "http://localhost:8002/v1/completions" -H "Content-Type: application/json" -d '{"model": "mistralai/Ministral-8B-Instruct-2410", "prompt": "You want to play a (text) game?", "max_tokens": 10}'
37 | 
38 | # Run the text games benchmark with the model we just set up for xork1
39 | wandb login
40 | 
41 | set -ex
42 | 
43 | pids=""
44 | 
45 | for i in {1..5}; do
46 |     python benchmark.py --agent agents/llm.py zero-shot --conversation --llm mistralai/Ministral-8B-Instruct-2410 --envs JerichoEnvZork1 --context 100 --nb-steps 100 --conversation --seed "20241106$((i))"
47 |     pids="$pids $!"
48 |     sleep 60
49 | done
50 | 
51 | wait $pids


--------------------------------------------------------------------------------
/tales/__init__.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import os
 3 | import traceback
 4 | import warnings
 5 | from collections import defaultdict
 6 | 
 7 | from termcolor import colored
 8 | 
 9 | from tales.agent import Agent
10 | from tales.version import __version__
11 | 
12 | root_dir = os.path.dirname(os.path.abspath(__file__))
13 | tasks = []
14 | envs = []
15 | envs_per_task = defaultdict(list)
16 | 
17 | _exclude_path = ["__pycache__", "tests"]
18 | 
19 | for dirname in os.listdir(root_dir):
20 |     if not os.path.isdir(os.path.join(root_dir, dirname)):
21 |         continue
22 | 
23 |     if dirname in _exclude_path:
24 |         continue
25 | 
26 |     if "skip" in os.listdir(os.path.join(root_dir, dirname)):
27 |         continue
28 | 
29 |     if "__init__.py" in os.listdir(os.path.join(root_dir, dirname)):
30 |         tasks.append(dirname)
31 | 
32 | 
33 | for task in tasks:
34 |     try:
35 |         # Load environments
36 |         module = importlib.import_module(f".{task}", package="tales")
37 |         environments = getattr(module, "environments", None)
38 |         if environments:
39 |             for env_name, version in environments:
40 |                 envs.append(env_name)
41 |                 envs_per_task[task].append(env_name)
42 |         else:
43 |             warnings.warn(
44 |                 "Failed to load `{}.environments`. Skipping the task.".format(task),
45 |                 UserWarning,
46 |             )
47 |             continue
48 | 
49 |     except Exception as e:
50 |         warnings.warn(
51 |             "Failed to import `{}`. Skipping the task.".format(task), UserWarning
52 |         )
53 |         warnings.warn(colored(f"{e}", "red"), UserWarning)
54 |         warnings.warn(colored(f"{traceback.format_exc()}", "red"), UserWarning)
55 |         continue
56 | 
57 | envs_per_task = dict(envs_per_task)
58 | env2task = {env: task for task, envs in envs_per_task.items() for env in envs}
59 | 
60 | __all__ = ["Agent", "__version__", "envs", "envs_per_task", "tasks"]
61 | 


--------------------------------------------------------------------------------
/tales/agent.py:
--------------------------------------------------------------------------------
 1 | class Agent:
 2 | 
 3 |     def reset(self, obs, info, env):
 4 |         pass
 5 | 
 6 |     def act(self, obs, reward, done, info):
 7 |         raise NotImplementedError("Child class must implement this method.")
 8 | 
 9 |     @property
10 |     def uid(self):
11 |         """Unique identifier for this agent.
12 | 
13 |         Usually, this is a string that contains the class name and the values of the
14 |         parameters used to initialize the agent.
15 |         """
16 |         # return f"{self.__class__.__name__}_" + "_".join(
17 |         #     f"{k}:{v}" for k, v in self.kwargs.items()
18 |         # ).strip("_")
19 |         raise NotImplementedError("Child class must implement this property.")
20 | 
21 |     @property
22 |     def params(self):
23 |         """Parameters used to initialize the agent.
24 | 
25 |         Returns:
26 |             dict: Parameters used to initialize the agent.
27 |         """
28 |         # return self.kwargs
29 |         raise NotImplementedError("Child class must implement this property.")
30 | 
31 | 
32 | # Registry for available agents to benchmark.
33 | AGENTS = {}
34 | 
35 | 
36 | def register(name: str, desc: str, klass: callable, add_arguments: callable) -> None:
37 |     """ Register a new type of Agent.
38 | 
39 |     Arguments:
40 |         name:
41 |             Name of the agent (must be unique).
42 |         desc:
43 |             Bried description of how the agent works (for `benchmark.py --help`).
44 |         klass:
45 |             Class used to instantiate the agent.
46 |         add_arguments:
47 |             Function that should add the `argparse` arguments needed for this agent.
48 |             The provided function should expect a `argparse.ArgumentParser` object.
49 | 
50 |     Example:
51 | 
52 |         >>> from tales.agent import register
53 |         >>> from tales.agents import RandomAgent
54 |         >>> def _add_arguments(parser):
55 |                 parser.add_argument("--seed", required=True, type=int,
56 |                                     help="Random seed to use.")
57 |         >>> \
58 |         >>> register(name="random",
59 |         >>>          desc="This agent randomly select actions.",
60 |         >>>          klass=RandomAgent,
61 |         >>>          add_arguments=_add_arguments)
62 |     """
63 |     if name in AGENTS:
64 |         raise ValueError(f"Agent '{name}' already registered.")
65 | 
66 |     AGENTS[name] = (desc, klass, add_arguments)
67 | 


--------------------------------------------------------------------------------
/tales/alfworld/__init__.py:
--------------------------------------------------------------------------------
 1 | import gymnasium as gym
 2 | 
 3 | from .alfworld_data import TASK_TYPES, prepare_alfworld_data
 4 | from .alfworld_env import ALFWorldTask
 5 | 
 6 | environments = []
 7 | 
 8 | for split in ["seen", "unseen"]:
 9 |     for task_type in TASK_TYPES:
10 |         task_name = task_type.replace("_", " ").title().replace(" ", "")
11 |         env_name = f"ALFWorld{task_name}{split.title()}"
12 |         environments.append([env_name, "v0"])
13 | 
14 |         gym.register(
15 |             id=f"tales/{env_name}-v0",
16 |             entry_point="tales.alfworld:ALFWorldTask",
17 |             kwargs={"task_type": task_type, "split": split},
18 |         )
19 | 
20 | 
21 | def download():
22 |     prepare_alfworld_data()
23 | 


--------------------------------------------------------------------------------
/tales/alfworld/alfworld_data.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import zipfile
 4 | from os.path import join as pjoin
 5 | 
 6 | from tales.config import TALES_CACHE_HOME, TALES_FORCE_DOWNLOAD
 7 | from tales.utils import download
 8 | 
 9 | TASK_TYPES = [
10 |     "pick_and_place_simple",
11 |     "look_at_obj_in_light",
12 |     "pick_clean_then_place_in_recep",
13 |     "pick_heat_then_place_in_recep",
14 |     "pick_cool_then_place_in_recep",
15 |     "pick_two_obj_and_place",
16 | ]
17 | 
18 | ALFWORLD_DATA_URL = "https://github.com/alfworld/alfworld/releases/download/0.4.2/json_2.1.3_tw-pddl.zip"
19 | TALES_CACHE_ALFWORLD = pjoin(TALES_CACHE_HOME, "alfworld")
20 | TALES_CACHE_ALFWORLD_DATA_ZIP = pjoin(TALES_CACHE_ALFWORLD, "json_2.1.3_tw-pddl.zip")
21 | TALES_CACHE_ALFWORLD_VALID_SEEN = pjoin(
22 |     TALES_CACHE_ALFWORLD, "json_2.1.1", "valid_seen"
23 | )
24 | TALES_CACHE_ALFWORLD_VALID_UNSEEN = pjoin(
25 |     TALES_CACHE_ALFWORLD, "json_2.1.1", "valid_unseen"
26 | )
27 | 
28 | 
29 | def prepare_alfworld_data(force=TALES_FORCE_DOWNLOAD):
30 |     os.makedirs(TALES_CACHE_ALFWORLD, exist_ok=True)
31 |     data_exists = os.path.exists(TALES_CACHE_ALFWORLD_VALID_SEEN) and os.path.exists(
32 |         TALES_CACHE_ALFWORLD_VALID_UNSEEN
33 |     )
34 |     if data_exists and not force:
35 |         return
36 | 
37 |     if not os.path.exists(TALES_CACHE_ALFWORLD_DATA_ZIP) or force:
38 |         download(
39 |             ALFWORLD_DATA_URL,
40 |             dst=TALES_CACHE_ALFWORLD,
41 |             desc="Downloading ALFWorld data",
42 |             force=force,
43 |         )
44 | 
45 |     # Extract the content of the folder test from the downloaded file
46 |     with zipfile.ZipFile(TALES_CACHE_ALFWORLD_DATA_ZIP, "r") as zip_ref:
47 |         # Only extract the test folder
48 |         for member in zip_ref.namelist():
49 |             if "valid_seen" in member or "valid_unseen" in member:
50 |                 zip_ref.extract(member, TALES_CACHE_ALFWORLD)
51 | 
52 | 
53 | def get_alfworld_game(task_type, split="seen"):
54 |     prepare_alfworld_data()  # make sure the data is ready
55 | 
56 |     if split == "seen":
57 |         root = TALES_CACHE_ALFWORLD_VALID_SEEN
58 |     elif split == "unseen":
59 |         root = TALES_CACHE_ALFWORLD_VALID_UNSEEN
60 |     else:
61 |         raise ValueError(f"Unknown split: {split}")
62 | 
63 |     game_files = sorted(glob.glob(pjoin(root, f"{task_type}*", "**", "*.tw-pddl")))
64 |     return game_files
65 | 


--------------------------------------------------------------------------------
/tales/alfworld/alfworld_env.py:
--------------------------------------------------------------------------------
 1 | import gymnasium as gym
 2 | import textworld
 3 | import textworld.gym
 4 | from alfworld.agents.environment.alfred_tw_env import AlfredDemangler
 5 | from textworld.envs.wrappers import Filter
 6 | 
 7 | from . import alfworld_data
 8 | 
 9 | 
10 | class ALFWorldEnv(gym.Env):
11 | 
12 |     def __init__(self, gamefile, admissible_commands=False, *args, **kwargs):
13 |         self.infos = textworld.EnvInfos(
14 |             score=True,
15 |             max_score=True,
16 |             won=True,
17 |             lost=True,
18 |             feedback=True,
19 |             moves=True,
20 |             admissible_commands=admissible_commands,
21 |             extras=["walkthrough", "expert_plan"],
22 |         )
23 |         self.gamefile = gamefile
24 |         self.env = None
25 | 
26 |     def reset(self, *, seed=None, options=None):
27 |         super().reset(seed=seed, options=options)
28 | 
29 |         if self.env is None:
30 |             self.env = textworld.start(
31 |                 self.gamefile, self.infos, wrappers=[Filter, AlfredDemangler()]
32 |             )
33 | 
34 |         obs, info = self.env.reset()
35 |         info["feedback"] = obs
36 |         info["score"] = 0
37 |         info["max_score"] = 1
38 |         return obs, info
39 | 
40 |     def step(self, action):
41 |         obs, done, reward, info = self.env.step(action)
42 |         # if obs == "Nothing happens.":
43 |         #     obs = "Invalid command or this command can't be used in this context. Type 'help' for a list of available commands."
44 | 
45 |         info["feedback"] = obs
46 |         info["score"] = int(done)
47 |         info["max_score"] = 1
48 |         return obs, done, reward, info
49 | 
50 | 
51 | class ALFWorldTask(ALFWorldEnv):
52 | 
53 |     def __init__(self, task_type, split, *args, **kwargs):
54 |         self.gamefiles = sorted(alfworld_data.get_alfworld_game(task_type, split))
55 |         super().__init__(self.gamefiles[0], *args, **kwargs)
56 | 
57 |     def reset(self, *, seed=None, options=None):
58 |         if seed is not None:
59 |             self.gamefile = self.gamefiles[seed % len(self.gamefiles)]
60 |             if self.env is not None:
61 |                 self.env.close()
62 |                 self.env = None
63 | 
64 |         return super().reset(seed=seed, options=options)
65 | 


--------------------------------------------------------------------------------
/tales/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | DEFAULT_TALES_CACHE_HOME = os.path.expanduser("~/.cache/tales")
 4 | TALES_CACHE_HOME = os.getenv("TALES_CACHE_HOME", DEFAULT_TALES_CACHE_HOME)
 5 | os.environ["TALES_CACHE_HOME"] = (
 6 |     TALES_CACHE_HOME  # Set the environment variable, in case it wasn't.
 7 | )
 8 | os.makedirs(TALES_CACHE_HOME, exist_ok=True)
 9 | 
10 | # Check if cache is flag is set to force download
11 | TALES_FORCE_DOWNLOAD = os.getenv("TALES_FORCE_DOWNLOAD", "false").lower() in (
12 |     "yes",
13 |     "true",
14 |     "t",
15 |     "1",
16 | )
17 | 


--------------------------------------------------------------------------------
/tales/download.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import traceback
 3 | import warnings
 4 | 
 5 | from termcolor import colored
 6 | from tqdm import tqdm
 7 | 
 8 | from tales import tasks
 9 | 
10 | 
11 | def download():
12 |     for task in tqdm(tasks, desc="Downloading data for TALES"):
13 |         try:
14 |             module = importlib.import_module(f".{task}", package="tales")
15 |             module.download()
16 |         except Exception as e:
17 |             warnings.warn(
18 |                 "Failed to download data for `{task}`.",
19 |                 UserWarning,
20 |             )
21 |             warnings.warn(colored(f"{e}", "red"), UserWarning)
22 |             warnings.warn(colored(f"{traceback.format_exc()}", "red"), UserWarning)
23 |             continue
24 | 
25 | 
26 | if __name__ == "__main__":
27 |     download()
28 | 


--------------------------------------------------------------------------------
/tales/jericho/__init__.py:
--------------------------------------------------------------------------------
 1 | import gymnasium as gym
 2 | 
 3 | from .jericho_data import GAMES_INFOS, prepare_jericho_data
 4 | from .jericho_env import JerichoEnv
 5 | 
 6 | environments = []
 7 | 
 8 | for game, infos in GAMES_INFOS.items():
 9 |     env_name = f"JerichoEnv{game.title()}"
10 |     environments.append([env_name, "v0"])
11 | 
12 |     gym.register(
13 |         id=f"tales/{env_name}-v0",
14 |         entry_point="tales.jericho:JerichoEnv",
15 |         kwargs={"game": game},
16 |     )
17 | 
18 | 
19 | def download():
20 |     prepare_jericho_data()
21 | 


--------------------------------------------------------------------------------
/tales/jericho/games.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "905": {
  3 |     "filename": "905.z5",
  4 |     "info": "http://ifdb.tads.org/viewgame?id=qzftg3j8nh5f34i2",
  5 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/905.z5",
  6 |     "md5": "4c5067169b834d247a30bb08d1039896"
  7 |   },
  8 |   "acorncourt": {
  9 |     "filename": "acorncourt.z5",
 10 |     "info": "http://ifdb.tads.org/viewgame?id=tqvambr6vowym20v",
 11 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/acorncourt.z5",
 12 |     "md5": "a61400439aa76f8faba3b8f01edd4a72"
 13 |   },
 14 |   "advent": {
 15 |     "filename": "advent.z5",
 16 |     "info": "http://ifdb.tads.org/viewgame?id=fft6pu91j85y4acv",
 17 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/Advent.z5",
 18 |     "md5": "ee2242e155fd8910921b0f8e04019a3a"
 19 |   },
 20 |   "adventureland": {
 21 |     "filename": "adventureland.z5",
 22 |     "info": "http://ifdb.tads.org/viewgame?id=dy4ok8sdlut6ddj7",
 23 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/Adventureland.z5",
 24 |     "md5": "a42545bd17330ae5e6fed02270ccfb4a"
 25 |   },
 26 |   "afflicted": {
 27 |     "filename": "afflicted.z8",
 28 |     "info": "http://ifdb.tads.org/viewgame?id=epl4q2933rczoo9x",
 29 |     "link": "http://mirror.ifarchive.org/if-archive/games/competition2008/zcode/afflicted/afflicted.z8",
 30 |     "md5": "064272be87de7106192b6fb743c4dfc4"
 31 |   },
 32 |   "anchor": {
 33 |     "filename": "anchor.z8",
 34 |     "info": "http://ifdb.tads.org/viewgame?id=op0uw1gn1tjqmjt7",
 35 |     "link": "http://ifarchive.org/if-archive/games/zcode/anchor.z8",
 36 |     "md5": "c043df8624e0e1e9fda92f1a74b6e402"
 37 |   },
 38 |   "awaken": {
 39 |     "filename": "awaken.z5",
 40 |     "info": "http://ifdb.tads.org/viewgame?id=rwseuddvj1gbo481",
 41 |     "link": "https://github.com/danielricks/textplayer/raw/master/games/awaken.z5",
 42 |     "md5": "9ba48c72d96ab3e7956a8570b12d34d6"
 43 |   },
 44 |   "balances": {
 45 |     "filename": "balances.z5",
 46 |     "info": "http://ifdb.tads.org/viewgame?id=x6ne0bbd2oqm6h3a",
 47 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/Balances.z5",
 48 |     "md5": "f2cb8f94a7e8df3b850a758da26fa387"
 49 |   },
 50 |   "ballyhoo": {
 51 |     "filename": "ballyhoo.z3",
 52 |     "info": "http://ifdb.tads.org/viewgame?id=b0i6bx7g4rkrekgg",
 53 |     "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FBallyhoo%20v97%20%281986%29%28Infocom%29.zip:BALLYHOO.DAT",
 54 |     "md5": "5d54e326815b0ed3aff8efb8ff02ef2f"
 55 |   },
 56 |   "curses": {
 57 |     "filename": "curses.z5",
 58 |     "info": "http://ifdb.tads.org/viewgame?id=plvzam05bmz3enh8",
 59 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/curses.z5",
 60 |     "md5": "f06a42a29a5a4e6aa70958c9ae4c37cd"
 61 |   },
 62 |   "cutthroat": {
 63 |     "filename": "cutthroat.z3",
 64 |     "info": "http://ifdb.tads.org/viewgame?id=4ao65o1u0xuvj8jf",
 65 |     "link": "https://github.com/BYU-PCCL/z-machine-games/raw/master/jericho-game-suite/cutthroat.z3",
 66 |     "md5": "216eeeba1c8017a77343dc8482f6f185"
 67 |   },
 68 |   "deephome": {
 69 |     "filename": "deephome.z5",
 70 |     "info": "http://ifdb.tads.org/viewgame?id=x85otcikhwp8bwup",
 71 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/deephome.z5",
 72 |     "md5": "5e56a6e5cdeecded434a8fd8012fc2c6"
 73 |   },
 74 |   "detective": {
 75 |     "filename": "detective.z5",
 76 |     "info": "http://ifdb.tads.org/viewgame?id=1po9rgq2xssupefw",
 77 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/detective.z5",
 78 |     "md5": "822655c9be83e292e06d3d3b1d6a9734"
 79 |   },
 80 |   "dragon": {
 81 |     "filename": "dragon.z5",
 82 |     "info": "http://ifdb.tads.org/viewgame?id=sjiyffz8n5patu8l",
 83 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/dragon.zip:Dragon.z5",
 84 |     "md5": "96d314997e5d3a5a793c83845977d44d"
 85 |   },
 86 |   "enchanter": {
 87 |     "filename": "enchanter.z3",
 88 |     "info": "http://ifdb.tads.org/viewgame?id=vu4xhul3abknifcr",
 89 |     "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FEnchanter%20v24%20%281984%29%28Infocom%29%5Bh%5D.zip:ench_24.z3",
 90 |     "md5": "ad3cdea88d81033fe29167688bd98c31"
 91 |   },
 92 |   "enter": {
 93 |     "filename": "enter.z5",
 94 |     "info": "http://ifdb.tads.org/viewgame?id=ld1f3t5epeagilfz",
 95 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/enter.z5",
 96 |     "md5": "4c48ba2c5523d78c5f7f9b7809d16b1d"
 97 |   },
 98 |   "gold": {
 99 |     "filename": "gold.z5",
100 |     "info": "http://ifdb.tads.org/viewgame?id=59ztsy9p01avd6wp",
101 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/gold.z5",
102 |     "md5": "f275ddf32ce8a9e744d53c3b99c5a658"
103 |   },
104 |   "hhgg": {
105 |     "filename": "hhgg.z3",
106 |     "info": "http://ifdb.tads.org/viewgame?id=ouv80gvsl32xlion",
107 |     "link": "https://github.com/BYU-PCCL/z-machine-games/raw/master/jericho-game-suite/hhgg.z3",
108 |     "md5": "6666389f60e0c8e4ceb08242a263bb52"
109 |   },
110 |   "hollywood": {
111 |     "filename": "hollywood.z3",
112 |     "info": "http://ifdb.tads.org/viewgame?id=jnfkbgdgopwfqist",
113 |     "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FHollywood%20Hijinx%20v235%20%281986%29%28Infocom%29%5Bh%5D%5B861118%5D.zip:hollywoo_235.z3",
114 |     "md5": "1ea91a064941a3f612b20833f0a47df7"
115 |   },
116 |   "huntdark": {
117 |     "filename": "huntdark.z5",
118 |     "info": "http://ifdb.tads.org/viewgame=mh1a6hizgwjdbeg7",
119 |     "link": "http://mirror.ifarchive.org/if-archive/games/competition99/inform/huntdark/huntdark.z5",
120 |     "md5": "253b02c8012710577085b9fd3a155cb7"
121 |   },
122 |   "infidel": {
123 |     "filename": "infidel.z3",
124 |     "info": "http://ifdb.tads.org/viewgame?id=anu79a4n1jedg5mm",
125 |     "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FInfidel%20v22%20%281983%29%28Infocom%29%5B830916%5D.zip:INFIDEL.DAT",
126 |     "md5": "2fe5b5693fa60b0cf8621402423994b1"
127 |   },
128 |   "inhumane": {
129 |     "filename": "inhumane.z5",
130 |     "info": "http://ifdb.tads.org/viewgame?id=wvs2vmbigm9unlpd",
131 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/inhumane.z5",
132 |     "md5": "84d3ce7ccfafb873736490811a0cc78c"
133 |   },
134 |   "jewel": {
135 |     "filename": "jewel.z5",
136 |     "info": "http://ifdb.tads.org/viewgame?id=hu60gp1bgkhlo5yx",
137 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/jewel.z5",
138 |     "md5": "1eef9c0fa009ca4adf4872cfc5249d45"
139 |   },
140 |   "karn": {
141 |     "filename": "karn.z5",
142 |     "info": "http://ifdb.tads.org/viewgame?id=bx8118ggp6j7nslo",
143 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/karn.z5",
144 |     "md5": "ec55791be814db3663ad1aec0d6b7690"
145 |   },
146 |   "library": {
147 |     "filename": "library.z5",
148 |     "info": "http://ifdb.tads.org/viewgame?id=400zakqderzjnu1i",
149 |     "link": "http://mirror.ifarchive.org/if-archive/games/competition95/library.z5",
150 |     "md5": "389acf3b617a40dc4848da3bda62ce06"
151 |   },
152 |   "loose": {
153 |     "filename": "loose.z5",
154 |     "info": "http://ifdb.tads.org/viewgame?id=4wd3lyaxi4thp8qi",
155 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/loose.z5",
156 |     "md5": "31a0c1e360dce94aa5bece5240691d17"
157 |   },
158 |   "lostpig": {
159 |     "filename": "lostpig.z8",
160 |     "info": "http://ifdb.tads.org/viewgame?id=mohwfk47yjzii14w",
161 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/LostPig.z8",
162 |     "md5": "aaf0b90fbb31717481c02832bf412070"
163 |   },
164 |   "ludicorp": {
165 |     "filename": "ludicorp.z5",
166 |     "info": "http://ifdb.tads.org/viewgame?id=r6g7pflngn3uxbam",
167 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/ludicorp.z5",
168 |     "md5": "646a63307f77dcdcd011f330277ae262"
169 |   },
170 |   "lurking": {
171 |     "filename": "lurking.z3",
172 |     "info": "http://ifdb.tads.org/viewgame?id=jhbd0kja1t57uop",
173 |     "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FLurking%20Horror%2C%20The%20v219%20%281987%29%28Infocom%29%5B870912%5D.zip:Lurking.z3",
174 |     "md5": "5f42ff092a2f30471ae98150ef4da2e1"
175 |   },
176 |   "moonlit": {
177 |     "filename": "moonlit.z5",
178 |     "info": "http://ifdb.tads.org/viewgame?id=10387w68qlwehbyq",
179 |     "link": "http://mirror.ifarchive.org/if-archive/games/competition2002/zcode/moonlit/Moonlit.z5",
180 |     "md5": "bf75b9651cff0e2d04302f19c443588e"
181 |   },
182 |   "murdac": {
183 |     "filename": "murdac.z5",
184 |     "info": "http://ifdb.tads.org/viewgame?id=q36lh5np0q9nak28",
185 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/Murdac.z5",
186 |     "md5": "570179d4f21b2f600862dbffbb5afc3e"
187 |   },
188 |   "night": {
189 |     "filename": "night.z5",
190 |     "info": "http://ifdb.tads.org/viewgame?id=ydhwa11st460g9u3",
191 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/night.z5",
192 |     "md5": "72125f159cccd581786ac16a2828d4e3"
193 |   },
194 |   "omniquest": {
195 |     "filename": "omniquest.z5",
196 |     "info": "http://ifdb.tads.org/viewgame?id=mygqz9tzxqvryead",
197 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/omniquest.z5",
198 |     "md5": "80ea198bca425b6d819c74bfa854236e"
199 |   },
200 |   "partyfoul": {
201 |     "filename": "partyfoul.z8",
202 |     "info": "http://ifdb.tads.org/viewgame?id=cqwq699i9qiqdju",
203 |     "link": "http://mirror.ifarchive.org/if-archive/games/mini-comps/cgdc7/PartyFoul.zblorb",
204 |     "md5": "d221daa82708c4e54447f1a884c239ef"
205 |   },
206 |   "pentari": {
207 |     "filename": "pentari.z5",
208 |     "info": "http://ifdb.tads.org/viewgame?id=llchvog0ukwrphih",
209 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/pentari.z5",
210 |     "md5": "f24c6863468823b744e910ccfe997c6d"
211 |   },
212 |   "planetfall": {
213 |     "filename": "planetfall.z3",
214 |     "info": "http://ifdb.tads.org/viewgame?id=xe6kb3cuqwie2q38",
215 |     "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FPlanetfall%20v29%20%281983%29%28Infocom%29%5B840118%5D.zip:planetfa.z3",
216 |     "md5": "6487dc814b280f5603c53155de378d27"
217 |   },
218 |   "plundered": {
219 |     "filename": "plundered.z3",
220 |     "info": "http://ifdb.tads.org/viewgame?id=ddagftras22bnz8h",
221 |     "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FPlundered%20Hearts%20v26%20%281987%29%28Infocom%29%5B870730%5D.zip:PLUNDERE.DAT",
222 |     "md5": "29fc7b270af2fbd406a0548a8298da7f"
223 |   },
224 |   "reverb": {
225 |     "filename": "reverb.z5",
226 |     "info": "http://ifdb.tads.org/viewgame?id=dop7nbjl90r5zmf9",
227 |     "link": "http://mirror.ifarchive.org/if-archive/games/competition96/reverb/reverb.z5",
228 |     "md5": "80d286fbfe624c621266b568c0076717"
229 |   },
230 |   "seastalker": {
231 |     "filename": "seastalker.z3",
232 |     "info": "http://ifdb.tads.org/viewgame?id=56wb8hflec2isvzm",
233 |     "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FSeastalker%20v86%20%281984%29%28Infocom%29%28beta%29%5B840320%5D.zip:SEASTALK.z3",
234 |     "md5": "ee339dbdbb0792f67e20bd71bafe0ea5"
235 |   },
236 |   "sherlock": {
237 |     "filename": "sherlock.z5",
238 |     "info": "http://ifdb.tads.org/viewgame?id=j8lmspy4iz73mx26",
239 |     "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FSherlock%20-%20The%20Riddle%20of%20the%20Crown%20Jewels%20v21%20%281987%29%28Infocom%29%5B871214%5D.zip:SHER.z5",
240 |     "md5": "35240654d83f9e7073973d338f9657b8"
241 |   },
242 |   "snacktime": {
243 |     "filename": "snacktime.z8",
244 |     "info": "http://ifdb.tads.org/viewgame?id=yr3y8s9k8e40hl5q",
245 |     "link": "http://mirror.ifarchive.org/if-archive/games/competition2008/zcode/snack/snacktime.z8",
246 |     "md5": "0ff228d12d7cb470dc1a8e9a5151769b"
247 |   },
248 |   "sorcerer": {
249 |     "filename": "sorcerer.z3",
250 |     "info": "http://ifdb.tads.org/viewgame?id=lidg5nx9ig0bwk55",
251 |     "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FSorcerer%20v18%20%281984%29%28Infocom%29%5Bh%5D%5B860904%5D.zip:sorcerer_18.z3",
252 |     "md5": "20f1468a058d0a6de016ae70022e651c"
253 |   },
254 |   "spellbrkr": {
255 |     "filename": "spellbrkr.z3",
256 |     "info": "http://ifdb.tads.org/viewgame?id=wqsmrahzozosu3r",
257 |     "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FSpellbreaker%20v63%20%281985%29%28Infocom%29%5B850916%5D.zip:spelbrkr.z3",
258 |     "md5": "7a92ce19a39bedd970d0f1e296981f71"
259 |   },
260 |   "spirit": {
261 |     "filename": "spirit.z5",
262 |     "info": "http://ifdb.tads.org/viewgame?id=tqpowvmdoemtooqf",
263 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/spirit.z5",
264 |     "md5": "808039c4e9554bdd15d7793539b3bd97"
265 |   },
266 |   "temple": {
267 |     "filename": "temple.z5",
268 |     "info": "http://ifdb.tads.org/viewgame?id=kq9qgjkf2k6xn1c0",
269 |     "link": "http://mirror.ifarchive.org/if-archive/games/competition2002/zcode/temple/temple.z5",
270 |     "md5": "047842c7b25c3d477b728cf3412e33de"
271 |   },
272 |   "theatre": {
273 |     "filename": "theatre.z5",
274 |     "info": "http://ifdb.tads.org/viewgame?id=bv8of8y9xeo7307g",
275 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/theatre.z5",
276 |     "md5": "33dcc5085acb290d1817e07653c13480"
277 |   },
278 |   "trinity": {
279 |     "filename": "trinity.z4",
280 |     "info": "http://ifdb.tads.org/viewgame?id=j18kjz80hxjtyayw",
281 |     "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FTrinity%20v12%20%281986%29%28Infocom%29%5B860926%5D.zip:TRINITY.z4",
282 |     "md5": "3bf1a444a1fc2057130ecb9806117233"
283 |   },
284 |   "tryst205": {
285 |     "filename": "tryst205.z5",
286 |     "info": "http://ifdb.tads.org/viewgame?id=ic0ebhbi70bdmyc2",
287 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/tryst205.z5",
288 |     "md5": "fc65ad8d4588da92fd39871f6f7463db"
289 |   },
290 |   "weapon": {
291 |     "filename": "weapon.z5",
292 |     "info": "http://ifdb.tads.org/viewgame?id=tcebhl79rlxo3qrk",
293 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/weapon.zip:weapon.z5",
294 |     "md5": "c632204be3849d6c5bb6f4eb5aca3cc0"
295 |   },
296 |   "wishbringer": {
297 |     "filename": "wishbringer.z3",
298 |     "info": "http://ifdb.tads.org/viewgame?id=z02joykzh66wfhcl",
299 |     "link": "https://archive.org/download/Infocom_Z-Machine_TOSEC_2012_04_23/Infocom_Z-Machine_TOSEC_2012_04_23.zip/Infocom%20Z-Machine%20%5BTOSEC%5D%2FGames%2FWishbringer%20-%20The%20Magick%20Stone%20of%20Dreams%20v68%20%281985%29%28Infocom%29%5B850501%5D.zip:WISHBRIN.z3",
300 |     "md5": "87ed53d854f7e57c36106fca3b9cf5a6"
301 |   },
302 |   "yomomma": {
303 |     "filename": "yomomma.z8",
304 |     "info": "http://ifdb.tads.org/viewgame?id=1iqmpkn009h9gbug",
305 |     "link": "http://nitku.net/if/yomomma/yomomma.zblorb",
306 |     "md5": "5b10162a7a134e7b4c381ecedfb4bc44"
307 |   },
308 |   "zenon": {
309 |     "filename": "zenon.z5",
310 |     "info": "http://ifdb.tads.org/viewgame?id=rw7zv98mifbr3335",
311 |     "link": "http://mirror.ifarchive.org/if-archive/games/zcode/zenon.z5",
312 |     "md5": "631cc926b4251f5a5f646d3a6bdac8c6"
313 |   },
314 |   "zork1": {
315 |     "filename": "zork1.z5",
316 |     "info": "http://ifdb.tads.org/viewgame?id=0dbnusxunq7fw5ro",
317 |     "link": "http://www.batmantis.com/zorks/zork1.z5",
318 |     "md5": "b732a93a6244ddd92a9b9a3e3a46c687"
319 |   },
320 |   "zork2": {
321 |     "filename": "zork2.z5",
322 |     "info": "http://ifdb.tads.org/viewgame?id=yzzm4puxyjakk8c4",
323 |     "link": "http://www.batmantis.com/zorks/zork2.z5",
324 |     "md5": "5bcd91ee055e9bd42812617571be227b"
325 |   },
326 |   "zork3": {
327 |     "filename": "zork3.z5",
328 |     "info": "http://ifdb.tads.org/viewgame?id=vrsot1zgy1wfcdru",
329 |     "link": "http://www.batmantis.com/zorks/zork3.z5",
330 |     "md5": "ffda9ee2d428fa2fa8e75a1914ff6959"
331 |   },
332 |   "ztuu": {
333 |     "filename": "ztuu.z5",
334 |     "info": "http://ifdb.tads.org/viewgame?id=40hswtkhap88gzvn",
335 |     "link": "http://www.batmantis.com/zorks/ztuu.z5",
336 |     "md5": "d8e1578470cbc676e013e03d72c93141"
337 |   }
338 | }


--------------------------------------------------------------------------------
/tales/jericho/jericho_data.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from os.path import join as pjoin
 4 | 
 5 | from tales.config import TALES_CACHE_HOME, TALES_FORCE_DOWNLOAD
 6 | from tales.utils import download
 7 | 
 8 | GAMES_URLS = "https://github.com/BYU-PCCL/z-machine-games/raw/master/jericho-game-suite"
 9 | TALES_CACHE_JERICHO = pjoin(TALES_CACHE_HOME, "jericho")
10 | 
11 | 
12 | with open(pjoin(os.path.dirname(__file__), "games.json")) as f:
13 |     GAMES_INFOS = json.load(f)
14 | 
15 | # Remove known games that are not working.
16 | GAMES_INFOS.pop("hollywood", None)
17 | GAMES_INFOS.pop("theatre", None)
18 | 
19 | 
20 | def prepare_jericho_data(force=TALES_FORCE_DOWNLOAD):
21 |     os.makedirs(TALES_CACHE_JERICHO, exist_ok=True)
22 | 
23 |     for name, game_info in GAMES_INFOS.items():
24 |         filename = game_info["filename"]
25 | 
26 |         game_file = pjoin(TALES_CACHE_JERICHO, filename)
27 |         if os.path.isfile(game_file) and not force:
28 |             continue
29 | 
30 |         link = f"{GAMES_URLS}/{filename}"
31 |         download(link, dst=TALES_CACHE_JERICHO, force=force)
32 | 
33 | 
34 | def get_game(game):
35 |     prepare_jericho_data()  # make sure the data is ready
36 | 
37 |     game_info = GAMES_INFOS[game]
38 |     game_file = pjoin(TALES_CACHE_JERICHO, game_info["filename"])
39 |     return game_file
40 | 


--------------------------------------------------------------------------------
/tales/jericho/jericho_env.py:
--------------------------------------------------------------------------------
 1 | import gymnasium as gym
 2 | import textworld
 3 | from textworld.envs.wrappers import Filter
 4 | 
 5 | from . import jericho_data
 6 | 
 7 | 
 8 | class JerichoEnv(gym.Env):
 9 | 
10 |     def __init__(self, game, admissible_commands=False, *args, **kwargs):
11 |         gamefile = jericho_data.get_game(game)
12 |         self.infos = textworld.EnvInfos(
13 |             score=True,
14 |             max_score=True,
15 |             won=True,
16 |             lost=True,
17 |             feedback=True,
18 |             moves=True,
19 |             admissible_commands=admissible_commands,
20 |             extras=["walkthrough"],
21 |         )
22 |         self.env = textworld.start(gamefile, self.infos, wrappers=[Filter])
23 | 
24 |     def reset(self, *, seed=None, options=None):
25 |         self.env.seed(seed)
26 |         return self.env.reset()
27 | 
28 |     def step(self, action):
29 |         return self.env.step(action)
30 | 


--------------------------------------------------------------------------------
/tales/logger.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import logging
 3 | import os
 4 | import platform
 5 | import re
 6 | from os.path import join as pjoin
 7 | 
 8 | from tqdm import tqdm
 9 | 
10 | log = logging.getLogger("tales")
11 | 
12 | 
13 | class TqdmLoggingHandler(logging.Handler):
14 |     def __init__(self, level=logging.NOTSET):
15 |         super().__init__(level)
16 | 
17 |     def emit(self, record):
18 |         try:
19 |             msg = self.format(record)
20 |             tqdm.write(msg)
21 |             self.flush()
22 |         except (KeyboardInterrupt, SystemExit):
23 |             raise
24 |         except Exception:
25 |             self.handleError(record)
26 | 
27 | 
28 | class StripAnsiFormatter(logging.Formatter):
29 |     ansi_escape = re.compile(r"\x1B[@-_][0-?]*[ -/]*[@-~]")
30 | 
31 |     def format(self, record):
32 |         msg = super().format(record)
33 |         return self.ansi_escape.sub("", msg)
34 | 
35 | 
36 | def setup_logging(args):
37 |     log.setLevel(logging.DEBUG)
38 | 
39 |     def add_new_file_handler(logfile):
40 |         fh = logging.FileHandler(logfile, mode="w")
41 |         formatter = StripAnsiFormatter("%(asctime)s: %(message)s")
42 |         log.addHandler(fh)
43 |         fh.setLevel(logging.DEBUG)
44 |         fh.setFormatter(formatter)
45 | 
46 |         # Log some system information at the top of the log file.
47 |         def _emit_msg(msg):
48 |             fh.emit(
49 |                 logging.makeLogRecord(
50 |                     {"name": log.name, "level": logging.DEBUG, "msg": msg}
51 |                 )
52 |             )
53 | 
54 |         _emit_msg("System information:")
55 |         _emit_msg(f"args = {args}")
56 |         _emit_msg(f"system = {platform.system()}")
57 |         _emit_msg(f"server = {platform.uname()[1]}")
58 |         _emit_msg(f"working_dir = {os.getcwd()}")
59 |         _emit_msg(f"datetime = {datetime.datetime.now()}")
60 |         _emit_msg(f"git_commit = {os.popen('git rev-parse HEAD').read().strip()}")
61 | 
62 |         return fh
63 | 
64 |     log.add_new_file_handler = add_new_file_handler
65 | 
66 |     timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
67 |     logfile = pjoin(args.log_dir, f"{timestamp}.log")
68 |     log.add_new_file_handler(logfile)
69 | 
70 |     ch = TqdmLoggingHandler()
71 |     formatter = logging.Formatter("%(message)s")
72 |     ch.setLevel(args.logging_level)
73 |     ch.setFormatter(formatter)
74 |     log.addHandler(ch)
75 | 


--------------------------------------------------------------------------------
/tales/scienceworld/__init__.py:
--------------------------------------------------------------------------------
 1 | import gymnasium as gym
 2 | 
 3 | from .scienceworld_env import TASK_NAMES, ScienceWorldEnv
 4 | 
 5 | environments = []
 6 | 
 7 | for task_name in TASK_NAMES:
 8 |     env_name = f"ScienceWorld{task_name.title().replace('-', '')}"
 9 |     environments.append([env_name, "v0"])
10 | 
11 |     gym.register(
12 |         id=f"tales/{env_name}-v0",
13 |         entry_point="tales.scienceworld:ScienceWorldEnv",
14 |         kwargs={"task_name": task_name},
15 |     )
16 | 
17 | 
18 | def download():
19 |     pass
20 | 


--------------------------------------------------------------------------------
/tales/scienceworld/scienceworld_data.py:
--------------------------------------------------------------------------------
 1 | import scienceworld
 2 | 
 3 | 
 4 | def get_task_names():
 5 |     return scienceworld.ScienceWorldEnv().task_names
 6 | 
 7 | 
 8 | def get_variations(task_name, split, env=None):
 9 |     env = env or scienceworld.ScienceWorldEnv(task_name)
10 |     if split == "train":
11 |         return env.get_variations_train()
12 |     elif split == "valid":
13 |         return env.get_variations_dev()
14 |     elif split == "test":
15 |         return env.get_variations_test()
16 |     else:
17 |         raise NotImplementedError("Only plan to support train, dev, and test splits.")
18 | 


--------------------------------------------------------------------------------
/tales/scienceworld/scienceworld_env.py:
--------------------------------------------------------------------------------
 1 | import gymnasium as gym
 2 | import numpy as np
 3 | import scienceworld
 4 | 
 5 | from . import scienceworld_data
 6 | 
 7 | TASK_NAMES = scienceworld_data.get_task_names()
 8 | 
 9 | 
10 | class ScienceWorldEnv(gym.Env):
11 | 
12 |     def __init__(self, task_name, admissible_commands=False, *args, **kwargs):
13 |         self.task_name = task_name
14 |         self.admissible_commands = admissible_commands
15 |         self.env = scienceworld.ScienceWorldEnv(self.task_name, envStepLimit=np.inf)
16 |         self.variations = scienceworld_data.get_variations(
17 |             self.task_name, split="test", env=self.env
18 |         )
19 |         self.variation = self.variations[0]
20 | 
21 |     def reset(self, *, seed=None, options=None):
22 |         if seed is not None:
23 |             self.variation = self.variations[seed % len(self.variations)]
24 | 
25 |         self.env.load(
26 |             self.task_name, self.variation, simplificationStr="", generateGoldPath=True
27 |         )
28 |         obs, info = self.env.reset()
29 | 
30 |         # Add task description to the first observation.
31 |         obs = info["taskDesc"] + "\n\n" + obs
32 | 
33 |         info["max_score"] = 100
34 |         info["feedback"] = obs
35 |         info["won"] = False
36 |         info["lost"] = False
37 |         info["admissible_commands"] = info["valid"]
38 |         info["extra.walkthrough"] = self.env.get_gold_action_sequence()
39 |         return obs, info
40 | 
41 |     def step(self, action):
42 |         obs, reward, done, info = self.env.step(action)
43 |         info["max_score"] = 100
44 |         info["feedback"] = obs
45 |         info["won"] = info["score"] == 100
46 |         info["lost"] = info["score"] < 0
47 |         info["admissible_commands"] = info["valid"]
48 |         return obs, reward, done, info
49 | 
50 |     def close(self):
51 |         self.env.close()
52 | 


--------------------------------------------------------------------------------
/tales/textworld/__init__.py:
--------------------------------------------------------------------------------
 1 | import gymnasium as gym
 2 | 
 3 | from .textworld_data import prepare_twcooking_data
 4 | from .textworld_env import TextWorldEnv, TWCookingEnv
 5 | 
 6 | environments = []
 7 | 
 8 | # TWCookingEnv
 9 | for difficulty in range(1, 10 + 1):
10 |     env_name = f"TWCookingLevel{difficulty}"
11 |     environments.append([env_name, "v0"])
12 | 
13 |     gym.register(
14 |         id=f"tales/{env_name}-v0",
15 |         entry_point="tales.textworld:TWCookingEnv",
16 |         kwargs={"difficulty": difficulty},
17 |     )
18 | 
19 | 
20 | def download():
21 |     prepare_twcooking_data()
22 | 


--------------------------------------------------------------------------------
/tales/textworld/textworld_data.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import zipfile
 4 | from os.path import join as pjoin
 5 | 
 6 | from tales.config import TALES_CACHE_HOME, TALES_FORCE_DOWNLOAD
 7 | from tales.utils import download
 8 | 
 9 | TW_COOKING_URL = (
10 |     "https://github.com/xingdi-eric-yuan/GATA-public/releases/download/data/rl.0.2.zip"
11 | )
12 | TALES_CACHE_TEXTWORLD = pjoin(TALES_CACHE_HOME, "textworld")
13 | TALES_CACHE_TWCOOKING = pjoin(TALES_CACHE_TEXTWORLD, "tw-cooking")
14 | TALES_CACHE_TWCOOKING_TEST = pjoin(TALES_CACHE_TWCOOKING, "test")
15 | 
16 | 
17 | def prepare_twcooking_data(force=TALES_FORCE_DOWNLOAD):
18 |     os.makedirs(TALES_CACHE_TWCOOKING, exist_ok=True)
19 |     if os.path.exists(TALES_CACHE_TWCOOKING_TEST) and not force:
20 |         return
21 | 
22 |     zip_file = pjoin(TALES_CACHE_TWCOOKING, "rl.0.2.zip")
23 |     if not os.path.exists(zip_file) or force:
24 |         download(
25 |             TW_COOKING_URL,
26 |             dst=TALES_CACHE_TWCOOKING,
27 |             desc="Downloading TWCooking",
28 |             force=force,
29 |         )
30 | 
31 |     # Extract the content of the folder test from the downloaded file
32 |     with zipfile.ZipFile(zip_file, "r") as zip_ref:
33 |         # Only extract the test folder
34 |         for member in zip_ref.namelist():
35 |             if "test" in member:
36 |                 zip_ref.extract(member, TALES_CACHE_TWCOOKING)
37 | 
38 | 
39 | def get_cooking_game(difficulty):
40 |     prepare_twcooking_data()  # make sure the data is ready
41 | 
42 |     cooking_dir = pjoin(TALES_CACHE_TWCOOKING_TEST, f"difficulty_level_{difficulty}")
43 |     game_files = glob.glob(pjoin(cooking_dir, "*.z8"))
44 |     return game_files
45 | 


--------------------------------------------------------------------------------
/tales/textworld/textworld_env.py:
--------------------------------------------------------------------------------
 1 | import gymnasium as gym
 2 | import numpy as np
 3 | import textworld
 4 | from textworld.envs.wrappers import Filter
 5 | 
 6 | from . import textworld_data
 7 | 
 8 | 
 9 | class TextWorldEnv(gym.Env):
10 | 
11 |     def __init__(self, gamefile, admissible_commands=False, *args, **kwargs):
12 |         self.infos = textworld.EnvInfos(
13 |             score=True,
14 |             max_score=True,
15 |             won=True,
16 |             lost=True,
17 |             feedback=True,
18 |             moves=True,
19 |             admissible_commands=admissible_commands,
20 |             extras=["walkthrough"],
21 |         )
22 |         self.gamefile = gamefile
23 |         self.env = None
24 | 
25 |     def reset(self, *, seed=None, options=None):
26 |         super().reset(seed=seed, options=options)
27 | 
28 |         if self.env is None:
29 |             self.env = textworld.start(self.gamefile, self.infos, wrappers=[Filter])
30 | 
31 |         return self.env.reset()
32 | 
33 |     def step(self, action):
34 |         return self.env.step(action)
35 | 
36 | 
37 | class TWCookingEnv(TextWorldEnv):
38 | 
39 |     def __init__(self, difficulty, *args, **kwargs):
40 |         self.gamefiles = sorted(textworld_data.get_cooking_game(difficulty))
41 |         super().__init__(self.gamefiles[0], *args, **kwargs)
42 | 
43 |     def reset(self, *, seed=None, options=None):
44 |         if seed is not None:
45 |             self.gamefile = self.gamefiles[seed % len(self.gamefiles)]
46 |             if self.env is not None:
47 |                 self.env.close()
48 |                 self.env = None
49 | 
50 |         return super().reset(seed=seed, options=options)
51 | 


--------------------------------------------------------------------------------
/tales/textworld_express/__init__.py:
--------------------------------------------------------------------------------
 1 | import gymnasium as gym
 2 | 
 3 | from .twx_env import TASKS, TextWorldExpressEnv
 4 | 
 5 | environments = []
 6 | 
 7 | for task_name, game_name, game_params in TASKS:
 8 |     env_name = f"TWX{task_name}"
 9 |     environments.append([env_name, "v0"])
10 | 
11 |     gym.register(
12 |         id=f"tales/{env_name}-v0",
13 |         entry_point="tales.textworld_express:TextWorldExpressEnv",
14 |         kwargs={"game_name": game_name, "game_params": game_params},
15 |     )
16 | 
17 | 
18 | def download():
19 |     pass
20 | 


--------------------------------------------------------------------------------
/tales/textworld_express/twx_data.py:
--------------------------------------------------------------------------------
 1 | import textworld_express as twx
 2 | 
 3 | # TASK_NAMES = list(twx.GAME_NAMES)
 4 | 
 5 | TASKS = [
 6 |     (
 7 |         "CookingWorld",
 8 |         "cookingworld",
 9 |         "numLocations=1, numIngredients=2, numDistractorItems=5, includeDoors=0, limitInventorySize=0",
10 |     ),
11 |     (
12 |         "TextWorldCommonsense",
13 |         "twc",
14 |         "numLocations=1,numItemsToPutAway=1,includeDoors=0,limitInventorySize=0",
15 |     ),
16 |     (
17 |         "CoinCollector",
18 |         "coin",
19 |         "numLocations=1, numDistractorItems=5, limitInventorySize=0",
20 |     ),
21 |     ("Arithmetic", "arithmetic", ""),
22 |     (
23 |         "MapReader",
24 |         "mapreader",
25 |         "numLocations=2, maxDistanceApart=1, maxDistractorItemsPerLocation=2, includeDoors=0, limitInventorySize=0",
26 |     ),
27 |     ("Sorting", "sorting", ""),
28 |     ("SimonSays10", "simonsays", "gameLength=10, numDistractors=4, memorization=0"),
29 |     ("SimonSays50", "simonsays", "gameLength=50, numDistractors=4, memorization=0"),
30 |     ("SimonSays100", "simonsays", "gameLength=100, numDistractors=4, memorization=0"),
31 |     (
32 |         "SimonSaysWithMemory10",
33 |         "simonsays",
34 |         "gameLength=10, numDistractors=4, memorization=1, verbose=0",
35 |     ),
36 |     (
37 |         "SimonSaysWithMemory50",
38 |         "simonsays",
39 |         "gameLength=50, numDistractors=4, memorization=1, verbose=0",
40 |     ),
41 |     (
42 |         "SimonSaysWithMemory100",
43 |         "simonsays",
44 |         "gameLength=100, numDistractors=4, memorization=1, verbose=0",
45 |     ),
46 |     (
47 |         "SimonSaysWithMemory10Verbose",
48 |         "simonsays",
49 |         "gameLength=10, numDistractors=4, memorization=1, verbose=1",
50 |     ),
51 |     (
52 |         "SimonSaysWithMemory50Verbose",
53 |         "simonsays",
54 |         "gameLength=50, numDistractors=4, memorization=1, verbose=1",
55 |     ),
56 |     (
57 |         "SimonSaysWithMemory100Verbose",
58 |         "simonsays",
59 |         "gameLength=100, numDistractors=4, memorization=1, verbose=1",
60 |     ),
61 |     ("PeckingOrder", "peckingorder", ""),
62 | ]
63 | 
64 | 
65 | def get_seeds(split, env=None):
66 |     env = env or twx.TextWorldExpressEnv()
67 |     if split == "train":
68 |         return env.getValidSeedsTrain()
69 |     elif split == "valid":
70 |         return env.getValidSeedsDev()
71 |     elif split == "test":
72 |         return env.getValidSeedsTest()
73 |     else:
74 |         raise NotImplementedError("Only plan to support train, dev, and test splits.")
75 | 


--------------------------------------------------------------------------------
/tales/textworld_express/twx_env.py:
--------------------------------------------------------------------------------
 1 | import gymnasium as gym
 2 | import numpy as np
 3 | import textworld_express as twx
 4 | 
 5 | from . import twx_data
 6 | 
 7 | TASKS = twx_data.TASKS
 8 | 
 9 | 
10 | class TextWorldExpressEnv(gym.Env):
11 | 
12 |     def __init__(
13 |         self, game_name, game_params, admissible_commands=False, *args, **kwargs
14 |     ):
15 |         self.game_name = game_name
16 |         self.game_params = game_params
17 |         self.admissible_commands = admissible_commands
18 |         self.env = twx.TextWorldExpressEnv(envStepLimit=np.inf)
19 |         self.seeds = twx_data.get_seeds(split="test", env=self.env)
20 |         self.seed = self.seeds[0]
21 | 
22 |     def reset(self, *, seed=None, options=None):
23 |         if seed is not None:
24 |             self.seed = self.seeds[seed % len(self.seeds)]
25 | 
26 |         obs, info = self.env.reset(
27 |             seed=self.seed,
28 |             gameFold="test",
29 |             gameName=self.game_name,
30 |             gameParams=self.game_params,
31 |             generateGoldPath=True,
32 |         )
33 | 
34 |         # Add task description to the first observation.
35 |         obs = info["taskDescription"] + "\n\n" + obs
36 | 
37 |         info["max_score"] = 100
38 |         info["feedback"] = obs
39 |         info["won"] = False
40 |         info["lost"] = False
41 |         info["moves"] = 0
42 |         info["score"] = int(info["score"] * 100)
43 |         info["admissible_commands"] = info["validActions"]
44 |         info["extra.walkthrough"] = self.env.getGoldActionSequence()
45 |         return obs, info
46 | 
47 |     def step(self, action):
48 |         obs, reward, done, info = self.env.step(action)
49 |         info["max_score"] = 100
50 |         info["feedback"] = obs
51 |         info["won"] = info["tasksuccess"]
52 |         info["lost"] = info["taskfailure"]
53 |         info["moves"] = info["numMoves"]
54 |         info["score"] = int(info["score"] * 100)
55 |         info["admissible_commands"] = info["validActions"]
56 |         return obs, reward, done, info
57 | 
58 |     def close(self):
59 |         self.env.close()
60 | 


--------------------------------------------------------------------------------
/tales/token.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Optional
  3 | 
  4 | import tiktoken
  5 | from llm import Model
  6 | 
  7 | # Suppress warnings from transformers
  8 | os.environ["TRANSFORMERS_NO_ADVISORY_WARNINGS"] = "True"
  9 | from transformers import AutoTokenizer
 10 | 
 11 | 
 12 | def get_token_counter(model: Optional[Model] = None):
 13 |     if model is None or model.model_id == "gpt-4o":
 14 |         return OpenAITokenCounter("gpt-4o")
 15 | 
 16 |     if "claude-" in model.model_id:
 17 |         return ClaudeTokenCounter(model)
 18 | 
 19 |     elif "gemini" in model.model_id or "gemma" in model.model_id:
 20 |         return GeminiTokenCounter(model)
 21 | 
 22 |     try:
 23 |         return OpenAITokenCounter(model.model_id)
 24 |     except KeyError:
 25 |         pass
 26 | 
 27 |     # Try to load from transformers.
 28 |     return HuggingFaceTokenCounter(model.model_id)
 29 | 
 30 | 
 31 | class TokenCounter:
 32 | 
 33 |     def __call__(self, *, messages=None, text=None):
 34 |         nb_tokens = 0
 35 |         if messages is not None:
 36 |             nb_tokens += sum(len(self.tokenize(msg["content"])) for msg in messages)
 37 | 
 38 |         if text is not None:
 39 |             nb_tokens += len(self.tokenize(text))
 40 | 
 41 |         return nb_tokens
 42 | 
 43 | 
 44 | class OpenAITokenCounter(TokenCounter):
 45 |     def __init__(self, model: str):
 46 |         self.model = model
 47 |         if self.model in tiktoken.model.MODEL_TO_ENCODING:
 48 |             self.tokenize = tiktoken.encoding_for_model(self.model).encode
 49 |         else:
 50 |             self.tokenize = tiktoken.encoding_for_model(self.model.split("_")[0]).encode
 51 | 
 52 | 
 53 | class HuggingFaceTokenCounter(TokenCounter):
 54 |     def __init__(self, model: str):
 55 |         self.model = model
 56 |         try:
 57 |             self.tokenize = AutoTokenizer.from_pretrained(self.model).tokenize
 58 |         except OSError:
 59 |             msg = (
 60 |                 f"Tokenizer not found for model {self.model},"
 61 |                 " make sure you have access to the model"
 62 |                 " (e.g., HuggingFace API key is correctly set)."
 63 |             )
 64 |             raise ValueError(msg)
 65 | 
 66 |     def __call__(self, *, messages=None, text=None):
 67 |         nb_tokens = 0
 68 |         if messages is not None:
 69 |             nb_tokens += sum(len(self.tokenize(msg["content"])) for msg in messages)
 70 | 
 71 |         if text is not None:
 72 |             nb_tokens += len(self.tokenize(text))
 73 | 
 74 |         return nb_tokens
 75 | 
 76 | 
 77 | class ClaudeTokenCounter(TokenCounter):
 78 | 
 79 |     def __init__(self, model: Model):
 80 |         from anthropic import Anthropic
 81 | 
 82 |         self.model = model.claude_model_id
 83 |         self.client = Anthropic(api_key=model.get_key())
 84 | 
 85 |     def __call__(self, *, messages=None, text=None):
 86 |         from anthropic import NOT_GIVEN
 87 | 
 88 |         messages = list(messages or [])
 89 |         if text is not None:
 90 |             messages += [{"role": "assistant", "content": text.strip()}]
 91 | 
 92 |         # Extract system messages, if any.
 93 |         system = NOT_GIVEN
 94 |         if messages and messages[0]["role"] == "system":
 95 |             system = messages[0]["content"]
 96 |             messages.pop(0)
 97 | 
 98 |         return self.client.beta.messages.count_tokens(
 99 |             model=self.model,
100 |             messages=messages,
101 |             system=system,
102 |         ).input_tokens
103 | 
104 | 
105 | class GeminiTokenCounter(TokenCounter):
106 | 
107 |     def __init__(self, model: Model):
108 |         from google import genai
109 | 
110 |         self.model = model.model_id
111 |         self.client = genai.Client(api_key=model.get_key())
112 | 
113 |     def __call__(self, *, messages=None, text=None):
114 |         from google.genai import types
115 | 
116 |         messages = list(messages or [])
117 |         if text is not None:
118 |             messages += [{"role": "assistant", "content": text.strip()}]
119 | 
120 |         system = None
121 |         if messages and messages[0]["role"] == "system":
122 |             system = [messages[0]["content"]]
123 |             messages.pop(0)
124 | 
125 |         chat = self.client.chats.create(
126 |             model=self.model,
127 |             history=[
128 |                 types.Content(
129 |                     role=msg["role"].replace("assistant", "model"),
130 |                     parts=[types.Part(text=msg["content"])],
131 |                 )
132 |                 for msg in messages
133 |             ],
134 |             config=types.GenerateContentConfig(system_instruction=system),
135 |         )
136 | 
137 |         return self.client.models.count_tokens(
138 |             model=self.model,
139 |             contents=chat.get_history(),
140 |         ).total_tokens
141 | 


--------------------------------------------------------------------------------
/tales/utils.py:
--------------------------------------------------------------------------------
  1 | import asyncio
  2 | import json
  3 | import logging
  4 | import os
  5 | import shutil
  6 | import tempfile
  7 | from os.path import join as pjoin
  8 | 
  9 | import numpy as np
 10 | import requests
 11 | from llm import AsyncResponse, Conversation, Prompt, Response
 12 | from tqdm import tqdm
 13 | 
 14 | from tales.logger import log
 15 | 
 16 | 
 17 | def mkdirs(dirpath: str) -> str:
 18 |     """Create a directory and all its parents.
 19 | 
 20 |     If the folder already exists, its path is returned without raising any exceptions.
 21 | 
 22 |     Arguments:
 23 |         dirpath: Path where a folder need to be created.
 24 | 
 25 |     Returns:
 26 |         Path to the (created) folder.
 27 |     """
 28 |     try:
 29 |         os.makedirs(dirpath)
 30 |     except FileExistsError:
 31 |         pass
 32 | 
 33 |     return dirpath
 34 | 
 35 | 
 36 | def download(url, dst, desc=None, force=False):
 37 |     """Download a remote file using HTTP get request.
 38 | 
 39 |     Args:
 40 |         url (str): URL where to get the file.
 41 |         dst (str): Destination folder where to save the file.
 42 |         force (bool, optional):
 43 |             Download again if it exists]. Defaults to False.
 44 | 
 45 |     Returns:
 46 |         str: Path to the downloaded file.
 47 | 
 48 |     Notes:
 49 |         This code is inspired by
 50 |         https://github.com/huggingface/transformers/blob/v4.0.0/src/transformers/file_utils.py#L1069
 51 |     """
 52 |     filename = url.split("/")[-1]
 53 |     path = pjoin(mkdirs(dst), filename)
 54 | 
 55 |     if os.path.isfile(path) and not force:
 56 |         return path
 57 | 
 58 |     # Download to a temp folder first to avoid corrupting the cache
 59 |     # with incomplete downloads.
 60 |     temp_dir = mkdirs(pjoin(tempfile.gettempdir(), "tales"))
 61 |     temp_path = pjoin(temp_dir, filename)
 62 |     with open(temp_path, "ab") as temp_file:
 63 |         headers = {}
 64 |         resume_size = temp_file.tell()
 65 |         if resume_size:
 66 |             headers["Range"] = f"bytes={resume_size}-"
 67 |             headers["x-ms-version"] = "2020-04-08"  # Needed for Range support.
 68 | 
 69 |         r = requests.get(url, stream=True, headers=headers)
 70 |         if r.headers.get("x-ms-error-code") == "InvalidRange" and r.headers[
 71 |             "Content-Range"
 72 |         ].rsplit("/", 1)[-1] == str(resume_size):
 73 |             shutil.move(temp_path, path)
 74 |             return path
 75 | 
 76 |         r.raise_for_status()  # Bad request.
 77 |         content_length = r.headers.get("Content-Length")
 78 |         total = resume_size + int(content_length)
 79 |         pbar = tqdm(
 80 |             unit="B",
 81 |             initial=resume_size,
 82 |             unit_scale=True,
 83 |             total=total,
 84 |             desc=desc or "Downloading {}".format(filename),
 85 |             leave=False,
 86 |         )
 87 | 
 88 |         for chunk in r.iter_content(chunk_size=1024):
 89 |             if chunk:  # filter out keep-alive new chunks
 90 |                 pbar.update(len(chunk))
 91 |                 temp_file.write(chunk)
 92 | 
 93 |     shutil.move(temp_path, path)
 94 | 
 95 |     pbar.close()
 96 |     return path
 97 | 
 98 | 
 99 | def merge_messages(messages):
100 |     """Merge messages from the same role into a single message."""
101 |     messages_out = [dict(messages[0])]
102 |     for message in messages[1:]:
103 |         if message["role"] == messages_out[-1]["role"]:
104 |             messages_out[-1]["content"] += "\n\n" + message["content"]
105 |         else:
106 |             messages_out.append(dict(message))
107 | 
108 |     return messages_out
109 | 
110 | 
111 | def messages2conversation(model, messages):
112 |     messages = merge_messages(messages)  # Just in case.
113 |     responses = []
114 | 
115 |     system = None
116 |     for message in messages:
117 |         if message["role"] == "system":
118 |             system = message["content"]
119 |             continue
120 | 
121 |         if message["role"] == "user":
122 |             prompt = message["content"]
123 |             continue
124 | 
125 |         if message["role"] == "assistant":
126 |             # Make a fake response object.
127 |             response = Response(
128 |                 model=model,
129 |                 prompt=Prompt(
130 |                     prompt,
131 |                     system=system,
132 |                     model=model,
133 |                 ),
134 |                 stream=False,
135 |             )
136 |             response._done = True
137 |             response._chunks = [message["content"]]
138 |             responses.append(response)
139 | 
140 |             system = None
141 |             prompt = None
142 | 
143 |     return Conversation(model, responses=responses)
144 | 
145 | 
146 | def format_messages_to_markdown(messages):
147 |     """Concatenate messages into a single markdown string."""
148 |     markdown_content = ""
149 |     for message in messages:
150 |         role = message["role"].capitalize()
151 |         content = message["content"]
152 |         markdown_content += f"#### {role}\n\n```\n{content}\n```\n\n"
153 |     return markdown_content
154 | 
155 | 
156 | def is_recoverable_error(exception):
157 |     # List of exceptions thrown by various libraries that can be retried.
158 |     recoverable_errors = [
159 |         "openai.APIStatusError",
160 |         "openai.APITimeoutError",
161 |         "openai.error.Timeout",
162 |         "openai.error.RateLimitError",
163 |         "openai.error.ServiceUnavailableError",
164 |         "openai.Timeout",
165 |         "openai.APIError",
166 |         "openai.APIConnectionError",
167 |         "openai.RateLimitError",
168 |         "openai.InternalServerError",
169 |         "anthropic.error.RateLimitError",
170 |         "anthropic.InternalServerError",
171 |         "anthropic.OverloadedError",
172 |         "anthropic.APIStatusError",
173 |         "anthropic._exceptions.OverloadedError",
174 |         "llm.errors.ModelError",  # Gemini
175 |         # Add more as needed
176 |     ]
177 |     exception_full_name = (
178 |         f"{exception.__class__.__module__}.{exception.__class__.__name__}"
179 |     )
180 |     log.warning(f"Exception_full_name: {exception_full_name}")
181 |     log.warning(f"Exception: {exception}")
182 |     return exception_full_name in recoverable_errors
183 | 
184 | 
185 | class NumpyEncoder(json.JSONEncoder):
186 |     def default(self, obj):
187 |         if isinstance(obj, (np.integer, np.floating)):
188 |             return obj.item()
189 |         elif isinstance(obj, np.ndarray):
190 |             return obj.tolist()
191 |         return super(NumpyEncoder, self).default(obj)
192 | 


--------------------------------------------------------------------------------
/tales/version.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.0.1"
2 | 


--------------------------------------------------------------------------------
/website/Gemfile.lock:
--------------------------------------------------------------------------------
  1 | GEM
  2 |   remote: https://rubygems.org/
  3 |   specs:
  4 |     activesupport (7.2.2.1)
  5 |       base64
  6 |       benchmark (>= 0.3)
  7 |       bigdecimal
  8 |       concurrent-ruby (~> 1.0, >= 1.3.1)
  9 |       connection_pool (>= 2.2.5)
 10 |       drb
 11 |       i18n (>= 1.6, < 2)
 12 |       logger (>= 1.4.2)
 13 |       minitest (>= 5.1)
 14 |       securerandom (>= 0.3)
 15 |       tzinfo (~> 2.0, >= 2.0.5)
 16 |     addressable (2.8.7)
 17 |       public_suffix (>= 2.0.2, < 7.0)
 18 |     base64 (0.2.0)
 19 |     benchmark (0.4.0)
 20 |     bigdecimal (3.1.9)
 21 |     coffee-script (2.4.1)
 22 |       coffee-script-source
 23 |       execjs
 24 |     coffee-script-source (1.12.2)
 25 |     colorator (1.1.0)
 26 |     commonmarker (0.23.11)
 27 |     concurrent-ruby (1.3.5)
 28 |     connection_pool (2.5.0)
 29 |     csv (3.3.4)
 30 |     dnsruby (1.72.4)
 31 |       base64 (~> 0.2.0)
 32 |       logger (~> 1.6.5)
 33 |       simpleidn (~> 0.2.1)
 34 |     drb (2.2.1)
 35 |     em-websocket (0.5.3)
 36 |       eventmachine (>= 0.12.9)
 37 |       http_parser.rb (~> 0)
 38 |     ethon (0.16.0)
 39 |       ffi (>= 1.15.0)
 40 |     eventmachine (1.2.7)
 41 |     execjs (2.10.0)
 42 |     faraday (2.13.0)
 43 |       faraday-net_http (>= 2.0, < 3.5)
 44 |       json
 45 |       logger
 46 |     faraday-net_http (3.4.0)
 47 |       net-http (>= 0.5.0)
 48 |     ffi (1.17.1)
 49 |     ffi (1.17.1-arm64-darwin)
 50 |     ffi (1.17.1-x86_64-darwin)
 51 |     forwardable-extended (2.6.0)
 52 |     gemoji (4.1.0)
 53 |     github-pages (232)
 54 |       github-pages-health-check (= 1.18.2)
 55 |       jekyll (= 3.10.0)
 56 |       jekyll-avatar (= 0.8.0)
 57 |       jekyll-coffeescript (= 1.2.2)
 58 |       jekyll-commonmark-ghpages (= 0.5.1)
 59 |       jekyll-default-layout (= 0.1.5)
 60 |       jekyll-feed (= 0.17.0)
 61 |       jekyll-gist (= 1.5.0)
 62 |       jekyll-github-metadata (= 2.16.1)
 63 |       jekyll-include-cache (= 0.2.1)
 64 |       jekyll-mentions (= 1.6.0)
 65 |       jekyll-optional-front-matter (= 0.3.2)
 66 |       jekyll-paginate (= 1.1.0)
 67 |       jekyll-readme-index (= 0.3.0)
 68 |       jekyll-redirect-from (= 0.16.0)
 69 |       jekyll-relative-links (= 0.6.1)
 70 |       jekyll-remote-theme (= 0.4.3)
 71 |       jekyll-sass-converter (= 1.5.2)
 72 |       jekyll-seo-tag (= 2.8.0)
 73 |       jekyll-sitemap (= 1.4.0)
 74 |       jekyll-swiss (= 1.0.0)
 75 |       jekyll-theme-architect (= 0.2.0)
 76 |       jekyll-theme-cayman (= 0.2.0)
 77 |       jekyll-theme-dinky (= 0.2.0)
 78 |       jekyll-theme-hacker (= 0.2.0)
 79 |       jekyll-theme-leap-day (= 0.2.0)
 80 |       jekyll-theme-merlot (= 0.2.0)
 81 |       jekyll-theme-midnight (= 0.2.0)
 82 |       jekyll-theme-minimal (= 0.2.0)
 83 |       jekyll-theme-modernist (= 0.2.0)
 84 |       jekyll-theme-primer (= 0.6.0)
 85 |       jekyll-theme-slate (= 0.2.0)
 86 |       jekyll-theme-tactile (= 0.2.0)
 87 |       jekyll-theme-time-machine (= 0.2.0)
 88 |       jekyll-titles-from-headings (= 0.5.3)
 89 |       jemoji (= 0.13.0)
 90 |       kramdown (= 2.4.0)
 91 |       kramdown-parser-gfm (= 1.1.0)
 92 |       liquid (= 4.0.4)
 93 |       mercenary (~> 0.3)
 94 |       minima (= 2.5.1)
 95 |       nokogiri (>= 1.16.2, < 2.0)
 96 |       rouge (= 3.30.0)
 97 |       terminal-table (~> 1.4)
 98 |       webrick (~> 1.8)
 99 |     github-pages-health-check (1.18.2)
100 |       addressable (~> 2.3)
101 |       dnsruby (~> 1.60)
102 |       octokit (>= 4, < 8)
103 |       public_suffix (>= 3.0, < 6.0)
104 |       typhoeus (~> 1.3)
105 |     html-pipeline (2.14.3)
106 |       activesupport (>= 2)
107 |       nokogiri (>= 1.4)
108 |     http_parser.rb (0.8.0)
109 |     i18n (1.14.7)
110 |       concurrent-ruby (~> 1.0)
111 |     jekyll (3.10.0)
112 |       addressable (~> 2.4)
113 |       colorator (~> 1.0)
114 |       csv (~> 3.0)
115 |       em-websocket (~> 0.5)
116 |       i18n (>= 0.7, < 2)
117 |       jekyll-sass-converter (~> 1.0)
118 |       jekyll-watch (~> 2.0)
119 |       kramdown (>= 1.17, < 3)
120 |       liquid (~> 4.0)
121 |       mercenary (~> 0.3.3)
122 |       pathutil (~> 0.9)
123 |       rouge (>= 1.7, < 4)
124 |       safe_yaml (~> 1.0)
125 |       webrick (>= 1.0)
126 |     jekyll-avatar (0.8.0)
127 |       jekyll (>= 3.0, < 5.0)
128 |     jekyll-coffeescript (1.2.2)
129 |       coffee-script (~> 2.2)
130 |       coffee-script-source (~> 1.12)
131 |     jekyll-commonmark (1.4.0)
132 |       commonmarker (~> 0.22)
133 |     jekyll-commonmark-ghpages (0.5.1)
134 |       commonmarker (>= 0.23.7, < 1.1.0)
135 |       jekyll (>= 3.9, < 4.0)
136 |       jekyll-commonmark (~> 1.4.0)
137 |       rouge (>= 2.0, < 5.0)
138 |     jekyll-default-layout (0.1.5)
139 |       jekyll (>= 3.0, < 5.0)
140 |     jekyll-feed (0.17.0)
141 |       jekyll (>= 3.7, < 5.0)
142 |     jekyll-gist (1.5.0)
143 |       octokit (~> 4.2)
144 |     jekyll-github-metadata (2.16.1)
145 |       jekyll (>= 3.4, < 5.0)
146 |       octokit (>= 4, < 7, != 4.4.0)
147 |     jekyll-include-cache (0.2.1)
148 |       jekyll (>= 3.7, < 5.0)
149 |     jekyll-mentions (1.6.0)
150 |       html-pipeline (~> 2.3)
151 |       jekyll (>= 3.7, < 5.0)
152 |     jekyll-optional-front-matter (0.3.2)
153 |       jekyll (>= 3.0, < 5.0)
154 |     jekyll-paginate (1.1.0)
155 |     jekyll-readme-index (0.3.0)
156 |       jekyll (>= 3.0, < 5.0)
157 |     jekyll-redirect-from (0.16.0)
158 |       jekyll (>= 3.3, < 5.0)
159 |     jekyll-relative-links (0.6.1)
160 |       jekyll (>= 3.3, < 5.0)
161 |     jekyll-remote-theme (0.4.3)
162 |       addressable (~> 2.0)
163 |       jekyll (>= 3.5, < 5.0)
164 |       jekyll-sass-converter (>= 1.0, <= 3.0.0, != 2.0.0)
165 |       rubyzip (>= 1.3.0, < 3.0)
166 |     jekyll-sass-converter (1.5.2)
167 |       sass (~> 3.4)
168 |     jekyll-seo-tag (2.8.0)
169 |       jekyll (>= 3.8, < 5.0)
170 |     jekyll-sitemap (1.4.0)
171 |       jekyll (>= 3.7, < 5.0)
172 |     jekyll-swiss (1.0.0)
173 |     jekyll-theme-architect (0.2.0)
174 |       jekyll (> 3.5, < 5.0)
175 |       jekyll-seo-tag (~> 2.0)
176 |     jekyll-theme-cayman (0.2.0)
177 |       jekyll (> 3.5, < 5.0)
178 |       jekyll-seo-tag (~> 2.0)
179 |     jekyll-theme-dinky (0.2.0)
180 |       jekyll (> 3.5, < 5.0)
181 |       jekyll-seo-tag (~> 2.0)
182 |     jekyll-theme-hacker (0.2.0)
183 |       jekyll (> 3.5, < 5.0)
184 |       jekyll-seo-tag (~> 2.0)
185 |     jekyll-theme-leap-day (0.2.0)
186 |       jekyll (> 3.5, < 5.0)
187 |       jekyll-seo-tag (~> 2.0)
188 |     jekyll-theme-merlot (0.2.0)
189 |       jekyll (> 3.5, < 5.0)
190 |       jekyll-seo-tag (~> 2.0)
191 |     jekyll-theme-midnight (0.2.0)
192 |       jekyll (> 3.5, < 5.0)
193 |       jekyll-seo-tag (~> 2.0)
194 |     jekyll-theme-minimal (0.2.0)
195 |       jekyll (> 3.5, < 5.0)
196 |       jekyll-seo-tag (~> 2.0)
197 |     jekyll-theme-modernist (0.2.0)
198 |       jekyll (> 3.5, < 5.0)
199 |       jekyll-seo-tag (~> 2.0)
200 |     jekyll-theme-primer (0.6.0)
201 |       jekyll (> 3.5, < 5.0)
202 |       jekyll-github-metadata (~> 2.9)
203 |       jekyll-seo-tag (~> 2.0)
204 |     jekyll-theme-slate (0.2.0)
205 |       jekyll (> 3.5, < 5.0)
206 |       jekyll-seo-tag (~> 2.0)
207 |     jekyll-theme-tactile (0.2.0)
208 |       jekyll (> 3.5, < 5.0)
209 |       jekyll-seo-tag (~> 2.0)
210 |     jekyll-theme-time-machine (0.2.0)
211 |       jekyll (> 3.5, < 5.0)
212 |       jekyll-seo-tag (~> 2.0)
213 |     jekyll-titles-from-headings (0.5.3)
214 |       jekyll (>= 3.3, < 5.0)
215 |     jekyll-watch (2.2.1)
216 |       listen (~> 3.0)
217 |     jemoji (0.13.0)
218 |       gemoji (>= 3, < 5)
219 |       html-pipeline (~> 2.2)
220 |       jekyll (>= 3.0, < 5.0)
221 |     json (2.10.2)
222 |     kramdown (2.4.0)
223 |       rexml
224 |     kramdown-parser-gfm (1.1.0)
225 |       kramdown (~> 2.0)
226 |     liquid (4.0.4)
227 |     listen (3.9.0)
228 |       rb-fsevent (~> 0.10, >= 0.10.3)
229 |       rb-inotify (~> 0.9, >= 0.9.10)
230 |     logger (1.6.6)
231 |     mercenary (0.3.6)
232 |     mini_portile2 (2.8.8)
233 |     minima (2.5.1)
234 |       jekyll (>= 3.5, < 5.0)
235 |       jekyll-feed (~> 0.9)
236 |       jekyll-seo-tag (~> 2.1)
237 |     minitest (5.25.5)
238 |     net-http (0.6.0)
239 |       uri
240 |     nokogiri (1.18.7)
241 |       mini_portile2 (~> 2.8.2)
242 |       racc (~> 1.4)
243 |     nokogiri (1.18.7-arm64-darwin)
244 |       racc (~> 1.4)
245 |     nokogiri (1.18.7-x86_64-darwin)
246 |       racc (~> 1.4)
247 |     octokit (4.25.1)
248 |       faraday (>= 1, < 3)
249 |       sawyer (~> 0.9)
250 |     pathutil (0.16.2)
251 |       forwardable-extended (~> 2.6)
252 |     public_suffix (5.1.1)
253 |     racc (1.8.1)
254 |     rb-fsevent (0.11.2)
255 |     rb-inotify (0.11.1)
256 |       ffi (~> 1.0)
257 |     rexml (3.4.1)
258 |     rouge (3.30.0)
259 |     rubyzip (2.4.1)
260 |     safe_yaml (1.0.5)
261 |     sass (3.7.4)
262 |       sass-listen (~> 4.0.0)
263 |     sass-listen (4.0.0)
264 |       rb-fsevent (~> 0.9, >= 0.9.4)
265 |       rb-inotify (~> 0.9, >= 0.9.7)
266 |     sawyer (0.9.2)
267 |       addressable (>= 2.3.5)
268 |       faraday (>= 0.17.3, < 3)
269 |     securerandom (0.4.1)
270 |     simpleidn (0.2.3)
271 |     terminal-table (1.8.0)
272 |       unicode-display_width (~> 1.1, >= 1.1.1)
273 |     typhoeus (1.4.1)
274 |       ethon (>= 0.9.0)
275 |     tzinfo (2.0.6)
276 |       concurrent-ruby (~> 1.0)
277 |     unicode-display_width (1.8.0)
278 |     uri (1.0.3)
279 |     webrick (1.9.1)
280 | 
281 | PLATFORMS
282 |   arm64-darwin
283 |   ruby
284 |   x86_64-darwin
285 |   x86_64-linux
286 | 
287 | DEPENDENCIES
288 |   github-pages
289 | 
290 | BUNDLED WITH
291 |    2.6.8
292 | 


--------------------------------------------------------------------------------
/website/_site/assets/figs/alfworld_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/alfworld_all_games.png


--------------------------------------------------------------------------------
/website/_site/assets/figs/all_framework_scores.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/all_framework_scores.png


--------------------------------------------------------------------------------
/website/_site/assets/figs/jericho_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/jericho_all_games.png


--------------------------------------------------------------------------------
/website/_site/assets/figs/radar_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/radar_chart.png


--------------------------------------------------------------------------------
/website/_site/assets/figs/radar_chart_zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/radar_chart_zoom.png


--------------------------------------------------------------------------------
/website/_site/assets/figs/scienceworld_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/scienceworld_all_games.png


--------------------------------------------------------------------------------
/website/_site/assets/figs/text-benchmark_bar_chart.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/text-benchmark_bar_chart.png


--------------------------------------------------------------------------------
/website/_site/assets/figs/text-benchmark_radar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/text-benchmark_radar.png


--------------------------------------------------------------------------------
/website/_site/assets/figs/text-benchmark_radar_zoom.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/text-benchmark_radar_zoom.png


--------------------------------------------------------------------------------
/website/_site/assets/figs/textworld_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/textworld_all_games.png


--------------------------------------------------------------------------------
/website/_site/assets/figs/textworld_express_all_games.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/assets/figs/textworld_express_all_games.png


--------------------------------------------------------------------------------
/website/_site/assets/js/tabs.js:
--------------------------------------------------------------------------------
 1 | function openTab(evt, tabName) {
 2 |     var i, tabcontent, tabbuttons;
 3 |     
 4 |     // Hide all tab content
 5 |     tabcontent = document.getElementsByClassName("tab-content");
 6 |     for (i = 0; i < tabcontent.length; i++) {
 7 |         tabcontent[i].style.display = "none";
 8 |     }
 9 |     
10 |     // Remove "active" class from all tab buttons
11 |     tabbuttons = document.getElementsByClassName("tab-button");
12 |     for (i = 0; i < tabbuttons.length; i++) {
13 |         tabbuttons[i].className = tabbuttons[i].className.replace(" active", "");
14 |     }
15 |     
16 |     // Show the current tab and add "active" class to the button
17 |     document.getElementById(tabName).style.display = "block";
18 |     evt.currentTarget.className += " active";
19 | }
20 | 
21 | // Nested tab functionality
22 | function openNestedTab(evt, tabName) {
23 |     var i, tabcontent, tabbuttons;
24 |     
25 |     // Hide all nested tab content within the parent tab
26 |     var parentTab = evt.currentTarget.closest('.tab-content');
27 |     tabcontent = parentTab.getElementsByClassName("nested-tab-content");
28 |     for (i = 0; i < tabcontent.length; i++) {
29 |         tabcontent[i].style.display = "none";
30 |     }
31 |     
32 |     // Remove "active" class from all nested tab buttons
33 |     tabbuttons = parentTab.getElementsByClassName("nested-tab-button");
34 |     for (i = 0; i < tabbuttons.length; i++) {
35 |         tabbuttons[i].className = tabbuttons[i].className.replace(" active", "");
36 |     }
37 |     
38 |     // Show the current nested tab and add "active" class to the button
39 |     document.getElementById(tabName).style.display = "block";
40 |     evt.currentTarget.className += " active";
41 | }
42 | 
43 | // Initialize tabs
44 | document.addEventListener('DOMContentLoaded', function() {
45 |     // Make sure the first tab and its first nested tab are active by default
46 |     document.querySelector('.tab-button').click();
47 | });


--------------------------------------------------------------------------------
/website/_site/favicon.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/tale-suite/07cb43d7713d7bd7a0e7d9ef9fce1ef84ac8aeac/website/_site/favicon.ico


--------------------------------------------------------------------------------
/website/_site/index.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html lang="en-US">
  3 |   <head>
  4 |     <meta charset="UTF-8">
  5 | 
  6 | <!-- Begin Jekyll SEO tag v2.8.0 -->
  7 | <title>T A L E S | the Text Adventure Learning Environment Suite</title>
  8 | <meta name="generator" content="Jekyll v3.10.0" />
  9 | <meta property="og:title" content="T A L E S" />
 10 | <meta property="og:locale" content="en_US" />
 11 | <meta name="description" content="the Text Adventure Learning Environment Suite" />
 12 | <meta property="og:description" content="the Text Adventure Learning Environment Suite" />
 13 | <link rel="canonical" href="http://0.0.0.0:4000/" />
 14 | <meta property="og:url" content="http://0.0.0.0:4000/" />
 15 | <meta property="og:site_name" content="the Text Adventure Learning Environment Suite" />
 16 | <meta property="og:type" content="website" />
 17 | <meta name="twitter:card" content="summary" />
 18 | <meta property="twitter:title" content="T A L E S" />
 19 | <script type="application/ld+json">
 20 | {"@context":"https://schema.org","@type":"WebSite","description":"the Text Adventure Learning Environment Suite","headline":"T A L E S","name":"the Text Adventure Learning Environment Suite","url":"http://0.0.0.0:4000/"}</script>
 21 | <!-- End Jekyll SEO tag -->
 22 | 
 23 |     <link rel="preconnect" href="https://fonts.gstatic.com">
 24 |     <link rel="preload" href="https://fonts.googleapis.com/css?family=Open+Sans:400,700&display=swap" as="style" type="text/css" crossorigin>
 25 |     <meta name="viewport" content="width=device-width, initial-scale=1">
 26 |     <meta name="theme-color" content="#157878">
 27 |     <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
 28 |     <link rel="stylesheet" href="/assets/css/style.css?v=550595e169aa16debc23fdc0e17e3b4eabc2188d">
 29 |     <!-- start custom head snippets, customize with your own _includes/head-custom.html file -->
 30 | 
 31 | <!-- Setup Analytics -->
 32 | 
 33 | <!-- You can set your favicon here -->
 34 | <!-- link rel="shortcut icon" type="image/x-icon" href="/favicon.ico" -->
 35 | 
 36 | <!-- Add your custom head content here -->
 37 | <script src="/assets/js/tabs.js"></script>
 38 | 
 39 | <!-- end custom head snippets -->
 40 | 
 41 |   </head>
 42 |   <body>
 43 |     <a id="skip-to-content" href="#content">Skip to the content.</a>
 44 | 
 45 |     <header class="page-header" role="banner">
 46 |       <h1 class="project-name"><strong><em>T A L E S</em></h1>
 47 |       <h2 class="project-tagline"><em>the</em> <strong><em>T</em></strong><em>ext</em> <strong><em>A</em></strong><em>dventure</em> <strong><em>L</em></strong><em>earning</em> <strong><em>E</em></strong><em>nvironment</em> <strong><em>S</em></strong><em>uite</em></h2>
 48 |       
 49 |         <a href="https://github.com/microsoft/text-games-benchmark" class="btn">View on GitHub</a>
 50 |       
 51 |       
 52 |     </header>
 53 | 
 54 |     <main id="content" class="main-content" role="main">
 55 |       <div class="tab-container">
 56 |     <div class="tabs">
 57 |         <button class="tab-button active" onclick="openTab(event, 'tab1')">Overview</button>
 58 |         <button class="tab-button" onclick="openTab(event, 'tab4')">Environments</button>
 59 |         <button class="tab-button" onclick="openTab(event, 'tab3')">Scores By Framework</button>
 60 |         <button class="tab-button" onclick="openTab(event, 'tab2')">Scores By Game</button>
 61 |         <button class="tab-button" onclick="openTab(event, 'tab5')">Bloopers</button>
 62 |     </div>
 63 |     
 64 |     <div id="tab1" class="tab-content active">
 65 |         <!-- Nested tabs for tab1 -->
 66 |         <div class="nested-tabs">
 67 |             <button class="nested-tab-button active" onclick="openNestedTab(event, 'tab1-subtab1')">Overview</button>
 68 |             <button class="nested-tab-button" onclick="openNestedTab(event, 'tab1-subtab2')">Environments</button>
 69 |         </div>
 70 |         
 71 |         <div id="tab1-subtab1" class="nested-tab-content active">
 72 |             <h2 id="overview">Overview</h2>
 73 |             Insert overview description here.
 74 |         </div>
 75 |         
 76 |         <div id="tab1-subtab2" class="nested-tab-content">
 77 |             <h2 id="Environment Description">Environments</h2>
 78 |             
 79 |         </div>
 80 |     </div>
 81 |     
 82 |     <div id="tab2" class="tab-content">
 83 |         <!-- Nested tabs for tab2 -->
 84 |         <div class="nested-tabs">
 85 |             <button class="nested-tab-button active" onclick="openNestedTab(event, 'tab2-subtab1')">Textworld</button>
 86 |             <button class="nested-tab-button" onclick="openNestedTab(event, 'tab2-subtab2')">Textworld Express</button>
 87 |             <button class="nested-tab-button" onclick="openNestedTab(event, 'tab3-subtab3')">Alfworld</button>
 88 |             <button class="nested-tab-button" onclick="openNestedTab(event, 'tab4-subtab4')">Scienceworld</button>
 89 |             <button class="nested-tab-button" onclick="openNestedTab(event, 'tab5-subtab5')">Jericho</button>
 90 |         </div>
 91 |         
 92 |         <div id="tab1-subtab1" class="nested-tab-content active">
 93 |             <h2 id="tw_all_games">Scores for all Textworld games for Top 9 models</h2>
 94 |             <p><img src="assets/figs/textworld_all_games.png" alt="tw_allgames chart" /></p>
 95 |         </div>
 96 | 
 97 |         <div id="tab2-subtab2" class="nested-tab-content active">
 98 |             <h2 id="twx_all_games">Scores for all Textworld Express games for Top 9 models</h2>
 99 |             <p><img src="assets/figs/textworld_express_all_games.png" alt="twx_allgames chart" /></p>
100 |         </div>
101 | 
102 |         <div id="tab3-subtab3" class="nested-tab-content active">
103 |             <h2 id="alfworld_all_games">Scores for all Alfworld games for Top 9 models</h2>
104 |             <p><img src="assets/figs/alfworld_all_games.png" alt="alfw_allgames chart" /></p>
105 |         </div>
106 | 
107 |         <div id="tab4-subtab4" class="nested-tab-content active">
108 |             <h2 id="scienceworld_all_games">Scores for all Scienceworld games for Top 9 models</h2>
109 |             <p><img src="assets/figs/scienceworld_all_games.png" alt="sciencew_allgames chart" /></p>
110 |         </div>
111 | 
112 |         <div id="tab5-subtab5" class="nested-tab-content active">
113 |             <h2 id="jericho_all_games">Scores for all Jericho games for Top 9 models</h2>
114 |             <p><img src="assets/figs/jericho_all_games.png" alt="jericho_allgames chart" /></p>
115 |         </div>
116 |         
117 |         <div id="tab1-subtab1" class="nested-tab-content active">
118 |             <h2 id="jericho_all_games">Scores for all Jericho games for Top 9 models</h2>
119 |             <p><img src="assets/figs/jericho_all_games.png" alt="jerichoallgames chart" /></p>
120 |         </div>
121 |     </div>
122 |     
123 |     <div id="tab3" class="tab-content">
124 |         <!-- Insert Tab 3 content here -->
125 |         <h2>Breakdown of scores per framework</h2>
126 |         <p><img src="assets/figs/all_framework_scores.png" alt="fws chart" /></p>
127 |     </div>
128 |     
129 |     <div id="tab4" class="tab-content">
130 |         <!-- Insert Tab 4 content here -->
131 |         <h2>Tab 4 Content</h2>
132 |         <p>This is where you'll put the content for Tab 4.</p>
133 |     </div>
134 |     
135 |     <div id="tab5" class="tab-content">
136 |         <!-- Insert Tab 5 content here -->
137 |         <h2>Tab 5 Content</h2>
138 |         <p>This is where you'll put the content for Tab 5.</p>
139 |     </div>
140 | </div>
141 | <footer class="site-footer" style="text-align: center;">
142 |     <span class="site-footer-credits">
143 |         <a href="https://go.microsoft.com/fwlink/?LinkId=521839">Privacy</a>
144 |         |
145 |         <a href="https://go.microsoft.com/fwlink/?LinkId=2259814">Consumer Health Privacy</a>
146 |         |
147 |         <a id="cookiesManager" onClick="manageConsent();">Cookies</a>
148 |         |
149 |         <a href="https://go.microsoft.com/fwlink/?LinkID=206977">Terms of Use</a>
150 |         |
151 |         <a href="https://www.microsoft.com/trademarks">Trademarks</a>
152 |         |
153 |         <a href="https://www.microsoft.com" id="copyright">©️ 2024 Microsoft</a>
154 |     </span>
155 |   </footer></main>
156 |   </body>
157 | </html>
158 | 


--------------------------------------------------------------------------------