├── src
    └── inspect_ai
    │   ├── py.typed
    │   ├── _view
    │       ├── www
    │       │   ├── .gitignore
    │       │   ├── preact
    │       │   │   ├── htm
    │       │   │   │   ├── preact.mjs
    │       │   │   │   ├── preact.js
    │       │   │   │   └── htm.mjs
    │       │   │   └── preact-hooks.mjs
    │       │   ├── src
    │       │   │   ├── utils
    │       │   │   │   ├── Type.mjs
    │       │   │   │   ├── sleep.mjs
    │       │   │   │   ├── Git.mjs
    │       │   │   │   ├── Path.mjs
    │       │   │   │   └── events.mjs
    │       │   │   ├── Register.mjs
    │       │   │   ├── api
    │       │   │   │   ├── index.mjs
    │       │   │   │   ├── api-vscode.mjs
    │       │   │   │   └── api-browser.mjs
    │       │   │   ├── log-reader
    │       │   │   │   ├── Native-Log-Reader.mjs
    │       │   │   │   └── Log-Reader.mjs
    │       │   │   ├── components
    │       │   │   │   ├── ToolButton.mjs
    │       │   │   │   ├── EmptyPanel.mjs
    │       │   │   │   ├── CopyButton.mjs
    │       │   │   │   ├── LabeledValue.mjs
    │       │   │   │   ├── LoadingScreen.mjs
    │       │   │   │   ├── AppErrorBoundary.mjs
    │       │   │   │   ├── AnsiDisplay.css
    │       │   │   │   ├── MorePopOver.mjs
    │       │   │   │   ├── ErrorPanel.mjs
    │       │   │   │   ├── MarkdownDiv.mjs
    │       │   │   │   ├── Dialog.mjs
    │       │   │   │   └── MessageContent.mjs
    │       │   │   ├── workspace
    │       │   │   │   └── TaskErrorPanel.mjs
    │       │   │   ├── samples
    │       │   │   │   ├── tools
    │       │   │   │   │   └── EpochFilter.mjs
    │       │   │   │   ├── SamplesTools.mjs
    │       │   │   │   └── SampleDialog.mjs
    │       │   │   └── usage
    │       │   │   │   ├── ModelTokenTable.mjs
    │       │   │   │   └── UsageCard.mjs
    │       │   ├── libs
    │       │   │   ├── bootstrap
    │       │   │   │   └── css
    │       │   │   │   │   └── fonts
    │       │   │   │   │       ├── bootstrap-icons.woff
    │       │   │   │   │       └── bootstrap-icons.woff2
    │       │   │   └── prism
    │       │   │   │   └── prism.min.css
    │       │   └── favicon.svg
    │       └── schema.py
    │   ├── __main__.py
    │   ├── _eval
    │       └── task
    │       │   ├── constants.py
    │       │   └── util.py
    │   ├── _util
    │       ├── dev.py
    │       ├── pattern.py
    │       ├── datetime.py
    │       ├── _async.py
    │       ├── samples.py
    │       ├── docstring.py
    │       ├── appdirs.py
    │       ├── text.py
    │       ├── constants.py
    │       ├── version.py
    │       ├── url.py
    │       ├── error.py
    │       ├── git.py
    │       ├── images.py
    │       ├── json.py
    │       ├── platform.py
    │       └── retry.py
    │   ├── _display
    │       ├── __init__.py
    │       └── _display.py
    │   ├── scorer
    │       ├── _metrics
    │       │   ├── __init__.py
    │       │   ├── mean.py
    │       │   ├── accuracy.py
    │       │   └── std.py
    │       ├── __init__.py
    │       ├── _multi.py
    │       └── _match.py
    │   ├── util
    │       ├── __init__.py
    │       └── _context
    │       │   ├── __init__.py
    │       │   └── logger.py
    │   ├── solver
    │       ├── _util.py
    │       ├── __init__.py
    │       └── _tool
    │       │   └── use_tools.py
    │   ├── _cli
    │       ├── util.py
    │       ├── view.py
    │       ├── main.py
    │       └── common.py
    │   ├── dataset
    │       ├── __init__.py
    │       ├── _examples
    │       │   └── biology_qa.jsonl
    │       └── _sources
    │       │   └── example.py
    │   ├── model
    │       ├── _providers
    │       │   ├── ollama.py
    │       │   └── util.py
    │       ├── __init__.py
    │       └── _tool.py
    │   ├── __init__.py
    │   └── log
    │       └── __init__.py
├── tests
    ├── test_helpers
    │   └── __init__.py
    ├── test_task_list
    │   ├── __init__.py
    │   ├── multiple_dir
    │   │   ├── bar.py
    │   │   ├── foo.py
    │   │   ├── _decoy2.py
    │   │   └── _decoy
    │   │   │   └── testit.py
    │   ├── recurse
    │   │   ├── folder1
    │   │   │   ├── _decoy.py
    │   │   │   └── theta.py
    │   │   ├── folder2
    │   │   │   ├── another.py
    │   │   │   ├── first.py
    │   │   │   └── .folder3
    │   │   │   │   └── epsilon.py
    │   │   └── .folder3
    │   │   │   └── epsilon.py
    │   ├── multiple.py
    │   └── attribs.ipynb
    ├── test_package
    │   ├── inspect_package
    │   │   ├── py.typed
    │   │   ├── __init__.py
    │   │   ├── inspect_ai.py
    │   │   └── modelapi
    │   │   │   └── custom.py
    │   ├── .gitignore
    │   └── pyproject.toml
    ├── test_eval_log
    │   ├── log_invalid.txt
    │   ├── log_version_2.txt
    │   └── log_with_nan.txt
    ├── test_dataset
    │   ├── samples.csv
    │   ├── samples.jsonl
    │   └── samples.json
    ├── conftest.py
    ├── test_examples.py
    ├── test_cloudflare.py
    ├── test_model_package.py
    ├── test_anthropic.py
    ├── test_registry.py
    ├── test_openai.py
    ├── test_num_choices.py
    ├── test_list_task.py
    ├── test_images.py
    ├── test_retry.py
    ├── test_hf.py
    ├── test_plan.py
    ├── test_stop_reason.py
    ├── scorer
    │   └── test_answer.py
    ├── test_subprocess.py
    ├── test_collapse_user_message.py
    ├── test_collapse_assistant_message.py
    ├── test_logprobs.py
    └── test_eval_log.py
├── docs
    ├── .gitignore
    ├── _variables.yml
    ├── images
    │   ├── inspect.png
    │   ├── aisi-logo.png
    │   ├── eval-log.png
    │   ├── popularity.png
    │   ├── rate-limit.png
    │   ├── running-theory.png
    │   ├── inspect-view-home.png
    │   ├── inspect-view-info.png
    │   ├── inspect-view-main.png
    │   ├── inspect-view-sort.png
    │   ├── inspect-view-answers.png
    │   ├── inspect-view-filter.png
    │   ├── inspect-view-history.png
    │   ├── inspect-view-logging.png
    │   ├── inspect-view-scoring.png
    │   ├── inspect-view-splash.png
    │   ├── inspect-view-messages.png
    │   ├── inspect-view-metadata.png
    │   ├── inspect-vscode-config.png
    │   ├── inspect-vscode-install.png
    │   ├── inspect-vscode-logview.png
    │   ├── inspect-vscode-run-task.png
    │   ├── inspect-view-logging-console.png
    │   └── inspect-vscode-output-channel.png
    ├── _format
    │   └── pre-render.sh
    ├── _examples
    │   └── footer.qmd
    └── theme.scss
├── tools
    └── vscode
    │   ├── .yarnrc
    │   ├── tools
    │       └── ts-to-mjs
    │       │   ├── .gitignore
    │       │   ├── src
    │       │       └── index.ts
    │       │   ├── rollup.config.js
    │       │   └── package.json
    │   ├── src
    │       ├── providers
    │       │   ├── activity-bar
    │       │   │   ├── webview
    │       │   │   │   ├── task-config-webview.css
    │       │   │   │   └── env-config-webview.css
    │       │   │   └── task-config-commands.ts
    │       │   ├── inspect
    │       │   │   ├── inspect-constants.ts
    │       │   │   └── inspect-eval-commands.ts
    │       │   ├── logview
    │       │   │   ├── logview.ts
    │       │   │   ├── commands.ts
    │       │   │   ├── logview-manager.ts
    │       │   │   └── logview-link-provider.ts
    │       │   ├── workspace
    │       │   │   ├── workspace-init.ts
    │       │   │   └── workspace-env-commands.ts
    │       │   ├── active-task
    │       │   │   └── active-task-command.ts
    │       │   └── settings
    │       │   │   ├── user-settings.ts
    │       │   │   └── inspect-settings.ts
    │       ├── core
    │       │   ├── text.ts
    │       │   ├── python
    │       │   │   ├── index.ts
    │       │   │   └── code.ts
    │       │   ├── wait.ts
    │       │   ├── log.ts
    │       │   ├── string.ts
    │       │   ├── nonce.ts
    │       │   ├── workspace.ts
    │       │   ├── dispose.ts
    │       │   ├── command.ts
    │       │   ├── random.ts
    │       │   ├── path.ts
    │       │   ├── git.ts
    │       │   └── process.ts
    │       ├── inspect
    │       │   ├── index.ts
    │       │   ├── version.ts
    │       │   ├── list.ts
    │       │   └── logs.ts
    │       ├── test
    │       │   └── extension.test.ts
    │       └── components
    │       │   ├── error.ts
    │       │   ├── focus.ts
    │       │   ├── symbol.ts
    │       │   ├── templates.ts
    │       │   └── document.ts
    │   ├── .gitignore
    │   ├── assets
    │       ├── logo
    │       │   ├── inspect.png
    │       │   └── inspect.svg
    │       ├── www
    │       │   ├── codicon
    │       │   │   └── codicon.ttf
    │       │   └── view
    │       │   │   └── view-overrides.css
    │       └── templates
    │       │   └── task.py.template
    │   ├── .vscode-test.mjs
    │   ├── .vscodeignore
    │   ├── .vscode
    │       ├── extensions.json
    │       ├── launch.json
    │       ├── settings.json
    │       └── tasks.json
    │   ├── tsconfig.json
    │   ├── LICENSE
    │   ├── .eslintrc.json
    │   ├── CHANGELOG.md
    │   └── README.md
├── examples
    ├── agents
    │   └── langchain
    │   │   ├── .gitignore
    │   │   ├── .env.example
    │   │   ├── requirements.txt
    │   │   ├── wikipedia.jsonl
    │   │   └── README.md
    ├── theory_of_mind.py
    ├── biology_qa.py
    ├── security_guide.py
    └── popularity.py
├── .gitattributes
├── .vscode
    ├── extensions.json
    └── settings.json
├── benchmarks
    ├── datasets
    │   ├── mmlu.csv
    │   └── math_test.csv
    ├── hellaswag.py
    ├── boolq.py
    ├── piqa.py
    ├── arc.py
    └── gpqa.py
├── .github
    ├── dependabot.yml
    ├── pull_request_template.md
    └── workflows
    │   ├── docs.yml
    │   ├── vscode.yml
    │   ├── pypi.yml
    │   └── build.yml
├── requirements.txt
├── .pre-commit-config.yaml
├── LICENSE
└── README.md


/src/inspect_ai/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_helpers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/test_task_list/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/.gitignore:
--------------------------------------------------------------------------------
1 | /.quarto/
2 | /_book/
3 | 


--------------------------------------------------------------------------------
/tests/test_package/inspect_package/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tools/vscode/.yarnrc:
--------------------------------------------------------------------------------
1 | --ignore-engines true


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/.gitignore:
--------------------------------------------------------------------------------
1 | node_modules/


--------------------------------------------------------------------------------
/tests/test_package/inspect_package/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tools/vscode/tools/ts-to-mjs/.gitignore:
--------------------------------------------------------------------------------
1 | .yarn/


--------------------------------------------------------------------------------
/examples/agents/langchain/.gitignore:
--------------------------------------------------------------------------------
1 | .env
2 | .venv/
3 | 


--------------------------------------------------------------------------------
/tests/test_eval_log/log_invalid.txt:
--------------------------------------------------------------------------------
1 | {
2 |     "version": 1, "status": 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | benchmarks/datasets/** filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/tools/vscode/tools/ts-to-mjs/src/index.ts:
--------------------------------------------------------------------------------
1 | 
2 | export * from './jsonrpc';
3 | 


--------------------------------------------------------------------------------
/examples/agents/langchain/.env.example:
--------------------------------------------------------------------------------
1 | TAVILY_API_KEY=your-tavily-api-key
2 | 
3 | 


--------------------------------------------------------------------------------
/tools/vscode/src/providers/activity-bar/webview/task-config-webview.css:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 


--------------------------------------------------------------------------------
/tools/vscode/.gitignore:
--------------------------------------------------------------------------------
1 | out
2 | dist
3 | node_modules
4 | .vscode-test/
5 | *.vsix
6 | 


--------------------------------------------------------------------------------
/docs/_variables.yml:
--------------------------------------------------------------------------------
1 | 
2 | examples-url: https://UKGovernmentBEIS.github.io/inspect_ai/examples.html
3 | 


--------------------------------------------------------------------------------
/docs/images/inspect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect.png


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/preact/htm/preact.mjs:
--------------------------------------------------------------------------------
1 | 
2 | import "./htm.mjs";
3 | export * from "./preact.js"


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/preact/preact-hooks.mjs:
--------------------------------------------------------------------------------
1 | import "./preact.mjs";
2 | export * from "./hooks.js";


--------------------------------------------------------------------------------
/docs/images/aisi-logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/aisi-logo.png


--------------------------------------------------------------------------------
/docs/images/eval-log.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/eval-log.png


--------------------------------------------------------------------------------
/docs/images/popularity.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/popularity.png


--------------------------------------------------------------------------------
/docs/images/rate-limit.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/rate-limit.png


--------------------------------------------------------------------------------
/src/inspect_ai/__main__.py:
--------------------------------------------------------------------------------
1 | from ._cli.main import main
2 | 
3 | if __name__ == "__main__":
4 |     main()
5 | 


--------------------------------------------------------------------------------
/docs/images/running-theory.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/running-theory.png


--------------------------------------------------------------------------------
/src/inspect_ai/_eval/task/constants.py:
--------------------------------------------------------------------------------
1 | TASK_FILE_ATTR = "__task_file__"
2 | TASK_RUN_DIR_ATTR = "__task_run_dir__"
3 | 


--------------------------------------------------------------------------------
/tests/test_dataset/samples.csv:
--------------------------------------------------------------------------------
1 | input,target,label,extra
2 | "Say 'Hello, World'","Hello, World","Hello, World","data"


--------------------------------------------------------------------------------
/docs/images/inspect-view-home.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-home.png


--------------------------------------------------------------------------------
/docs/images/inspect-view-info.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-info.png


--------------------------------------------------------------------------------
/docs/images/inspect-view-main.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-main.png


--------------------------------------------------------------------------------
/docs/images/inspect-view-sort.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-sort.png


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | 
4 | sys.path.append(os.path.join(os.path.dirname(__file__), "helpers"))
5 | 


--------------------------------------------------------------------------------
/tools/vscode/src/core/text.ts:
--------------------------------------------------------------------------------
1 | export function lines(text: string): string[] {
2 |   return text.split(/\r?\n/);
3 | }
4 | 


--------------------------------------------------------------------------------
/docs/images/inspect-view-answers.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-answers.png


--------------------------------------------------------------------------------
/docs/images/inspect-view-filter.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-filter.png


--------------------------------------------------------------------------------
/docs/images/inspect-view-history.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-history.png


--------------------------------------------------------------------------------
/docs/images/inspect-view-logging.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-logging.png


--------------------------------------------------------------------------------
/docs/images/inspect-view-scoring.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-scoring.png


--------------------------------------------------------------------------------
/docs/images/inspect-view-splash.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-splash.png


--------------------------------------------------------------------------------
/tools/vscode/assets/logo/inspect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/tools/vscode/assets/logo/inspect.png


--------------------------------------------------------------------------------
/tools/vscode/src/core/python/index.ts:
--------------------------------------------------------------------------------
1 | 
2 | export * from './code';
3 | export * from './exec';
4 | export * from './interpreter';


--------------------------------------------------------------------------------
/docs/images/inspect-view-messages.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-messages.png


--------------------------------------------------------------------------------
/docs/images/inspect-view-metadata.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-metadata.png


--------------------------------------------------------------------------------
/docs/images/inspect-vscode-config.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-vscode-config.png


--------------------------------------------------------------------------------
/docs/images/inspect-vscode-install.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-vscode-install.png


--------------------------------------------------------------------------------
/docs/images/inspect-vscode-logview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-vscode-logview.png


--------------------------------------------------------------------------------
/docs/images/inspect-vscode-run-task.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-vscode-run-task.png


--------------------------------------------------------------------------------
/tests/test_dataset/samples.jsonl:
--------------------------------------------------------------------------------
1 | { "input": "Say 'Hello, World'", "target": "Hello, World", "label": "Hello, World", "extra": "data" }
2 | 


--------------------------------------------------------------------------------
/tests/test_task_list/multiple_dir/bar.py:
--------------------------------------------------------------------------------
1 | from inspect_ai import Task, task
2 | 
3 | 
4 | @task
5 | def foo():
6 |     return Task([])
7 | 


--------------------------------------------------------------------------------
/tests/test_task_list/multiple_dir/foo.py:
--------------------------------------------------------------------------------
1 | from inspect_ai import Task, task
2 | 
3 | 
4 | @task
5 | def foo():
6 |     return Task([])
7 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/utils/Type.mjs:
--------------------------------------------------------------------------------
1 | export const isNumeric = (n) => {
2 |   return !isNaN(parseFloat(n)) && isFinite(n);
3 | };
4 | 


--------------------------------------------------------------------------------
/tests/test_task_list/multiple_dir/_decoy2.py:
--------------------------------------------------------------------------------
1 | from inspect_ai import Task, task
2 | 
3 | 
4 | @task
5 | def decoy():
6 |     return Task([])
7 | 


--------------------------------------------------------------------------------
/tools/vscode/assets/www/codicon/codicon.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/tools/vscode/assets/www/codicon/codicon.ttf


--------------------------------------------------------------------------------
/tools/vscode/src/core/wait.ts:
--------------------------------------------------------------------------------
1 | export function sleep(ms: number) {
2 |   return new Promise((resolve) => setTimeout(resolve, ms));
3 | }
4 | 


--------------------------------------------------------------------------------
/docs/images/inspect-view-logging-console.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-logging-console.png


--------------------------------------------------------------------------------
/docs/images/inspect-vscode-output-channel.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-vscode-output-channel.png


--------------------------------------------------------------------------------
/tests/test_task_list/multiple_dir/_decoy/testit.py:
--------------------------------------------------------------------------------
1 | from inspect_ai import Task, task
2 | 
3 | 
4 | @task
5 | def foo():
6 |     return Task([])
7 | 


--------------------------------------------------------------------------------
/tests/test_task_list/recurse/folder1/_decoy.py:
--------------------------------------------------------------------------------
1 | from inspect_ai import Task, task
2 | 
3 | 
4 | @task
5 | def decoy():
6 |     return Task([])
7 | 


--------------------------------------------------------------------------------
/tests/test_task_list/recurse/folder1/theta.py:
--------------------------------------------------------------------------------
1 | from inspect_ai import Task, task
2 | 
3 | 
4 | @task
5 | def theta():
6 |     return Task([])
7 | 


--------------------------------------------------------------------------------
/tests/test_task_list/recurse/folder2/another.py:
--------------------------------------------------------------------------------
1 | from inspect_ai import Task, task
2 | 
3 | 
4 | @task
5 | def beta():
6 |     return Task([])
7 | 


--------------------------------------------------------------------------------
/tests/test_task_list/recurse/folder2/first.py:
--------------------------------------------------------------------------------
1 | from inspect_ai import Task, task
2 | 
3 | 
4 | @task
5 | def alpha():
6 |     return Task([])
7 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_util/dev.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | 
4 | def is_dev_mode() -> bool:
5 |     return os.environ.get("INSPECT_DEV_MODE", None) is not None
6 | 


--------------------------------------------------------------------------------
/tests/test_task_list/recurse/.folder3/epsilon.py:
--------------------------------------------------------------------------------
1 | from inspect_ai import Task, task
2 | 
3 | 
4 | @task
5 | def epsilon():
6 |     return Task([])
7 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/utils/sleep.mjs:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | export function sleep(ms) {
4 |   return new Promise((resolve) => setTimeout(resolve, ms));
5 | }
6 | 


--------------------------------------------------------------------------------
/tests/test_task_list/recurse/folder2/.folder3/epsilon.py:
--------------------------------------------------------------------------------
1 | from inspect_ai import Task, task
2 | 
3 | 
4 | @task
5 | def epsilon():
6 |     return Task([])
7 | 


--------------------------------------------------------------------------------
/tools/vscode/src/core/log.ts:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | import { window } from "vscode";
4 | 
5 | export const log = window.createOutputChannel("Inspect", { log: true });
6 | 


--------------------------------------------------------------------------------
/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 |   "recommendations": [
3 |     "ms-python.python",
4 |     "charliermarsh.ruff",
5 |     "ms-python.mypy-type-checker"
6 |   ]
7 | }


--------------------------------------------------------------------------------
/examples/agents/langchain/requirements.txt:
--------------------------------------------------------------------------------
1 | inspect_ai
2 | openai
3 | langchain==0.2.1
4 | langchainhub==0.1.16
5 | langchain-community==0.2.1
6 | wikipedia==1.4.0
7 | 


--------------------------------------------------------------------------------
/tools/vscode/.vscode-test.mjs:
--------------------------------------------------------------------------------
1 | import { defineConfig } from '@vscode/test-cli';
2 | 
3 | export default defineConfig({
4 | 	files: 'out/test/**/*.test.js',
5 | });
6 | 


--------------------------------------------------------------------------------
/tools/vscode/src/inspect/index.ts:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | export { initInspectProps, inspectVersion } from './props';
4 | 
5 | export { inspectListTasks } from './list';
6 | 
7 | 


--------------------------------------------------------------------------------
/benchmarks/datasets/mmlu.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:15b6785d49e0012602e089558a7a0dfb916baf97e9295aa25b48062f13c6afbb
3 | size 6667575
4 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/Register.mjs:
--------------------------------------------------------------------------------
1 | // Register file readers
2 | import "./log-reader/Open-AI-Log-Reader.mjs";
3 | import "./log-reader/Native-Log-Reader.mjs";
4 | 


--------------------------------------------------------------------------------
/benchmarks/datasets/math_test.csv:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1835505d451a6f4b8bfdfe11b90bbd6676f382d2aa269acf8d3e4155947fe451
3 | size 1031861
4 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_display/__init__.py:
--------------------------------------------------------------------------------
1 | from ._display import Display
2 | from .rich import rich_display
3 | 
4 | 
5 | def display() -> Display:
6 |     return rich_display()
7 | 


--------------------------------------------------------------------------------
/tests/test_package/.gitignore:
--------------------------------------------------------------------------------
 1 | .venv
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | venv
 6 | .eggs
 7 | .pytest_cache
 8 | *.egg-info
 9 | .DS_Store
10 | dist
11 | build
12 | 


--------------------------------------------------------------------------------
/tests/test_dataset/samples.json:
--------------------------------------------------------------------------------
1 | [
2 |   {
3 |     "input": "Say 'Hello, World'",
4 |     "target": "Hello, World",
5 |     "label": "Hello, World",
6 |     "extra": "data"
7 |   }
8 | ]


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/libs/bootstrap/css/fonts/bootstrap-icons.woff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/src/inspect_ai/_view/www/libs/bootstrap/css/fonts/bootstrap-icons.woff


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/libs/bootstrap/css/fonts/bootstrap-icons.woff2:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/canyon289/inspect_ai/main/src/inspect_ai/_view/www/libs/bootstrap/css/fonts/bootstrap-icons.woff2


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/utils/Git.mjs:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | export const ghCommitUrl = (origin, commit) => {
4 |   const baseUrl = origin.replace(/\.git$/, "");
5 |   return `${baseUrl}/commit/${commit}`;
6 | }


--------------------------------------------------------------------------------
/src/inspect_ai/scorer/_metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from .accuracy import accuracy
2 | from .mean import mean, var
3 | from .std import bootstrap_std
4 | 
5 | __all__ = ["accuracy", "mean", "var", "bootstrap_std"]
6 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_util/pattern.py:
--------------------------------------------------------------------------------
1 | ANSWER_PATTERN_LETTER = r"(?i)ANSWER\s*:\s*([A-Za-z])(?:[^\w]|\n|$)"
2 | ANSWER_PATTERN_WORD = r"(?i)ANSWER\s*:\s*(\w+)(?:\n|$)"
3 | ANSWER_PATTERN_LINE = r"(?i)ANSWER\s*:\s*([^\n]+)"
4 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/api/index.mjs:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 
4 | import browserApi from './api-browser.mjs'
5 | import vscodeApi from './api-vscode.mjs'
6 | 
7 | export default window.acquireVsCodeApi ? vscodeApi : browserApi
8 | 


--------------------------------------------------------------------------------
/tests/test_task_list/multiple.py:
--------------------------------------------------------------------------------
 1 | from inspect_ai import Task, task
 2 | 
 3 | 
 4 | @task
 5 | def first():
 6 |     return Task([])
 7 | 
 8 | 
 9 | @task(name="second_task")
10 | def second():
11 |     return Task([])
12 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 | - package-ecosystem: pip
 4 |   directory: "/"
 5 |   schedule:
 6 |     interval: daily
 7 |     time: "13:00"
 8 |   groups:
 9 |     python-packages:
10 |       patterns:
11 |         - "*"
12 | 


--------------------------------------------------------------------------------
/tests/test_examples.py:
--------------------------------------------------------------------------------
1 | from test_helpers.utils import run_example, skip_if_no_openai
2 | 
3 | 
4 | @skip_if_no_openai
5 | def test_examples():
6 |     run_example("security_guide.py", "openai/gpt-4")
7 |     run_example("popularity.py", "openai/gpt-4")
8 | 


--------------------------------------------------------------------------------
/tools/vscode/src/core/string.ts:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | // quotes values which have a space
 5 | export function shQuote(value: string): string {
 6 |   if (/\s/g.test(value)) {
 7 |     return `"${value}"`;
 8 |   } else {
 9 |     return value;
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/tools/vscode/.vscodeignore:
--------------------------------------------------------------------------------
 1 | .vscode/**
 2 | .vscode-test/**
 3 | node_modules/**
 4 | src/**
 5 | .gitignore
 6 | .yarnrc
 7 | webpack.config.js
 8 | vsc-extension-quickstart.md
 9 | **/tsconfig.json
10 | **/.eslintrc.json
11 | **/*.map
12 | **/*.ts
13 | **/.vscode-test.*
14 | 


--------------------------------------------------------------------------------
/tools/vscode/.vscode/extensions.json:
--------------------------------------------------------------------------------
1 | {
2 |   // See http://go.microsoft.com/fwlink/?LinkId=827846
3 |   // for the documentation about the extensions.json format
4 |   "recommendations": ["dbaeumer.vscode-eslint", "amodio.tsl-problem-matcher", "ms-vscode.extension-test-runner"]
5 | }
6 | 


--------------------------------------------------------------------------------
/tools/vscode/src/test/extension.test.ts:
--------------------------------------------------------------------------------
 1 | import * as assert from 'assert';
 2 | 
 3 | suite('Extension Test Suite', () => {
 4 | 
 5 | 	test('Sample test', () => {
 6 | 		assert.strictEqual(-1, [1, 2, 3].indexOf(5));
 7 | 		assert.strictEqual(-1, [1, 2, 3].indexOf(0));
 8 | 	});
 9 | });
10 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/preact/htm/preact.js:
--------------------------------------------------------------------------------
1 | /* esm.sh - esbuild bundle(htm@3.1.1/preact) es2022 production */
2 | import { h as r } from "preact"; import { h as d, render as f, Component as h } from "preact"; import o from "./htm.mjs"; var p = o.bind(r); export { h as Component, d as h, p as html, f as render };
3 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/utils/Path.mjs:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | export const filename = (path) => {
 5 |   const pathparts = path.split("/");
 6 |   const basename = pathparts.slice(-1)[0];
 7 |   const match = basename.match(/(.*)\.\S+$/);
 8 |   if (match) {
 9 |     return match[1];
10 |   } else {
11 |     return path;
12 |   }
13 | }


--------------------------------------------------------------------------------
/src/inspect_ai/_util/datetime.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from typing import Literal
 3 | 
 4 | 
 5 | def iso_now(
 6 |     timespec: Literal[
 7 |         "auto", "hours", "minutes", "seconds", "milliseconds" "microseconds"
 8 |     ] = "seconds",
 9 | ) -> str:
10 |     return datetime.now().isoformat(timespec=timespec)
11 | 


--------------------------------------------------------------------------------
/src/inspect_ai/util/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._context.concurrency import concurrency
 2 | from ._context.resource import resource
 3 | from ._context.subprocess import (
 4 |     ProcessResult,
 5 |     subprocess,
 6 | )
 7 | 
 8 | __all__ = [
 9 |     "ProcessResult",
10 |     "concurrency",
11 |     "resource",
12 |     "subprocess",
13 | ]
14 | 


--------------------------------------------------------------------------------
/tools/vscode/src/components/error.ts:
--------------------------------------------------------------------------------
 1 | import {
 2 |     window,
 3 |   } from "vscode";
 4 | 
 5 |   
 6 | 
 7 | export async function showError(msg: string, error?: Error) {
 8 |     const message = [msg];
 9 |     if (error) {
10 |         message.push(error.message);
11 |     }
12 |     await window.showErrorMessage(message.join("\n"), "Ok");
13 | }


--------------------------------------------------------------------------------
/src/inspect_ai/_util/_async.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from typing import Any
 3 | 
 4 | 
 5 | def is_callable_coroutine(func_or_cls: Any) -> bool:
 6 |     if asyncio.iscoroutinefunction(func_or_cls):
 7 |         return True
 8 |     elif callable(func_or_cls):
 9 |         return asyncio.iscoroutinefunction(func_or_cls.__call__)
10 |     return False
11 | 


--------------------------------------------------------------------------------
/tools/vscode/src/core/nonce.ts:
--------------------------------------------------------------------------------
 1 | 
 2 | export function getNonce() {
 3 |     let text = "";
 4 |     const possible =
 5 |       "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
 6 |     for (let i = 0; i < 64; i++) {
 7 |       text += possible.charAt(Math.floor(Math.random() * possible.length));
 8 |     }
 9 |     return text;
10 |   }


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | click
 2 | debugpy
 3 | fsspec
 4 | httpx
 5 | numpy
 6 | platformdirs
 7 | python-dotenv
 8 | jsonlines
 9 | json-stream
10 | nest_asyncio
11 | pydantic>=2
12 | s3fs>=2023
13 | semver
14 | shortuuid
15 | tenacity
16 | beautifulsoup4
17 | docstring-parser
18 | typing_extensions
19 | pyyaml
20 | rich
21 | psutil
22 | 
23 | 
24 | 
25 | 
26 | 


--------------------------------------------------------------------------------
/src/inspect_ai/util/_context/__init__.py:
--------------------------------------------------------------------------------
 1 | from .concurrency import init_concurrency
 2 | from .logger import init_logger_records
 3 | from .subprocess import init_subprocess
 4 | 
 5 | 
 6 | def init_async_context(max_subprocesses: int | None = None) -> None:
 7 |     init_concurrency()
 8 |     init_subprocess(max_subprocesses)
 9 |     init_logger_records()
10 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs:
--------------------------------------------------------------------------------
 1 | import { register } from "./Log-Reader.mjs";
 2 | 
 3 | export const rawFileReader = {
 4 |   name: "RawFileReader",
 5 |   canRead: (_filename) => {
 6 |     return true;
 7 |   },
 8 |   read: (contents) => {
 9 |     return JSON.parse(contents);
10 |   },
11 | };
12 | 
13 | register(rawFileReader);
14 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_util/samples.py:
--------------------------------------------------------------------------------
 1 | def parse_samples_limit(limit: str | None) -> int | tuple[int, int] | None:
 2 |     if limit is not None:
 3 |         if "," not in limit:
 4 |             return int(limit)
 5 |         else:
 6 |             limit_split = [int(r) for r in limit.split(",")]
 7 |             return (limit_split[0] - 1, limit_split[1])
 8 |     else:
 9 |         return None
10 | 


--------------------------------------------------------------------------------
/tools/vscode/src/inspect/version.ts:
--------------------------------------------------------------------------------
 1 | import { inspectVersion } from "./props";
 2 | 
 3 | 
 4 | export const withMinimumInspectVersion = (version: string, hasVersion: () => void, doesntHaveVersion: () => void) => {
 5 |   const activeVersion = inspectVersion();
 6 |   if (activeVersion && activeVersion.compare(version) >= 0) {
 7 |     hasVersion();
 8 |   } else {
 9 |     doesntHaveVersion();
10 |   }
11 | };


--------------------------------------------------------------------------------
/src/inspect_ai/_util/docstring.py:
--------------------------------------------------------------------------------
 1 | from docstring_parser import Docstring, parse
 2 | 
 3 | 
 4 | def parse_docstring(
 5 |     docstring: str | None,
 6 | ) -> Docstring:
 7 |     if docstring is None:
 8 |         return Docstring()
 9 |     parsed_docstring = parse(docstring)
10 |     if parsed_docstring.short_description is None:
11 |         raise ValueError("Docstring must have a short description")
12 |     return parsed_docstring
13 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_util/appdirs.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | from platformdirs import user_runtime_dir
 4 | 
 5 | from inspect_ai._util.constants import PKG_NAME
 6 | 
 7 | 
 8 | def inspect_runtime_dir(subdir: str | None) -> Path:
 9 |     runtime_dir = Path(user_runtime_dir(PKG_NAME))
10 |     if subdir:
11 |         runtime_dir = runtime_dir / subdir
12 |     runtime_dir.mkdir(parents=True, exist_ok=True)
13 |     return runtime_dir
14 | 


--------------------------------------------------------------------------------
/tools/vscode/src/core/workspace.ts:
--------------------------------------------------------------------------------
 1 | import { workspace, WorkspaceFolder } from "vscode";
 2 | 
 3 | export function activeWorkspaceFolder(): WorkspaceFolder {
 4 |   const workspaceFolder = workspace.workspaceFolders![0];
 5 |   return workspaceFolder;
 6 | }
 7 | 
 8 | 
 9 | export function checkActiveWorkspaceFolder(): WorkspaceFolder | undefined {
10 |   const workspaceFolder = workspace.workspaceFolders?.[0];
11 |   return workspaceFolder;
12 | }
13 | 


--------------------------------------------------------------------------------
/docs/_format/pre-render.sh:
--------------------------------------------------------------------------------
 1 | 
 2 | #!/usr/bin/env bash
 3 | 
 4 | if [ -n "${QUARTO_PROJECT_RENDER_ALL}" ]; then
 5 |   cd _examples
 6 |   cp index.qmd ../examples.qmd
 7 |   (echo; echo) >> ../examples.qmd
 8 |   for f in security_guide.qmd hellaswag.qmd theory_of_mind.qmd mathematics.qmd biology_qa.qmd arc.qmd tool_use.qmd gsm8k.qmd footer.qmd; do (cat "${f}"; echo; echo; echo) >> ../examples.qmd; done
 9 |   cd ..
10 | fi
11 | 
12 | 
13 | 
14 | 
15 | 
16 | 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/tests/test_cloudflare.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from test_helpers.utils import skip_if_no_cloudflare
 3 | 
 4 | from inspect_ai.model import get_model
 5 | 
 6 | 
 7 | @pytest.mark.asyncio
 8 | @skip_if_no_cloudflare
 9 | async def test_cloudflare_api() -> None:
10 |     model = get_model("cf/meta/llama-2-7b-chat-fp16")
11 |     message = "This is a test string. What are you?"
12 |     response = await model.generate(input=message)
13 |     assert len(response.completion) >= 1
14 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | ## This PR contains:
 2 | - [ ] New features
 3 | - [ ] Changes to dev-tools e.g. CI config / github tooling
 4 | - [ ] Docs
 5 | - [ ] Bug fixes
 6 | - [ ] Code refactor
 7 | 
 8 | ### What is the current behavior? (You can also link to an open issue here)
 9 | 
10 | ### What is the new behavior?
11 | 
12 | ### Does this PR introduce a breaking change? (What changes might users need to make in their application due to this PR?)
13 | 
14 | ### Other information:
15 | 


--------------------------------------------------------------------------------
/tests/test_package/inspect_package/inspect_ai.py:
--------------------------------------------------------------------------------
 1 | from inspect_ai.model import modelapi
 2 | 
 3 | 
 4 | @modelapi(name="custom")
 5 | def custom():
 6 |     # delayed import allows us to only resolve the imports in
 7 |     # .modelapi.custom when the modelapi is referneced (helpful
 8 |     # if the modelapi provider has dependencies we don't want to
 9 |     # require unless the provider is actually used)
10 |     from .modelapi.custom import CustomModelAPI
11 | 
12 |     return CustomModelAPI
13 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_util/text.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import string
 3 | 
 4 | 
 5 | def strip_punctuation(s: str) -> str:
 6 |     return s.strip(string.whitespace + string.punctuation)
 7 | 
 8 | 
 9 | def strip_numeric_punctuation(s: str) -> str:
10 |     # strip $, €, £, and ,
11 |     stripped = re.sub(r"[$,£,€]", "", s)
12 |     # strip . if it's followed by a space, the end of the string,
13 |     # or a non-digit character
14 |     stripped = re.sub(r"\.(?=\s|$|\D)", "", stripped)
15 |     return stripped
16 | 


--------------------------------------------------------------------------------
/tools/vscode/src/components/focus.ts:
--------------------------------------------------------------------------------
 1 | import { commands, window } from "vscode";
 2 | 
 3 | export function scheduleReturnFocus(command: string) {
 4 |   setTimeout(() => {
 5 |     void commands.executeCommand(command);
 6 |   }, 200);
 7 | }
 8 | 
 9 | export function scheduleFocusActiveEditor() {
10 |   setTimeout(() => {
11 |     const editor = window.activeTextEditor;
12 |     if (editor) {
13 |       void window.showTextDocument(editor.document, editor.viewColumn, false);
14 |     }
15 |   }, 200);
16 | }
17 | 


--------------------------------------------------------------------------------
/tools/vscode/assets/templates/task.py.template:
--------------------------------------------------------------------------------
 1 | from inspect_ai import Task, task
 2 | from inspect_ai.dataset import Sample
 3 | from inspect_ai.scorer import match
 4 | from inspect_ai.solver import generate
 5 | 
 6 | 
 7 | @task
 8 | def {{<taskName>}}():
 9 | 
10 |     # TODO: load dataset
11 |     dataset = [Sample(input = "Say hello", target = "Hello.")]
12 | 
13 |     return Task(
14 |         dataset=dataset,
15 |         plan=[
16 |           generate(),
17 |         ],
18 |         scorer=match(),
19 |     )
20 | 


--------------------------------------------------------------------------------
/tools/vscode/src/providers/inspect/inspect-constants.ts:
--------------------------------------------------------------------------------
 1 | 
 2 | export const kInspectEnvValues = {
 3 |   providerModel: "INSPECT_EVAL_MODEL",
 4 |   logLevel: "INSPECT_LOG_LEVEL",
 5 |   logDir: "INSPECT_LOG_DIR",
 6 |   connections: "INSPECT_EVAL_MAX_CONNECTIONS",
 7 |   retries: "INSPECT_EVAL_MAX_RETRIES",
 8 |   timeout: "INSPECT_EVAL_TIMEOUT",
 9 |   modelBaseUrl: "INSPECT_MODE_BASE_URL",
10 | };
11 | 
12 | export const kLogLevelEnv = "INSPECT_EVAL_MODEL";
13 | 
14 | export const kInspectChangeEvalSignalVersion = "0.3.10";
15 | 


--------------------------------------------------------------------------------
/src/inspect_ai/solver/_util.py:
--------------------------------------------------------------------------------
 1 | from inspect_ai.model import ChatMessage, ChatMessageSystem
 2 | 
 3 | 
 4 | def append_system_message(
 5 |     messages: list[ChatMessage], message: ChatMessageSystem
 6 | ) -> None:
 7 |     # find last index of any existing system message
 8 |     lastIndex = -1
 9 |     for i in list(reversed(range(0, len(messages)))):
10 |         if isinstance(messages[i], ChatMessageSystem):
11 |             lastIndex = i
12 |             break
13 | 
14 |     # insert it
15 |     messages.insert(lastIndex + 1, message)
16 | 


--------------------------------------------------------------------------------
/tests/test_task_list/attribs.ipynb:
--------------------------------------------------------------------------------
 1 | {
 2 |  "cells": [
 3 |   {
 4 |    "cell_type": "code",
 5 |    "execution_count": null,
 6 |    "metadata": {},
 7 |    "outputs": [],
 8 |    "source": [
 9 |     "from inspect_ai import Task, task\n",
10 |     "\n",
11 |     "\n",
12 |     "@task(light=True, type=\"bio\")\n",
13 |     "def attribs():\n",
14 |     "    return Task([])\n"
15 |    ]
16 |   }
17 |  ],
18 |  "metadata": {
19 |   "language_info": {
20 |    "name": "python"
21 |   }
22 |  },
23 |  "nbformat": 4,
24 |  "nbformat_minor": 2
25 | }
26 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_util/constants.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | PKG_AUTHOR = "UK AI Safety Institute"
 4 | PKG_AUTHOR_DIR = "UK-AISI"
 5 | PKG_NAME = Path(__file__).parent.parent.stem
 6 | PKG_PATH = Path(__file__).parent.parent
 7 | DEFAULT_EPOCHS = 1
 8 | DEFAULT_MAX_RETRIES = 5
 9 | DEFAULT_TIMEOUT = 120
10 | DEFAULT_MAX_CONNECTIONS = 10
11 | DEFAULT_MAX_TOKENS = 1024
12 | DEFAULT_VIEW_PORT = 7575
13 | DEFAULT_SERVER_HOST = "127.0.0.1"
14 | HTTP = 15
15 | HTTP_LOG_LEVEL = "HTTP"
16 | DEFAULT_LOG_LEVEL = "warning"
17 | SCORED_SUFFIX = "-scored"
18 | 


--------------------------------------------------------------------------------
/tests/test_package/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "inspect_package"
 3 | version = "0.1"
 4 | description = "Inspect Extension Package"
 5 | requires-python = ">=3.10"
 6 | authors = [{name = "UK AISI"}]
 7 | license = {text = "MIT"}
 8 | classifiers = [
 9 |     "License :: OSI Approved :: MIT License",
10 | ]
11 | dependencies = [
12 |     "inspect_ai"
13 | ]
14 | 
15 | [build-system]
16 | requires = ["setuptools"]
17 | build-backend = "setuptools.build_meta"
18 | 
19 | [project.entry-points.inspect_ai]
20 | inspect_package = "inspect_package.inspect_ai"
21 | 
22 | 


--------------------------------------------------------------------------------
/tools/vscode/src/components/symbol.ts:
--------------------------------------------------------------------------------
 1 | import { DocumentSymbol, Range, SymbolKind, TextDocument } from "vscode";
 2 | 
 3 | export const symbolIsTask = (document: TextDocument, pred: DocumentSymbol) => {
 4 |   if (pred.kind === SymbolKind.Function) {
 5 |     const textRange = new Range(pred.range.start, pred.range.end);
 6 |     const textBeforeFunction = document.getText(textRange);
 7 | 
 8 |     // Check if the text contains the `@task` decorator
 9 |     if (textBeforeFunction && textBeforeFunction.startsWith('@task')) {
10 |       return true;
11 |     }
12 |   }
13 | };


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "editor.formatOnSave": true,
 3 |   "mypy-type-checker.importStrategy": "fromEnvironment",
 4 |   "[json]": {
 5 |     "editor.wordWrap": "on"
 6 |   },
 7 |   "[markdown]": {
 8 |     "editor.formatOnSave": false
 9 |   },
10 |   "[quarto]": {
11 |     "editor.formatOnSave": false
12 |   },
13 |   "search.exclude": {
14 |     "logs/**": true
15 |   },
16 |   "python.testing.pytestArgs": [
17 |     "tests"
18 |   ],
19 |   "python.testing.unittestEnabled": false,
20 |   "python.testing.pytestEnabled": true,
21 |   "quarto.render.renderOnSave": true
22 | }


--------------------------------------------------------------------------------
/src/inspect_ai/_util/version.py:
--------------------------------------------------------------------------------
 1 | from importlib.metadata import version
 2 | 
 3 | import semver
 4 | 
 5 | from .error import module_version_error
 6 | 
 7 | 
 8 | def verify_required_version(feature: str, package: str, version: str) -> None:
 9 |     if not has_required_version(package, version):
10 |         raise module_version_error(feature, package, version)
11 | 
12 | 
13 | def has_required_version(package: str, required_version: str) -> bool:
14 |     if semver.Version.parse(version(package)).compare(required_version) >= 0:
15 |         return True
16 |     else:
17 |         return False
18 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/components/ToolButton.mjs:
--------------------------------------------------------------------------------
 1 | import { html } from "htm/preact";
 2 | import { h } from "preact";
 3 | 
 4 | export const ToolButton = ({ name, classes, icon, onclick, ...rest }) => {
 5 |   // Create the component (dynamically to forward attributes)
 6 |   const attr = {
 7 |     type: "button",
 8 |     class: `btn btn-tools ${classes || ""}`,
 9 |     onclick,
10 |     ...rest,
11 |   };
12 |   const iconEl = icon
13 |     ? html`<i class="${icon}" style=${{ marginRight: "0.5em" }}></i>`
14 |     : "";
15 |   return h("button", attr, html`${iconEl}${name}`);
16 | };
17 | 


--------------------------------------------------------------------------------
/tools/vscode/tools/ts-to-mjs/rollup.config.js:
--------------------------------------------------------------------------------
 1 | // rollup.config.js
 2 | import typescript from '@rollup/plugin-typescript';
 3 | import terser from '@rollup/plugin-terser';
 4 | 
 5 | export default {
 6 |   input: 'src/index.ts',
 7 |   output: {
 8 |     dir: 'dist',
 9 |     format: 'esm',
10 |     entryFileNames: '[name].mjs'
11 |   },
12 |   plugins: [
13 |     typescript(),
14 |     terser({
15 |       format: {
16 |         comments: 'some',
17 |         beautify: true,
18 |         ecma: '2022',
19 |       },
20 |       compress: false,
21 |       mangle: false,
22 |       module: true,
23 |     }),
24 |   ]
25 | };
26 | 


--------------------------------------------------------------------------------
/examples/theory_of_mind.py:
--------------------------------------------------------------------------------
 1 | from inspect_ai import Task, task
 2 | from inspect_ai.dataset import example_dataset
 3 | from inspect_ai.scorer import model_graded_fact
 4 | from inspect_ai.solver import chain_of_thought, generate, self_critique
 5 | 
 6 | 
 7 | @task
 8 | def theory_of_mind(critique = False):
 9 | 
10 |     # use self_critique if requested
11 |     plan = [chain_of_thought(), generate()]
12 |     if critique:
13 |         plan.append(self_critique())
14 | 
15 |     return Task(
16 |         dataset=example_dataset("theory_of_mind"),
17 |         plan=plan,
18 |         scorer=model_graded_fact(),
19 |     )
20 | 
21 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/components/EmptyPanel.mjs:
--------------------------------------------------------------------------------
 1 | import { html } from "htm/preact";
 2 | 
 3 | export const EmptyPanel = ({ id, classes, height, style, children }) => {
 4 |   const emptyStyle = {
 5 |     display: "flex",
 6 |     textAlign: "center",
 7 |     flex: "0 0 content",
 8 |     alignItems: "center",
 9 |     justifyContent: "center",
10 |     height: height ? height : "10rem",
11 |   };
12 |   return html`
13 |     <div ...${{id}} class="${classes ? classes : ""}" style=${{width: "100%"}}>
14 |       <div  style=${{...emptyStyle, ...style}}><div>${children || ""}</div></div>
15 |     </div>
16 |   `;
17 | };
18 | 


--------------------------------------------------------------------------------
/examples/biology_qa.py:
--------------------------------------------------------------------------------
 1 | from inspect_ai import Task, task
 2 | from inspect_ai.dataset import FieldSpec, example_dataset
 3 | from inspect_ai.scorer import model_graded_qa
 4 | from inspect_ai.solver import generate, use_tools, web_search
 5 | 
 6 | 
 7 | @task
 8 | def biology_qa() -> Task:
 9 |     return Task(
10 |         dataset=example_dataset(
11 |             name="biology_qa",
12 |             sample_fields=FieldSpec(
13 |                 input="question",
14 |                 target="answer"
15 |             ),
16 |         ),
17 |         plan=[use_tools(web_search()), generate()],
18 |         scorer=model_graded_qa(),
19 |     )
20 | 
21 | 


--------------------------------------------------------------------------------
/tools/vscode/tools/ts-to-mjs/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "tstojs",
 3 |   "version": "0.1.11",
 4 |   "packageManager": "yarn@3.3.1",
 5 |   "main": "dist/index.mjs",
 6 |   "type": "module",
 7 |   "types": "dist/index.d.ts",
 8 |   "files": [
 9 |     "/dist"
10 |   ],
11 |   "scripts": {
12 |     "build": "rollup -c"
13 |   },
14 |   "devDependencies": {
15 |     "@rollup/plugin-terser": "^0.2.1",
16 |     "@rollup/plugin-typescript": "^10.0.1",
17 |     "@types/jest": "^29.2.4",
18 |     "jest": "^29.3.1",
19 |     "rollup": "^3.9.0",
20 |     "ts-jest": "^29.0.3",
21 |     "tslib": "^2.4.1",
22 |     "typescript": "^4.9.4"
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs:
--------------------------------------------------------------------------------
 1 | import { html } from "htm/preact";
 2 | import { icons } from "../Constants.mjs"
 3 | 
 4 | import { Card, CardHeader, CardBody } from "../components/Card.mjs"
 5 | import { ANSIDisplay } from "../components/AnsiDisplay.mjs"
 6 | 
 7 | 
 8 | export const TaskErrorCard = ({ evalError }) => {
 9 |   return html`
10 |     <${Card}>
11 |       <${CardHeader} icon=${icons.error} label="Task Failed: ${evalError.message}"></${CardHeader}>
12 |       <${CardBody} style=${{fontSize: "0.8em"}}>
13 |         <${ANSIDisplay} output=${evalError.traceback_ansi}/>
14 |       </${CardBody}>
15 |     </${Card}>
16 |   `;
17 | };
18 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_cli/util.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | 
 3 | import yaml
 4 | 
 5 | 
 6 | def parse_cli_args(args: tuple[str] | list[str] | None) -> dict[str, Any]:
 7 |     params: dict[str, Any] = dict()
 8 |     if args:
 9 |         for arg in list(args):
10 |             parts = arg.split("=")
11 |             if len(parts) > 1:
12 |                 key = parts[0].replace("-", "_")
13 |                 value = yaml.safe_load("=".join(parts[1:]))
14 |                 if isinstance(value, str):
15 |                     value = value.split(",")
16 |                     value = value if len(value) > 1 else value[0]
17 |                 params[key] = value
18 |     return params
19 | 


--------------------------------------------------------------------------------
/.github/workflows/docs.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   workflow_dispatch:
 3 | 
 4 | name: Quarto Publish
 5 | 
 6 | jobs:
 7 |   build-deploy:
 8 |     runs-on: ubuntu-latest
 9 |     permissions:
10 |       contents: write
11 |     steps:
12 |       - name: Check out repository
13 |         uses: actions/checkout@v4
14 | 
15 |       - name: Set up Quarto
16 |         uses: quarto-dev/quarto-actions/setup@v2
17 |         with:
18 |           tinytex: true
19 | 
20 |       - name: Render and Publish
21 |         uses: quarto-dev/quarto-actions/publish@v2
22 |         with:
23 |           target: gh-pages
24 |           path: docs
25 |         env:
26 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
27 | 


--------------------------------------------------------------------------------
/src/inspect_ai/dataset/__init__.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa: F403 F405
 2 | 
 3 | from ._dataset import (
 4 |     Dataset,
 5 |     FieldSpec,
 6 |     MemoryDataset,
 7 |     RecordToSample,
 8 |     Sample,
 9 | )
10 | from ._sources.csv import csv_dataset
11 | from ._sources.example import example_dataset
12 | from ._sources.file import file_dataset
13 | from ._sources.hf import hf_dataset
14 | from ._sources.json import json_dataset
15 | 
16 | __all__ = [
17 |     "Dataset",
18 |     "Sample",
19 |     "FieldSpec",
20 |     "RecordToSample",
21 |     "MemoryDataset",
22 |     "file_dataset",
23 |     "csv_dataset",
24 |     "hf_dataset",
25 |     "json_dataset",
26 |     "example_dataset",
27 | ]
28 | 


--------------------------------------------------------------------------------
/src/inspect_ai/model/_providers/ollama.py:
--------------------------------------------------------------------------------
 1 | from inspect_ai.model._providers.util import model_base_url
 2 | 
 3 | from .._model import GenerateConfig
 4 | from .openai import OpenAIAPI
 5 | 
 6 | 
 7 | class OllamaAPI(OpenAIAPI):
 8 |     def __init__(
 9 |         self,
10 |         model_name: str,
11 |         base_url: str | None = None,
12 |         config: GenerateConfig = GenerateConfig(),
13 |     ) -> None:
14 |         base_url = model_base_url(base_url, "OLLAMA_BASE_URL")
15 |         base_url = base_url if base_url else "http://localhost:11434/v1"
16 |         super().__init__(
17 |             model_name=model_name, base_url=base_url, config=config, api_key="ollama"
18 |         )
19 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/log-reader/Log-Reader.mjs:
--------------------------------------------------------------------------------
 1 | const adapters = [];
 2 | 
 3 | export const register = (adapter) => {
 4 |   adapters.push(adapter);
 5 | };
 6 | 
 7 | export const readLogFile = (filename, text) => {
 8 |   const adapter = adapters.find((adapter) => {
 9 |     return adapter.canRead(filename);
10 |   });
11 | 
12 |   // TODO Exception handling
13 |   if (!adapter) {
14 |     throw new Error(`The file ${filename} is not recognized as a valid log file`);
15 |   }
16 |   try {
17 |     return adapter.read(text);
18 |   } catch(e) {
19 |     throw new Error(`Failed to parse the file ${filename}. Please check the file exists and that the format is valid.`, e);
20 |   }
21 | };
22 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_util/url.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | 
 4 | def is_http_url(url: str) -> bool:
 5 |     return url.startswith("http://") or url.startswith("https://")
 6 | 
 7 | 
 8 | def is_data_uri(url: str) -> bool:
 9 |     return url.startswith("data:")
10 | 
11 | 
12 | def data_uri_mime_type(data_url: str) -> str | None:
13 |     pattern = r"^data:([^;]+);.*"
14 |     match = re.match(pattern, data_url)
15 |     if match:
16 |         mime_type = match.group(1)
17 |         return mime_type
18 |     else:
19 |         return None
20 | 
21 | 
22 | def data_uri_to_base64(data_uri: str) -> str:
23 |     pattern = r"^data:[^,]+,"
24 |     stripped_uri = re.sub(pattern, "", data_uri)
25 |     return stripped_uri
26 | 


--------------------------------------------------------------------------------
/tests/test_model_package.py:
--------------------------------------------------------------------------------
 1 | # type: ignore
 2 | 
 3 | import subprocess
 4 | import sys
 5 | 
 6 | import pytest
 7 | 
 8 | from inspect_ai.model import get_model
 9 | 
10 | 
11 | @pytest.mark.asyncio
12 | async def test_model_package():
13 |     # ensure the package is installed
14 |     try:
15 |         import inspect_package  # noqa: F401
16 |     except ImportError:
17 |         subprocess.check_call(
18 |             [sys.executable, "-m", "pip", "install", "tests/test_package"]
19 |         )
20 | 
21 |     # call the model
22 |     mdl = get_model("custom/gpt7")
23 |     result = await mdl.generate({"role": "user", "content": "hello"}, [], "none", {})
24 |     assert result.completion == "Hello from gpt7"
25 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/components/CopyButton.mjs:
--------------------------------------------------------------------------------
 1 | import { html } from "htm/preact";
 2 | import { icons } from "../Constants.mjs";
 3 | 
 4 | export const CopyButton = ({ value }) => {
 5 |   return html`<button
 6 |     class="copy-button"
 7 |     style=${{ border: "none", backgroundColor: "inherit", opacity: "0.5" }}
 8 |     data-clipboard-text=${value}
 9 |     onclick=${(e) => {
10 |       const iEl = e.target;
11 |       if (iEl) {
12 |         iEl.className = `${icons.confirm} primary`
13 |         setTimeout(() => {
14 |           iEl.className = icons.copy;
15 |         }, 1250);
16 |       }
17 |       return false;
18 |     }}
19 |   >
20 |     <i class=${icons.copy}></i>
21 |   </button>`;
22 | };
23 | 


--------------------------------------------------------------------------------
/tools/vscode/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | // A launch configuration that compiles the extension and then opens it inside a new window
 2 | // Use IntelliSense to learn about possible attributes.
 3 | // Hover to view descriptions of existing attributes.
 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 | {
 6 | 	"version": "0.2.0",
 7 | 	"configurations": [
 8 | 		{
 9 | 			"name": "Run Extension",
10 | 			"type": "extensionHost",
11 | 			"request": "launch",
12 | 			"args": [
13 | 				"--extensionDevelopmentPath=${workspaceFolder}"
14 | 			],
15 | 			"outFiles": [
16 | 				"${workspaceFolder}/dist/**/*.js"
17 | 			],
18 | 			"preLaunchTask": "${defaultBuildTask}"
19 | 		}
20 | 	]
21 | }
22 | 


--------------------------------------------------------------------------------
/tools/vscode/tsconfig.json:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"compilerOptions": {
 3 | 		"module": "commonjs",
 4 | 		"target": "ES2022",
 5 | 		"lib": ["ES2022", "DOM"],
 6 | 		"sourceMap": true,
 7 | 		"rootDir": "src",
 8 | 		"strict": true,   /* enable all strict type-checking options */
 9 | 		"esModuleInterop": true,
10 | 		/* Additional Checks */
11 | 		// "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */
12 | 		// "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */
13 | 		// "noUnusedParameters": true,  /* Report errors on unused parameters. */
14 | 	},
15 | 	"exclude": ["node_modules", ".vscode-test", "src/providers/activity-bar/webview", "tools"]
16 | }
17 | 


--------------------------------------------------------------------------------
/examples/security_guide.py:
--------------------------------------------------------------------------------
 1 | from inspect_ai import Task, task
 2 | from inspect_ai.dataset import example_dataset
 3 | from inspect_ai.scorer import model_graded_fact
 4 | from inspect_ai.solver import generate, system_message
 5 | 
 6 | SYSTEM_MESSAGE = """
 7 | You are a computer security expert and you are tasked with providing concise response to the following questions. Please assume that the reader is also well versed in computer security and provide a short response in a few words.
 8 | """
 9 | 
10 | 
11 | @task
12 | def security_guide():
13 |     return Task(
14 |         dataset=example_dataset("security_guide"),
15 |         plan=[system_message(SYSTEM_MESSAGE), generate()],
16 |         scorer=model_graded_fact(),
17 |     )
18 | 


--------------------------------------------------------------------------------
/src/inspect_ai/scorer/_metrics/mean.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from .._metric import Metric, Score, metric
 4 | 
 5 | 
 6 | @metric
 7 | def mean() -> Metric:
 8 |     """Compute mean of all scores.
 9 | 
10 |     Returns:
11 |        mean metric
12 |     """
13 | 
14 |     def metric(scores: list[Score]) -> float:
15 |         return np.mean([score.as_float() for score in scores]).item()
16 | 
17 |     return metric
18 | 
19 | 
20 | @metric
21 | def var() -> Metric:
22 |     """Compute variance over all scores.
23 | 
24 |     Returns:
25 |        var metric
26 |     """
27 | 
28 |     def metric(scores: list[Score]) -> float:
29 |         return np.var([score.as_float() for score in scores]).item()
30 | 
31 |     return metric
32 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/components/LabeledValue.mjs:
--------------------------------------------------------------------------------
 1 | import { html } from "htm/preact";
 2 | 
 3 | export const LabeledValue = ({ label, style, valueStyle, layout="column", children }) => {
 4 |   const flexDirection = layout === "column" ? "column" : "row";
 5 | 
 6 |   return html` <div
 7 |     style=${{
 8 |       display: "flex",
 9 |       flexDirection,
10 |       ...style,
11 |     }}
12 |   >
13 |     <div
14 |       style=${{
15 |         fontSize: "0.5rem",
16 |         textTransform: "uppercase",
17 |         fontWeight: "600",
18 |         marginBottom: "-0.2rem",
19 |         color: "var(--bs-secondary)"
20 |       }}
21 |     >
22 |       ${label}
23 |     </div>
24 |     <div style=${{ fontSize: "0.8rem", ...valueStyle }}>${children}</div>
25 |   </div>`;
26 | };
27 | 


--------------------------------------------------------------------------------
/tests/test_anthropic.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from test_helpers.utils import skip_if_no_anthropic
 3 | 
 4 | from inspect_ai.model import GenerateConfig, get_model
 5 | 
 6 | 
 7 | @pytest.mark.asyncio
 8 | @skip_if_no_anthropic
 9 | async def test_anthropic_api() -> None:
10 |     model = get_model(
11 |         "anthropic/claude-2.1",
12 |         config=GenerateConfig(
13 |             frequency_penalty=0.0,
14 |             stop_seqs=None,
15 |             max_tokens=50,
16 |             presence_penalty=0.0,
17 |             seed=None,
18 |             temperature=0.0,
19 |             top_p=1.0,
20 |         ),
21 |     )
22 | 
23 |     message = "This is a test string. What are you?"
24 |     response = await model.generate(input=message)
25 |     assert len(response.completion) >= 1
26 | 


--------------------------------------------------------------------------------
/tools/vscode/assets/logo/inspect.svg:
--------------------------------------------------------------------------------
1 | <?xml version="1.0" encoding="utf-8"?>
2 | <svg viewBox="0 0 128.084 128.671" xmlns="http://www.w3.org/2000/svg" xmlns:bx="https://boxy-svg.com">
3 |   <defs>
4 |     <style bx:fonts="Montserrat">@import url(https://fonts.googleapis.com/css2?family=Montserrat%3Aital%2Cwght%400%2C100..900%3B1%2C100..900&amp;display=swap);</style>
5 |   </defs>
6 |   <ellipse style="stroke: rgb(0, 0, 0); stroke-width: 10px; fill: none;" cx="63.749" cy="64.336" rx="57.286" ry="57.286" transform="matrix(0.9999999999999999, 0, 0, 0.9999999999999999, 0, -7.105427357601002e-15)"/>
7 |   <text style="fill: rgb(51, 51, 51); font-family: Montserrat; font-size: 86px; font-weight: 660;" x="50.767" y="95.182" transform="matrix(0.9999999999999999, 0, 0, 0.9999999999999999, 0, -7.105427357601002e-15)">i</text>
8 | </svg>


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | # This should be the _latest_ version of python supported by us
 4 | default_language_version:
 5 |   python: python3.11
 6 | repos:
 7 | - repo: https://github.com/astral-sh/ruff-pre-commit
 8 |   rev: v0.1.6
 9 |   hooks:
10 |     # Run the linter.
11 |     - id: ruff
12 |       args: [ --fix ]
13 |     # Run the formatter.
14 |     - id: ruff-format
15 | -   repo: https://github.com/pre-commit/pre-commit-hooks
16 |     rev: v4.5.0
17 |     hooks:
18 |       - id: check-added-large-files
19 |       - id: check-json
20 |       - id: check-yaml
21 |       - id: debug-statements
22 |       - id: detect-private-key
23 |       - id: end-of-file-fixer
24 |       - id: requirements-txt-fixer
25 | 


--------------------------------------------------------------------------------
/src/inspect_ai/__init__.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa: F401 F403 F405
 2 | 
 3 | from importlib.metadata import version as importlib_version
 4 | 
 5 | from inspect_ai._eval.eval import eval, eval_async, eval_retry, eval_retry_async
 6 | from inspect_ai._eval.list import list_tasks
 7 | from inspect_ai._eval.registry import task
 8 | from inspect_ai._eval.score import score, score_async
 9 | from inspect_ai._eval.types import Task, TaskInfo, Tasks
10 | from inspect_ai._util.constants import PKG_NAME
11 | 
12 | __version__ = importlib_version(PKG_NAME)
13 | 
14 | 
15 | __all__ = [
16 |     "__version__",
17 |     "eval",
18 |     "eval_async",
19 |     "eval_retry",
20 |     "eval_retry_async",
21 |     "score",
22 |     "score_async",
23 |     "Task",
24 |     "TaskInfo",
25 |     "Tasks",
26 |     "task",
27 |     "list_tasks",
28 | ]
29 | 


--------------------------------------------------------------------------------
/tests/test_registry.py:
--------------------------------------------------------------------------------
 1 | from inspect_ai._util.constants import PKG_NAME
 2 | from inspect_ai._util.registry import registry_info, registry_lookup
 3 | from inspect_ai.scorer import Metric, Score, metric
 4 | 
 5 | 
 6 | def test_registry_namespaces() -> None:
 7 |     # define a local metric which we can lookup by simple name
 8 |     @metric(name="local_accuracy")
 9 |     def accuracy1(correct: str = "C") -> Metric:
10 |         def metric(scores: list[Score]) -> int | float:
11 |             return 1
12 | 
13 |         return metric
14 | 
15 |     assert registry_lookup("metric", "local_accuracy")
16 | 
17 |     # confirm that inspect_ai builtins have their namespace auto-appended
18 |     info = registry_info(registry_lookup("metric", f"{PKG_NAME}/accuracy"))
19 |     assert info
20 |     assert info.name == f"{PKG_NAME}/accuracy"
21 | 


--------------------------------------------------------------------------------
/tools/vscode/src/inspect/list.ts:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import { AbsolutePath } from "../core/path";
 4 | import { runProcess } from "../core/process";
 5 | import { inspectBinPath } from "./props";
 6 | 
 7 | export interface TaskDescriptor {
 8 |   file: string,
 9 |   name: string,
10 |   attribs: Record<string, unknown>;
11 | }
12 | 
13 | export const inspectListTasks = (cwd: AbsolutePath): TaskDescriptor[] => {
14 |   return inspectListCmd(cwd, "tasks");
15 | };
16 | 
17 | function inspectListCmd<T>(cwd: AbsolutePath, type: string, args?: string[]): T[] {
18 |   const inspectBin = inspectBinPath();
19 |   if (inspectBin) {
20 |     const cmdArgs = ["list", type, "--json", ...(args || [])];
21 |     const output = runProcess(inspectBin, cmdArgs, cwd);
22 |     return JSON.parse(output) as T[];
23 |   } else {
24 |     return [];
25 |   }
26 | }


--------------------------------------------------------------------------------
/src/inspect_ai/solver/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._critique import self_critique
 2 | from ._multiple_choice import multiple_choice
 3 | from ._plan import Plan, plan
 4 | from ._prompt import (
 5 |     chain_of_thought,
 6 |     prompt_template,
 7 |     system_message,
 8 | )
 9 | from ._solver import Generate, Solver, TaskState, generate, solver
10 | from ._tool.tool import Tool, tool
11 | from ._tool.use_tools import use_tools
12 | from ._tool.web_search import web_search
13 | 
14 | __all__ = [
15 |     "generate",
16 |     "prompt_template",
17 |     "chain_of_thought",
18 |     "multiple_choice",
19 |     "system_message",
20 |     "self_critique",
21 |     "tool",
22 |     "use_tools",
23 |     "web_search",
24 |     "plan",
25 |     "Plan",
26 |     "Solver",
27 |     "solver",
28 |     "TaskState",
29 |     "Tool",
30 |     "Generate",
31 | ]
32 | 


--------------------------------------------------------------------------------
/src/inspect_ai/util/_context/logger.py:
--------------------------------------------------------------------------------
 1 | from logging import INFO, LogRecord
 2 | 
 3 | _logger_records: list[LogRecord] = []
 4 | _rate_limit_records: list[LogRecord] = []
 5 | 
 6 | 
 7 | def init_logger_records() -> None:
 8 |     _logger_records.clear()
 9 |     _rate_limit_records.clear()
10 | 
11 | 
12 | def notify_logger_record(record: LogRecord, write: bool) -> None:
13 |     if write:
14 |         _logger_records.append(record)
15 |     if record.levelno <= INFO and "429" in record.getMessage():
16 |         _rate_limit_records.append(record)
17 | 
18 | 
19 | def logger_http_rate_limit_count() -> int:
20 |     return len(_rate_limit_records)
21 | 
22 | 
23 | def collect_logger_records() -> list[LogRecord]:
24 |     records = _logger_records.copy()
25 |     _logger_records.clear()
26 |     _rate_limit_records.clear()
27 |     return records
28 | 


--------------------------------------------------------------------------------
/tools/vscode/.vscode/settings.json:
--------------------------------------------------------------------------------
 1 | // Place your settings in this file to overwrite default and user settings.
 2 | {
 3 |     "files.exclude": {
 4 |         "out": false, // set this to true to hide the "out" folder with the compiled JS files
 5 |         "dist": false // set this to true to hide the "dist" folder with the compiled JS files
 6 |     },
 7 |     "search.exclude": {
 8 |         "out": true, // set this to false to include "out" folder in search results
 9 |         "dist": true // set this to false to include "dist" folder in search results
10 |     },
11 |     // Turn off tsc task auto detection since we have the necessary tasks as npm scripts
12 |     "typescript.tsc.autoDetect": "off",
13 |     "editor.tabSize": 2,
14 |     "editor.formatOnSave": true,
15 |     "editor.defaultFormatter": "vscode.typescript-language-features"
16 | }


--------------------------------------------------------------------------------
/src/inspect_ai/_util/error.py:
--------------------------------------------------------------------------------
 1 | from importlib.metadata import version
 2 | 
 3 | 
 4 | def pip_dependency_error(feature: str, dependencies: list[str]) -> Exception:
 5 |     return ModuleNotFoundError(
 6 |         f"ERROR: {feature} requires optional dependencies. "
 7 |         f"Install with:\n\npip install {' '.join(dependencies)}\n"
 8 |     )
 9 | 
10 | 
11 | def module_version_error(
12 |     feature: str, package: str, required_version: str
13 | ) -> Exception:
14 |     return ModuleNotFoundError(
15 |         f"ERROR: {feature} requires at least version {required_version} of package {package} "
16 |         f"(you have version {version(package)} installed).\n\n"
17 |         f"Upgrade with:\n\npip install --upgrade {package}\n"
18 |     )
19 | 
20 | 
21 | def exception_message(ex: BaseException) -> str:
22 |     return getattr(ex, "message", repr(ex))
23 | 


--------------------------------------------------------------------------------
/tests/test_package/inspect_package/modelapi/custom.py:
--------------------------------------------------------------------------------
 1 | from inspect_ai.model import (
 2 |     ChatMessage,
 3 |     GenerateConfig,
 4 |     ModelAPI,
 5 |     ModelOutput,
 6 |     ToolChoice,
 7 |     ToolInfo,
 8 | )
 9 | 
10 | 
11 | class CustomModelAPI(ModelAPI):
12 |     def __init__(
13 |         self,
14 |         model_name: str,
15 |         base_url: str | None = None,
16 |         config: GenerateConfig = GenerateConfig(),
17 |     ) -> None:
18 |         super().__init__(model_name, base_url, config)
19 | 
20 |     async def generate(
21 |         self,
22 |         input: list[ChatMessage],
23 |         tools: list[ToolInfo],
24 |         tool_choice: ToolChoice,
25 |         config: GenerateConfig,
26 |     ) -> ModelOutput:
27 |         return ModelOutput.from_content(
28 |             self.model_name, f"Hello from {self.model_name}"
29 |         )
30 | 


--------------------------------------------------------------------------------
/tools/vscode/src/core/dispose.ts:
--------------------------------------------------------------------------------
 1 | import * as vscode from "vscode";
 2 | 
 3 | export function disposeAll(disposables: vscode.Disposable[]) {
 4 |   while (disposables.length) {
 5 |     const item = disposables.pop();
 6 |     item?.dispose();
 7 |   }
 8 | }
 9 | 
10 | export abstract class Disposable {
11 |   private _isDisposed = false;
12 | 
13 |   protected _disposables: vscode.Disposable[] = [];
14 | 
15 |   public dispose(): unknown {
16 |     if (this._isDisposed) {
17 |       return;
18 |     }
19 |     this._isDisposed = true;
20 |     disposeAll(this._disposables);
21 |   }
22 | 
23 |   protected _register<T extends vscode.Disposable>(value: T): T {
24 |     if (this._isDisposed) {
25 |       value.dispose();
26 |     } else {
27 |       this._disposables.push(value);
28 |     }
29 |     return value;
30 |   }
31 | 
32 |   protected get isDisposed() {
33 |     return this._isDisposed;
34 |   }
35 | }


--------------------------------------------------------------------------------
/tests/test_openai.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from test_helpers.utils import skip_if_no_openai
 3 | 
 4 | from inspect_ai.model import (
 5 |     ChatMessageUser,
 6 |     GenerateConfig,
 7 |     get_model,
 8 | )
 9 | 
10 | 
11 | @pytest.mark.asyncio
12 | @skip_if_no_openai
13 | async def test_openai_api() -> None:
14 |     model = get_model(
15 |         "openai/gpt-3.5-turbo",
16 |         config=GenerateConfig(
17 |             frequency_penalty=0.0,
18 |             stop_seqs=None,
19 |             max_tokens=50,
20 |             presence_penalty=0.0,
21 |             logit_bias=dict([(42, 10), (43, -10)]),
22 |             seed=None,
23 |             temperature=0.0,
24 |             top_p=1.0,
25 |         ),
26 |     )
27 | 
28 |     message = ChatMessageUser(content="This is a test string. What are you?")
29 |     response = await model.generate(input=[message])
30 |     assert len(response.completion) >= 1
31 | 


--------------------------------------------------------------------------------
/tools/vscode/src/core/python/code.ts:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | export function isValidPythonFnName(input: string) {
 5 |   if (!kFnCharsRegex.test(input)) {
 6 |     return false;
 7 |   }
 8 |   if (kReservedWords.includes(input)) {
 9 |     return false;
10 |   }
11 |   return true;
12 | }
13 | const kFnCharsRegex = /^[A-Za-z_][A-Za-z0-9_]*$/;
14 | const kReservedWords = [
15 |   "False",
16 |   "None",
17 |   "True",
18 |   "and",
19 |   "as",
20 |   "assert",
21 |   "async",
22 |   "await",
23 |   "break",
24 |   "class",
25 |   "continue",
26 |   "def",
27 |   "del",
28 |   "elif",
29 |   "else",
30 |   "except",
31 |   "finally",
32 |   "for",
33 |   "from",
34 |   "global",
35 |   "if",
36 |   "import",
37 |   "in",
38 |   "is",
39 |   "lambda",
40 |   "nonlocal",
41 |   "not",
42 |   "or",
43 |   "pass",
44 |   "raise",
45 |   "return",
46 |   "try",
47 |   "while",
48 |   "with",
49 |   "yield",
50 | ];
51 | 
52 | 
53 | 


--------------------------------------------------------------------------------
/tools/vscode/.vscode/tasks.json:
--------------------------------------------------------------------------------
 1 | // See https://go.microsoft.com/fwlink/?LinkId=733558
 2 | // for the documentation about the tasks.json format
 3 | {
 4 | 	"version": "2.0.0",
 5 | 	"tasks": [
 6 | 		{
 7 | 			"type": "npm",
 8 | 			"script": "watch",
 9 | 			"problemMatcher": "$ts-webpack-watch",
10 | 			"isBackground": true,
11 | 			"presentation": {
12 | 				"reveal": "never",
13 | 				"group": "watchers"
14 | 			},
15 | 			"group": {
16 | 				"kind": "build",
17 | 				"isDefault": true
18 | 			}
19 | 		},
20 | 		{
21 | 			"type": "npm",
22 | 			"script": "watch-tests",
23 | 			"problemMatcher": "$tsc-watch",
24 | 			"isBackground": true,
25 | 			"presentation": {
26 | 				"reveal": "never",
27 | 				"group": "watchers"
28 | 			},
29 | 			"group": "build"
30 | 		},
31 | 		{
32 | 			"label": "tasks: watch-tests",
33 | 			"dependsOn": [
34 | 				"npm: watch",
35 | 				"npm: watch-tests"
36 | 			],
37 | 			"problemMatcher": []
38 | 		}
39 | 	]
40 | }
41 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs:
--------------------------------------------------------------------------------
 1 | import { html } from "htm/preact";
 2 | 
 3 | export const EpochFilter = ({ epochs, epoch, setEpoch }) => {
 4 |   const options = ["all"];
 5 |   for (let i = 1; i <= epochs; i++) {
 6 |     options.push(i + "");
 7 |   }
 8 |   return html`
 9 |     <div style=${{ display: "flex"}}>
10 |       <span
11 |         class="epoch-filter-label"
12 |         style=${{ alignSelf: "center"}}
13 |         >Epochs:</span
14 |       >
15 |       <select
16 |         class="form-select form-select-sm"
17 |         aria-label=".epoch-filter-label"
18 |         style=${{ fontSize: "0.7rem" }}
19 |         onChange=${(e) => {
20 |           setEpoch(e.target.value);
21 |         }}
22 |       >
23 |         ${options.map((option) => {
24 |           return html`<option value="${option}" ${epoch === option ? "selected" : ""}>${option}</option>`;
25 |         })}
26 |       </select>
27 |     </div>
28 |   `;
29 | };
30 | 


--------------------------------------------------------------------------------
/src/inspect_ai/log/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._file import (
 2 |     EvalLogInfo,
 3 |     list_eval_logs,
 4 |     read_eval_log,
 5 |     write_eval_log,
 6 | )
 7 | from ._log import (
 8 |     EvalConfig,
 9 |     EvalDataset,
10 |     EvalError,
11 |     EvalLog,
12 |     EvalMetric,
13 |     EvalPlan,
14 |     EvalPlanStep,
15 |     EvalResults,
16 |     EvalRevision,
17 |     EvalSample,
18 |     EvalScorer,
19 |     EvalSpec,
20 |     EvalStats,
21 |     LoggingLevel,
22 |     LoggingMessage,
23 | )
24 | 
25 | __all__ = [
26 |     "EvalConfig",
27 |     "EvalError",
28 |     "EvalDataset",
29 |     "EvalLog",
30 |     "EvalMetric",
31 |     "EvalPlan",
32 |     "EvalPlanStep",
33 |     "EvalResults",
34 |     "EvalRevision",
35 |     "EvalSample",
36 |     "EvalScorer",
37 |     "EvalSpec",
38 |     "EvalStats",
39 |     "EvalLogInfo",
40 |     "LoggingLevel",
41 |     "LoggingMessage",
42 |     "list_eval_logs",
43 |     "read_eval_log",
44 |     "write_eval_log",
45 | ]
46 | 


--------------------------------------------------------------------------------
/tools/vscode/src/providers/logview/logview.ts:
--------------------------------------------------------------------------------
 1 | import { ExtensionContext } from "vscode";
 2 | 
 3 | import { Command } from "../../core/command";
 4 | import { logviewCommands } from "./commands";
 5 | import { InspectLogviewWebviewManager } from "./logview-webview";
 6 | import { InspectLogviewManager } from "./logview-manager";
 7 | import { InspectSettingsManager } from "../settings/inspect-settings";
 8 | import { InspectManager } from "../inspect/inspect-manager";
 9 | 
10 | export function activateLogview(
11 |   inspectManager: InspectManager,
12 |   settingsMgr: InspectSettingsManager,
13 |   context: ExtensionContext
14 | ): [Command[], InspectLogviewManager] {
15 | 
16 |   // initilize manager
17 |   const logviewWebManager = new InspectLogviewWebviewManager(inspectManager, context);
18 |   const logviewManager = new InspectLogviewManager(logviewWebManager, settingsMgr);
19 | 
20 |   // logview commands
21 |   return [logviewCommands(logviewManager), logviewManager];
22 | }
23 | 


--------------------------------------------------------------------------------
/tools/vscode/src/core/command.ts:
--------------------------------------------------------------------------------
 1 | import { Disposable, commands } from "vscode";
 2 | 
 3 | export interface Command {
 4 |   readonly id: string;
 5 | 
 6 |   execute(...args: unknown[]): void;
 7 | }
 8 | 
 9 | export class CommandManager {
10 |   private readonly commands = new Map<string, Disposable>();
11 | 
12 |   public dispose() {
13 |     for (const registration of this.commands.values()) {
14 |       registration.dispose();
15 |     }
16 |     this.commands.clear();
17 |   }
18 | 
19 |   public register<T extends Command>(command: T): T {
20 |     // eslint-disable-next-line @typescript-eslint/unbound-method
21 |     this.registerCommand(command.id, command.execute, command);
22 |     return command;
23 |   }
24 | 
25 |   private registerCommand(
26 |     id: string,
27 |     impl: (...args: unknown[]) => void,
28 |     thisArg?: unknown
29 |   ) {
30 |     if (this.commands.has(id)) {
31 |       return;
32 |     }
33 | 
34 |     this.commands.set(id, commands.registerCommand(id, impl, thisArg));
35 |   }
36 | }


--------------------------------------------------------------------------------
/tools/vscode/src/core/random.ts:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | export function randomInt(min: number, max: number) {
 5 |     min = Math.ceil(min);
 6 |     max = Math.floor(max);
 7 |     return Math.floor(cryptoRandom() * (max - min) + min);
 8 |   }
 9 |   
10 | 
11 |   // version of Math.random() that uses web crypto
12 | // https://stackoverflow.com/questions/13694626/generating-random-numbers-0-to-1-with-crypto-generatevalues
13 | export function cryptoRandom() {
14 |     // eslint-disable-next-line @typescript-eslint/no-var-requires
15 |     const crypto = require('crypto') as { getRandomValues: (arr: Uint32Array) => void; };
16 |     
17 |     const arr = new Uint32Array(2);
18 |     crypto.getRandomValues(arr);
19 |   
20 |     // keep all 32 bits of the the first, top 20 of the second for 52 random bits
21 |     const mantissa = (arr[0] * Math.pow(2, 20)) + (arr[1] >>> 12);
22 |   
23 |     // shift all 52 bits to the right of the decimal point
24 |     const result = mantissa * Math.pow(2, -52);
25 |     return result;
26 |   }
27 |   


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/components/LoadingScreen.mjs:
--------------------------------------------------------------------------------
 1 | import { html } from "htm/preact";
 2 | 
 3 | export const LoadingScreen = ({ id, classes, message }) => {
 4 |   const fullScreenStyle = {
 5 |     position: "absolute",
 6 |     top: "0",
 7 |     bottom: "0",
 8 |     right: "0",
 9 |     left: "0",
10 |     display: "flex",
11 |     justifyContent: "center",
12 |     alignItems: "center",
13 |     zIndex:1000
14 |   }
15 | 
16 |   const emptyStyle = {
17 |     display: "flex",
18 |     textAlign: "center",
19 |     flex: "0 0 content",
20 |     alignItems: "center",
21 |     justifyContent: "center",
22 |   };
23 |   return html`
24 |     <div ...${{ id, class: classes, style: fullScreenStyle}}>
25 |       <div style=${emptyStyle} class="empty-message">
26 |           <div
27 |             class="spinner-border"
28 |             style=${{ display: "inline", marginRight: "0.5rem" }}
29 |             role="status"
30 |           >
31 |         </div>
32 |         ${message || "Loading..."}
33 |       </div>
34 |     </div>
35 |   `;
36 | };
37 | 


--------------------------------------------------------------------------------
/tools/vscode/src/providers/workspace/workspace-init.ts:
--------------------------------------------------------------------------------
 1 | import { Command } from "../../core/command";
 2 | import { WorkspaceStateManager } from "./workspace-state-provider";
 3 | import { ensureGitignore } from "../../core/git";
 4 | import {
 5 |   activeWorkspacePath,
 6 | } from "../../core/path";
 7 | 
 8 | 
 9 | const kGitInitKey = "gitInit";
10 | 
11 | export async function initializeWorkspace(
12 |   state: WorkspaceStateManager
13 | ): Promise<[Command[]]> {
14 |   const hasInitializedGit = state.getState(kGitInitKey);
15 |   if (hasInitializedGit !== "true" || 1 === 1) {
16 |     const path = activeWorkspacePath();
17 | 
18 |     // If we're in a workspace, initialize
19 |     ensureGitignore(path, ignorePaths());
20 | 
21 |     await state.setState(kGitInitKey, "true");
22 | 
23 |   }
24 |   return [[]];
25 | }
26 | 
27 | // TODO: Extract this for use adding additional paths (like if the modify env with logdir)
28 | 
29 | function ignorePaths() {
30 |   const ignores: string[] = [".env", "logs/", "__pycache__/"];
31 |   return ignores;
32 | }
33 | 


--------------------------------------------------------------------------------
/docs/_examples/footer.qmd:
--------------------------------------------------------------------------------
 1 | ::: {.content-hidden when-format="html"}
 2 | ## Additional Examples
 3 | 
 4 | See the following additional examples in the online version of the Inspect documentation:
 5 | 
 6 | | Example                                               | Demonstrates                                                  |
 7 | |----------------------------|--------------------------------------------|
 8 | | [MATH]({{< var examples-url >}}#sec-mathematics)      | Custom scorer that uses a model to judge equivalence.         |
 9 | | [Biology QA]({{< var examples-url >}}#sec-biology-qa) | Built-in web search tool; Custom model grading template.      |
10 | | [ARC]({{< var examples-url >}}#sec-arc)               | Defining multiple tasks in a file; Multiple choice questions. |
11 | | [Tool Use]({{< var examples-url >}}#sec-tool-use)     | Tool usage and creating custom tools; Launching subprocesses. |
12 | | [GSM8K]({{< var examples-url >}}#sec-gsm8k)           | Using fewshot examples; Scoring numeric output.               |
13 | 
14 | : {tbl-colwidths="\[30,70\]"}
15 | :::


--------------------------------------------------------------------------------
/docs/theme.scss:
--------------------------------------------------------------------------------
 1 | /*-- scss:rules --*/
 2 | 
 3 | .sidebar>.sidebar-menu-container>.list-unstyled>.sidebar-item {
 4 |     margin-bottom: 1em;
 5 | }
 6 | 
 7 | .sidebar-header-item>p {
 8 |     margin-bottom: 0;
 9 | }
10 | 
11 | .sidebar-tools-main .quarto-navigation-tool[title="Source Code"] {
12 |     padding-top: 2.5px;
13 | }
14 | 
15 | .code-tabset {
16 |     margin-bottom: 1em;
17 | }
18 | 
19 | .code-tabset .tab-content {
20 |     padding: 0;
21 |     margin-bottom: 0;
22 | }
23 | 
24 | .code-tabset div.sourceCode {
25 |     border: none;
26 |     margin: 0;
27 | }
28 | 
29 | .code-tabset .nav-tabs .nav-link.active,
30 | .nav-tabs .nav-item.show .nav-link {
31 |     border-bottom-color: $border-color;
32 | }
33 | 
34 | .quarto-layout-panel .sourceCode {
35 |     margin-top: 0;
36 |     margin-bottom: 0.5em;
37 | }
38 | 
39 | .splash ul {
40 |     padding-inline-start: 1rem;
41 | }
42 | 
43 | @media(max-width: 991.98px) {
44 |     .sidebar-header-item .img-fluid {
45 |         max-width: 195px;
46 |     }
47 | }
48 | 
49 | .blockquote {
50 |     color: #505a62;
51 | }
52 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_util/git.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import subprocess
 3 | 
 4 | from pydantic import BaseModel
 5 | 
 6 | from .path import chdir
 7 | 
 8 | 
 9 | class GitContext(BaseModel):
10 |     origin: str
11 |     commit: str
12 | 
13 | 
14 | def git_context(dir: str) -> GitContext | None:
15 |     with chdir(dir):
16 |         # check for git
17 |         git = shutil.which("git")
18 |         if not git:
19 |             return None
20 | 
21 |         # check for a git revision in this directory
22 |         commit_result = subprocess.run(
23 |             [git, "rev-parse", "--short", "HEAD"], capture_output=True, text=True
24 |         )
25 |         if commit_result.returncode != 0:
26 |             return None
27 | 
28 |         # check for git origin (if any)
29 |         origin = subprocess.run(
30 |             [git, "remote", "get-url", "origin"],
31 |             capture_output=True,
32 |             text=True,
33 |         ).stdout.strip()
34 | 
35 |         # return context
36 |         return GitContext(origin=origin, commit=commit_result.stdout.strip())
37 | 


--------------------------------------------------------------------------------
/tools/vscode/src/components/templates.ts:
--------------------------------------------------------------------------------
 1 | 
 2 | import { ExtensionContext, Uri, workspace } from "vscode";
 3 | 
 4 | export interface Template {
 5 |     name: string
 6 | }
 7 | 
 8 | export const templates = {
 9 |     "python_task": {
10 |         name: "task.py.template"
11 |     }
12 | };
13 | 
14 | export const readTemplate = async (template: Template, context: ExtensionContext, variables: Record<string, string> = {}) => {
15 |     // Compute the template path
16 |     const extensionUri = context.extensionUri;    
17 |     const templateUri = Uri.joinPath(extensionUri, "assets", "templates", template.name);
18 | 
19 |     // Read and decode the text file
20 |     const templateRaw = await workspace.fs.readFile(templateUri);
21 |     const textDecoder = new TextDecoder('utf-8');
22 |     let templateContent = textDecoder.decode(templateRaw);
23 | 
24 |     // Replace variables
25 |     Object.keys(variables).forEach((key) => {
26 |         templateContent = templateContent.replaceAll(`{{<${key}>}}`, variables[key]);
27 |     });
28 | 
29 |     return templateContent;
30 | };


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/components/AppErrorBoundary.mjs:
--------------------------------------------------------------------------------
 1 | import { html } from "htm/preact";
 2 | import { Component } from "preact";
 3 | 
 4 | import { ErrorPanel } from "./ErrorPanel.mjs";
 5 | 
 6 | export class AppErrorBoundary extends Component {
 7 |   constructor(props) {
 8 |     super(props);
 9 |     this.state = { hasError: false };
10 |   }
11 | 
12 |   static getDerivedStateFromError(error) {
13 |     // Update state so the next render will show the fallback UI.
14 |     return { hasError: true , error: error};
15 |   }
16 | 
17 |   componentDidCatch(error, errorInfo) {
18 |     // You can also log the error to an error reporting service
19 |     logErrorToMyService(error, errorInfo);
20 |   }
21 | 
22 |   render() {
23 |     if (this.state.hasError) {
24 |       console.log({e: this.state.error});
25 |       // You can render any custom fallback UI
26 |       return html`<${ErrorPanel}
27 |         title="An unexpected error occurred."
28 |         error="${this.state.error}"
29 |       />`;
30 |     }
31 | 
32 |     return this.props.children;
33 |   }
34 | }
35 | 


--------------------------------------------------------------------------------
/.github/workflows/vscode.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 |     tags:
 4 |       - "v[0-9]*"
 5 |     paths:
 6 |       - "tools/vscode/**"
 7 |       - ".github/workflows/vscode.yml"
 8 |     branches:
 9 |       - "main"
10 |   pull_request:
11 |     branches:
12 |       - "main"
13 |     paths:
14 |       - "tools/vscode/**"
15 |       - ".github/workflows/vscode.yml"
16 |   workflow_dispatch:
17 | 
18 | name: Build VS Code Ext
19 | jobs:
20 |   deploy:
21 |     runs-on: ubuntu-latest
22 |     steps:
23 |       - uses: actions/checkout@v4
24 |       - uses: actions/setup-node@v4
25 |         with:
26 |           node-version: "18.x"
27 |       - run: |
28 |           pushd tools/vscode
29 |           yarn install --immutable --immutable-cache --check-cache
30 | 
31 |       - name: Build Extension
32 |         run: |
33 |           pushd tools/vscode
34 |           yarn vsce package
35 | 
36 |       - name: Upload extension to Actions Artifact
37 |         uses: actions/upload-artifact@v4
38 |         with:
39 |           name: inspect-vscode
40 |           path: "tools/vscode/inspect*.vsix"
41 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/samples/SamplesTools.mjs:
--------------------------------------------------------------------------------
 1 | import { html } from "htm/preact";
 2 | 
 3 | import { EpochFilter } from "./tools/EpochFilter.mjs";
 4 | import {
 5 |   SortFilter,
 6 | } from "./tools/SortFilter.mjs";
 7 | import { SampleFilter } from "./tools/SampleFilter.mjs";
 8 | 
 9 | export const SampleTools = (props) => {
10 |   const { epoch, setEpoch, filter, filterChanged, sort, setSort, epochs, sampleDescriptor } = props;
11 | 
12 |   const hasEpochs = epochs > 1;
13 |   const tools = [];
14 |   if (hasEpochs) {
15 |     tools.push(
16 |       html`<${EpochFilter}
17 |         epoch=${epoch}
18 |         setEpoch="${setEpoch}"
19 |         epochs=${epochs}
20 |       />`
21 |     );
22 |   }
23 | 
24 |   tools.push(
25 |     html`<${SampleFilter}
26 |       filter=${filter}
27 |       filterChanged=${filterChanged}
28 |       descriptor=${sampleDescriptor}
29 |     />`
30 |   );
31 | 
32 |   tools.push(
33 |     html`<${SortFilter}
34 |       sort=${sort}
35 |       setSort=${setSort}
36 |       epochs=${hasEpochs}
37 |     />`
38 |   );
39 | 
40 |   return tools;
41 | };
42 | 


--------------------------------------------------------------------------------
/tools/vscode/src/providers/workspace/workspace-env-commands.ts:
--------------------------------------------------------------------------------
 1 | import { existsSync, writeFileSync } from "fs";
 2 | import { Command } from "../../core/command";
 3 | import { workspacePath } from "../../core/path";
 4 | import { window, workspace } from "vscode";
 5 | 
 6 | 
 7 | export function workspaceEnvCommands() {
 8 |   return [new EditEnvFileCommand()];
 9 | }
10 | 
11 | export class EditEnvFileCommand implements Command {
12 |   constructor() { }
13 |   async execute(): Promise<void> {
14 | 
15 |     // The path to the env file
16 |     const absPath = workspacePath(`.env`);
17 | 
18 | 
19 |     // Ensure env file actually exists
20 |     if (!existsSync(absPath.path)) {
21 |       writeFileSync(absPath.path,
22 |         "",
23 |         { encoding: "utf-8" }
24 |       );
25 |     }
26 | 
27 |     // Open the env file
28 |     const document = await workspace.openTextDocument(absPath.path);
29 |     await window.showTextDocument(document);
30 | 
31 |   }
32 | 
33 |   private static readonly id = "inspect.editEnvFile";
34 |   public readonly id = EditEnvFileCommand.id;
35 | }
36 | 


--------------------------------------------------------------------------------
/tests/test_num_choices.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from test_helpers.utils import skip_if_no_openai, skip_if_no_together
 3 | 
 4 | from inspect_ai.model import GenerateConfig, get_model
 5 | 
 6 | 
 7 | async def generate(model_name):
 8 |     model = get_model(model_name)
 9 |     return await model.generate(input="Hello.", config=GenerateConfig(num_choices=3))
10 | 
11 | 
12 | async def check_num_choices(model_name):
13 |     model = get_model(model_name)
14 |     response = await model.generate(
15 |         input="Hello.", config=GenerateConfig(num_choices=3)
16 |     )
17 |     assert len(response.choices) == 3
18 | 
19 | 
20 | @pytest.mark.asyncio
21 | @skip_if_no_openai
22 | async def test_openai_num_choices() -> None:
23 |     await check_num_choices("openai/gpt-3.5-turbo")
24 | 
25 | 
26 | @pytest.mark.asyncio
27 | @skip_if_no_together
28 | async def test_together_num_choices() -> None:
29 |     await check_num_choices("together/google/gemma-2b-it")
30 | 
31 | 
32 | # @pytest.mark.asyncio
33 | # @skip_if_no_azureai
34 | # async def test_azureai_num_choices() -> None:
35 | #     await check_num_choices(None)
36 | 


--------------------------------------------------------------------------------
/examples/agents/langchain/wikipedia.jsonl:
--------------------------------------------------------------------------------
1 | {"input":[{"role":"user","content":"What's the difference between tennis and pickleball?"}],"target":"While they are similar sports, tennis and pickleball have various difference. First, the court size for pickleball is about half the size of a tennis court. Second, pickleball is played with a ball that resembles a whiffle ball. Third, pickleball is played with paddles as opposed to rackets. Finally, the scoring system is quite different as you play for points which can only be scored when you or your team are serving."}
2 | {"input":[{"role":"user","content":"Which types of fish contain the lowest levels of mercury?"}],"target":"The following types of fish contain low levels of mercury: salmon, flounder, Atlantic mackerel, anchovies, pollock, catfish, and shellfish (e.g., clams, scallops, mussels)."}
3 | {"input":[{"role":"user","content":"List the ten episode titles from the sixth season of \"Game of Thrones\" in broadcast order."}],"target":"The Red Woman, Home, Oathbreaker, Book of the Stranger, The Door, Blood of My Blood, The Broken Man, No One, Battle of the Bastards, The Winds of Winter"}


--------------------------------------------------------------------------------
/tools/vscode/src/providers/activity-bar/webview/env-config-webview.css:
--------------------------------------------------------------------------------
 1 | .dropdown-container #provider {
 2 |   flex-grow: 1;
 3 | }
 4 | 
 5 | #model-help {
 6 |   float:right;
 7 | }
 8 | 
 9 | #model-help .codicon::before {
10 |   margin-top: 2px;
11 |   margin-bottom: -2px;
12 | }
13 | 
14 | #model-display {
15 |   margin-top: 0.3em;
16 |   margin-bottom: 0.6em;
17 | }
18 | 
19 | #model-container vscode-text-field,
20 | #model-container {
21 |   width: 100%;
22 | }
23 | 
24 | #log-level {
25 |   margin-top: 3px;
26 | }
27 | 
28 | #limit {
29 |   flex-basis: 1;
30 |   flex-grow: 1;
31 | }
32 | 
33 | #epochs {
34 |   flex-basis: 1;
35 |   flex-grow: 1;
36 | }
37 | 
38 | #log-dir {
39 |   flex-grow: 1;
40 | }
41 | 
42 | #provider-label {
43 |   width: 100%;
44 | }
45 | 
46 | #show-base-url-container {
47 |   width: 100%;
48 | }
49 | 
50 | #show-base-url-container vscode-link {
51 |   float: right;
52 |   margin-right: 0.5em;
53 |   margin-top: -10px;
54 |   height: 10px;
55 |   color: var(--vscode-foreground);
56 | }
57 | 
58 | #show-base-url-container vscode-link i:before {
59 |   height: 4px;
60 |   line-height: 4px;
61 | }
62 | 
63 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_cli/view.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | from typing_extensions import Unpack
 3 | 
 4 | from inspect_ai._util.constants import DEFAULT_SERVER_HOST, DEFAULT_VIEW_PORT
 5 | from inspect_ai._view.view import view
 6 | 
 7 | from .common import CommonOptions, common_options, resolve_common_options
 8 | 
 9 | 
10 | @click.command("view")
11 | @click.option(
12 |     "--recursive",
13 |     type=bool,
14 |     is_flag=True,
15 |     default=True,
16 |     help="Include all logs in log_dir recursively.",
17 | )
18 | @click.option(
19 |     "--host",
20 |     default=DEFAULT_SERVER_HOST,
21 |     help="Tcp/Ip host",
22 | )
23 | @click.option("--port", default=DEFAULT_VIEW_PORT, help="TCP/IP port")
24 | @common_options
25 | def view_command(
26 |     recursive: bool,
27 |     host: str,
28 |     port: int,
29 |     **kwargs: Unpack[CommonOptions],
30 | ) -> None:
31 |     """View evaluation logs."""
32 |     # read common options
33 |     (log_dir, log_level) = resolve_common_options(kwargs)
34 | 
35 |     # run the viewer
36 |     view(
37 |         log_dir=log_dir, recursive=recursive, host=host, port=port, log_level=log_level
38 |     )
39 | 


--------------------------------------------------------------------------------
/src/inspect_ai/scorer/_metrics/accuracy.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | 
 3 | from .._metric import (
 4 |     Metric,
 5 |     Score,
 6 |     ValueToFloat,
 7 |     metric,
 8 |     value_to_float,
 9 | )
10 | 
11 | logger = getLogger(__name__)
12 | 
13 | 
14 | @metric
15 | def accuracy(to_float: ValueToFloat = value_to_float()) -> Metric:
16 |     r"""Compute proportion of total answers which are correct.
17 | 
18 |     Args:
19 |       to_float (ValueToFloat): Function for mapping
20 |         Value to float for computing metrics. The default
21 |         `value_to_float()` maps CORRECT ("C") to 1.0,
22 |         INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and
23 |         NOANSWER ("N") to 0, casts numeric values to
24 |         float directly, and prints a warning and returns
25 |         0 if the Value is a complex object (list or dict).
26 | 
27 |     Returns:
28 |        Accuracy metric
29 |     """
30 | 
31 |     def metric(scores: list[Score]) -> float:
32 |         total = 0.0
33 |         for item in scores:
34 |             total += to_float(item.value)
35 |         return total / float(len(scores))
36 | 
37 |     return metric
38 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 UK AI Safety Institute
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tools/vscode/src/providers/logview/commands.ts:
--------------------------------------------------------------------------------
 1 | import { Command } from "../../core/command";
 2 | import { InspectLogviewManager } from "./logview-manager";
 3 | import { showError } from "../../components/error";
 4 | 
 5 | export interface LogviewState {
 6 |   url?: string;
 7 | }
 8 | 
 9 | export interface LogviewOptions {
10 |   state?: LogviewState;
11 |   activate?: boolean;
12 | }
13 | 
14 | export function logviewCommands(
15 |   manager: InspectLogviewManager,
16 | ): Command[] {
17 |   return [new ShowLogviewCommand(manager)];
18 | }
19 | 
20 | class ShowLogviewCommand implements Command {
21 |   constructor(private readonly manager_: InspectLogviewManager) { }
22 |   async execute(): Promise<void> {
23 |     // ensure logview is visible
24 |     try {
25 |       this.manager_.showInspectView();
26 |     } catch (err: unknown) {
27 |       await showError(
28 |         "An error occurred while attempting to start Inspect View",
29 |         err instanceof Error ? err : Error(String(err))
30 |       );
31 |     }
32 | 
33 |   }
34 | 
35 |   private static readonly id = "inspect.showLogview";
36 |   public readonly id = ShowLogviewCommand.id;
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------
/tools/vscode/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 UK AI Safety Institute
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/tools/vscode/src/providers/logview/logview-manager.ts:
--------------------------------------------------------------------------------
 1 | import { Uri, ViewColumn, window, workspace } from "vscode";
 2 | import { InspectLogviewWebviewManager } from "./logview-webview";
 3 | import { InspectSettingsManager } from "../settings/inspect-settings";
 4 | 
 5 | export class InspectLogviewManager {
 6 |   constructor(
 7 |     private readonly webViewManager_: InspectLogviewWebviewManager,
 8 |     private readonly settingsMgr_: InspectSettingsManager,
 9 |   ) { }
10 | 
11 |   public async showLogFile(logFile: Uri) {
12 |     if (this.settingsMgr_.getSettings().logViewType === "text" && logFile.scheme === "file") {
13 |       await workspace.openTextDocument(logFile).then(async (doc) => {
14 |         await window.showTextDocument(doc, {
15 |           preserveFocus: true,
16 |           viewColumn: ViewColumn.Two,
17 |         });
18 |       });
19 |     } else {
20 | 
21 |       // Show the log file
22 |       this.webViewManager_.showLogFile(logFile);
23 |     }
24 |   }
25 | 
26 |   public showInspectView() {
27 |     this.webViewManager_.showLogview();
28 |   }
29 | 
30 |   public viewColumn() {
31 |     return this.webViewManager_.viewColumn();
32 |   }
33 | }
34 | 
35 | 


--------------------------------------------------------------------------------
/examples/popularity.py:
--------------------------------------------------------------------------------
 1 | from inspect_ai import Task, task
 2 | from inspect_ai.dataset import FieldSpec, example_dataset
 3 | from inspect_ai.model import GenerateConfig
 4 | from inspect_ai.scorer import match
 5 | from inspect_ai.solver import generate, system_message
 6 | 
 7 | SYSTEM_MESSAGE = r"""
 8 | Please note: In the following EXERCISE, it is important that you only respond with exactly "Yes" or "No"
 9 | """
10 | 
11 | 
12 | @task
13 | def popularity(model):
14 |     dataset = example_dataset(
15 |         name="popularity",
16 |         sample_fields=FieldSpec(
17 |             input="question",
18 |             target="answer_matching_behavior",
19 |             metadata=["label_confidence"],
20 |         ),
21 |     )
22 | 
23 |     config = GenerateConfig()
24 |     match model:
25 |         case "gpt" | "gemini":
26 |             config.temperature = 0.9
27 |         case "claude":
28 |             config.temperature = 0.8
29 |         case _:
30 |             config.temperature = 1.0
31 | 
32 |     return Task(
33 |         dataset=dataset,
34 |         plan=[system_message(SYSTEM_MESSAGE), generate()],
35 |         scorer=match(),
36 |         config=config,
37 |     )
38 | 


--------------------------------------------------------------------------------
/tools/vscode/src/inspect/logs.ts:
--------------------------------------------------------------------------------
 1 | import { AbsolutePath } from "../core/path";
 2 | import { runProcess } from "../core/process";
 3 | import { inspectBinPath } from "./props";
 4 | 
 5 | 
 6 | 
 7 | export function inspectEvalLogs(cwd: AbsolutePath): string | undefined {
 8 |   const inspectBin = inspectBinPath();
 9 |   if (inspectBin) {
10 |     const cmdArgs = ["list", "logs", "--json"];
11 |     const output = runProcess(inspectBin, cmdArgs, cwd);
12 |     return output;
13 |   }
14 | }
15 | 
16 | export function inspectEvalLog(cwd: AbsolutePath, log: string, headerOnly: boolean): string | undefined {
17 |   const inspectBin = inspectBinPath();
18 |   if (inspectBin) {
19 |     const cmdArgs = ["info", "log-file", log];
20 |     if (headerOnly) {
21 |       cmdArgs.push("--header-only");
22 |     }
23 |     const output = runProcess(inspectBin, cmdArgs, cwd);
24 |     return output;
25 |   }
26 | }
27 | 
28 | export function inspectEvalLogHeaders(cwd: AbsolutePath, logs: string[]): string | undefined {
29 |   const inspectBin = inspectBinPath();
30 |   if (inspectBin) {
31 |     const cmdArgs = ["info", "log-file-headers", ...logs];
32 |     const output = runProcess(inspectBin, cmdArgs, cwd);
33 |     return output;
34 |   }
35 | }


--------------------------------------------------------------------------------
/benchmarks/hellaswag.py:
--------------------------------------------------------------------------------
 1 | """
 2 | HellaSwag: Can a Machine Really Finish Your Sentence?
 3 | 
 4 | Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, Yejin Choi
 5 | https://arxiv.org/abs/1905.07830
 6 | """
 7 | 
 8 | from inspect_ai import Task, task
 9 | from inspect_ai.dataset import Sample, hf_dataset
10 | from inspect_ai.scorer import answer
11 | from inspect_ai.solver import multiple_choice, system_message
12 | 
13 | SYSTEM_MESSAGE = """
14 | Choose the most plausible continuation for the story.
15 | """
16 | 
17 | 
18 | def record_to_sample(record):
19 |     return Sample(
20 |         input=record["ctx"],
21 |         target=chr(ord("A") + int(record["label"])),
22 |         choices=record["endings"],
23 |         metadata=dict(source_id=record["source_id"]),
24 |     )
25 | 
26 | 
27 | @task
28 | def hellaswag():
29 |     # dataset
30 |     dataset = hf_dataset(
31 |         path="hellaswag",
32 |         split="validation",
33 |         sample_fields=record_to_sample,
34 |         trust=True,
35 |         shuffle=True,
36 |     )
37 | 
38 |     # define task
39 |     return Task(
40 |         dataset=dataset,
41 |         plan=[system_message(SYSTEM_MESSAGE), multiple_choice()],
42 |         scorer=answer("letter"),
43 |     )
44 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/components/AnsiDisplay.css:
--------------------------------------------------------------------------------
 1 | 
 2 |  .ansi-display {
 3 | 	font-family: monospace;
 4 | 	white-space: pre-wrap;
 5 | 	--ansiBlack: #000000;
 6 | 	--ansiRed: #cd3131;
 7 | 	--ansiGreen: #00BC00;
 8 | 	--ansiYellow: #949800;
 9 | 	--ansiBlue: #0451a5;
10 | 	--ansiMagenta: #bc05bc;
11 | 	--ansiCyan: #0598bc;
12 | 	--ansiWhite: #555555;
13 |   --ansiBrightBlack: #666666;
14 | 	--ansiBrightRed: #cd3131;
15 | 	--ansiBrightGreen: #14CE14;
16 | 	--ansiBrightYellow: #b5ba00;
17 | 	--ansiBrightBlue: #0451a5;
18 | 	--ansiBrightMagenta: #bc05bc;
19 | 	--ansiBrightCyan: #0598bc;
20 | 	--ansiBrightWhite: #a5a5a5;
21 | }
22 | 
23 | .dark-mode .ansi-display {
24 | 	--ansiBlack: #000000;
25 | 	--ansiRed: #cd3131;
26 | 	--ansiGreen: #0DBC79;
27 | 	--ansiYellow: #e5e510;
28 | 	--ansiBlue: #2472c8;
29 | 	--ansiMagenta: #bc3fbc;
30 | 	--ansiCyan: #11a8cd;
31 | 	--ansiWhite: #e5e5e5;
32 |   --ansiBrightBlack: #666666;
33 | 	--ansiBrightRed: #f14c4c;
34 | 	--ansiBrightGreen: #23d18b;
35 | 	--ansiBrightYellow: #f5f543;
36 | 	--ansiBrightBlue: #3b8eea;
37 | 	--ansiBrightMagenta: #d670d6;
38 | 	--ansiBrightCyan: #29b8db;
39 | 	--ansiBrightWhite: #e5e5e5;
40 | }
41 | 
42 | @keyframes ansi-displaly-run-blink {
43 | 	50% {
44 | 		opacity: 0;
45 | 	}
46 | }
47 | 
48 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | [<img width="295" src="https://ukgovernmentbeis.github.io/inspect_ai/images/aisi-logo.png" />](https://www.gov.uk/government/organisations/ai-safety-institute)
 2 | 
 3 | Welcome to Inspect, a framework for large language model evaluations created by the [UK AI Safety Institute](https://www.gov.uk/government/organisations/ai-safety-institute).
 4 | 
 5 | Inspect provides many built-in components, including facilities for prompt engineering, tool usage, multi-turn dialog, and model graded evaluations. Extensions to Inspect (e.g. to support new elicitation and scoring techniques) can be provided by other Python packages.
 6 | 
 7 | To get started with Inspect, please see the documentation at <https://UKGovernmentBEIS.github.io/inspect_ai/>.
 8 | 
 9 | ***
10 | 
11 | #### Development
12 | 
13 | To work on development of Inspect, clone the repository and install with the `-e` flag and `[dev]` optional dependencies:
14 | 
15 | ```         
16 | $ git clone https://github.com/UKGovernmentBEIS/inspect_ai.git
17 | $ cd inspect_ai
18 | $ pip install -e ".[dev]"
19 | ```
20 | 
21 | If you use VS Code, you should be sure to have installed the recommended extensions (Python, Ruff, and MyPy). Note that you'll be prompted to install these when you open the project in VS Code.
22 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_cli/main.py:
--------------------------------------------------------------------------------
 1 | import click
 2 | 
 3 | from inspect_ai._util.dotenv import init_dotenv
 4 | 
 5 | from .. import __version__
 6 | from .eval import eval_command
 7 | from .info import info_command
 8 | from .list import list_command
 9 | from .score import score_command
10 | from .view import view_command
11 | 
12 | 
13 | @click.group(invoke_without_command=True)
14 | @click.option(
15 |     "--version",
16 |     type=bool,
17 |     is_flag=True,
18 |     default=False,
19 |     help="Print the Inspect version.",
20 | )
21 | @click.pass_context
22 | def inspect(ctx: click.Context, version: bool) -> None:
23 |     # if this was a subcommand then allow it to execute
24 |     if ctx.invoked_subcommand is not None:
25 |         return
26 | 
27 |     if version:
28 |         print(__version__)
29 |         ctx.exit()
30 |     else:
31 |         click.echo(ctx.get_help())
32 |         ctx.exit()
33 | 
34 | 
35 | inspect.add_command(eval_command)
36 | inspect.add_command(score_command)
37 | inspect.add_command(view_command)
38 | inspect.add_command(list_command)
39 | inspect.add_command(info_command)
40 | 
41 | 
42 | def main() -> None:
43 |     init_dotenv()
44 |     inspect(auto_envvar_prefix="INSPECT")
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     main()
49 | 


--------------------------------------------------------------------------------
/tests/test_list_task.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Callable
 3 | 
 4 | from inspect_ai import TaskInfo, list_tasks
 5 | 
 6 | TEST_TASKS_DIR = Path("tests/test_task_list")
 7 | 
 8 | 
 9 | def list_test_tasks_dir(
10 |     globs: list[str], filter: Callable[[TaskInfo], bool] | None = None
11 | ):
12 |     return list_tasks(globs, filter=filter, root_dir=TEST_TASKS_DIR)
13 | 
14 | 
15 | def test_task_list_multiple_file():
16 |     tasks = list_test_tasks_dir(["multiple.py"])
17 |     assert len(tasks) == 2
18 |     names = [task.name for task in tasks]
19 |     assert "first" in names
20 |     assert "second_task" in names
21 | 
22 | 
23 | def test_task_list_multiple_dir():
24 |     tasks = list_test_tasks_dir(["multiple_dir"])
25 |     assert len(tasks) == 2
26 | 
27 | 
28 | def test_task_list_attribs():
29 |     tasks = list_test_tasks_dir(["attribs.ipynb"])
30 |     assert tasks[0].attribs.get("light") is True
31 |     assert tasks[0].attribs.get("type") == "bio"
32 | 
33 | 
34 | def test_task_list_filter():
35 |     tasks = list_test_tasks_dir(["*"], filter=lambda t: t.attribs.get("type") == "bio")
36 |     assert len(tasks) == 1
37 | 
38 | 
39 | def test_task_list_recurse():
40 |     tasks = list_test_tasks_dir(["recurse"])
41 |     assert len(tasks) == 3
42 | 


--------------------------------------------------------------------------------
/src/inspect_ai/scorer/__init__.py:
--------------------------------------------------------------------------------
 1 | from ._answer import AnswerPattern, answer
 2 | from ._match import includes, match
 3 | from ._metric import (
 4 |     CORRECT,
 5 |     INCORRECT,
 6 |     NOANSWER,
 7 |     PARTIAL,
 8 |     Metric,
 9 |     Score,
10 |     Value,
11 |     ValueToFloat,
12 |     metric,
13 |     value_to_float,
14 | )
15 | from ._metrics.accuracy import accuracy
16 | from ._metrics.mean import mean
17 | from ._metrics.std import bootstrap_std
18 | from ._model import model_graded_fact, model_graded_qa
19 | from ._multi import ScoreReducer, majority_vote, multi_scorer
20 | from ._pattern import pattern
21 | from ._scorer import (
22 |     Scorer,
23 |     Target,
24 |     scorer,
25 | )
26 | 
27 | __all__ = [
28 |     "includes",
29 |     "match",
30 |     "model_graded_qa",
31 |     "model_graded_fact",
32 |     "answer",
33 |     "pattern",
34 |     "AnswerPattern",
35 |     "Scorer",
36 |     "Target",
37 |     "scorer",
38 |     "accuracy",
39 |     "bootstrap_std",
40 |     "mean",
41 |     "Metric",
42 |     "metric",
43 |     "Score",
44 |     "Value",
45 |     "ValueToFloat",
46 |     "value_to_float",
47 |     "CORRECT",
48 |     "INCORRECT",
49 |     "PARTIAL",
50 |     "NOANSWER",
51 |     "multi_scorer",
52 |     "majority_vote",
53 |     "ScoreReducer",
54 | ]
55 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/preact/htm/htm.mjs:
--------------------------------------------------------------------------------
1 | /* esm.sh - esbuild bundle(htm@3.1.1) es2022 production */
2 | var a=function(p,f,c,n){var l;f[0]=0;for(var u=1;u<f.length;u++){var g=f[u++],o=f[u]?(f[0]|=g?1:2,c[f[u++]]):f[++u];g===3?n[0]=o:g===4?n[1]=Object.assign(n[1]||{},o):g===5?(n[1]=n[1]||{})[f[++u]]=o:g===6?n[1][f[++u]]+=o+"":g?(l=p.apply(o,a(p,o,c,["",null])),n.push(l),o[0]?f[0]|=2:(f[u-2]=0,f[u]=l)):n.push(o)}return n},M=new Map;function b(p){var f=M.get(this);return f||(f=new Map,M.set(this,f)),(f=a(this,f.get(p)||(f.set(p,f=function(c){for(var n,l,u=1,g="",o="",i=[0],s=function(v){u===1&&(v||(g=g.replace(/^\s*\n\s*|\s*\n\s*$/g,"")))?i.push(0,v,g):u===3&&(v||g)?(i.push(3,v,g),u=2):u===2&&g==="..."&&v?i.push(4,v,0):u===2&&g&&!v?i.push(5,0,!0,g):u>=5&&((g||!v&&u===5)&&(i.push(u,0,g,l),u=6),v&&(i.push(u,v,0,l),u=6)),g=""},t=0;t<c.length;t++){t&&(u===1&&s(),s(t));for(var w=0;w<c[t].length;w++)n=c[t][w],u===1?n==="<"?(s(),i=[i],u=3):g+=n:u===4?g==="--"&&n===">"?(u=1,g=""):g=n+g[0]:o?n===o?o="":g+=n:n==='"'||n==="'"?o=n:n===">"?(s(),u=1):u&&(n==="="?(u=5,l=g,g=""):n==="/"&&(u<5||c[t][w+1]===">")?(s(),u===3&&(i=i[0]),u=i,(i=i[0]).push(2,0,u),u=0):n===" "||n==="	"||n===`
3 | `||n==="\r"?(s(),u=2):g+=n),u===3&&g==="!--"&&(u=4,i=i[0])}return s(),i}(p)),f),arguments,[])).length>1?f:f[0]}export{b as default};
4 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/api/api-vscode.mjs:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import { webViewJsonRpcClient, kMethodEvalLog, kMethodEvalLogs, kMethodEvalLogHeaders } from "./jsonrpc.mjs";
 4 | 
 5 | const vscodeApi = window.acquireVsCodeApi ? window.acquireVsCodeApi() : undefined;
 6 | 
 7 | const vscodeClient = webViewJsonRpcClient(vscodeApi)
 8 | 
 9 | 
10 | async function client_events() {
11 |   return [];
12 | }
13 | 
14 | async function eval_logs() {
15 |   const response = await vscodeClient(kMethodEvalLogs, []);
16 |   if (response) {
17 |     return {
18 |       log_dir: "",
19 |       files: JSON5.parse(response)
20 |     }
21 |   } else {
22 |     return undefined;
23 |   }
24 | 
25 | }
26 | 
27 | async function eval_log(file, headerOnly) {
28 |   const response = await vscodeClient(kMethodEvalLog, [file, headerOnly]);
29 |   if (response) {
30 |     return JSON5.parse(response);
31 |   } else {
32 |     return undefined;
33 |   }
34 | }
35 | 
36 | async function eval_log_headers(files) {
37 |   const response = await vscodeClient(kMethodEvalLogHeaders, [files]);
38 |   if (response) {
39 |     return JSON5.parse(response);
40 |   } else {
41 |     return undefined;
42 |   }
43 | }
44 | 
45 | 
46 | export default {
47 |   client_events,
48 |   eval_logs,
49 |   eval_log,
50 |   eval_log_headers
51 | }
52 | 
53 | 


--------------------------------------------------------------------------------
/benchmarks/boolq.py:
--------------------------------------------------------------------------------
 1 | """
 2 | BoolQ
 3 | 
 4 | Exploring the Surprising Difficulty of Natural Yes/No Questions
 5 | Christopher Clark, Kenton Lee, Ming-Wei Chang, Tom Kwiatkowski, Michael Collins,
 6 | Kristina Toutanova
 7 | https://arxiv.org/abs/1905.10044
 8 | 
 9 | # Run against validations boolq dataset
10 | inspect eval boolq.py
11 | """
12 | 
13 | from inspect_ai import Task, task
14 | from inspect_ai.dataset import Sample, hf_dataset
15 | from inspect_ai.scorer import pattern
16 | from inspect_ai.solver import generate, prompt_template
17 | 
18 | TEMPLATE = r"""
19 | Answer the following question with either Yes or No. Include nothing else in your response.
20 | 
21 | Question: {prompt}
22 | """
23 | 
24 | 
25 | def record_to_sample(record):
26 |     if record["answer"]:
27 |         target = "Yes"
28 |     else:
29 |         target = "No"
30 | 
31 |     return Sample(input=record["question"], target=target)
32 | 
33 | 
34 | @task
35 | def boolq():
36 |     dataset = hf_dataset(
37 |         path="boolq",
38 |         sample_fields=record_to_sample,
39 |         split="validation",
40 |         shuffle=True,
41 |     )
42 | 
43 |     return Task(
44 |         dataset=dataset,
45 |         plan=[prompt_template(template=TEMPLATE), generate()],
46 |         scorer=pattern(r"(Yes|No).?\Z"),
47 |     )
48 | 


--------------------------------------------------------------------------------
/tools/vscode/.eslintrc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "root": true,
 3 |     "parser": "@typescript-eslint/parser",
 4 |     "parserOptions": {
 5 |         "ecmaVersion": 2020,
 6 |         "sourceType": "module",
 7 |         "project": "./tsconfig.json"
 8 |       },
 9 |     "plugins": [
10 |         "@typescript-eslint"
11 |     ],
12 |     "extends": [
13 |         "eslint:recommended",
14 |         "plugin:@typescript-eslint/recommended",
15 |         "plugin:@typescript-eslint/recommended-requiring-type-checking"
16 |       ],    
17 |     "rules": {
18 |         "@typescript-eslint/naming-convention": [
19 |             "warn",
20 |             {
21 |                 "selector": "import",
22 |                 "format": [ "camelCase", "PascalCase" ]
23 |             }
24 |         ],
25 |         "@typescript-eslint/semi": "warn",
26 |         "curly": "warn",
27 |         "eqeqeq": "warn",
28 |         "no-throw-literal": "warn",
29 |         "semi": "off"
30 |     },
31 |     "ignorePatterns": [
32 |         "out",
33 |         "dist",
34 |         "**/*.d.ts",
35 |         "src/providers/activity-bar/webview/env-config-webview.ts",
36 |         "src/providers/activity-bar/webview/task-config-webview.ts",
37 |         "src/providers/activity-bar/webview/webview-utils.ts",
38 |         "tools/**"
39 |     ]
40 | }


--------------------------------------------------------------------------------
/tests/test_images.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from test_helpers.utils import (
 4 |     skip_if_no_anthropic,
 5 |     skip_if_no_google,
 6 |     skip_if_no_openai,
 7 | )
 8 | 
 9 | from inspect_ai import Task, eval, task
10 | from inspect_ai.dataset import json_dataset
11 | from inspect_ai.scorer import match
12 | from inspect_ai.solver import generate, system_message
13 | 
14 | SYSTEM_MESSAGE = """
15 | For the following exercise, it is important that you answer with only a single word or numeric value in brackets. For example, [22] or [house]. Do not include any discussion, narrative, or rationale, just a single value in brackets.
16 | """
17 | 
18 | 
19 | @task
20 | def images():
21 |     return Task(
22 |         dataset=json_dataset(os.path.join("tests", "test_images", "images.jsonl")),
23 |         plan=[system_message(SYSTEM_MESSAGE), generate()],
24 |         scorer=match(),
25 |     )
26 | 
27 | 
28 | def check_images(model):
29 |     eval(images, model)
30 | 
31 | 
32 | @skip_if_no_google
33 | def test_google_images():
34 |     check_images("google/gemini-pro-vision")
35 | 
36 | 
37 | @skip_if_no_openai
38 | def test_openai_images():
39 |     check_images("openai/gpt-4")
40 | 
41 | 
42 | @skip_if_no_anthropic
43 | def test_anthropic_images():
44 |     check_images("anthropic/claude-3-sonnet-20240229")
45 | 


--------------------------------------------------------------------------------
/.github/workflows/pypi.yml:
--------------------------------------------------------------------------------
 1 | name: Publish to PyPI
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |     inputs:
 6 |       publish-release:
 7 |         description: "Production Release"
 8 |         required: false
 9 |         type: boolean
10 |         default: false
11 | 
12 | jobs:
13 |   publish:
14 |     name: Publish
15 |     runs-on: ubuntu-latest
16 |     environment: pypi
17 |     strategy:
18 |       fail-fast: false
19 |     permissions:
20 |       id-token: write
21 |     steps:
22 |       - name: Checkout
23 |         uses: actions/checkout@v4
24 |         with:
25 |           fetch-depth: 0
26 |       - name: Set up Python
27 |         uses: actions/setup-python@v5
28 |         with:
29 |           python-version: "3.x"
30 |       - name: Install pypa/build
31 |         run: >-
32 |           python3 -m
33 |           pip install
34 |           build
35 |           --user
36 |       - name: Build
37 |         run: python -m build
38 |       - name: Publish package to TestPyPI
39 |         uses: pypa/gh-action-pypi-publish@release/v1
40 |         if: ${{ ! inputs.publish-release }}
41 |         with:
42 |           repository-url: https://test.pypi.org/legacy/
43 |       - name: Publish package to PyPI
44 |         uses: pypa/gh-action-pypi-publish@release/v1
45 |         if: ${{ inputs.publish-release }}
46 | 


--------------------------------------------------------------------------------
/tests/test_retry.py:
--------------------------------------------------------------------------------
 1 | from random import random
 2 | 
 3 | from test_helpers.utils import skip_if_no_openai
 4 | 
 5 | from inspect_ai import Task, eval, eval_retry, task
 6 | from inspect_ai.dataset import Sample
 7 | from inspect_ai.scorer import match
 8 | from inspect_ai.solver import Generate, TaskState, generate, solver
 9 | 
10 | 
11 | @solver
12 | def failing_solver():
13 |     async def solve(state: TaskState, generate: Generate):
14 |         if random() > 0.33:
15 |             raise ValueError("Eval failed!")
16 | 
17 |         return state
18 | 
19 |     return solve
20 | 
21 | 
22 | @task
23 | def failing_task():
24 |     return Task(
25 |         dataset=[Sample(input="Say hello", target="hello")],
26 |         plan=[failing_solver(), generate()],
27 |         scorer=match(),
28 |     )
29 | 
30 | 
31 | @skip_if_no_openai
32 | def test_eval_retry():
33 |     # run eval with a solver that fails 2/3 times
34 |     failing_eval = f"{__file__}@failing_task"
35 |     log = eval(failing_eval, limit=1)[0]
36 | 
37 |     # note the task id so we can be certain it remains the same
38 |     task_id = log.eval.task_id
39 | 
40 |     # retry until we succeed (confirming the task_id is stable)
41 |     while log.status != "success":
42 |         log = eval_retry(log)[0]
43 |         assert log.eval.task_id == task_id
44 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_eval/task/util.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from copy import deepcopy
 3 | from typing import cast
 4 | 
 5 | from inspect_ai._util.path import cwd_relative_path
 6 | from inspect_ai.dataset import Sample
 7 | from inspect_ai.model import ChatMessage, ChatMessageUser
 8 | from inspect_ai.solver import TaskState
 9 | 
10 | from ..types import Task
11 | from .constants import TASK_FILE_ATTR, TASK_RUN_DIR_ATTR
12 | 
13 | 
14 | def sample_messages(sample: Sample) -> list[ChatMessage]:
15 |     if isinstance(sample.input, str):
16 |         return [ChatMessageUser(content=sample.input, source="input")]
17 |     else:
18 |         messages = deepcopy(sample.input)
19 |         for message in messages:
20 |             message.source = "input"
21 |         return messages
22 | 
23 | 
24 | def has_max_messages(state: TaskState, max_messages: int | None) -> bool:
25 |     return max_messages is not None and (len(state.messages) >= max_messages)
26 | 
27 | 
28 | def task_run_dir(task: Task) -> str:
29 |     return getattr(task, TASK_RUN_DIR_ATTR, os.getcwd())
30 | 
31 | 
32 | def task_file(task: Task, relative: bool = False) -> str | None:
33 |     file = cast(str | None, getattr(task, TASK_FILE_ATTR, None))
34 |     if file:
35 |         if relative:
36 |             return cwd_relative_path(file)
37 |         else:
38 |             return file
39 |     else:
40 |         return None
41 | 


--------------------------------------------------------------------------------
/src/inspect_ai/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # ruff: noqa: F401 F403 F405
 2 | 
 3 | from ._model import (
 4 |     ChatCompletionChoice,
 5 |     ChatMessage,
 6 |     ChatMessageAssistant,
 7 |     ChatMessageSystem,
 8 |     ChatMessageTool,
 9 |     ChatMessageUser,
10 |     Content,
11 |     ContentImage,
12 |     ContentText,
13 |     GenerateConfig,
14 |     GenerateConfigArgs,
15 |     Logprob,
16 |     Logprobs,
17 |     Model,
18 |     ModelAPI,
19 |     ModelName,
20 |     ModelOutput,
21 |     ModelUsage,
22 |     StopReason,
23 |     TopLogprob,
24 |     get_model,
25 | )
26 | from ._providers.providers import *
27 | from ._registry import modelapi
28 | from ._tool import ToolCall, ToolChoice, ToolFunction, ToolInfo, ToolParam
29 | 
30 | __all__ = [
31 |     "GenerateConfig",
32 |     "GenerateConfigArgs",
33 |     "ContentText",
34 |     "ContentImage",
35 |     "Content",
36 |     "ChatMessage",
37 |     "ChatMessageSystem",
38 |     "ChatMessageUser",
39 |     "ChatMessageAssistant",
40 |     "ChatMessageTool",
41 |     "ChatCompletionChoice",
42 |     "ModelOutput",
43 |     "Logprobs",
44 |     "Logprob",
45 |     "TopLogprob",
46 |     "Model",
47 |     "ModelAPI",
48 |     "ModelName",
49 |     "ModelUsage",
50 |     "StopReason",
51 |     "ToolCall",
52 |     "ToolChoice",
53 |     "ToolFunction",
54 |     "ToolInfo",
55 |     "ToolParam",
56 |     "ToolType",
57 |     "get_model",
58 |     "modelapi",
59 | ]
60 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_util/images.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import mimetypes
 3 | 
 4 | import httpx
 5 | 
 6 | from .file import file
 7 | from .url import (
 8 |     data_uri_mime_type,
 9 |     data_uri_to_base64,
10 |     is_data_uri,
11 |     is_http_url,
12 | )
13 | 
14 | 
15 | async def image_as_data(image: str) -> tuple[bytes, str]:
16 |     if is_data_uri(image):
17 |         # resolve mime type and base64 content
18 |         mime_type = data_uri_mime_type(image) or "image/png"
19 |         image_base64 = data_uri_to_base64(image)
20 |         image_bytes = base64.b64decode(image_base64)
21 |     else:
22 |         # guess mime type
23 |         type, _ = mimetypes.guess_type(image)
24 |         if type:
25 |             mime_type = type
26 |         else:
27 |             mime_type = "image/png"
28 | 
29 |         # handle url or file
30 |         if is_http_url(image):
31 |             client = httpx.AsyncClient()
32 |             image_bytes = (await client.get(image)).content
33 |         else:
34 |             with file(image, "rb") as f:
35 |                 image_bytes = f.read()
36 | 
37 |     # return bytes and type
38 |     return image_bytes, mime_type
39 | 
40 | 
41 | async def image_as_data_uri(image: str) -> str:
42 |     bytes, mime_type = await image_as_data(image)
43 |     base64_image = base64.b64encode(bytes).decode("utf-8")
44 |     image = f"data:{mime_type};base64,{base64_image}"
45 |     return image
46 | 


--------------------------------------------------------------------------------
/tools/vscode/src/providers/inspect/inspect-eval-commands.ts:
--------------------------------------------------------------------------------
 1 | import { Uri } from "vscode";
 2 | import { Command } from "../../core/command";
 3 | import { InspectEvalManager } from "./inspect-eval";
 4 | import { toAbsolutePath } from "../../core/path";
 5 | import { scheduleFocusActiveEditor } from "../../components/focus";
 6 | 
 7 | export function inspectEvalCommands(manager: InspectEvalManager): Command[] {
 8 |   return [new RunEvalCommand(manager), new DebugEvalCommand(manager)];
 9 | }
10 | 
11 | export class RunEvalCommand implements Command {
12 |   constructor(private readonly manager_: InspectEvalManager) { }
13 |   async execute(documentUri: Uri, fnName: string): Promise<void> {
14 |     const cwd = toAbsolutePath(documentUri.fsPath);
15 | 
16 |     const evalPromise = this.manager_.startEval(cwd, fnName, false);
17 |     scheduleFocusActiveEditor();
18 |     await evalPromise;
19 |   }
20 |   private static readonly id = "inspect.runTask";
21 |   public readonly id = RunEvalCommand.id;
22 | }
23 | 
24 | export class DebugEvalCommand implements Command {
25 |   constructor(private readonly manager_: InspectEvalManager) { }
26 |   async execute(documentUri: Uri, fnName: string): Promise<void> {
27 |     const cwd = toAbsolutePath(documentUri.fsPath);
28 |     await this.manager_.startEval(cwd, fnName, true);
29 |   }
30 |   private static readonly id = "inspect.debugTask";
31 |   public readonly id = DebugEvalCommand.id;
32 | }
33 | 
34 | 


--------------------------------------------------------------------------------
/src/inspect_ai/scorer/_metrics/std.py:
--------------------------------------------------------------------------------
 1 | from logging import getLogger
 2 | from typing import cast
 3 | 
 4 | import numpy as np
 5 | 
 6 | from .._metric import (
 7 |     Metric,
 8 |     Score,
 9 |     ValueToFloat,
10 |     metric,
11 |     value_to_float,
12 | )
13 | 
14 | logger = getLogger(__name__)
15 | 
16 | 
17 | @metric
18 | def bootstrap_std(
19 |     num_samples: int = 1000, to_float: ValueToFloat = value_to_float()
20 | ) -> Metric:
21 |     """Standard deviation of a bootstrapped estimate of the mean.
22 | 
23 |     Args:
24 |        num_samples (int): Number of bootstrap samples to take.
25 |        to_float (ValueToFloat): Function for mapping
26 |          Value to float for computing metrics. The default
27 |          `value_to_float()` maps CORRECT ("C") to 1.0,
28 |          INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and
29 |          NOANSWER ("N") to 0, casts numeric values to
30 |          float directly, and prints a warning and returns
31 |          0 if the Value is a complex object (list or dict).
32 | 
33 |     Returns:
34 |        bootstrap_std metric
35 |     """
36 | 
37 |     def metric(scores: list[Score]) -> float:
38 |         values = [to_float(score.value) for score in scores]
39 |         std = np.std(
40 |             [
41 |                 np.mean(np.random.choice(values, len(values), replace=True))
42 |                 for _ in range(num_samples)
43 |             ]
44 |         )
45 |         return cast(float, std.item())
46 | 
47 |     return metric
48 | 


--------------------------------------------------------------------------------
/tests/test_hf.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from test_helpers.utils import skip_if_github_action
 3 | from transformers import PreTrainedModel  # type: ignore
 4 | 
 5 | from inspect_ai.model import (
 6 |     ChatMessageUser,
 7 |     GenerateConfig,
 8 |     get_model,
 9 | )
10 | 
11 | 
12 | @pytest.fixture
13 | def model() -> PreTrainedModel:
14 |     return get_model(
15 |         "hf/EleutherAI/pythia-70m",
16 |         config=GenerateConfig(
17 |             max_tokens=1,
18 |             seed=42,
19 |             temperature=0.01,
20 |         ),
21 |         # this allows us to run base models with the chat message scaffolding:
22 |         chat_template="{% for message in messages %}{{ message.content }}{% endfor %}",
23 |     )
24 | 
25 | 
26 | @pytest.mark.asyncio
27 | @skip_if_github_action
28 | async def test_hf_api(model: PreTrainedModel) -> None:
29 |     message = ChatMessageUser(content="Lorem ipsum dolor")
30 |     response = await model.generate(input=[message])
31 |     assert len(response.completion) >= 1
32 | 
33 | 
34 | @pytest.mark.asyncio
35 | @skip_if_github_action
36 | async def test_hf_api_fails(model: PreTrainedModel) -> None:
37 |     temp_before = model.config.temperature
38 |     try:
39 |         model.config.temperature = 0.0
40 | 
41 |         message = ChatMessageUser(content="Lorem ipsum dolor")
42 |         with pytest.raises(Exception):
43 |             await model.generate(input=[message])
44 |     finally:
45 |         model.config.temperature = temp_before
46 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/utils/events.mjs:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | // Returns a function, that, when invoked, will only be triggered at most once
 5 | // during a given window of time. Normally, the throttled function will run
 6 | // as much as it can, without ever going more than once per `wait` duration;
 7 | // but if you'd like to disable the execution on the leading edge, pass
 8 | // `{leading: false}`. To disable execution on the trailing edge, ditto.
 9 | export function throttle(func, wait, options) {
10 |   var context, args, result;
11 |   var timeout = null;
12 |   var previous = 0;
13 |   if (!options) options = {};
14 |   var later = function() {
15 |     previous = options.leading === false ? 0 : Date.now();
16 |     timeout = null;
17 |     result = func.apply(context, args);
18 |     if (!timeout) context = args = null;
19 |   };
20 |   return function() {
21 |     var now = Date.now();
22 |     if (!previous && options.leading === false) previous = now;
23 |     var remaining = wait - (now - previous);
24 |     context = this;
25 |     args = arguments;
26 |     if (remaining <= 0 || remaining > wait) {
27 |       if (timeout) {
28 |         clearTimeout(timeout);
29 |         timeout = null;
30 |       }
31 |       previous = now;
32 |       result = func.apply(context, args);
33 |       if (!timeout) context = args = null;
34 |     } else if (!timeout && options.trailing !== false) {
35 |       timeout = setTimeout(later, remaining);
36 |     }
37 |     return result;
38 |   };
39 | };


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/components/MorePopOver.mjs:
--------------------------------------------------------------------------------
 1 | import { html } from "htm/preact";
 2 | import { useEffect, useRef } from "preact/hooks";
 3 | 
 4 | import { icons, sharedStyles } from "../Constants.mjs";
 5 | 
 6 | export const MorePopOver = ({ title, customClass, children }) => {
 7 |   const popoverRef = useRef();
 8 |   const contentRef = useRef();
 9 | 
10 |   // Initialize the popover
11 |   useEffect(() => {
12 |     const contentEl = contentRef.current;
13 |     const popOverContent = document.createElement("div");
14 |     contentEl.childNodes.forEach((child) =>
15 |       popOverContent.appendChild(child.cloneNode(true))
16 |     );
17 |     new bootstrap.Popover(popoverRef.current, {
18 |       content: popOverContent,
19 |       title,
20 |       html: true,
21 |       customClass: customClass,
22 |       trigger: "focus",
23 |     });
24 |   }, [popoverRef, contentRef]);
25 | 
26 |   const popoverElements = [];
27 | 
28 |   // The popover display button
29 |   popoverElements.push(html`
30 |     <a
31 |       tabindex="0"
32 |       ref=${popoverRef}
33 |       class="btn"
34 |       role="button"
35 |       data-bs-toggle="popover"
36 |       data-bs-trigger="focus"
37 |       style=${sharedStyles.moreButton}
38 |       ><i class="${icons.more}"></i
39 |     ></a>
40 |   `);
41 | 
42 |   // A container to hold the popover contents
43 |   popoverElements.push(html` <div style="display: none;" ref=${contentRef}>
44 |     ${children}
45 |   </div>`);
46 | 
47 |   return popoverElements;
48 | };


--------------------------------------------------------------------------------
/tools/vscode/src/components/document.ts:
--------------------------------------------------------------------------------
 1 | import { Position, Selection, TextDocument, Uri, workspace } from "vscode";
 2 | import { readTaskData } from "./task";
 3 | 
 4 | 
 5 | // Provides a Selection for a task with a document
 6 | export const taskRangeForDocument = async (task: string, documentUri: Uri) => {
 7 |   const taskDatas = await tasksForDocument(documentUri);
 8 | 
 9 |   // Find the task that matches the name (or just select the first task)
10 |   const taskData = taskDatas.find((data) => {
11 |     return data.name === task;
12 |   });
13 | 
14 |   // If the task is within this document, find its position
15 |   if (taskData) {
16 |     const position = new Position(taskData.line + 1, 0);
17 |     return new Selection(position, position);
18 |   }
19 | };
20 | 
21 | export const firstTaskRangeForDocument = async (documentUri: Uri) => {
22 | 
23 |   const taskDatas = await tasksForDocument(documentUri);
24 |   if (taskDatas.length > 0) {
25 |     const position = new Position(taskDatas[0].line + 1, 0);
26 |     return new Selection(position, position);
27 |   }
28 | };
29 | 
30 | // Provides a list of task DocumentSymbols for a document
31 | const tasksForDocument = async (documentUri: Uri) => {
32 |   const document = await workspace.openTextDocument(documentUri);
33 |   const tasks = readTaskData(document);
34 |   return tasks;
35 | };
36 | 
37 | 
38 | export const documentHasTasks = (document: TextDocument) => {
39 |   const tasks = readTaskData(document);
40 |   return tasks.length > 0;
41 | };
42 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/favicon.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" width="16" height="16" fill="currentColor" class="bi bi-gear" viewBox="0 0 16 16">
2 |   <path d="M8 4.754a3.246 3.246 0 1 0 0 6.492 3.246 3.246 0 0 0 0-6.492M5.754 8a2.246 2.246 0 1 1 4.492 0 2.246 2.246 0 0 1-4.492 0"/>
3 |   <path d="M9.796 1.343c-.527-1.79-3.065-1.79-3.592 0l-.094.319a.873.873 0 0 1-1.255.52l-.292-.16c-1.64-.892-3.433.902-2.54 2.541l.159.292a.873.873 0 0 1-.52 1.255l-.319.094c-1.79.527-1.79 3.065 0 3.592l.319.094a.873.873 0 0 1 .52 1.255l-.16.292c-.892 1.64.901 3.434 2.541 2.54l.292-.159a.873.873 0 0 1 1.255.52l.094.319c.527 1.79 3.065 1.79 3.592 0l.094-.319a.873.873 0 0 1 1.255-.52l.292.16c1.64.893 3.434-.902 2.54-2.541l-.159-.292a.873.873 0 0 1 .52-1.255l.319-.094c1.79-.527 1.79-3.065 0-3.592l-.319-.094a.873.873 0 0 1-.52-1.255l.16-.292c.893-1.64-.902-3.433-2.541-2.54l-.292.159a.873.873 0 0 1-1.255-.52zm-2.633.283c.246-.835 1.428-.835 1.674 0l.094.319a1.873 1.873 0 0 0 2.693 1.115l.291-.16c.764-.415 1.6.42 1.184 1.185l-.159.292a1.873 1.873 0 0 0 1.116 2.692l.318.094c.835.246.835 1.428 0 1.674l-.319.094a1.873 1.873 0 0 0-1.115 2.693l.16.291c.415.764-.42 1.6-1.185 1.184l-.291-.159a1.873 1.873 0 0 0-2.693 1.116l-.094.318c-.246.835-1.428.835-1.674 0l-.094-.319a1.873 1.873 0 0 0-2.692-1.115l-.292.16c-.764.415-1.6-.42-1.184-1.185l.159-.291A1.873 1.873 0 0 0 1.945 8.93l-.319-.094c-.835-.246-.835-1.428 0-1.674l.319-.094A1.873 1.873 0 0 0 3.06 4.377l-.16-.292c-.415-.764.42-1.6 1.185-1.184l.292.159a1.873 1.873 0 0 0 2.692-1.115z"/>
4 | </svg>
5 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/components/ErrorPanel.mjs:
--------------------------------------------------------------------------------
 1 | import { html } from "htm/preact";
 2 | import { icons } from "../Constants.mjs";
 3 | 
 4 | export const ErrorPanel = ({ id, classes, title, error }) => {
 5 |   const emptyStyle = {
 6 |     display: "flex",
 7 |     flex: "0 0 content",
 8 |     alignItems: "center",
 9 |     justifyContent: "center",
10 |   };
11 |   const message = error.message;
12 |   const stack = error.stack;
13 |   return html`
14 |     <div
15 |       ...${{ id }}
16 |       class="${classes ? classes : ""}"
17 |       style=${{ ...emptyStyle, flexDirection: "column", minHeight: "10rem", marginTop: "4rem" }}
18 |     >
19 |       <div style=${{...emptyStyle, fontSize: "1.3rem"}}>
20 |         <div>
21 |           <i class="${icons.error}" style="${{ marginRight: "0.5rem", color: "var(--bs-red)" }}"></i>
22 |         </div>
23 |         <div>${title || ""}</div>
24 |       </div>
25 |       <div
26 |         style=${{
27 |           display: "inline-block",
28 |           fontSize: "0.8rem",
29 |           marginTop: "3rem",
30 |           border: "solid 1px var(--bs-border-color)",
31 |           borderRadius: "var(--bs-border-radius)",
32 |           padding: "1em",
33 |           maxWidth: "80%"
34 |         }}
35 |       >
36 |         <div>
37 |           Error: ${message || ""}
38 |           <pre style=${{ fontSize: "0.8rem" }}>
39 |             <code>
40 |               at ${stack}
41 |             </code>
42 |           </pre>
43 |         </div>
44 |       </div>
45 |     </div>
46 |   `;
47 | };
48 | 


--------------------------------------------------------------------------------
/tools/vscode/src/providers/active-task/active-task-command.ts:
--------------------------------------------------------------------------------
 1 | import { Command } from "../../core/command";
 2 | import { toAbsolutePath } from "../../core/path";
 3 | import { InspectEvalManager } from "../inspect/inspect-eval";
 4 | import { ActiveTaskManager } from "./active-task-provider";
 5 | 
 6 | 
 7 | 
 8 | export class RunActiveTaskCommand implements Command {
 9 |   constructor(private readonly manager_: ActiveTaskManager,
10 |     private readonly inspectMgr_: InspectEvalManager
11 |   ) { }
12 |   async execute(): Promise<void> {
13 |     const taskInfo = this.manager_.getActiveTaskInfo();
14 |     if (taskInfo) {
15 |       const docPath = toAbsolutePath(taskInfo.document.fsPath);
16 |       await this.inspectMgr_.startEval(docPath, taskInfo.activeTask?.name, false);
17 |     }
18 |   }
19 | 
20 |   private static readonly id = "inspect.runActiveTask";
21 |   public readonly id = RunActiveTaskCommand.id;
22 | }
23 | 
24 | export class DebugActiveTaskCommand implements Command {
25 |   constructor(private readonly manager_: ActiveTaskManager,
26 |     private readonly inspectMgr_: InspectEvalManager
27 |   ) { }
28 |   async execute(): Promise<void> {
29 |     const taskInfo = this.manager_.getActiveTaskInfo();
30 |     if (taskInfo) {
31 |       const docPath = toAbsolutePath(taskInfo.document.fsPath);
32 |       await this.inspectMgr_.startEval(docPath, taskInfo.activeTask?.name, true);
33 |     }
34 |   }
35 | 
36 |   private static readonly id = "inspect.debugActiveTask";
37 |   public readonly id = DebugActiveTaskCommand.id;
38 | }
39 | 
40 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/components/MarkdownDiv.mjs:
--------------------------------------------------------------------------------
 1 | import { html } from "htm/preact";
 2 | 
 3 | showdown.setOption('simpleLineBreaks', true);
 4 | showdown.setOption('literalMidWordUnderscores', true);
 5 | const converter = new showdown.Converter();
 6 | 
 7 | 
 8 | export const MarkdownDiv = (props) => {
 9 |   const { markdown, style } = props;
10 |   
11 |   // Escape all HTML tags
12 |   const escaped = DOMPurify.sanitize(markdown, { ALLOWED_TAGS: []});
13 | 
14 |   // Pre-render any text that isn't handled by markdown
15 |   const preRendered = preRenderText(escaped);
16 |   const renderedHtml = converter.makeHtml(preRendered);
17 | 
18 |   // Return the rendered markdown
19 |   const markup = { __html: renderedHtml };
20 |   return html`<div dangerouslySetInnerHTML=${markup} style=${style} class="${props.class ? props.class : ''} markdown-content" />`;
21 | };
22 | 
23 | 
24 | const kLetterListPattern = /^([a-zA-Z][\)\.]\s.*?)$/gm;
25 | const kCommonmarkReferenceLinkPattern = /\[(.*)\]\:( +.+)/g;
26 | 
27 | 
28 | const preRenderText = (txt) => {
29 |   // Special handling for ordered lists that look like
30 |   // multiple choice (e.g. a), b), c), d) etc..)
31 |   const rendered = txt.replaceAll(kLetterListPattern, "<p style='margin-bottom: 0.2em;'>$1</p>");
32 | 
33 |   // Special handling for commonmark like reference links which might
34 |   // look like:
35 |   // [alias]: http://www.google.com
36 |   // but text like:
37 |   // [expert]: answer 
38 |   // Also fools this
39 |   return rendered.replaceAll(kCommonmarkReferenceLinkPattern, "\[$1\]:$2");
40 | };


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/usage/ModelTokenTable.mjs:
--------------------------------------------------------------------------------
 1 | import { html } from "htm/preact";
 2 | 
 3 | export const ModelTokenTable = ({ model_usage }) => {
 4 |   return html`
 5 |   <${TokenTable}>
 6 |     <${TokenHeader}/>
 7 |     <tbody>
 8 |     ${Object.keys(model_usage).map((key) => {
 9 |       const vals = Object.values(model_usage[key]);
10 |       return html`<${TokenRow} model=${key} values=${vals} />`;
11 |     })}
12 |     </tbody>
13 |   </${TokenTable}>
14 |   `;
15 | };
16 | 
17 | const TokenTable = ({ children }) => {
18 |   return html`<table
19 |     class="table table-sm"
20 |     style=${{ width: "100%", fontSize: "0.8rem", marginTop: "0.7rem" }}
21 |   >
22 |     ${children}
23 |   </table>`;
24 | };
25 | 
26 | const thStyle = {padding: 0, fontSize: "0.7rem", fontWeight: 400, textTransform: "uppercase"}
27 | 
28 | const TokenHeader = () => {
29 |   return html`<thead>
30 |     <tr>
31 |       <td></td>
32 |       <td
33 |         colspan="3"
34 |         align="center"
35 |         class="card-subheading"
36 |         style=${{paddingBottom: "0.7rem"}}
37 |       >
38 |         Tokens
39 |       </td>
40 |     </tr>
41 |     <tr>
42 |       <th style=${thStyle}>Model</th>
43 |       <th style=${thStyle}>Input</th>
44 |       <th style=${thStyle}>Output</th>
45 |       <th style=${thStyle}>Total</th>
46 |     </tr>
47 |   </thead>`;
48 | };
49 | 
50 | const TokenRow = ({ model, values }) => {
51 |   return html`<tr>
52 |     <td>${model}</td>
53 |     ${values.map((val) => {
54 |       return html`<td>${val.toLocaleString()}</td>`;
55 |     })}
56 |   </tr>`;
57 | };
58 | 


--------------------------------------------------------------------------------
/tests/test_plan.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from test_helpers.utils import skip_if_no_openai
 3 | 
 4 | from inspect_ai import Task, eval_async
 5 | from inspect_ai._util.registry import registry_info
 6 | from inspect_ai.dataset import Sample
 7 | from inspect_ai.solver import (
 8 |     Generate,
 9 |     Plan,
10 |     TaskState,
11 |     chain_of_thought,
12 |     generate,
13 |     plan,
14 |     solver,
15 | )
16 | 
17 | 
18 | @plan(fancy=True)
19 | def my_plan() -> Plan:
20 |     return Plan(steps=[chain_of_thought(), generate()])
21 | 
22 | 
23 | @skip_if_no_openai
24 | @pytest.mark.asyncio
25 | async def test_plan_cleanup():
26 |     @solver
27 |     def failing_solver():
28 |         async def solve(state: TaskState, generate: Generate):
29 |             raise ValueError("Eval failed!")
30 | 
31 |         return solve
32 | 
33 |     cleaned_up = False
34 | 
35 |     def cleanup(state):
36 |         nonlocal cleaned_up
37 |         cleaned_up = True
38 | 
39 |     task = Task(
40 |         dataset=[Sample(input="Say hello.", target="Hello")],
41 |         plan=Plan(
42 |             steps=[chain_of_thought(), failing_solver(), generate()], cleanup=cleanup
43 |         ),
44 |     )
45 | 
46 |     result = await eval_async(task, model="openai/gpt-4")
47 | 
48 |     assert result[0].status == "error"
49 |     assert cleaned_up
50 | 
51 | 
52 | def test_plan_registration():
53 |     plan = my_plan()
54 |     assert registry_info(plan).name == "my_plan"
55 | 
56 | 
57 | def test_plan_attribs():
58 |     plan = my_plan()
59 |     assert registry_info(plan).metadata["attribs"]["fancy"] is True
60 | 


--------------------------------------------------------------------------------
/src/inspect_ai/model/_tool.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import (
 3 |     Any,
 4 |     Literal,
 5 |     Union,
 6 | )
 7 | 
 8 | from inspect_ai._util.json import JSONType
 9 | 
10 | 
11 | @dataclass
12 | class ToolParam:
13 |     name: str
14 |     """Parameter name."""
15 | 
16 |     type: JSONType
17 |     """JSON type of parameter."""
18 | 
19 |     description: str
20 |     """Description of parameter."""
21 | 
22 |     optional: bool
23 |     """Is the parameter optional"""
24 | 
25 | 
26 | @dataclass
27 | class ToolInfo:
28 |     name: str
29 |     """Tool name."""
30 | 
31 |     description: str
32 |     """Tool description."""
33 | 
34 |     params: list[ToolParam]
35 |     """Tool parameters"""
36 | 
37 | 
38 | @dataclass
39 | class ToolCall:
40 |     id: str
41 |     """Unique identifier for tool call."""
42 | 
43 |     function: str
44 |     """Function called."""
45 | 
46 |     arguments: dict[str, Any]
47 |     """Arguments to function."""
48 | 
49 |     type: Literal["function"]
50 |     """Type of tool call (currently only 'function')"""
51 | 
52 |     parse_error: str | None = field(default=None)
53 |     """Error which occurred parsing tool call."""
54 | 
55 | 
56 | @dataclass
57 | class ToolFunction:
58 |     name: str
59 |     """The name of the function to call."""
60 | 
61 | 
62 | ToolChoice = Union[Literal["auto", "any", "none"], ToolFunction]
63 | """Specify which tool to call.
64 | 
65 | "auto" means the model decides; "any" means use at least one tool,
66 | "none" means never call a tool; ToolFunction instructs the model
67 | to call a specific function.
68 | """
69 | 


--------------------------------------------------------------------------------
/tests/test_stop_reason.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from test_helpers.utils import (
 3 |     skip_if_no_anthropic,
 4 |     skip_if_no_mistral,
 5 |     skip_if_no_openai,
 6 |     skip_if_no_together,
 7 | )
 8 | 
 9 | from inspect_ai.model import GenerateConfig, ModelOutput, get_model
10 | 
11 | 
12 | async def generate(model_name) -> ModelOutput:
13 |     model = get_model(model_name)
14 |     return await model.generate(input="Hello.")
15 | 
16 | 
17 | async def generate_token_limit(model_name) -> ModelOutput:
18 |     model = get_model(model_name)
19 |     return await model.generate(
20 |         input="Tell me a story.", config=GenerateConfig(max_tokens=2)
21 |     )
22 | 
23 | 
24 | async def check_stop_reason(model_name):
25 |     response = await generate(model_name)
26 |     assert response.choices[0].stop_reason == "stop"
27 | 
28 |     response = await generate_token_limit(model_name)
29 |     assert response.choices[0].stop_reason == "length"
30 | 
31 | 
32 | @pytest.mark.asyncio
33 | @skip_if_no_openai
34 | async def test_openai_stop_reason() -> None:
35 |     await check_stop_reason("openai/gpt-3.5-turbo")
36 | 
37 | 
38 | @pytest.mark.asyncio
39 | @skip_if_no_anthropic
40 | async def test_anthropic_stop_reason() -> None:
41 |     await check_stop_reason("anthropic/claude-3-haiku-20240307")
42 | 
43 | 
44 | @pytest.mark.asyncio
45 | @skip_if_no_mistral
46 | async def test_mistral_stop_reason() -> None:
47 |     await check_stop_reason("mistral/mistral-medium-latest")
48 | 
49 | 
50 | @pytest.mark.asyncio
51 | @skip_if_no_together
52 | async def test_together_stop_reason() -> None:
53 |     await check_stop_reason("together/google/gemma-2b-it")
54 | 


--------------------------------------------------------------------------------
/benchmarks/piqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | PIQA (Physical Interaction: Question Answering)
 3 | 
 4 | Reasoning about Physical Commonsense in Natural Language
 5 | Yonatan Bisk, Rowan Zellers, Ronan Le Bras, Jianfeng Gao, Yejin Choi
 6 | https://arxiv.org/abs/1911.11641
 7 | 
 8 | # eval piqa validation set
 9 | inspect eval piq.py
10 | """
11 | 
12 | from inspect_ai import Task, task
13 | from inspect_ai.dataset import Sample, hf_dataset
14 | from inspect_ai.scorer import answer
15 | from inspect_ai.solver import multiple_choice
16 | 
17 | 
18 | def record_to_sample(record):
19 |     return Sample(
20 |         input=record["goal"],
21 |         target="A" if record["label"] == 0 else "B",
22 |         choices=[record["sol1"], record["sol2"]],
23 |     )
24 | 
25 | 
26 | TEMPLATE = r"""
27 | The entire content of your response should be of the following format: 'ANSWER:
28 | $LETTER' (without quotes) where LETTER is one of {letters}.
29 | 
30 | Given either a question or a statement followed by two possible solutions
31 | labelled A and B, choose the most appropriate solution. If a question is given,
32 | the solutions answer the question. If a statement is given, the solutions
33 | explain how to achieve the statement.
34 | 
35 | {question}
36 | 
37 | {choices}
38 | """.strip()
39 | 
40 | 
41 | @task
42 | def piqa():
43 |     dataset = hf_dataset(
44 |         path="piqa",
45 |         sample_fields=record_to_sample,
46 |         trust=True,
47 |         split="validation",
48 |         shuffle=True,
49 |     )
50 | 
51 |     return Task(
52 |         dataset=dataset,
53 |         plan=[multiple_choice(template=TEMPLATE)],
54 |         scorer=answer("letter"),
55 |     )
56 | 


--------------------------------------------------------------------------------
/benchmarks/arc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge
 3 | 
 4 | Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa Schoenick, Oyvind Tafjord
 5 | https://arxiv.org/abs/1803.05457
 6 | 
 7 | # run all subsets
 8 | inspect eval arc.py
 9 | 
10 | # run specific subsets
11 | inspect eval arc.py@arc_easy
12 | inspect eval arc.py@arc_challenge
13 | """
14 | 
15 | from inspect_ai import Task, task
16 | from inspect_ai.dataset import Sample, hf_dataset
17 | from inspect_ai.scorer import answer
18 | from inspect_ai.solver import multiple_choice
19 | 
20 | 
21 | def record_to_sample(record):
22 |     # read the labels and text
23 |     choices = record["choices"]
24 |     choices = dict(zip(choices["label"], choices["text"]))
25 | 
26 |     # determine the target then normalize to letter
27 |     answerKey = record["answerKey"]
28 |     target = list(choices.keys()).index(answerKey)
29 |     target = chr(ord("A") + int(target))
30 | 
31 |     # return sample
32 |     return Sample(
33 |         input=record["question"], choices=list(choices.values()), target=target
34 |     )
35 | 
36 | 
37 | def arc_task(dataset_name):
38 |     return Task(
39 |         dataset=hf_dataset(
40 |             path="allenai/ai2_arc",
41 |             name=dataset_name,
42 |             split="test",
43 |             sample_fields=record_to_sample,
44 |         ),
45 |         plan=multiple_choice(),
46 |         scorer=answer("letter"),
47 |     )
48 | 
49 | 
50 | @task
51 | def arc_easy():
52 |     return arc_task("ARC-Easy")
53 | 
54 | 
55 | @task
56 | def arc_challenge():
57 |     return arc_task("ARC-Challenge")
58 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_display/_display.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import contextlib
 3 | from dataclasses import dataclass
 4 | from types import TracebackType
 5 | from typing import Any, Iterator, Type
 6 | 
 7 | from inspect_ai.log import EvalConfig, EvalError, EvalResults, EvalStats
 8 | from inspect_ai.model import GenerateConfig, ModelName
 9 | 
10 | 
11 | class Progress(abc.ABC):
12 |     @abc.abstractmethod
13 |     def update(self, n: float = 1) -> None: ...
14 | 
15 | 
16 | class TaskDisplay(abc.ABC):
17 |     @abc.abstractmethod
18 |     @contextlib.contextmanager
19 |     def progress(self, total: int) -> Iterator[Progress]: ...
20 | 
21 |     @abc.abstractmethod
22 |     def summary(self, results: EvalResults, stats: EvalStats) -> None: ...
23 | 
24 |     @abc.abstractmethod
25 |     def error(
26 |         self,
27 |         error: EvalError,
28 |         exc_type: Type[Any],
29 |         exc_value: BaseException,
30 |         traceback: TracebackType | None,
31 |     ) -> None: ...
32 | 
33 | 
34 | @dataclass
35 | class TaskProfile:
36 |     name: str
37 |     sequence: tuple[int, int]
38 |     model: ModelName
39 |     dataset: str
40 |     scorer: str
41 |     samples: int
42 |     eval_config: EvalConfig
43 |     task_args: dict[str, Any]
44 |     generate_config: GenerateConfig
45 |     log_location: str
46 | 
47 | 
48 | class Display(abc.ABC):
49 |     @abc.abstractmethod
50 |     def print(self, message: str) -> None: ...
51 | 
52 |     @abc.abstractmethod
53 |     @contextlib.contextmanager
54 |     def progress(self, total: int) -> Iterator[Progress]: ...
55 | 
56 |     @abc.abstractmethod
57 |     @contextlib.contextmanager
58 |     def task(self, profile: TaskProfile) -> Iterator[TaskDisplay]: ...
59 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_util/json.py:
--------------------------------------------------------------------------------
 1 | from typing import Literal
 2 | 
 3 | JSONType = Literal["string", "integer", "number", "boolean", "array", "object", "null"]
 4 | 
 5 | PythonType = Literal["str", "int", "float", "bool", "list", "dict", "None"]
 6 | 
 7 | 
 8 | def python_type_to_json_type(python_type: str | None) -> JSONType:
 9 |     match python_type:
10 |         case "str":
11 |             return "string"
12 |         case "int":
13 |             return "integer"
14 |         case "float":
15 |             return "number"
16 |         case "bool":
17 |             return "boolean"
18 |         case "list":
19 |             return "array"
20 |         case "dict":
21 |             return "object"
22 |         case "None":
23 |             return "null"
24 |         # treat 'unknown' as string as anything can be converted to string
25 |         case None:
26 |             return "string"
27 |         case _:
28 |             raise ValueError(
29 |                 f"Unsupported type: {python_type} for Python to JSON conversion."
30 |             )
31 | 
32 | 
33 | def json_type_to_python_type(json_type: str) -> PythonType:
34 |     match json_type:
35 |         case "string":
36 |             return "str"
37 |         case "integer":
38 |             return "int"
39 |         case "number":
40 |             return "float"
41 |         case "boolean":
42 |             return "bool"
43 |         case "array":
44 |             return "list"
45 |         case "object":
46 |             return "dict"
47 |         case "null":
48 |             return "None"
49 |         case _:
50 |             raise ValueError(
51 |                 f"Unsupported type: {json_type} for JSON to Python conversion."
52 |             )
53 | 


--------------------------------------------------------------------------------
/src/inspect_ai/scorer/_multi.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from collections import Counter
 3 | from typing import (
 4 |     Protocol,
 5 |     runtime_checkable,
 6 | )
 7 | 
 8 | from inspect_ai.solver import TaskState
 9 | 
10 | from ._metric import Score
11 | from ._scorer import Scorer, Target
12 | 
13 | 
14 | @runtime_checkable
15 | class ScoreReducer(Protocol):
16 |     def __call__(self, scores: list[Score]) -> Score: ...
17 | 
18 | 
19 | def multi_scorer(scorers: list[Scorer], reducer: ScoreReducer) -> Scorer:
20 |     r"""Returns a Scorer that runs multiple Scorers in parallel and aggregates their results into a single Score using the provided reducer function.
21 | 
22 |     Args:
23 |         scorers: a list of Scorers.
24 |         reducer: a function which takes in a list of Scores and returns a single Score.
25 |     """
26 | 
27 |     async def score(state: TaskState, target: Target) -> Score:
28 |         scores = await asyncio.gather(*[_scorer(state, target) for _scorer in scorers])
29 |         return reducer(scores)
30 | 
31 |     return score
32 | 
33 | 
34 | def majority_vote(scores: list[Score]) -> Score:
35 |     r"""A utility function for taking a majority vote over a list of scores.
36 | 
37 |     Args:
38 |         scores: a list of Scores.
39 |     """
40 |     counts: Counter[str | int | float | bool] = Counter()
41 |     for score in scores:
42 |         counts[score._as_scalar()] += 1
43 |     return Score(
44 |         value=counts.most_common(1)[0][0],
45 |         answer=scores[0].answer,
46 |         explanation=scores[0].explanation,
47 |         metadata={
48 |             "individual_scores": scores
49 |         },  # TODO: massage into format better for display
50 |     )
51 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/schema.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import subprocess
 4 | from pathlib import Path
 5 | from typing import Any
 6 | 
 7 | from inspect_ai.log import EvalLog
 8 | 
 9 | WWW_DIR = os.path.abspath((Path(__file__).parent / "www").as_posix())
10 | 
11 | 
12 | def sync_view_schema() -> None:
13 |     """Generate a JSON schema and Typescript types for EvalLog.
14 | 
15 |     This is useful for keeping log file viewer JS development
16 |     in sync w/ Python development
17 |     """
18 |     # export schema file
19 |     schema_path = Path(WWW_DIR, "log-schema.json")
20 |     types_path = Path(WWW_DIR, "log.d.ts")
21 |     with open(schema_path, "w", encoding="utf-8") as f:
22 |         # make everything required
23 |         schema = EvalLog.model_json_schema()
24 |         defs: dict[str, Any] = schema["$defs"]
25 |         for key in defs.keys():
26 |             defs[key] = schema_to_strict(defs[key])
27 |         f.write(json.dumps(schema, indent=2))
28 | 
29 |         # generate types w/ json-schema-to-typescript
30 |         subprocess.run(
31 |             [
32 |                 "json2ts",
33 |                 "--input",
34 |                 schema_path,
35 |                 "--output",
36 |                 types_path,
37 |                 "--additionalProperties",
38 |                 "false",
39 |             ]
40 |         )
41 | 
42 | 
43 | def schema_to_strict(schema: dict[str, Any]) -> dict[str, Any]:
44 |     properties = schema.get("properties", None)
45 |     if properties:
46 |         schema["required"] = list(properties.keys())
47 |         schema["additionalProperties"] = False
48 | 
49 |     return schema
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     sync_view_schema()
54 | 


--------------------------------------------------------------------------------
/tools/vscode/src/providers/settings/user-settings.ts:
--------------------------------------------------------------------------------
 1 | import { ConfigurationTarget, workspace } from "vscode";
 2 | 
 3 | const kPackageIndexDepthsSetting = "packageIndexDepths";
 4 | 
 5 | export const initializeGlobalSettings = async () => {
 6 |     const pythonAnalysis = workspace.getConfiguration("python.analysis") || [];
 7 |     const pkgIndexDepths =
 8 |         pythonAnalysis.get<Array<{ name: string; depth: number }>>(
 9 |             kPackageIndexDepthsSetting
10 |         ) || [];
11 | 
12 |     try {
13 |         kInspectPackageIndexDepth.forEach((pkgDep) => {
14 |             if (
15 |                 !pkgIndexDepths.find((p) => {
16 |                     return pkgDep.name === p.name;
17 |                 })
18 |             ) {
19 |                 pkgIndexDepths.push(pkgDep);
20 |             }
21 |         });
22 |         await pythonAnalysis.update(
23 |             kPackageIndexDepthsSetting,
24 |             pkgIndexDepths,
25 |             ConfigurationTarget.Global
26 |         );
27 |     } catch {
28 |         // This can happen if the user disables the Pylance extension
29 |         // in that case, since this is a Pylance setting, we're safe to just
30 |         // ignore it
31 |         // 
32 |         // Don't log since this is an allowed state (we don't require Pylance)
33 |         // and continue for any exception since we shouldn't allow this setting
34 |         // to block extension init
35 |     }
36 | 
37 |     const config = workspace.getConfiguration("editor", { languageId: "json" });
38 |     await config.update("wordWrap", "on", true);
39 | };
40 | 
41 | const kInspectPackageIndexDepth = [
42 |     {
43 |         name: "inspect_ai",
44 |         depth: 2,
45 |     },
46 | ];
47 | 


--------------------------------------------------------------------------------
/tools/vscode/src/core/path.ts:
--------------------------------------------------------------------------------
 1 | import path, { basename, dirname, join } from "path";
 2 | import { activeWorkspaceFolder } from "./workspace";
 3 | import { existsSync } from "fs";
 4 | 
 5 | export type UnknownPath = string;
 6 | 
 7 | export type AbsolutePath = {
 8 |   path: string;
 9 |   dirname: () => AbsolutePath;
10 |   filename: () => string;
11 |   child: (file: string) => AbsolutePath;
12 | };
13 | 
14 | export const activeWorkspacePath = (): AbsolutePath => {
15 |   const root = activeWorkspaceFolder();
16 |   return toAbsolutePath(root.uri.fsPath);
17 | };
18 | 
19 | // Resolves a workspace relative path into an absolute path
20 | export const workspacePath = (unknownPath: UnknownPath) => {
21 |   if (path.isAbsolute(unknownPath)) {
22 |     return toAbsolutePath(unknownPath);
23 |   } else {
24 |     const workspaceRoot = activeWorkspaceFolder().uri;
25 |     const absolutePath = path.resolve(workspaceRoot.fsPath, unknownPath);
26 |     return toAbsolutePath(absolutePath);
27 |   }
28 | };
29 | 
30 | export const workspaceRelativePath = (absPath: AbsolutePath) => {
31 |   const workspaceRoot = activeWorkspaceFolder();
32 |   return path.relative(workspaceRoot.uri.fsPath, absPath.path);
33 | };
34 | 
35 | export const toAbsolutePath = (path: string): AbsolutePath => {
36 |   return {
37 |     path,
38 |     dirname: () => {
39 |       return toAbsolutePath(dirname(path));
40 |     },
41 |     filename: () => {
42 |       return basename(path);
43 |     },
44 |     child: (file: string) => {
45 |       return toAbsolutePath(join(path, file));
46 |     }
47 |   };
48 | };
49 | 
50 | export const pathExists = (path: string) => {
51 |   const wsPath = workspacePath(path);
52 |   return existsSync(wsPath.path);
53 | };
54 | 


--------------------------------------------------------------------------------
/tools/vscode/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | 
 3 | ## 0.3.19
 4 | 
 5 | - Fix an issue showing the log viewer when an evaluation completes (specific to Inspect 0.3.10 or later)
 6 | 
 7 | ## 0.3.18
 8 | 
 9 | - Fix issues with task params when type hints are provided
10 | - Improve metric appearance in `inspect view`
11 | 
12 | ## 0.3.17
13 | 
14 | - Improve `inspect view` title bar treatment
15 | 
16 | ## 0.3.16
17 | 
18 | - Fix an issue that prevented the extension from loading when the `Pylance` extension was disabled or uninstalled.
19 | - Don't send task params that have been removed from tasks
20 | - Ensure that debugger breakpoints are available outside of user code
21 | - Ensure that evaluations are run from the workspace directory
22 | - Only show the logview in VS Code window that started an eval
23 | 
24 | ## 0.3.14
25 | 
26 | - Fix issue where the run/debug task option would be disabled for the task configuration pane if a file containing no tasks was being editted.
27 | - Improve Inspect binary detection on Linux platforms
28 | 
29 | ## 0.3.13
30 | 
31 | -   Ensure that inspect CLI is in the path for terminals using a global Python environment
32 | -   Add 'Show Logs' command to the environment panel.
33 | -   Improve models in the environment panel
34 |     -   Display literal provider names (rather than pretty names)
35 |     -   Remember the last used model for each provider
36 |     -   Allow free-form provide in model
37 |     -   Add autocomplete for Ollama
38 | -   Fix 'Restart' when debugging to properly restart the Inspect debugging session
39 | -   Improve performance loading task tree, selecting tasks within outline, and navigating to tasks
40 | -   Improve task selection behavior when the activity bar is first shown


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/components/Dialog.mjs:
--------------------------------------------------------------------------------
 1 | import { html } from "htm/preact";
 2 | 
 3 | export const DialogButton = ({ id, btnType, classes, style, children }) => {
 4 |   return html`<button
 5 |     type="button"
 6 |     class="btn ${btnType ? btnType : ""} ${classes ? classes : ""}"
 7 |     data-bs-toggle="modal"
 8 |     data-bs-target="#${id}"
 9 |     ...${{ style }}
10 |   >
11 |     ${children}
12 |   </button>`;
13 | };
14 | 
15 | export const DialogAfterBody = ({
16 |   id,
17 |   title,
18 |   classes,
19 |   scrollable,
20 |   centered,
21 |   styles,
22 |   children,
23 | }) => {
24 |   return html`
25 |     <div
26 |       class="modal fade ${classes}"
27 |       id="${id}"
28 |       tabindex="0"
29 |       aria-hidden="true"
30 |       styles=${{ ...styles }}
31 |     >
32 |       <div
33 |         class="modal-dialog modal-lg ${centered
34 |           ? "modal-dialog-centered"
35 |           : ""} ${scrollable ? "modal-dialog-scrollable" : ""}"
36 |       >
37 |         <div class="modal-content">
38 |           <div class="modal-header">
39 |             <h5 class="modal-title">${title}</h5>
40 |             <button
41 |               type="button"
42 |               class="btn-close"
43 |               data-bs-dismiss="modal"
44 |               aria-label="Close"
45 |             ></button>
46 |           </div>
47 |           <div class="modal-body">${children}</div>
48 |           <div class="modal-footer">
49 |             <button
50 |               type="button"
51 |               class="btn btn-outline-secondary"
52 |               data-bs-dismiss="modal"
53 |             >
54 |               Close
55 |             </button>
56 |           </div>
57 |         </div>
58 |       </div>
59 |     </div>
60 |   `;
61 | };
62 | 


--------------------------------------------------------------------------------
/tools/vscode/src/providers/settings/inspect-settings.ts:
--------------------------------------------------------------------------------
 1 | import { workspace } from "vscode";
 2 | 
 3 | // Inspect Settings
 4 | export interface InspectSettings {
 5 |   logViewAuto: boolean;
 6 |   logViewType: InspectLogViewStyle;
 7 | }
 8 | export type InspectLogViewStyle = "html" | "text";
 9 | 
10 | // Settings namespace and constants
11 | const kInspectConfigSection = "inspect_ai";
12 | const kInspectConfigLogViewAuto = "logViewAuto";
13 | const kInspectConfigLogViewType = "logViewType";
14 | 
15 | // Manages the settings for the inspect extension
16 | export class InspectSettingsManager {
17 |   constructor(private readonly onChanged_: (() => void) | undefined) {
18 |     workspace.onDidChangeConfiguration((event) => {
19 |       if (event.affectsConfiguration(kInspectConfigSection)) {
20 |         // Configuration for has changed
21 |         this.settings_ = undefined;
22 |         if (this.onChanged_) {
23 |           this.onChanged_();
24 |         }
25 |       }
26 |     });
27 |   }
28 |   private settings_ : InspectSettings | undefined;
29 | 
30 |   // get the current settings values
31 |   getSettings(): InspectSettings {
32 |     if (!this.settings_) {
33 |       this.settings_ = this.readSettings();
34 |     }
35 |     return this.settings_;
36 |   }
37 | 
38 |   // Read settings values directly from VS.Code
39 |   private readSettings() {
40 |     const configuration = workspace.getConfiguration(kInspectConfigSection);
41 |     const logViewType =
42 |       configuration.get<InspectLogViewStyle>(kInspectConfigLogViewType) || "html";
43 |     const logViewAuto = configuration.get<boolean>(kInspectConfigLogViewAuto);
44 |     return {
45 |       logViewType,
46 |       logViewAuto: logViewAuto !== undefined ? logViewAuto : true,
47 |     };
48 |   }
49 |   
50 | }


--------------------------------------------------------------------------------
/tests/scorer/test_answer.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from test_helpers.utils import simple_task_state
 3 | 
 4 | from inspect_ai.scorer import CORRECT, INCORRECT, Target, answer
 5 | 
 6 | 
 7 | @pytest.mark.asyncio
 8 | async def test_letter_success():
 9 |     scorer = answer("letter")
10 |     state = simple_task_state("ANSWER: B")
11 |     result = await scorer(state, Target(["B"]))
12 | 
13 |     assert result.text == CORRECT
14 | 
15 | 
16 | @pytest.mark.asyncio
17 | async def test_letter_failure():
18 |     scorer = answer("letter")
19 |     state = simple_task_state("ANSWER: B")
20 |     result = await scorer(state, Target(["C"]))
21 | 
22 |     assert result.text == INCORRECT
23 | 
24 | 
25 | @pytest.mark.asyncio
26 | async def test_word_success():
27 |     scorer = answer("word")
28 |     state = simple_task_state("ANSWER: Yes")
29 |     result = await scorer(state, Target(["Yes"]))
30 | 
31 |     assert result.text == CORRECT
32 | 
33 | 
34 | @pytest.mark.asyncio
35 | async def test_word_failure():
36 |     scorer = answer("letter")
37 |     state = simple_task_state("ANSWER: Yes")
38 |     result = await scorer(state, Target(["No"]))
39 | 
40 |     assert result.text == INCORRECT
41 | 
42 | 
43 | @pytest.mark.asyncio
44 | async def test_line_success():
45 |     scorer = answer("line")
46 |     state = simple_task_state("ANSWER:\nThis is a whole new line")
47 |     result = await scorer(state, Target(["This is a whole new line"]))
48 | 
49 |     assert result.text == CORRECT
50 | 
51 | 
52 | @pytest.mark.asyncio
53 | async def test_line_failure():
54 |     scorer = answer("line")
55 |     state = simple_task_state("ANSWER:\nThis is a whole new line")
56 |     result = await scorer(state, Target(["This doesn't match does it?"]))
57 | 
58 |     assert result.text == INCORRECT
59 | 


--------------------------------------------------------------------------------
/tools/vscode/assets/www/view/view-overrides.css:
--------------------------------------------------------------------------------
 1 | /* custom title block treatment for vscode */
 2 | 
 3 | body[class^="vscode-"] .workspace {
 4 |   margin-top: 0 !important;
 5 | }
 6 | 
 7 | body[class^="vscode-"] .workspace > div:first-of-type {
 8 |   padding-top: 0 !important;
 9 |   padding-bottom: 0 !important;
10 | }
11 | 
12 | body[class^="vscode-"] .font-title {
13 |   font-size: 0.9em;
14 | }
15 | 
16 | body[class^="vscode-"] .font-subtitle {
17 |   font-size: 0.8em;
18 | }
19 | 
20 | body[class^="vscode-"] .font-title > span:last-of-type {
21 |   font-size: 0.8em !important;
22 | }
23 | 
24 | body[class^="vscode-"] .workspace > div > div > div:last-child > div > div > div:last-child {
25 |   font-size: 1rem !important;
26 | }
27 | 
28 | body[class^="vscode-"] #title-plan-summary > div > div > div:last-child,
29 | body[class^="vscode-"] #title-plan-summary > div > div > div:last-child > div {
30 |   font-size: 0.7rem !important;
31 | }
32 | 
33 | body[class^="vscode-"] #title-hyperparameters > div > div:last-child {
34 |   font-size: 0.7rem !important;
35 | }
36 | 
37 | /* custom sidebar treatment for vscode */
38 | body[class^="vscode-"] .sidebar .list-group .list-group-item {
39 |   font-size: 0.6rem !important;
40 | }
41 | 
42 | body[class^="vscode-"] #sidebarOffCanvas > div > span {
43 |   font-size: 0.8rem !important;
44 | }
45 | 
46 | body[class^="vscode-"] code:not(.sourceCode) {
47 |   color: var(--bs-body-color);
48 | }
49 | 
50 | /* temporary hack to improve the appearance of metrics in the navbar
51 |    to truly fix, remove 'navbar-brand' from metrics div and use `navbar-metrics` 
52 |    to properly style it */
53 | body[class^="vscode-"] .navbar > div > .navbar-text:not(.navbar-brand) > div > div > div:last-of-type {
54 |   margin-top: -10px;
55 |   transform: scale(0.7);
56 | }


--------------------------------------------------------------------------------
/tools/vscode/src/providers/logview/logview-link-provider.ts:
--------------------------------------------------------------------------------
 1 | import { Uri } from "vscode";
 2 | 
 3 | import { InspectLogviewManager } from "./logview-manager";
 4 | import { workspacePath } from "../../core/path";
 5 | import { showError } from "../../components/error";
 6 | import { TerminalLink, TerminalLinkContext } from "vscode";
 7 | 
 8 | const kLogFilePattern = /^.*Log: (\S*?\.json)\s*/g;
 9 | 
10 | interface LogViewTerminalLink extends TerminalLink {
11 |   data: string;
12 | }
13 | 
14 | export const logviewTerminalLinkProvider = (manager: InspectLogviewManager) => {
15 |   return {
16 |     provideTerminalLinks: (
17 |       context: TerminalLinkContext,
18 |     ) => {
19 |       // Find the log file result, if present
20 |       const matches = [...context.line.matchAll(kLogFilePattern)];
21 |       if (matches.length === 0) {
22 |         return [];
23 |       }
24 | 
25 |       // Forward matches
26 |       const result = matches.map((match) => {
27 |         // The path from the terminal.
28 |         const path = match[1];
29 | 
30 |         // Sort out the decoration range for the link
31 |         const line = context.line;
32 |         const startIndex = line.indexOf(path);
33 |         return {
34 |           startIndex,
35 |           length: path.length,
36 |           tooltip: "View Log",
37 |           data: path,
38 |         } as LogViewTerminalLink;
39 |       });
40 |       return result;
41 |     },
42 |     handleTerminalLink: (link: LogViewTerminalLink) => {
43 | 
44 |       const logFile = /^[a-z0-9]+:\/\//.test(link.data) ? Uri.parse(link.data) : Uri.file(workspacePath(link.data).path);
45 | 
46 | 
47 |       manager.showLogFile(logFile).catch(async (err: Error) => {
48 |         await showError("Failed to preview log file - failed to start Inspect View", err);
49 |       });
50 |     },
51 |   };
52 | };
53 | 


--------------------------------------------------------------------------------
/tools/vscode/src/providers/activity-bar/task-config-commands.ts:
--------------------------------------------------------------------------------
 1 | import { Command } from "../../core/command";
 2 | import { toAbsolutePath } from "../../core/path";
 3 | import { InspectEvalManager } from "../inspect/inspect-eval";
 4 | import { ActiveTaskManager } from "../active-task/active-task-provider";
 5 | import { scheduleReturnFocus } from "../../components/focus";
 6 | 
 7 | export class RunConfigTaskCommand implements Command {
 8 |   constructor(private readonly manager_: ActiveTaskManager,
 9 |     private readonly inspectMgr_: InspectEvalManager
10 |   ) { }
11 |   async execute(): Promise<void> {
12 |     const taskInfo = this.manager_.getActiveTaskInfo();
13 |     if (taskInfo) {
14 |       const docPath = toAbsolutePath(taskInfo.document.fsPath);
15 |       const evalPromise = this.inspectMgr_.startEval(docPath, taskInfo.activeTask?.name, false);
16 |       scheduleReturnFocus("inspect_ai.task-configuration.focus");
17 |       await evalPromise;
18 |     }
19 |   }
20 | 
21 |   private static readonly id = "inspect.runConfigTask";
22 |   public readonly id = RunConfigTaskCommand.id;
23 | }
24 | 
25 | export class DebugConfigTaskCommand implements Command {
26 |   constructor(private readonly manager_: ActiveTaskManager,
27 |     private readonly inspectMgr_: InspectEvalManager
28 |   ) { }
29 |   async execute(): Promise<void> {
30 |     const taskInfo = this.manager_.getActiveTaskInfo();
31 |     if (taskInfo) {
32 |       const docPath = toAbsolutePath(taskInfo.document.fsPath);
33 |       const evalPromise = this.inspectMgr_.startEval(docPath, taskInfo.activeTask?.name, true);
34 |       scheduleReturnFocus("inspect_ai.task-configuratio.focus");
35 |       await evalPromise;
36 |     }
37 |   }
38 | 
39 |   private static readonly id = "inspect.debugConfigTask";
40 |   public readonly id = DebugConfigTaskCommand.id;
41 | }
42 | 


--------------------------------------------------------------------------------
/tests/test_eval_log/log_version_2.txt:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": 2,
 3 |     "status": "success",
 4 |     "eval": {
 5 |         "task": "wikipedia",
 6 |         "task_version": 0,
 7 |         "task_file": "examples/agents/langchain/wikipedia.py",
 8 |         "task_id": "YAdbKczyeSb6mEgPd3R9Qs",
 9 |         "run_id": "i5LyrzaUdD9K4EW5WTAd5t",
10 |         "created": "2024-05-05T07:59:35",
11 |         "dataset": {
12 |             "name": "wikipedia",
13 |             "location": "wikipedia.jsonl"
14 |         },
15 |         "model": "openai/gpt-4",
16 |         "task_attribs": {},
17 |         "task_args": {},
18 |         "model_args": {},
19 |         "config": {
20 |             "limit": 20
21 |         }
22 |     },
23 |     "plan": {
24 |         "name": "plan",
25 |         "steps": [
26 |             {
27 |                 "solver": "wikipedia_search",
28 |                 "params": {}
29 |             }
30 |         ],
31 |         "config": {}
32 |     },
33 |     "results": {
34 |         "scorer": {
35 |             "name": "model_graded_fact",
36 |             "params": {}
37 |         },
38 |         "metrics": {
39 |             "accuracy": {
40 |                 "name": "accuracy",
41 |                 "value": 1,
42 |                 "options": {}
43 |             },
44 |             "bootstrap_std": {
45 |                 "name": "bootstrap_std",
46 |                 "value": 0.0,
47 |                 "options": {}
48 |             }
49 |         }
50 |     },
51 |     "stats": {
52 |         "started_at": "2024-05-05T07:59:35",
53 |         "completed_at": "2024-05-05T08:00:03",
54 |         "model_usage": {
55 |             "openai/gpt-4": {
56 |                 "input_tokens": 8868,
57 |                 "output_tokens": 1351,
58 |                 "total_tokens": 10219
59 |             }
60 |         }
61 |     },
62 |     "logging": []
63 | }


--------------------------------------------------------------------------------
/tests/test_eval_log/log_with_nan.txt:
--------------------------------------------------------------------------------
 1 | {
 2 |     "version": 1,
 3 |     "status": "success",
 4 |     "eval": {
 5 |         "task": "wikipedia",
 6 |         "task_version": 0,
 7 |         "task_file": "examples/agents/langchain/wikipedia.py",
 8 |         "task_id": "YAdbKczyeSb6mEgPd3R9Qs",
 9 |         "run_id": "i5LyrzaUdD9K4EW5WTAd5t",
10 |         "created": "2024-05-05T07:59:35",
11 |         "dataset": {
12 |             "name": "wikipedia",
13 |             "location": "wikipedia.jsonl"
14 |         },
15 |         "model": "openai/gpt-4",
16 |         "task_attribs": {},
17 |         "task_args": {},
18 |         "model_args": {},
19 |         "config": {
20 |             "limit": 20
21 |         }
22 |     },
23 |     "plan": {
24 |         "name": "plan",
25 |         "steps": [
26 |             {
27 |                 "solver": "wikipedia_search",
28 |                 "params": {}
29 |             }
30 |         ],
31 |         "config": {}
32 |     },
33 |     "results": {
34 |         "scorer": {
35 |             "name": "model_graded_fact",
36 |             "params": {}
37 |         },
38 |         "metrics": {
39 |             "accuracy": {
40 |                 "name": "accuracy",
41 |                 "value": NaN,
42 |                 "options": {}
43 |             },
44 |             "bootstrap_std": {
45 |                 "name": "bootstrap_std",
46 |                 "value": 0.0,
47 |                 "options": {}
48 |             }
49 |         }
50 |     },
51 |     "stats": {
52 |         "started_at": "2024-05-05T07:59:35",
53 |         "completed_at": "2024-05-05T08:00:03",
54 |         "model_usage": {
55 |             "openai/gpt-4": {
56 |                 "input_tokens": 8868,
57 |                 "output_tokens": 1351,
58 |                 "total_tokens": 10219
59 |             }
60 |         }
61 |     },
62 |     "logging": []
63 | }


--------------------------------------------------------------------------------
/src/inspect_ai/model/_providers/util.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from logging import getLogger
 4 | from typing import Any
 5 | 
 6 | from .._model import StopReason
 7 | from .._tool import ToolCall
 8 | 
 9 | logger = getLogger(__name__)
10 | 
11 | 
12 | def as_stop_reason(reason: str | None) -> StopReason:
13 |     """Encode common reason strings into standard StopReason."""
14 |     match reason:
15 |         case "stop" | "eos":
16 |             return "stop"
17 |         case "length" | "content_filter":
18 |             return reason
19 |         case "model_length":
20 |             return "length"
21 |         case "tool_calls" | "function_call":
22 |             return "tool_calls"
23 |         case _:
24 |             return "unknown"
25 | 
26 | 
27 | def model_base_url(base_url: str | None, env_vars: str | list[str]) -> str | None:
28 |     if base_url:
29 |         return base_url
30 | 
31 |     if isinstance(env_vars, str):
32 |         env_vars = [env_vars]
33 | 
34 |     for env_var in env_vars:
35 |         base_url = os.getenv(env_var, None)
36 |         if base_url:
37 |             return base_url
38 | 
39 |     return os.getenv("INSPECT_EVAL_MODEL_BASE_URL", None)
40 | 
41 | 
42 | def parse_tool_call(id: str, function: str, arguments: str) -> ToolCall:
43 |     error: str | None = None
44 |     arguments_dict: dict[str, Any] = {}
45 |     try:
46 |         arguments_dict = json.loads(arguments)
47 |     except json.JSONDecodeError as ex:
48 |         # define and log error
49 |         error = f"Error parsing the following tool call arguments:\n{arguments}\nError details: {ex}"
50 |         logger.warning(error)
51 | 
52 |     # return ToolCall with error payload
53 |     return ToolCall(
54 |         id=id,
55 |         function=function,
56 |         arguments=arguments_dict,
57 |         type="function",
58 |         parse_error=error,
59 |     )
60 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/libs/prism/prism.min.css:
--------------------------------------------------------------------------------
1 | /* PrismJS 1.29.0
2 | https://prismjs.com/download.html#themes=prism&languages=markup+css+clike+javascript */
3 | code[class*=language-],pre[class*=language-]{color:#000;background:0 0;text-shadow:0 1px #fff;font-family:Consolas,Monaco,'Andale Mono','Ubuntu Mono',monospace;font-size:1em;text-align:left;white-space:pre;word-spacing:normal;word-break:normal;word-wrap:normal;line-height:1.5;-moz-tab-size:4;-o-tab-size:4;tab-size:4;-webkit-hyphens:none;-moz-hyphens:none;-ms-hyphens:none;hyphens:none}code[class*=language-] ::-moz-selection,code[class*=language-]::-moz-selection,pre[class*=language-] ::-moz-selection,pre[class*=language-]::-moz-selection{text-shadow:none;background:#b3d4fc}code[class*=language-] ::selection,code[class*=language-]::selection,pre[class*=language-] ::selection,pre[class*=language-]::selection{text-shadow:none;background:#b3d4fc}@media print{code[class*=language-],pre[class*=language-]{text-shadow:none}}pre[class*=language-]{padding:1em;margin:.5em 0;overflow:auto}:not(pre)>code[class*=language-],pre[class*=language-]{background:#f5f2f0}:not(pre)>code[class*=language-]{padding:.1em;border-radius:.3em;white-space:normal}.token.cdata,.token.comment,.token.doctype,.token.prolog{color:#708090}.token.punctuation{color:#999}.token.namespace{opacity:.7}.token.boolean,.token.constant,.token.deleted,.token.number,.token.property,.token.symbol,.token.tag{color:#905}.token.attr-name,.token.builtin,.token.char,.token.inserted,.token.selector,.token.string{color:#690}.language-css .token.string,.style .token.string,.token.entity,.token.operator,.token.url{color:#9a6e3a;background:hsla(0,0%,100%,.5)}.token.atrule,.token.attr-value,.token.keyword{color:#07a}.token.class-name,.token.function{color:#dd4a68}.token.important,.token.regex,.token.variable{color:#e90}.token.bold,.token.important{font-weight:700}.token.italic{font-style:italic}.token.entity{cursor:help}
4 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/components/MessageContent.mjs:
--------------------------------------------------------------------------------
 1 | import { html } from "htm/preact";
 2 | import { MarkdownDiv } from "./MarkdownDiv.mjs";
 3 | 
 4 | export const MessageContent = (props) => {
 5 |   const { contents } = props;
 6 |   if (Array.isArray(contents)) {
 7 |     return contents.map((content, index) => {
 8 |       if (typeof content === "string") {
 9 |         return renderer.render(content, index === contents.length - 1);
10 |       } else {
11 |         const renderer = messageRenderers[content.type];
12 |         if (renderer) {
13 |           return renderer.render(content, index === contents.length - 1);
14 |         } else {
15 |           console.error(`Unknown message content type '${content.type}'`);
16 |         }
17 |       }
18 |     });
19 |   } else {
20 |     // This is a simple string
21 |     return messageRenderers["text"].render({ text: contents });
22 |   }
23 | };
24 | 
25 | const messageRenderers = {
26 |   text: {
27 |     render: (content, isLast) => {
28 |       return html`<${MarkdownDiv}
29 |         markdown=${content.text}
30 |         class=${isLast ? "no-last-para-padding" : ""}
31 |       />`;
32 |     },
33 |   },
34 |   image: {
35 |     render: (content, isLast) => {
36 |       return html`<img
37 |         src="${content.image}"
38 |         style=${{
39 |           maxWidth: "400px",
40 |           border: "solid var(--bs-border-color) 1px",
41 |         }}
42 |       />`;
43 |     },
44 |   },
45 |   tool: {
46 |     render: (content, isLast) => {
47 |       return html`<pre
48 |         style=${{
49 |           border: "solid var(--bs-border-color) 1px",
50 |           padding: "1em",
51 |           marginTop: "0.5em",
52 |           whiteSpace: "pre-wrap",
53 |           maxHeight: "50em",
54 |           overflow: "scroll"
55 | 
56 |         }}
57 |       ><code class="sourceCode">
58 |       ${content.text}
59 |       </code></pre>`;
60 |     },
61 |   },
62 | };
63 | 


--------------------------------------------------------------------------------
/benchmarks/gpqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | GPQA: A Graduate-Level Google-Proof Q&A Benchmark
 3 | 
 4 | David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard
 5 | Yuanzhe Pang, Julien Dirani, Julian Michael, Samuel R. Bowman
 6 | https://arxiv.org/abs/2311.12022
 7 | 
 8 | Based on: https://github.com/openai/simple-evals/blob/main/gpqa_eval.py
 9 | 
10 | # eval for default epochs (4)
11 | inspect eval gpqa.py
12 | 
13 | # eval with 1 epoch
14 | inspect eval gpqa.py --epochs 1
15 | 
16 | # without chain of thought
17 | inspect eval gpqa.py -T cot=false
18 | """
19 | 
20 | 
21 | from inspect_ai import Task, task
22 | from inspect_ai.dataset import Sample, csv_dataset
23 | from inspect_ai.model import GenerateConfig
24 | from inspect_ai.scorer import answer
25 | from inspect_ai.solver import multiple_choice
26 | 
27 | # default epochs to run eval for
28 | DEFAULT_EPOCHS = 4
29 | 
30 | 
31 | # map records to inspect samples (note that target is always "A" in the,
32 | # dataset, we will shuffle the presentation of options to mitigate this)
33 | def record_to_sample(record):
34 |     return Sample(
35 |         input=record["Question"],
36 |         choices=[
37 |             str(record["Correct Answer"]),
38 |             str(record["Incorrect Answer 1"]),
39 |             str(record["Incorrect Answer 2"]),
40 |             str(record["Incorrect Answer 3"]),
41 |         ],
42 |         target="A",
43 |         id=record["Record ID"],
44 |     )
45 | 
46 | 
47 | @task
48 | def gpqa_diamond(cot=True):
49 |     return Task(
50 |         dataset=csv_dataset(
51 |             csv_file="https://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv",
52 |             sample_fields=record_to_sample,
53 |         ),
54 |         plan=[
55 |             multiple_choice(cot=cot, shuffle=True),
56 |         ],
57 |         scorer=answer("letter"),
58 |         config=GenerateConfig(temperature=0.5),
59 |         epochs=DEFAULT_EPOCHS,
60 |     )
61 | 


--------------------------------------------------------------------------------
/src/inspect_ai/solver/_tool/use_tools.py:
--------------------------------------------------------------------------------
 1 | from inspect_ai.model import (
 2 |     ChatMessageSystem,
 3 |     ToolChoice,
 4 | )
 5 | 
 6 | from .._solver import Generate, Solver, TaskState, solver
 7 | from .._util import append_system_message
 8 | from .tool import Tool
 9 | from .tool_def import tool_defs
10 | 
11 | 
12 | @solver
13 | def use_tools(
14 |     tools: Tool | list[Tool] | None = None, tool_choice: ToolChoice = "auto"
15 | ) -> Solver:
16 |     """
17 |     Solver that inject tools into the task state to be used in generate().
18 | 
19 |     Args:
20 |         tools (Tool | list[Tool]): one or more tools to inject into the task state.
21 |         tool_choice (ToolChoice | None): Directive indicating which
22 |           tools the model should use.
23 | 
24 |     Returns:
25 |         A solver that injects the tools and tool_choice into the task state.
26 |     """
27 |     # create tool defs
28 |     tools = tools if isinstance(tools, list) else [tools] if tools else None
29 |     tdefs = tool_defs(tools) if tools else None
30 | 
31 |     async def solve(state: TaskState, generate: Generate) -> TaskState:
32 |         # register the tools
33 |         if tools and tdefs:
34 |             state.tools.extend(tools)
35 | 
36 |             # append the tools system prompts. mark the 'source' of messages
37 |             # as tool so they can be removed if tool_choice == "none"
38 |             for tool in tdefs:
39 |                 if tool.prompt:
40 |                     append_system_message(
41 |                         state.messages,
42 |                         ChatMessageSystem(content=tool.prompt, tool=tool.name),
43 |                     )
44 | 
45 |         # set tool choice (note you can call this function w/o tools
46 |         # for just the side effect of enabling/disabling tool usage)
47 |         state.tool_choice = tool_choice
48 | 
49 |         # return state
50 |         return state
51 | 
52 |     return solve
53 | 


--------------------------------------------------------------------------------
/src/inspect_ai/scorer/_match.py:
--------------------------------------------------------------------------------
 1 | from typing import Literal
 2 | 
 3 | from ._common import match_str, str_match_scorer
 4 | from ._metrics import accuracy, bootstrap_std
 5 | from ._scorer import Scorer, scorer
 6 | 
 7 | 
 8 | @scorer(metrics=[accuracy(), bootstrap_std()])
 9 | def match(
10 |     location: Literal["begin", "end", "any", "exact"] = "end",
11 |     *,
12 |     ignore_case: bool = True,
13 |     numeric: bool = False,
14 | ) -> Scorer:
15 |     """Scorer which matches text or a number.
16 | 
17 |     Args:
18 |        location (Literal["begin", "end", "any", "exact"]):
19 |           Location to match at. "any" matches anywhere in the
20 |           output; "exact" requires the output be exactly
21 |           equal to the target (module whitespace, etc.)
22 |        ignore_case (bool): Do case insensitive comparison.
23 |        numeric (bool): Is this a numeric match? (in this
24 |           case different punctuation removal rules are
25 |           used and numbers are normalized before comparison).
26 |     """
27 | 
28 |     def check(value: str, target: str) -> tuple[str, bool]:
29 |         return match_str(
30 |             value=value,
31 |             target=target,
32 |             location=location,
33 |             ignore_case=ignore_case,
34 |             numeric=numeric,
35 |         )
36 | 
37 |     return str_match_scorer(check)
38 | 
39 | 
40 | @scorer(metrics=[accuracy(), bootstrap_std()])
41 | def includes(ignore_case: bool = True) -> Scorer:
42 |     """Check whether the specified text is included in the model output.
43 | 
44 |     Args:
45 |        ignore_case (bool): Use a case insensitive comparison.
46 | 
47 |     """
48 | 
49 |     def check(value: str, target: str) -> tuple[str, bool]:
50 |         if ignore_case:
51 |             idx = value.lower().rfind(target.lower())
52 |         else:
53 |             idx = value.rfind(target)
54 |         return value, idx != -1
55 | 
56 |     return str_match_scorer(check)
57 | 


--------------------------------------------------------------------------------
/tools/vscode/src/core/git.ts:
--------------------------------------------------------------------------------
 1 | import { existsSync, readFileSync, writeFileSync } from "fs";
 2 | import path from "path";
 3 | import { lines } from "./text";
 4 | import { runProcess } from "./process";
 5 | import { AbsolutePath } from "./path";
 6 | import { platform } from "os";
 7 | 
 8 | export function ensureGitignore(
 9 |   dir: AbsolutePath,
10 |   entries: string[]
11 | ): boolean {
12 |   // if .gitignore exists, then ensure it has the requisite entries
13 |   const gitignorePath = path.join(dir.path, ".gitignore");
14 |   if (existsSync(gitignorePath)) {
15 |     const gitignore = lines(
16 |       readFileSync(gitignorePath, {
17 |         encoding: "utf-8",
18 |       })
19 |     ).map((line) => line.trim());
20 |     const requiredEntries: string[] = [];
21 |     for (const requiredEntry of entries) {
22 |       if (!gitignore.includes(requiredEntry)) {
23 |         requiredEntries.push(requiredEntry);
24 |       }
25 |     }
26 |     if (requiredEntries.length > 0) {
27 |       writeGitignore(dir.path, gitignore.concat(requiredEntries));
28 |       return true;
29 |     } else {
30 |       return false;
31 |     }
32 |   } else {
33 |     // if it doesn't exist then auto-create if we are in a git project or we had the force flag
34 |     try {
35 |       const result = runProcess("git", ["rev-parse"], dir);
36 |       if (result) {
37 |         createGitignore(dir.path, entries);
38 |         return true;
39 |       } else {
40 |         return false;
41 |       }
42 |     } catch {
43 |       return false;
44 |     }
45 |   }
46 | }
47 | 
48 | export function createGitignore(dir: string, entries: string[]) {
49 |   writeGitignore(dir, entries);
50 | }
51 | 
52 | function writeGitignore(dir: string, lines: string[]) {
53 |   const lineEnding = platform() === "win32" ? "\r\n" : "\n";
54 |   writeFileSync(
55 |     path.join(dir, ".gitignore"),
56 |     lines.join(lineEnding) + lineEnding,
57 |     { encoding: "utf-8" }
58 |   );
59 | }
60 | 
61 | 


--------------------------------------------------------------------------------
/.github/workflows/build.yml:
--------------------------------------------------------------------------------
 1 | name: Build
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 |       - "release/**"
11 | 
12 | jobs:
13 |   ruff:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: ["3.10", "3.11"]
18 |     steps:
19 |       - uses: actions/checkout@v4
20 |       - name: Lint and format with Ruff
21 |         uses: chartboost/ruff-action@v1
22 | 
23 |   mypy:
24 |     runs-on: ubuntu-latest
25 |     strategy:
26 |       matrix:
27 |         python-version: ["3.10", "3.11"]
28 |     steps:
29 |       - uses: actions/checkout@v4
30 |       - name: Set up Python ${{ matrix.python-version }}
31 |         uses: actions/setup-python@v4
32 |         with:
33 |           python-version: ${{ matrix.python-version }}
34 |       - name: Install dependencies
35 |         run: |
36 |           python -m pip install --upgrade pip
37 |           python -m pip install .[dev]
38 |       - name: Run mypy
39 |         run: |
40 |           mypy --exclude tests/test_package src tests
41 | 
42 |   test:
43 |     runs-on: ubuntu-latest
44 |     strategy:
45 |       matrix:
46 |         python-version: ["3.10", "3.11"]
47 | 
48 |     steps:
49 |       - uses: actions/checkout@v4
50 |       - name: Set up Python ${{ matrix.python-version }}
51 |         uses: actions/setup-python@v4
52 |         with:
53 |           python-version: ${{ matrix.python-version }}
54 |       - name: Install dependencies
55 |         run: |
56 |           python -m pip install --upgrade pip
57 |           python -m pip install .[dev]
58 |       - name: Test with pytest
59 |         run: |
60 |           pytest -rA --doctest-modules --color=yes --cov=inspect_ai
61 | 
62 |   package:
63 |     name: Build & inspect the package.
64 |     runs-on: ubuntu-latest
65 | 
66 |     steps:
67 |       - uses: actions/checkout@v4
68 |       - uses: hynek/build-and-inspect-python-package@v1
69 | 


--------------------------------------------------------------------------------
/tests/test_subprocess.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from pathlib import Path
 3 | 
 4 | import pytest
 5 | 
 6 | from inspect_ai.util import subprocess
 7 | 
 8 | 
 9 | @pytest.mark.asyncio
10 | async def test_subprocess_execute():
11 |     result = await subprocess(["python3", "-c", "print('foo')"])
12 |     assert result.stdout.strip() == "foo"
13 | 
14 | 
15 | @pytest.mark.asyncio
16 | async def test_subprocess_fail():
17 |     result = await subprocess(["python4"])
18 |     assert result.success is False
19 | 
20 | 
21 | @pytest.mark.asyncio
22 | async def test_subprocess_stdin():
23 |     input = "tell me a story"
24 |     result = await subprocess(
25 |         ["python3", "-c", "import sys; print(sys.stdin.read())"], input=input
26 |     )
27 |     assert result.stdout.strip() == input
28 | 
29 | 
30 | @pytest.mark.asyncio
31 | async def test_subprocess_binary():
32 |     input = "tell me a story".encode()
33 |     result = await subprocess(
34 |         ["python3", "-c", "import sys; print(sys.stdin.read())"],
35 |         text=False,
36 |         input=input,
37 |     )
38 |     assert result.stdout.decode().strip() == input.decode()
39 | 
40 | 
41 | @pytest.mark.asyncio
42 | async def test_subprocess_cwd():
43 |     parent_dir = Path(os.getcwd()).parent.as_posix()
44 |     result = await subprocess(
45 |         ["python3", "-c", "import os; print(os.getcwd())"], cwd=parent_dir
46 |     )
47 |     assert result.stdout.strip() == parent_dir
48 | 
49 | 
50 | @pytest.mark.asyncio
51 | async def test_subprocess_env():
52 |     ENV_VAR = "TEST_SUBPROCESS_ENV"
53 |     ENV_VALUE = "test value"
54 |     result = await subprocess(
55 |         ["python3", "-c", f"import os; print(os.getenv('{ENV_VAR}'))"],
56 |         env={ENV_VAR: ENV_VALUE},
57 |     )
58 |     assert result.stdout.strip() == ENV_VALUE
59 | 
60 | 
61 | @pytest.mark.asyncio
62 | async def test_subprocess_timeout():
63 |     result = await subprocess(["sleep", "2"], timeout=1)
64 |     assert result.returncode == 1
65 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/api/api-browser.mjs:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | const loaded_time = Date.now()
 5 | let last_eval_time = 0
 6 | 
 7 | 
 8 | async function client_events() {
 9 |     const params = new URLSearchParams()
10 |     params.append("loaded_time", loaded_time.valueOf())
11 |     params.append("last_eval_time", last_eval_time.valueOf())
12 |     return api("GET", `/api/events?${params.toString()}`)
13 | }
14 | 
15 | async function eval_logs() {
16 |   const logs = await api("GET", `/api/logs`)
17 |   last_eval_time = Date.now()
18 |   return logs
19 | }
20 | 
21 | async function eval_log(file, headerOnly) {
22 |   if (headerOnly) {
23 |     return api("GET", `/api/logs/${file}?header-only=true`)
24 |   } else {
25 |     return api("GET", `/api/logs/${file}`)
26 |   }
27 | }
28 | 
29 | async function eval_log_headers(files) {
30 |   const params = new URLSearchParams();
31 |   for (const file of files) {
32 |     params.append("file", file);
33 |   }
34 |   return api("GET", `/api/log-headers?${params.toString()}`)
35 | }
36 | 
37 | async function api(method, path, body) {
38 |   // build headers
39 |   const headers = {
40 |     Accept: "application/json",
41 |     Pragma: "no-cache",
42 |     Expires: "0",
43 |     ['Cache-Control']: 'no-cache',
44 |   }
45 |   if (body) {
46 |     headers["Content-Type"] = "application/json";
47 |   }
48 | 
49 |   // make request
50 |   const response = await fetch(`${path}`, { method, headers, body });
51 |   if (response.ok) {
52 |     const text = await response.text();
53 |     return JSON5.parse(text);
54 |   } else if (response.status !== 200) {
55 |     const message = await response.text() || response.statusText;
56 |     const error = new Error(`Error: ${response.status}: ${message})`)
57 |     throw error;
58 |   } else {
59 |     throw new Error(`${response.status} - ${response.statusText} `);
60 |   }
61 | 
62 | }
63 | 
64 | export default {
65 |   client_events,
66 |   eval_logs,
67 |   eval_log,
68 |   eval_log_headers
69 | }
70 | 
71 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/samples/SampleDialog.mjs:
--------------------------------------------------------------------------------
 1 | import { html } from "htm/preact";
 2 | 
 3 | import { icons } from "../Constants.mjs";
 4 | import { EmptyPanel } from "../components/EmptyPanel.mjs";
 5 | import { LargeModal } from "../components/LargeModal.mjs";
 6 | 
 7 | import { SampleDisplay } from "./SampleDisplay.mjs"
 8 | 
 9 | export const SampleDialog = (props) => {
10 |   const {
11 |     id,
12 |     index,
13 |     task,
14 |     model,
15 |     title,
16 |     sample,
17 |     sampleDescriptor,
18 |     nextSample,
19 |     prevSample,
20 |     context,
21 |   } = props;
22 | 
23 |   // If there is no sample, just show an empty panel
24 |   // This should never happen
25 |   if (!sample) {
26 |     return html`<${LargeModal} id=${id} title="No Sample"><${EmptyPanel}>No Sample Selected</${EmptyPanel}></${LargeModal}>`;
27 |   }
28 | 
29 |   const nextTool = {
30 |     label: "Next Sample",
31 |     icon: icons.next,
32 |     onclick: nextSample,
33 |     enabled: !!nextSample,
34 |   };
35 | 
36 |   const prevTool = {
37 |     label: "Previous Sample",
38 |     icon: icons.previous,
39 |     onclick: prevSample,
40 |     enabled: !!prevSample,
41 |   };
42 | 
43 |   // Provide the dialog
44 |   return html`
45 |     <${LargeModal} 
46 |       id=${id} 
47 |       detail=${title}
48 |       detailTools=${{
49 |         left: [prevTool],
50 |         right: [nextTool],
51 |       }}
52 |       onkeyup=${(e) => {
53 |         switch (e.key) {
54 |           case "ArrowRight":
55 |             if (nextSample) {
56 |               nextSample();
57 |             }
58 |             break;
59 |           case "ArrowLeft":
60 |             if (prevSample) {
61 |               prevSample();
62 |             }
63 |             break;
64 |         }
65 |       }}   
66 |     >
67 |     <${SampleDisplay}
68 |       index=${index}
69 |       id=${id}
70 |       sample=${sample}
71 |       sampleDescriptor=${sampleDescriptor}
72 |       context=${context}/>
73 |     </${LargeModal}>`;
74 | };
75 | 
76 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_cli/common.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | from typing import Any, Callable, Tuple, cast
 3 | 
 4 | import click
 5 | from typing_extensions import TypedDict
 6 | 
 7 | from inspect_ai._util.constants import DEFAULT_LOG_LEVEL
 8 | 
 9 | 
10 | class CommonOptions(TypedDict):
11 |     log_level: str
12 |     log_dir: str
13 |     debug: bool
14 |     debug_port: int
15 | 
16 | 
17 | def common_options(func: Callable[..., Any]) -> Callable[..., click.Context]:
18 |     @click.option(
19 |         "--log-level",
20 |         type=click.Choice(
21 |             ["debug", "http", "info", "warning", "error", "critical"],
22 |             case_sensitive=False,
23 |         ),
24 |         default=DEFAULT_LOG_LEVEL,
25 |         envvar="INSPECT_LOG_LEVEL",
26 |         help=f"Set the log level (defaults to '{DEFAULT_LOG_LEVEL}')",
27 |     )
28 |     @click.option(
29 |         "--log-dir",
30 |         type=str,
31 |         default="./logs",
32 |         envvar="INSPECT_LOG_DIR",
33 |         help="Directory for log files.",
34 |     )
35 |     @click.option(
36 |         "--debug", is_flag=True, envvar="INSPECT_DEBUG", help="Wait to attach debugger"
37 |     )
38 |     @click.option(
39 |         "--debug-port",
40 |         default=5678,
41 |         envvar="INSPECT_DEBUG_PORT",
42 |         help="Port number for debugger",
43 |     )
44 |     @functools.wraps(func)
45 |     def wrapper(*args: Any, **kwargs: Any) -> click.Context:
46 |         return cast(click.Context, func(*args, **kwargs))
47 | 
48 |     return wrapper
49 | 
50 | 
51 | def resolve_common_options(options: CommonOptions) -> Tuple[str, str]:
52 |     # attach debugger if requested
53 |     if options["debug"]:
54 |         import debugpy  # type: ignore
55 | 
56 |         debugpy.listen(options["debug_port"])
57 |         print("Waiting for debugger attach")
58 |         debugpy.wait_for_client()
59 |         print("Debugger attached")
60 | 
61 |     # return resolved options
62 |     return (options["log_dir"], options["log_level"])
63 | 


--------------------------------------------------------------------------------
/tests/test_collapse_user_message.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from inspect_ai.model import (
 4 |     ChatMessageAssistant,
 5 |     ChatMessageUser,
 6 |     ContentImage,
 7 |     ContentText,
 8 | )
 9 | from inspect_ai.model._model import collapse_consecutive_user_messages
10 | 
11 | 
12 | @pytest.fixture
13 | def user_message_str():
14 |     return ChatMessageUser(content="User message")
15 | 
16 | 
17 | @pytest.fixture
18 | def user_message_image_and_str():
19 |     return ChatMessageUser(
20 |         content=[ContentImage(image="foo"), ContentText(text="Message")]
21 |     )
22 | 
23 | 
24 | @pytest.fixture
25 | def assistant_message():
26 |     return ChatMessageAssistant(content="Assistant message")
27 | 
28 | 
29 | @pytest.fixture
30 | def combined_user_message():
31 |     return ChatMessageUser(
32 |         content=[ContentText(text="Message 1"), ContentText(text="Message 2")]
33 |     )
34 | 
35 | 
36 | def test_collapse_consecutive_user_messages_single_user_message(user_message_str):
37 |     messages = [user_message_str]
38 |     assert collapse_consecutive_user_messages(messages) == messages
39 | 
40 | 
41 | def test_collapse_consecutive_user_messages_alternating_messages(
42 |     user_message_str, assistant_message
43 | ):
44 |     messages = [user_message_str, assistant_message, user_message_str]
45 |     assert collapse_consecutive_user_messages(messages) == messages
46 | 
47 | 
48 | def test_collapse_consecutive_user_messages_consecutive_user_messages(user_message_str):
49 |     messages = [user_message_str, user_message_str, user_message_str]
50 |     assert len(collapse_consecutive_user_messages(messages)) == 1
51 | 
52 | 
53 | def test_collapse_consecutive_user_messages_with_image_message(
54 |     user_message_image_and_str,
55 | ):
56 |     messages = [user_message_image_and_str, user_message_image_and_str]
57 |     assert len(collapse_consecutive_user_messages(messages)) == 1
58 |     assert isinstance(
59 |         collapse_consecutive_user_messages(messages)[0].content[0], ContentImage
60 |     )
61 | 


--------------------------------------------------------------------------------
/tests/test_collapse_assistant_message.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from inspect_ai.model import (
 4 |     ChatMessageAssistant,
 5 |     ChatMessageUser,
 6 |     ContentImage,
 7 |     ContentText,
 8 | )
 9 | from inspect_ai.model._model import collapse_consecutive_assistant_messages
10 | 
11 | 
12 | @pytest.fixture
13 | def user_message_str():
14 |     return ChatMessageUser(content="User message")
15 | 
16 | 
17 | @pytest.fixture
18 | def user_message_image_and_str():
19 |     return ChatMessageUser(
20 |         content=[ContentImage(image="foo"), ContentText(text="Message")]
21 |     )
22 | 
23 | 
24 | @pytest.fixture
25 | def assistant_message_str():
26 |     return ChatMessageAssistant(content="Assistant message")
27 | 
28 | 
29 | def test_collapse_consecutive_assistant_messages_single_assistant_message(
30 |     assistant_message_str,
31 | ):
32 |     messages = [assistant_message_str]
33 |     assert collapse_consecutive_assistant_messages(messages) == messages
34 | 
35 | 
36 | def test_collapse_consecutive_assistant_messages_alternating_messages(
37 |     user_message_str, user_message_image_and_str, assistant_message_str
38 | ):
39 |     messages = [user_message_str]
40 |     assert collapse_consecutive_assistant_messages(messages) == messages
41 | 
42 |     messages = [user_message_str, assistant_message_str]
43 |     assert collapse_consecutive_assistant_messages(messages) == messages
44 | 
45 |     messages = [user_message_str, assistant_message_str, user_message_str]
46 |     assert collapse_consecutive_assistant_messages(messages) == messages
47 | 
48 |     messages = [user_message_str, assistant_message_str, user_message_image_and_str]
49 |     assert collapse_consecutive_assistant_messages(messages) == messages
50 | 
51 | 
52 | def test_collapse_consecutive_assistant_messages_consecutive_assistant_messages(
53 |     assistant_message_str,
54 | ):
55 |     messages = [assistant_message_str, assistant_message_str, assistant_message_str]
56 |     assert len(collapse_consecutive_assistant_messages(messages)) == 1
57 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_util/platform.py:
--------------------------------------------------------------------------------
 1 | import importlib.util
 2 | import os
 3 | 
 4 | 
 5 | def running_in_notebook() -> bool:
 6 |     try:
 7 |         from IPython import get_ipython  # type: ignore
 8 | 
 9 |         if "IPKernelApp" not in get_ipython().config:  # type: ignore
10 |             return False
11 |     except ImportError:
12 |         return False
13 |     except AttributeError:
14 |         return False
15 |     return True
16 | 
17 | 
18 | def platform_init() -> None:
19 |     # if we are running in a notebook, confirm that we have ipywidgets
20 |     if running_in_notebook():
21 |         # check for required packages
22 |         if not have_package("ipywidgets"):
23 |             raise ModuleNotFoundError(
24 |                 "To using inspect_ai within a notebook, please install ipywidgets with:\n\n"
25 |                 + "pip install ipywidgets\n"
26 |             )
27 | 
28 |         # activate nest_asyncio (required so we operate properly within
29 |         # the Jupyter async event loop)
30 |         import nest_asyncio  # type: ignore
31 | 
32 |         nest_asyncio.apply()
33 | 
34 | 
35 | def have_package(package: str) -> bool:
36 |     return importlib.util.find_spec(package) is not None
37 | 
38 | 
39 | def is_running_in_jupyterlab() -> bool:
40 |     return os.getenv("JPY_SESSION_NAME", None) is not None
41 | 
42 | 
43 | def is_running_in_vscode() -> bool:
44 |     # Check if running in VS Code Jupyter notebook or interactive window
45 |     if (
46 |         os.getenv("VSCODE_IPYTHON_KERNEL") is not None
47 |         or os.getenv("VSCODE_CLI_REQUIRE_TOKEN") is not None
48 |         or os.getenv("VSCODE_PID") is not None
49 |         or os.getenv("VSCODE_CWD") is not None
50 |     ):
51 |         return True
52 |     # Check if running in a VS Code terminal
53 |     if os.getenv("TERM_PROGRAM") == "vscode":
54 |         return True
55 | 
56 |     # If none of the conditions are met, we assume it's not running in VS Code
57 |     return False
58 | 
59 | 
60 | def is_windows() -> bool:
61 |     return os.name == "nt"
62 | 


--------------------------------------------------------------------------------
/tests/test_logprobs.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from test_helpers.utils import (
 3 |     skip_if_github_action,
 4 |     skip_if_no_openai,
 5 |     skip_if_no_together,
 6 | )
 7 | 
 8 | from inspect_ai.model import ChatMessageUser, GenerateConfig, ModelOutput, get_model
 9 | 
10 | 
11 | async def generate_with_logprobs(model_name, **model_kwargs) -> ModelOutput:
12 |     model = get_model(
13 |         model_name,
14 |         config=GenerateConfig(
15 |             logprobs=True, top_logprobs=2, temperature=0.001, max_tokens=50
16 |         ),
17 |         **model_kwargs,
18 |     )
19 | 
20 |     message = ChatMessageUser(content="Hello.")
21 |     return await model.generate(input=[message])
22 | 
23 | 
24 | @pytest.mark.asyncio
25 | @skip_if_no_openai
26 | async def test_openai_logprobs() -> None:
27 |     response = await generate_with_logprobs("openai/gpt-3.5-turbo")
28 |     assert response.choices[0].logprobs is not None
29 |     assert response.choices[0].logprobs.content[0].top_logprobs is not None
30 |     assert len(response.choices[0].logprobs.content[0].top_logprobs) == 2
31 | 
32 | 
33 | @pytest.mark.asyncio
34 | @skip_if_no_together
35 | async def test_together_logprobs() -> None:
36 |     response = await generate_with_logprobs("together/lmsys/vicuna-13b-v1.5")
37 |     assert (
38 |         response.choices[0].logprobs is not None
39 |         and response.choices[0].logprobs.content[0].top_logprobs
40 |         is None  # together only ever returns top-1, so top_logprobs should always be None
41 |     )
42 | 
43 | 
44 | @pytest.mark.asyncio
45 | @skip_if_github_action
46 | async def test_hf_logprobs() -> None:
47 |     response = await generate_with_logprobs(
48 |         "hf/EleutherAI/pythia-70m",
49 |         chat_template="{% for message in messages %}{{ message.content }}{% endfor %}",
50 |     )
51 |     assert (
52 |         response.choices[0].logprobs
53 |         and response.choices[0].logprobs.content[0].top_logprobs is not None
54 |     )
55 |     assert len(response.choices[0].logprobs.content[0].top_logprobs) == 2
56 | 


--------------------------------------------------------------------------------
/tests/test_eval_log.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import os
 3 | 
 4 | import pytest
 5 | from pydantic_core import PydanticSerializationError
 6 | from test_helpers.utils import skip_if_no_openai
 7 | 
 8 | from inspect_ai import Task, eval
 9 | from inspect_ai.dataset import Sample
10 | from inspect_ai.log import read_eval_log
11 | from inspect_ai.solver import (
12 |     Generate,
13 |     Plan,
14 |     TaskState,
15 |     generate,
16 |     solver,
17 | )
18 | 
19 | 
20 | def log_path(file: str) -> str:
21 |     # use .txt extension so vscode linter doesn't complain about invalid json
22 |     return os.path.join("tests", "test_eval_log", f"{file}.txt")
23 | 
24 | 
25 | class NotSerializable:
26 |     name: str
27 | 
28 | 
29 | @skip_if_no_openai
30 | def test_ignore_unserializable():
31 |     @solver
32 |     def inject_unserializable():
33 |         async def solve(state: TaskState, generate: Generate):
34 |             state.metadata["not serializable"] = NotSerializable
35 |             return state
36 | 
37 |         return solve
38 | 
39 |     task = Task(
40 |         dataset=[Sample(input="Say hello.", target="Hello")],
41 |         plan=Plan(steps=[inject_unserializable(), generate()]),
42 |     )
43 | 
44 |     try:
45 |         eval(task, model="openai/gpt-4")
46 |     except PydanticSerializationError:
47 |         assert False, "Eval raised Pydantic serialization error."
48 | 
49 | 
50 | def test_read_nan():
51 |     def check_for_nan(log):
52 |         assert math.isnan(log.results.metrics.get("accuracy").value)
53 | 
54 |     log_file = log_path("log_with_nan")
55 |     check_for_nan(read_eval_log(log_file))
56 |     check_for_nan(read_eval_log(log_file, header_only=True))
57 | 
58 | 
59 | def test_fail_invalid():
60 |     check_log_raises(log_path("log_invalid"))
61 | 
62 | 
63 | def test_fail_version():
64 |     check_log_raises(log_path("log_version_2"))
65 | 
66 | 
67 | def check_log_raises(log_file):
68 |     with pytest.raises(ValueError):
69 |         read_eval_log(log_file)
70 |     with pytest.raises(ValueError):
71 |         read_eval_log(log_file, header_only=True)
72 | 


--------------------------------------------------------------------------------
/src/inspect_ai/dataset/_examples/biology_qa.jsonl:
--------------------------------------------------------------------------------
 1 | {"id": "q1", "question": "Hansen's disease is more commonly known by which name?", "answer": "Leprosy"}
 2 | {"id": "q2", "question": "Botany is the study of what life form?", "answer": "Plants"}
 3 | {"id": "q3", "question": "What is the human body's largest organ?", "answer": "Skin"}
 4 | {"id": "q4", "question": "True or false: snails have teeth", "answer": "True"}
 5 | {"id": "q5", "question": "What part of the human body is the Mandible?", "answer": "Lower Jawbone"}
 6 | {"id": "q6", "question": "How many bones does an adult human have?", "answer": "206"}
 7 | {"id": "q7", "question": "True or false: jellyfish have hearts", "answer": "False"}
 8 | {"id": "q8", "question": "Which French microbiologist discovered the process of pasteurisation?", "answer": "Louis Pasteur"}
 9 | {"id": "q9", "question": "What year was the first animal cloned?", "answer": "1996"}
10 | {"id": "q10", "question": "Who discovered penicillin?", "answer": "Alexander Fleming"}
11 | {"id": "q11", "question": "When was the Human Genome project completed?", "answer": "2003"}
12 | {"id": "q12", "question": "How many species are estimated to live on Earth?", "answer": "8.7 million"}
13 | {"id": "q13", "question": "A DNA molecule is described as being what shape?", "answer": "Double helix"}
14 | {"id": "q14", "question": "Heterochromia results in which change in physical appearance?", "answer": "Different coloured eyes"}
15 | {"id": "q15", "question": "Crohn's disease is part of which disease group?", "answer": "Inflammatory bowel disease"}
16 | {"id": "q16", "question": "How many neck vertebrae do giraffes have, compared to a human's seven?", "answer": "Seven"}
17 | {"id": "q17", "question": "Which food substance helps move waste through the body?", "answer": "Fibre"}
18 | {"id": "q18", "question": "The term 'renal' refers to which organs?", "answer": "Kidneys"}
19 | {"id": "q19", "question": "What is the name of the biggest part of the human brain?", "answer": "The cerebrum"}
20 | {"id": "q20", "question": "Can cell walls be found in plant cells, animal cells or both?", "answer": "Plant cells"}
21 | 


--------------------------------------------------------------------------------
/src/inspect_ai/dataset/_sources/example.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | from typing import Literal
 3 | 
 4 | from .._dataset import Dataset, FieldSpec, MemoryDataset, RecordToSample
 5 | from .csv import csv_dataset
 6 | from .json import json_dataset
 7 | 
 8 | EXAMPLES_PATH = Path(__file__).parent.parent / "_examples"
 9 | 
10 | 
11 | def example_dataset(
12 |     name: Literal["security_guide", "theory_of_mind", "popularity", "biology_qa"],
13 |     sample_fields: FieldSpec | RecordToSample | None = None,
14 | ) -> Dataset:
15 |     """Read a dataset from inspect_ai package examples.
16 | 
17 |     This is primarily used for sharing runnable example
18 |     snippets that don't need to read an external dataset.
19 | 
20 |     Args:
21 |       name (Literal["security_guide", "theory_of_mind", "popularity", "biology_qa"]):
22 |          Example dataset name. One of 'security_guide', 'theory_of_mind',
23 |         'popularity', or 'biology_qa'
24 |       sample_fields (SampleFieldSpec | RecordToSample): Method of mapping underlying
25 |         fields in the data source to `Sample` objects. Pass `None` if the data is already
26 |         stored in `Sample` form (i.e. object with "input" and "target" fields); Pass a
27 |         `SampleFieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to
28 |         handle mapping with a custom function.
29 | 
30 | 
31 |     Returns:
32 |       Dataset read from example file.
33 |     """
34 |     json_file = (EXAMPLES_PATH / f"{name}.jsonl").as_posix()
35 |     csv_file = (EXAMPLES_PATH / f"{name}.csv").as_posix()
36 |     if not Path(json_file).exists() and Path(csv_file).exists():
37 |         raise ValueError(f"Sample dataset {name} not found.")
38 | 
39 |     if Path(json_file).exists():
40 |         dataset = json_dataset(
41 |             json_file=json_file,
42 |             sample_fields=sample_fields,
43 |         )
44 |     else:
45 |         dataset = csv_dataset(
46 |             csv_file=csv_file,
47 |             sample_fields=sample_fields,
48 |         )
49 | 
50 |     return MemoryDataset(samples=list(dataset), name=name, location=f"example://{name}")
51 | 


--------------------------------------------------------------------------------
/tools/vscode/README.md:
--------------------------------------------------------------------------------
 1 | # inspect-vscode
 2 | 
 3 | VS Code extension for the Inspect framework for large language model evaluations. This extension provides support for developing evaluations using Inspect, including:
 4 | 
 5 | - Integrated viewer for evaluation log files
 6 | - Panel to browse, run, and debug tasks in the workspace
 7 | - Panel for editing Inspect `.env` file
 8 | - Panel for configuring task CLI options and args
 9 | - Commands and key-bindings for running tasks
10 | - Commands and key-bindings for debugging tasks
11 | 
12 | ## Log Viewer
13 | 
14 | The `inspect view` command is used to automatically display the log for tasks executed within the workspace (this behavior can be controlled with an option).
15 | 
16 | ## Task Navigation
17 | 
18 | The Tasks panel displays a listing of all the Inspect tasks within your workspace. Selecting the source file or task within the listing will open the task source code in the source editor (or Notebook viewer). You can display a tree of tasks including folders and hierarchy or a flat list of tasks sorted alphabetically.
19 | 
20 | ## Configuration Panel
21 | 
22 | Use the Configuration (.env) panel to edit common settings in your `.env.` file including the model provider and name, and the log directory and level.
23 | 
24 | ## Task Panel
25 | 
26 | Use the Task panel to edit CLI options for a task, set task args, and run or debug a task. Values will be saved for each task and used whenever the task is run or debugged from within the Inspect VS Code extension.
27 | 
28 | ## Running and Debugging
29 | 
30 | The Inspect VS Code extension includes commands and keyboard shortcuts for running or debugging tasks. After the task has been completed, `inspect view` is used behind the scenes to provide a results pane within VS Code alongside your source code.
31 | 
32 | Use the run or debug commands to execute the current task. You can alternatively use the <kbd>Ctrl+Shift+U</kbd> keyboard shortcut to run a task, or the <kbd>Ctrl+Shift+T</kbd> keyboard shortcut to debug a task.
33 | 
34 | > Note that on the Mac you should use `Cmd` rather than `Ctrl` as the prefix for all Inspect keyboard shortcuts.
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/examples/agents/langchain/README.md:
--------------------------------------------------------------------------------
 1 | ## LangChain Agent
 2 | 
 3 | This example demonstrates creating a custom solver that utilises a LangChain agent to perform Q and A using Wikipedia. The example includes the following source files:
 4 | 
 5 | | File                   | Description                                                                                     |
 6 | |------------------------|-------------------------------------------------------------------------------------------------|
 7 | | `.gitignore`           | Ignore the `.venv` directory and the `.env` file containing environment variables for the eval. |
 8 | | `.env.example`         | Prototype of `.env` file (copy this to `.env` and provide your `TAVILY_API_KEY`).               |
 9 | | `inspect_langchain.py` | Utilities for creating inspect solvers that use LangChain agents.                               |
10 | | `wikipedia.py`         | Evaluation task and custom solver that uses the search agent.                                   |
11 | | `wikipedia.jsonl`      | Dataset with questions and ideal answers.                                                       |
12 | 
13 | To run this example, first, be sure you provide a `.env` file that defines a `TAVILY_API_KEY` ([Tavily](https://tavily.com/) is a search API for LLM agents). Note that `.env` files should always be included in `.gitignore` as they often contain secrets!
14 | 
15 | Next, create a virtual environment and install the required dependencies:
16 | 
17 | ``` bash
18 | $ python3 -m venv .venv
19 | $ source .venv/bin/activate
20 | $ pip install -r requirements.txt
21 | ```
22 | 
23 | Now you should be able to run the example as follows:
24 | 
25 | ``` python
26 | $ inspect eval --model openai/gpt-4 
27 | ```
28 | 
29 | This example will run with any model provider that supports tool use (so Anthropic, Google Gemini, and Mistral will all work as well).
30 | 
31 | If you want to run in verbose mode (to see the agent's queries printed out), pass the `verbose` task parameter:
32 | 
33 | ``` bash
34 | $ inspect eval --model openai/gpt-4  -T verbose=true --limit 1
35 | ```
36 | 
37 | Note that we specify `--limit 1` so that the verbose output from multiple samples is not intermixed.


--------------------------------------------------------------------------------
/src/inspect_ai/_util/retry.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from typing import Callable
 3 | 
 4 | from httpx import ConnectError, ConnectTimeout, HTTPStatusError, ReadTimeout
 5 | from tenacity import RetryCallState
 6 | 
 7 | from inspect_ai._util.constants import HTTP
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | def httpx_should_retry(ex: BaseException) -> bool:
13 |     """Check whether an exception raised from httpx should be retried.
14 | 
15 |     Implements the strategy described here: https://cloud.google.com/storage/docs/retry-strategy
16 | 
17 |     Args:
18 |       ex (BaseException): Exception to examine for retry behavior
19 | 
20 |     Returns:
21 |       True if a retry should occur
22 |     """
23 |     # httpx status exception
24 |     if isinstance(ex, HTTPStatusError):
25 |         # request timeout
26 |         if ex.response.status_code == 408:
27 |             return True
28 |         # lock timeout
29 |         elif ex.response.status_code == 409:
30 |             return True
31 |         # rate limit
32 |         elif ex.response.status_code == 429:
33 |             return True
34 |         # internal errors
35 |         elif ex.response.status_code >= 500:
36 |             return True
37 |         else:
38 |             return False
39 | 
40 |     # connection error
41 |     elif is_httpx_connection_error(ex):
42 |         return True
43 | 
44 |     # don't retry
45 |     else:
46 |         return False
47 | 
48 | 
49 | def log_rate_limit_retry(context: str, retry_state: RetryCallState) -> None:
50 |     logger.log(
51 |         HTTP,
52 |         f"{context} rate limit retry {retry_state.attempt_number} after waiting for {retry_state.idle_for}",
53 |     )
54 | 
55 | 
56 | def log_retry_attempt(context: str) -> Callable[[RetryCallState], None]:
57 |     def log_attempt(retry_state: RetryCallState) -> None:
58 |         logger.log(
59 |             HTTP,
60 |             f"{context} connection retry {retry_state.attempt_number} after waiting for {retry_state.idle_for}",
61 |         )
62 | 
63 |     return log_attempt
64 | 
65 | 
66 | def is_httpx_connection_error(ex: BaseException) -> bool:
67 |     return isinstance(ex, ConnectTimeout | ConnectError | ConnectionError | ReadTimeout)
68 | 


--------------------------------------------------------------------------------
/src/inspect_ai/_view/www/src/usage/UsageCard.mjs:
--------------------------------------------------------------------------------
 1 | import { html } from "htm/preact";
 2 | 
 3 | import { icons } from "../Constants.mjs";
 4 | import { formatTime } from "../utils/Format.mjs";
 5 | import { Card, CardHeader, CardBody } from "../components/Card.mjs";
 6 | import { MetaDataView } from "../components/MetaDataView.mjs";
 7 | import { ModelTokenTable } from "./ModelTokenTable.mjs"
 8 | 
 9 | const kUsageCardBodyId = "usage-card-body";
10 | 
11 | export const UsageCard = ({ stats, context }) => {
12 |   if (!stats) {
13 |     return "";
14 |   }
15 | 
16 |   const totalDuration = duration(stats);
17 | 
18 |   const usageMetadataStyle = {
19 |     fontSize: "0.8em",
20 |   };
21 | 
22 |   return html`
23 | 
24 |     <${Card}>
25 |       <${CardHeader} icon=${icons.usage} label="Usage"/>
26 |       <${CardBody} id=${kUsageCardBodyId} style=${{
27 |         paddingTop: "0",
28 |         paddingBottom: "0",
29 |         borderTop: "solid var(--bs-border-color) 1px",
30 |       }}>
31 |         <div style=${{
32 |           paddingTop: "0",
33 |           paddingBottom: "1em",
34 |           marginLeft: "0",
35 |           display: "flex",
36 |         }}>
37 | 
38 |           <div style=${{ flex: "1 1 33%" }}>
39 |           <div class="card-subheading">Duration</div>
40 |           <${MetaDataView}
41 |             entries="${{
42 |               ["Start"]: new Date(stats.started_at).toLocaleString(),
43 |               ["End"]: new Date(stats.completed_at).toLocaleString(),
44 |               ["Duration"]: totalDuration,
45 |             }}"
46 |             tableOptions="borderless,sm"
47 |             context=${context}
48 |             style=${usageMetadataStyle}
49 |           />
50 |           </div>
51 | 
52 |           <div style=${{ flex: "1 1 67%" }}>
53 |             <${ModelTokenTable} model_usage=${stats.model_usage}/>
54 |           </div>
55 |         </div>
56 |       </${CardBody}>
57 |     </${Card}>
58 |   `;
59 | };
60 | 
61 | const duration = (stats) => {
62 |   const start = new Date(stats.started_at);
63 |   const end = new Date(stats.completed_at);
64 |   const durationMs = end.getTime() - start.getTime();
65 |   const durationSec = durationMs / 1000;
66 |   return formatTime(durationSec);
67 | };
68 | 
69 | 


--------------------------------------------------------------------------------
/tools/vscode/src/core/process.ts:
--------------------------------------------------------------------------------
 1 | import { SpawnSyncOptionsWithStringEncoding, spawn, spawnSync } from "child_process";
 2 | import { AbsolutePath } from "./path";
 3 | 
 4 | 
 5 | export function runProcess(
 6 |   cmd: string | AbsolutePath,
 7 |   args: string[],
 8 |   cwd?: AbsolutePath
 9 | ) {
10 | 
11 |   // Process options
12 |   const options: SpawnSyncOptionsWithStringEncoding = {
13 |     cwd: cwd?.path,
14 |     encoding: "utf-8",
15 |     windowsHide: true,
16 |     maxBuffer: 1000 * 1000 * 100
17 |   };
18 | 
19 |   cmd = typeof (cmd) === "string" ? cmd : cmd.path;
20 |   const result = spawnSync(cmd, args, options);
21 |   if (result.error) {
22 |     throw new Error(
23 |       `The process could not be started\n${result.error.message}`
24 |     );
25 |   } else if (result.status === 0) {
26 |     return result.stdout;
27 |   } else {
28 |     throw new Error(
29 |       `Command failed with code ${result.status}: ${result.stderr}`
30 |     );
31 |   }
32 | }
33 | 
34 | 
35 | export function spawnProcess(
36 |   cmd: string,
37 |   args: string[],
38 |   cwd: AbsolutePath,
39 |   io?: {
40 |     stdout?: (data: Buffer | string) => void;
41 |     stderr?: (data: Buffer | string) => void;
42 |   },
43 |   lifecycle?: {
44 |     onError?: (error: Error) => void;
45 |     onClose?: (code: number) => void;
46 |   }
47 | ) {
48 |   // Process options
49 |   const options = {
50 |     cwd: cwd.path,
51 |     detached: true,
52 |   };
53 | 
54 |   // Start the actual process
55 |   const process = spawn(cmd, args, options);
56 | 
57 |   // Capture stdout
58 |   if (process.stdout) {
59 |     if (io?.stdout) {
60 |       process.stdout.on("data", io.stdout);
61 |     }
62 |   } else {
63 |     throw new Error("Unexpectedly missing stdout from server");
64 |   }
65 | 
66 |   // Capture stderr
67 |   if (process.stderr) {
68 |     if (io?.stderr) {
69 |       process.stderr.on("data", io.stderr);
70 |     }
71 |   } else {
72 |     throw new Error("Unexpectedly missing stderr from server");
73 |   }
74 | 
75 |   // Note errors
76 |   if (lifecycle?.onError) {
77 |     process.on("error", lifecycle.onError);
78 |   }
79 | 
80 |   if (lifecycle?.onClose) {
81 |     process.on("close", lifecycle?.onClose);
82 |   }
83 |   return process;
84 | }


--------------------------------------------------------------------------------