├── src └── inspect_ai │ ├── py.typed │ ├── _view │ ├── www │ │ ├── .gitignore │ │ ├── preact │ │ │ ├── htm │ │ │ │ ├── preact.mjs │ │ │ │ ├── preact.js │ │ │ │ └── htm.mjs │ │ │ └── preact-hooks.mjs │ │ ├── src │ │ │ ├── utils │ │ │ │ ├── Type.mjs │ │ │ │ ├── sleep.mjs │ │ │ │ ├── Git.mjs │ │ │ │ ├── Path.mjs │ │ │ │ └── events.mjs │ │ │ ├── Register.mjs │ │ │ ├── api │ │ │ │ ├── index.mjs │ │ │ │ ├── api-vscode.mjs │ │ │ │ └── api-browser.mjs │ │ │ ├── log-reader │ │ │ │ ├── Native-Log-Reader.mjs │ │ │ │ └── Log-Reader.mjs │ │ │ ├── components │ │ │ │ ├── ToolButton.mjs │ │ │ │ ├── EmptyPanel.mjs │ │ │ │ ├── CopyButton.mjs │ │ │ │ ├── LabeledValue.mjs │ │ │ │ ├── LoadingScreen.mjs │ │ │ │ ├── AppErrorBoundary.mjs │ │ │ │ ├── AnsiDisplay.css │ │ │ │ ├── MorePopOver.mjs │ │ │ │ ├── ErrorPanel.mjs │ │ │ │ ├── MarkdownDiv.mjs │ │ │ │ ├── Dialog.mjs │ │ │ │ └── MessageContent.mjs │ │ │ ├── workspace │ │ │ │ └── TaskErrorPanel.mjs │ │ │ ├── samples │ │ │ │ ├── tools │ │ │ │ │ └── EpochFilter.mjs │ │ │ │ ├── SamplesTools.mjs │ │ │ │ └── SampleDialog.mjs │ │ │ └── usage │ │ │ │ ├── ModelTokenTable.mjs │ │ │ │ └── UsageCard.mjs │ │ ├── libs │ │ │ ├── bootstrap │ │ │ │ └── css │ │ │ │ │ └── fonts │ │ │ │ │ ├── bootstrap-icons.woff │ │ │ │ │ └── bootstrap-icons.woff2 │ │ │ └── prism │ │ │ │ └── prism.min.css │ │ └── favicon.svg │ └── schema.py │ ├── __main__.py │ ├── _eval │ └── task │ │ ├── constants.py │ │ └── util.py │ ├── _util │ ├── dev.py │ ├── pattern.py │ ├── datetime.py │ ├── _async.py │ ├── samples.py │ ├── docstring.py │ ├── appdirs.py │ ├── text.py │ ├── constants.py │ ├── version.py │ ├── url.py │ ├── error.py │ ├── git.py │ ├── images.py │ ├── json.py │ ├── platform.py │ └── retry.py │ ├── _display │ ├── __init__.py │ └── _display.py │ ├── scorer │ ├── _metrics │ │ ├── __init__.py │ │ ├── mean.py │ │ ├── accuracy.py │ │ └── std.py │ ├── __init__.py │ ├── _multi.py │ └── _match.py │ ├── util │ ├── __init__.py │ └── _context │ │ ├── __init__.py │ │ └── logger.py │ ├── solver │ ├── _util.py │ ├── __init__.py │ └── _tool │ │ └── use_tools.py │ ├── _cli │ ├── util.py │ ├── view.py │ ├── main.py │ └── common.py │ ├── dataset │ ├── __init__.py │ ├── _examples │ │ └── biology_qa.jsonl │ └── _sources │ │ └── example.py │ ├── model │ ├── _providers │ │ ├── ollama.py │ │ └── util.py │ ├── __init__.py │ └── _tool.py │ ├── __init__.py │ └── log │ └── __init__.py ├── tests ├── test_helpers │ └── __init__.py ├── test_task_list │ ├── __init__.py │ ├── multiple_dir │ │ ├── bar.py │ │ ├── foo.py │ │ ├── _decoy2.py │ │ └── _decoy │ │ │ └── testit.py │ ├── recurse │ │ ├── folder1 │ │ │ ├── _decoy.py │ │ │ └── theta.py │ │ ├── folder2 │ │ │ ├── another.py │ │ │ ├── first.py │ │ │ └── .folder3 │ │ │ │ └── epsilon.py │ │ └── .folder3 │ │ │ └── epsilon.py │ ├── multiple.py │ └── attribs.ipynb ├── test_package │ ├── inspect_package │ │ ├── py.typed │ │ ├── __init__.py │ │ ├── inspect_ai.py │ │ └── modelapi │ │ │ └── custom.py │ ├── .gitignore │ └── pyproject.toml ├── test_eval_log │ ├── log_invalid.txt │ ├── log_version_2.txt │ └── log_with_nan.txt ├── test_dataset │ ├── samples.csv │ ├── samples.jsonl │ └── samples.json ├── conftest.py ├── test_examples.py ├── test_cloudflare.py ├── test_model_package.py ├── test_anthropic.py ├── test_registry.py ├── test_openai.py ├── test_num_choices.py ├── test_list_task.py ├── test_images.py ├── test_retry.py ├── test_hf.py ├── test_plan.py ├── test_stop_reason.py ├── scorer │ └── test_answer.py ├── test_subprocess.py ├── test_collapse_user_message.py ├── test_collapse_assistant_message.py ├── test_logprobs.py └── test_eval_log.py ├── docs ├── .gitignore ├── _variables.yml ├── images │ ├── inspect.png │ ├── aisi-logo.png │ ├── eval-log.png │ ├── popularity.png │ ├── rate-limit.png │ ├── running-theory.png │ ├── inspect-view-home.png │ ├── inspect-view-info.png │ ├── inspect-view-main.png │ ├── inspect-view-sort.png │ ├── inspect-view-answers.png │ ├── inspect-view-filter.png │ ├── inspect-view-history.png │ ├── inspect-view-logging.png │ ├── inspect-view-scoring.png │ ├── inspect-view-splash.png │ ├── inspect-view-messages.png │ ├── inspect-view-metadata.png │ ├── inspect-vscode-config.png │ ├── inspect-vscode-install.png │ ├── inspect-vscode-logview.png │ ├── inspect-vscode-run-task.png │ ├── inspect-view-logging-console.png │ └── inspect-vscode-output-channel.png ├── _format │ └── pre-render.sh ├── _examples │ └── footer.qmd └── theme.scss ├── tools └── vscode │ ├── .yarnrc │ ├── tools │ └── ts-to-mjs │ │ ├── .gitignore │ │ ├── src │ │ └── index.ts │ │ ├── rollup.config.js │ │ └── package.json │ ├── src │ ├── providers │ │ ├── activity-bar │ │ │ ├── webview │ │ │ │ ├── task-config-webview.css │ │ │ │ └── env-config-webview.css │ │ │ └── task-config-commands.ts │ │ ├── inspect │ │ │ ├── inspect-constants.ts │ │ │ └── inspect-eval-commands.ts │ │ ├── logview │ │ │ ├── logview.ts │ │ │ ├── commands.ts │ │ │ ├── logview-manager.ts │ │ │ └── logview-link-provider.ts │ │ ├── workspace │ │ │ ├── workspace-init.ts │ │ │ └── workspace-env-commands.ts │ │ ├── active-task │ │ │ └── active-task-command.ts │ │ └── settings │ │ │ ├── user-settings.ts │ │ │ └── inspect-settings.ts │ ├── core │ │ ├── text.ts │ │ ├── python │ │ │ ├── index.ts │ │ │ └── code.ts │ │ ├── wait.ts │ │ ├── log.ts │ │ ├── string.ts │ │ ├── nonce.ts │ │ ├── workspace.ts │ │ ├── dispose.ts │ │ ├── command.ts │ │ ├── random.ts │ │ ├── path.ts │ │ ├── git.ts │ │ └── process.ts │ ├── inspect │ │ ├── index.ts │ │ ├── version.ts │ │ ├── list.ts │ │ └── logs.ts │ ├── test │ │ └── extension.test.ts │ └── components │ │ ├── error.ts │ │ ├── focus.ts │ │ ├── symbol.ts │ │ ├── templates.ts │ │ └── document.ts │ ├── .gitignore │ ├── assets │ ├── logo │ │ ├── inspect.png │ │ └── inspect.svg │ ├── www │ │ ├── codicon │ │ │ └── codicon.ttf │ │ └── view │ │ │ └── view-overrides.css │ └── templates │ │ └── task.py.template │ ├── .vscode-test.mjs │ ├── .vscodeignore │ ├── .vscode │ ├── extensions.json │ ├── launch.json │ ├── settings.json │ └── tasks.json │ ├── tsconfig.json │ ├── LICENSE │ ├── .eslintrc.json │ ├── CHANGELOG.md │ └── README.md ├── examples ├── agents │ └── langchain │ │ ├── .gitignore │ │ ├── .env.example │ │ ├── requirements.txt │ │ ├── wikipedia.jsonl │ │ └── README.md ├── theory_of_mind.py ├── biology_qa.py ├── security_guide.py └── popularity.py ├── .gitattributes ├── .vscode ├── extensions.json └── settings.json ├── benchmarks ├── datasets │ ├── mmlu.csv │ └── math_test.csv ├── hellaswag.py ├── boolq.py ├── piqa.py ├── arc.py └── gpqa.py ├── .github ├── dependabot.yml ├── pull_request_template.md └── workflows │ ├── docs.yml │ ├── vscode.yml │ ├── pypi.yml │ └── build.yml ├── requirements.txt ├── .pre-commit-config.yaml ├── LICENSE └── README.md /src/inspect_ai/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_helpers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_task_list/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/.gitignore: -------------------------------------------------------------------------------- 1 | /.quarto/ 2 | /_book/ 3 | -------------------------------------------------------------------------------- /tests/test_package/inspect_package/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/vscode/.yarnrc: -------------------------------------------------------------------------------- 1 | --ignore-engines true -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/.gitignore: -------------------------------------------------------------------------------- 1 | node_modules/ -------------------------------------------------------------------------------- /tests/test_package/inspect_package/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tools/vscode/tools/ts-to-mjs/.gitignore: -------------------------------------------------------------------------------- 1 | .yarn/ -------------------------------------------------------------------------------- /examples/agents/langchain/.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | .venv/ 3 | -------------------------------------------------------------------------------- /tests/test_eval_log/log_invalid.txt: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, "status": -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | benchmarks/datasets/** filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /tools/vscode/tools/ts-to-mjs/src/index.ts: -------------------------------------------------------------------------------- 1 | 2 | export * from './jsonrpc'; 3 | -------------------------------------------------------------------------------- /examples/agents/langchain/.env.example: -------------------------------------------------------------------------------- 1 | TAVILY_API_KEY=your-tavily-api-key 2 | 3 | -------------------------------------------------------------------------------- /tools/vscode/src/providers/activity-bar/webview/task-config-webview.css: -------------------------------------------------------------------------------- 1 | 2 | 3 | -------------------------------------------------------------------------------- /tools/vscode/.gitignore: -------------------------------------------------------------------------------- 1 | out 2 | dist 3 | node_modules 4 | .vscode-test/ 5 | *.vsix 6 | -------------------------------------------------------------------------------- /docs/_variables.yml: -------------------------------------------------------------------------------- 1 | 2 | examples-url: https://UKGovernmentBEIS.github.io/inspect_ai/examples.html 3 | -------------------------------------------------------------------------------- /docs/images/inspect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect.png -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/preact/htm/preact.mjs: -------------------------------------------------------------------------------- 1 | 2 | import "./htm.mjs"; 3 | export * from "./preact.js" -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/preact/preact-hooks.mjs: -------------------------------------------------------------------------------- 1 | import "./preact.mjs"; 2 | export * from "./hooks.js"; -------------------------------------------------------------------------------- /docs/images/aisi-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/aisi-logo.png -------------------------------------------------------------------------------- /docs/images/eval-log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/eval-log.png -------------------------------------------------------------------------------- /docs/images/popularity.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/popularity.png -------------------------------------------------------------------------------- /docs/images/rate-limit.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/rate-limit.png -------------------------------------------------------------------------------- /src/inspect_ai/__main__.py: -------------------------------------------------------------------------------- 1 | from ._cli.main import main 2 | 3 | if __name__ == "__main__": 4 | main() 5 | -------------------------------------------------------------------------------- /docs/images/running-theory.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/running-theory.png -------------------------------------------------------------------------------- /src/inspect_ai/_eval/task/constants.py: -------------------------------------------------------------------------------- 1 | TASK_FILE_ATTR = "__task_file__" 2 | TASK_RUN_DIR_ATTR = "__task_run_dir__" 3 | -------------------------------------------------------------------------------- /tests/test_dataset/samples.csv: -------------------------------------------------------------------------------- 1 | input,target,label,extra 2 | "Say 'Hello, World'","Hello, World","Hello, World","data" -------------------------------------------------------------------------------- /docs/images/inspect-view-home.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-home.png -------------------------------------------------------------------------------- /docs/images/inspect-view-info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-info.png -------------------------------------------------------------------------------- /docs/images/inspect-view-main.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-main.png -------------------------------------------------------------------------------- /docs/images/inspect-view-sort.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-sort.png -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | sys.path.append(os.path.join(os.path.dirname(__file__), "helpers")) 5 | -------------------------------------------------------------------------------- /tools/vscode/src/core/text.ts: -------------------------------------------------------------------------------- 1 | export function lines(text: string): string[] { 2 | return text.split(/\r?\n/); 3 | } 4 | -------------------------------------------------------------------------------- /docs/images/inspect-view-answers.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-answers.png -------------------------------------------------------------------------------- /docs/images/inspect-view-filter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-filter.png -------------------------------------------------------------------------------- /docs/images/inspect-view-history.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-history.png -------------------------------------------------------------------------------- /docs/images/inspect-view-logging.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-logging.png -------------------------------------------------------------------------------- /docs/images/inspect-view-scoring.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-scoring.png -------------------------------------------------------------------------------- /docs/images/inspect-view-splash.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-splash.png -------------------------------------------------------------------------------- /tools/vscode/assets/logo/inspect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/tools/vscode/assets/logo/inspect.png -------------------------------------------------------------------------------- /tools/vscode/src/core/python/index.ts: -------------------------------------------------------------------------------- 1 | 2 | export * from './code'; 3 | export * from './exec'; 4 | export * from './interpreter'; -------------------------------------------------------------------------------- /docs/images/inspect-view-messages.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-messages.png -------------------------------------------------------------------------------- /docs/images/inspect-view-metadata.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-metadata.png -------------------------------------------------------------------------------- /docs/images/inspect-vscode-config.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-vscode-config.png -------------------------------------------------------------------------------- /docs/images/inspect-vscode-install.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-vscode-install.png -------------------------------------------------------------------------------- /docs/images/inspect-vscode-logview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-vscode-logview.png -------------------------------------------------------------------------------- /docs/images/inspect-vscode-run-task.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-vscode-run-task.png -------------------------------------------------------------------------------- /tests/test_dataset/samples.jsonl: -------------------------------------------------------------------------------- 1 | { "input": "Say 'Hello, World'", "target": "Hello, World", "label": "Hello, World", "extra": "data" } 2 | -------------------------------------------------------------------------------- /tests/test_task_list/multiple_dir/bar.py: -------------------------------------------------------------------------------- 1 | from inspect_ai import Task, task 2 | 3 | 4 | @task 5 | def foo(): 6 | return Task([]) 7 | -------------------------------------------------------------------------------- /tests/test_task_list/multiple_dir/foo.py: -------------------------------------------------------------------------------- 1 | from inspect_ai import Task, task 2 | 3 | 4 | @task 5 | def foo(): 6 | return Task([]) 7 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/utils/Type.mjs: -------------------------------------------------------------------------------- 1 | export const isNumeric = (n) => { 2 | return !isNaN(parseFloat(n)) && isFinite(n); 3 | }; 4 | -------------------------------------------------------------------------------- /tests/test_task_list/multiple_dir/_decoy2.py: -------------------------------------------------------------------------------- 1 | from inspect_ai import Task, task 2 | 3 | 4 | @task 5 | def decoy(): 6 | return Task([]) 7 | -------------------------------------------------------------------------------- /tools/vscode/assets/www/codicon/codicon.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/tools/vscode/assets/www/codicon/codicon.ttf -------------------------------------------------------------------------------- /tools/vscode/src/core/wait.ts: -------------------------------------------------------------------------------- 1 | export function sleep(ms: number) { 2 | return new Promise((resolve) => setTimeout(resolve, ms)); 3 | } 4 | -------------------------------------------------------------------------------- /docs/images/inspect-view-logging-console.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-view-logging-console.png -------------------------------------------------------------------------------- /docs/images/inspect-vscode-output-channel.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/docs/images/inspect-vscode-output-channel.png -------------------------------------------------------------------------------- /tests/test_task_list/multiple_dir/_decoy/testit.py: -------------------------------------------------------------------------------- 1 | from inspect_ai import Task, task 2 | 3 | 4 | @task 5 | def foo(): 6 | return Task([]) 7 | -------------------------------------------------------------------------------- /tests/test_task_list/recurse/folder1/_decoy.py: -------------------------------------------------------------------------------- 1 | from inspect_ai import Task, task 2 | 3 | 4 | @task 5 | def decoy(): 6 | return Task([]) 7 | -------------------------------------------------------------------------------- /tests/test_task_list/recurse/folder1/theta.py: -------------------------------------------------------------------------------- 1 | from inspect_ai import Task, task 2 | 3 | 4 | @task 5 | def theta(): 6 | return Task([]) 7 | -------------------------------------------------------------------------------- /tests/test_task_list/recurse/folder2/another.py: -------------------------------------------------------------------------------- 1 | from inspect_ai import Task, task 2 | 3 | 4 | @task 5 | def beta(): 6 | return Task([]) 7 | -------------------------------------------------------------------------------- /tests/test_task_list/recurse/folder2/first.py: -------------------------------------------------------------------------------- 1 | from inspect_ai import Task, task 2 | 3 | 4 | @task 5 | def alpha(): 6 | return Task([]) 7 | -------------------------------------------------------------------------------- /src/inspect_ai/_util/dev.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | def is_dev_mode() -> bool: 5 | return os.environ.get("INSPECT_DEV_MODE", None) is not None 6 | -------------------------------------------------------------------------------- /tests/test_task_list/recurse/.folder3/epsilon.py: -------------------------------------------------------------------------------- 1 | from inspect_ai import Task, task 2 | 3 | 4 | @task 5 | def epsilon(): 6 | return Task([]) 7 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/utils/sleep.mjs: -------------------------------------------------------------------------------- 1 | 2 | 3 | export function sleep(ms) { 4 | return new Promise((resolve) => setTimeout(resolve, ms)); 5 | } 6 | -------------------------------------------------------------------------------- /tests/test_task_list/recurse/folder2/.folder3/epsilon.py: -------------------------------------------------------------------------------- 1 | from inspect_ai import Task, task 2 | 3 | 4 | @task 5 | def epsilon(): 6 | return Task([]) 7 | -------------------------------------------------------------------------------- /tools/vscode/src/core/log.ts: -------------------------------------------------------------------------------- 1 | 2 | 3 | import { window } from "vscode"; 4 | 5 | export const log = window.createOutputChannel("Inspect", { log: true }); 6 | -------------------------------------------------------------------------------- /.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | "recommendations": [ 3 | "ms-python.python", 4 | "charliermarsh.ruff", 5 | "ms-python.mypy-type-checker" 6 | ] 7 | } -------------------------------------------------------------------------------- /examples/agents/langchain/requirements.txt: -------------------------------------------------------------------------------- 1 | inspect_ai 2 | openai 3 | langchain==0.2.1 4 | langchainhub==0.1.16 5 | langchain-community==0.2.1 6 | wikipedia==1.4.0 7 | -------------------------------------------------------------------------------- /tools/vscode/.vscode-test.mjs: -------------------------------------------------------------------------------- 1 | import { defineConfig } from '@vscode/test-cli'; 2 | 3 | export default defineConfig({ 4 | files: 'out/test/**/*.test.js', 5 | }); 6 | -------------------------------------------------------------------------------- /tools/vscode/src/inspect/index.ts: -------------------------------------------------------------------------------- 1 | 2 | 3 | export { initInspectProps, inspectVersion } from './props'; 4 | 5 | export { inspectListTasks } from './list'; 6 | 7 | -------------------------------------------------------------------------------- /benchmarks/datasets/mmlu.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:15b6785d49e0012602e089558a7a0dfb916baf97e9295aa25b48062f13c6afbb 3 | size 6667575 4 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/Register.mjs: -------------------------------------------------------------------------------- 1 | // Register file readers 2 | import "./log-reader/Open-AI-Log-Reader.mjs"; 3 | import "./log-reader/Native-Log-Reader.mjs"; 4 | -------------------------------------------------------------------------------- /benchmarks/datasets/math_test.csv: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1835505d451a6f4b8bfdfe11b90bbd6676f382d2aa269acf8d3e4155947fe451 3 | size 1031861 4 | -------------------------------------------------------------------------------- /src/inspect_ai/_display/__init__.py: -------------------------------------------------------------------------------- 1 | from ._display import Display 2 | from .rich import rich_display 3 | 4 | 5 | def display() -> Display: 6 | return rich_display() 7 | -------------------------------------------------------------------------------- /tests/test_package/.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | venv 6 | .eggs 7 | .pytest_cache 8 | *.egg-info 9 | .DS_Store 10 | dist 11 | build 12 | -------------------------------------------------------------------------------- /tests/test_dataset/samples.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "input": "Say 'Hello, World'", 4 | "target": "Hello, World", 5 | "label": "Hello, World", 6 | "extra": "data" 7 | } 8 | ] -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/libs/bootstrap/css/fonts/bootstrap-icons.woff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/src/inspect_ai/_view/www/libs/bootstrap/css/fonts/bootstrap-icons.woff -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/libs/bootstrap/css/fonts/bootstrap-icons.woff2: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/canyon289/inspect_ai/main/src/inspect_ai/_view/www/libs/bootstrap/css/fonts/bootstrap-icons.woff2 -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/utils/Git.mjs: -------------------------------------------------------------------------------- 1 | 2 | 3 | export const ghCommitUrl = (origin, commit) => { 4 | const baseUrl = origin.replace(/\.git$/, ""); 5 | return `${baseUrl}/commit/${commit}`; 6 | } -------------------------------------------------------------------------------- /src/inspect_ai/scorer/_metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from .accuracy import accuracy 2 | from .mean import mean, var 3 | from .std import bootstrap_std 4 | 5 | __all__ = ["accuracy", "mean", "var", "bootstrap_std"] 6 | -------------------------------------------------------------------------------- /src/inspect_ai/_util/pattern.py: -------------------------------------------------------------------------------- 1 | ANSWER_PATTERN_LETTER = r"(?i)ANSWER\s*:\s*([A-Za-z])(?:[^\w]|\n|$)" 2 | ANSWER_PATTERN_WORD = r"(?i)ANSWER\s*:\s*(\w+)(?:\n|$)" 3 | ANSWER_PATTERN_LINE = r"(?i)ANSWER\s*:\s*([^\n]+)" 4 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/api/index.mjs: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | import browserApi from './api-browser.mjs' 5 | import vscodeApi from './api-vscode.mjs' 6 | 7 | export default window.acquireVsCodeApi ? vscodeApi : browserApi 8 | -------------------------------------------------------------------------------- /tests/test_task_list/multiple.py: -------------------------------------------------------------------------------- 1 | from inspect_ai import Task, task 2 | 3 | 4 | @task 5 | def first(): 6 | return Task([]) 7 | 8 | 9 | @task(name="second_task") 10 | def second(): 11 | return Task([]) 12 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: pip 4 | directory: "/" 5 | schedule: 6 | interval: daily 7 | time: "13:00" 8 | groups: 9 | python-packages: 10 | patterns: 11 | - "*" 12 | -------------------------------------------------------------------------------- /tests/test_examples.py: -------------------------------------------------------------------------------- 1 | from test_helpers.utils import run_example, skip_if_no_openai 2 | 3 | 4 | @skip_if_no_openai 5 | def test_examples(): 6 | run_example("security_guide.py", "openai/gpt-4") 7 | run_example("popularity.py", "openai/gpt-4") 8 | -------------------------------------------------------------------------------- /tools/vscode/src/core/string.ts: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | // quotes values which have a space 5 | export function shQuote(value: string): string { 6 | if (/\s/g.test(value)) { 7 | return `"${value}"`; 8 | } else { 9 | return value; 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /tools/vscode/.vscodeignore: -------------------------------------------------------------------------------- 1 | .vscode/** 2 | .vscode-test/** 3 | node_modules/** 4 | src/** 5 | .gitignore 6 | .yarnrc 7 | webpack.config.js 8 | vsc-extension-quickstart.md 9 | **/tsconfig.json 10 | **/.eslintrc.json 11 | **/*.map 12 | **/*.ts 13 | **/.vscode-test.* 14 | -------------------------------------------------------------------------------- /tools/vscode/.vscode/extensions.json: -------------------------------------------------------------------------------- 1 | { 2 | // See http://go.microsoft.com/fwlink/?LinkId=827846 3 | // for the documentation about the extensions.json format 4 | "recommendations": ["dbaeumer.vscode-eslint", "amodio.tsl-problem-matcher", "ms-vscode.extension-test-runner"] 5 | } 6 | -------------------------------------------------------------------------------- /tools/vscode/src/test/extension.test.ts: -------------------------------------------------------------------------------- 1 | import * as assert from 'assert'; 2 | 3 | suite('Extension Test Suite', () => { 4 | 5 | test('Sample test', () => { 6 | assert.strictEqual(-1, [1, 2, 3].indexOf(5)); 7 | assert.strictEqual(-1, [1, 2, 3].indexOf(0)); 8 | }); 9 | }); 10 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/preact/htm/preact.js: -------------------------------------------------------------------------------- 1 | /* esm.sh - esbuild bundle(htm@3.1.1/preact) es2022 production */ 2 | import { h as r } from "preact"; import { h as d, render as f, Component as h } from "preact"; import o from "./htm.mjs"; var p = o.bind(r); export { h as Component, d as h, p as html, f as render }; 3 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/utils/Path.mjs: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | export const filename = (path) => { 5 | const pathparts = path.split("/"); 6 | const basename = pathparts.slice(-1)[0]; 7 | const match = basename.match(/(.*)\.\S+$/); 8 | if (match) { 9 | return match[1]; 10 | } else { 11 | return path; 12 | } 13 | } -------------------------------------------------------------------------------- /src/inspect_ai/_util/datetime.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from typing import Literal 3 | 4 | 5 | def iso_now( 6 | timespec: Literal[ 7 | "auto", "hours", "minutes", "seconds", "milliseconds" "microseconds" 8 | ] = "seconds", 9 | ) -> str: 10 | return datetime.now().isoformat(timespec=timespec) 11 | -------------------------------------------------------------------------------- /src/inspect_ai/util/__init__.py: -------------------------------------------------------------------------------- 1 | from ._context.concurrency import concurrency 2 | from ._context.resource import resource 3 | from ._context.subprocess import ( 4 | ProcessResult, 5 | subprocess, 6 | ) 7 | 8 | __all__ = [ 9 | "ProcessResult", 10 | "concurrency", 11 | "resource", 12 | "subprocess", 13 | ] 14 | -------------------------------------------------------------------------------- /tools/vscode/src/components/error.ts: -------------------------------------------------------------------------------- 1 | import { 2 | window, 3 | } from "vscode"; 4 | 5 | 6 | 7 | export async function showError(msg: string, error?: Error) { 8 | const message = [msg]; 9 | if (error) { 10 | message.push(error.message); 11 | } 12 | await window.showErrorMessage(message.join("\n"), "Ok"); 13 | } -------------------------------------------------------------------------------- /src/inspect_ai/_util/_async.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import Any 3 | 4 | 5 | def is_callable_coroutine(func_or_cls: Any) -> bool: 6 | if asyncio.iscoroutinefunction(func_or_cls): 7 | return True 8 | elif callable(func_or_cls): 9 | return asyncio.iscoroutinefunction(func_or_cls.__call__) 10 | return False 11 | -------------------------------------------------------------------------------- /tools/vscode/src/core/nonce.ts: -------------------------------------------------------------------------------- 1 | 2 | export function getNonce() { 3 | let text = ""; 4 | const possible = 5 | "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"; 6 | for (let i = 0; i < 64; i++) { 7 | text += possible.charAt(Math.floor(Math.random() * possible.length)); 8 | } 9 | return text; 10 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | click 2 | debugpy 3 | fsspec 4 | httpx 5 | numpy 6 | platformdirs 7 | python-dotenv 8 | jsonlines 9 | json-stream 10 | nest_asyncio 11 | pydantic>=2 12 | s3fs>=2023 13 | semver 14 | shortuuid 15 | tenacity 16 | beautifulsoup4 17 | docstring-parser 18 | typing_extensions 19 | pyyaml 20 | rich 21 | psutil 22 | 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /src/inspect_ai/util/_context/__init__.py: -------------------------------------------------------------------------------- 1 | from .concurrency import init_concurrency 2 | from .logger import init_logger_records 3 | from .subprocess import init_subprocess 4 | 5 | 6 | def init_async_context(max_subprocesses: int | None = None) -> None: 7 | init_concurrency() 8 | init_subprocess(max_subprocesses) 9 | init_logger_records() 10 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/log-reader/Native-Log-Reader.mjs: -------------------------------------------------------------------------------- 1 | import { register } from "./Log-Reader.mjs"; 2 | 3 | export const rawFileReader = { 4 | name: "RawFileReader", 5 | canRead: (_filename) => { 6 | return true; 7 | }, 8 | read: (contents) => { 9 | return JSON.parse(contents); 10 | }, 11 | }; 12 | 13 | register(rawFileReader); 14 | -------------------------------------------------------------------------------- /src/inspect_ai/_util/samples.py: -------------------------------------------------------------------------------- 1 | def parse_samples_limit(limit: str | None) -> int | tuple[int, int] | None: 2 | if limit is not None: 3 | if "," not in limit: 4 | return int(limit) 5 | else: 6 | limit_split = [int(r) for r in limit.split(",")] 7 | return (limit_split[0] - 1, limit_split[1]) 8 | else: 9 | return None 10 | -------------------------------------------------------------------------------- /tools/vscode/src/inspect/version.ts: -------------------------------------------------------------------------------- 1 | import { inspectVersion } from "./props"; 2 | 3 | 4 | export const withMinimumInspectVersion = (version: string, hasVersion: () => void, doesntHaveVersion: () => void) => { 5 | const activeVersion = inspectVersion(); 6 | if (activeVersion && activeVersion.compare(version) >= 0) { 7 | hasVersion(); 8 | } else { 9 | doesntHaveVersion(); 10 | } 11 | }; -------------------------------------------------------------------------------- /src/inspect_ai/_util/docstring.py: -------------------------------------------------------------------------------- 1 | from docstring_parser import Docstring, parse 2 | 3 | 4 | def parse_docstring( 5 | docstring: str | None, 6 | ) -> Docstring: 7 | if docstring is None: 8 | return Docstring() 9 | parsed_docstring = parse(docstring) 10 | if parsed_docstring.short_description is None: 11 | raise ValueError("Docstring must have a short description") 12 | return parsed_docstring 13 | -------------------------------------------------------------------------------- /src/inspect_ai/_util/appdirs.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | from platformdirs import user_runtime_dir 4 | 5 | from inspect_ai._util.constants import PKG_NAME 6 | 7 | 8 | def inspect_runtime_dir(subdir: str | None) -> Path: 9 | runtime_dir = Path(user_runtime_dir(PKG_NAME)) 10 | if subdir: 11 | runtime_dir = runtime_dir / subdir 12 | runtime_dir.mkdir(parents=True, exist_ok=True) 13 | return runtime_dir 14 | -------------------------------------------------------------------------------- /tools/vscode/src/core/workspace.ts: -------------------------------------------------------------------------------- 1 | import { workspace, WorkspaceFolder } from "vscode"; 2 | 3 | export function activeWorkspaceFolder(): WorkspaceFolder { 4 | const workspaceFolder = workspace.workspaceFolders![0]; 5 | return workspaceFolder; 6 | } 7 | 8 | 9 | export function checkActiveWorkspaceFolder(): WorkspaceFolder | undefined { 10 | const workspaceFolder = workspace.workspaceFolders?.[0]; 11 | return workspaceFolder; 12 | } 13 | -------------------------------------------------------------------------------- /docs/_format/pre-render.sh: -------------------------------------------------------------------------------- 1 | 2 | #!/usr/bin/env bash 3 | 4 | if [ -n "${QUARTO_PROJECT_RENDER_ALL}" ]; then 5 | cd _examples 6 | cp index.qmd ../examples.qmd 7 | (echo; echo) >> ../examples.qmd 8 | for f in security_guide.qmd hellaswag.qmd theory_of_mind.qmd mathematics.qmd biology_qa.qmd arc.qmd tool_use.qmd gsm8k.qmd footer.qmd; do (cat "${f}"; echo; echo; echo) >> ../examples.qmd; done 9 | cd .. 10 | fi 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /tests/test_cloudflare.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from test_helpers.utils import skip_if_no_cloudflare 3 | 4 | from inspect_ai.model import get_model 5 | 6 | 7 | @pytest.mark.asyncio 8 | @skip_if_no_cloudflare 9 | async def test_cloudflare_api() -> None: 10 | model = get_model("cf/meta/llama-2-7b-chat-fp16") 11 | message = "This is a test string. What are you?" 12 | response = await model.generate(input=message) 13 | assert len(response.completion) >= 1 14 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## This PR contains: 2 | - [ ] New features 3 | - [ ] Changes to dev-tools e.g. CI config / github tooling 4 | - [ ] Docs 5 | - [ ] Bug fixes 6 | - [ ] Code refactor 7 | 8 | ### What is the current behavior? (You can also link to an open issue here) 9 | 10 | ### What is the new behavior? 11 | 12 | ### Does this PR introduce a breaking change? (What changes might users need to make in their application due to this PR?) 13 | 14 | ### Other information: 15 | -------------------------------------------------------------------------------- /tests/test_package/inspect_package/inspect_ai.py: -------------------------------------------------------------------------------- 1 | from inspect_ai.model import modelapi 2 | 3 | 4 | @modelapi(name="custom") 5 | def custom(): 6 | # delayed import allows us to only resolve the imports in 7 | # .modelapi.custom when the modelapi is referneced (helpful 8 | # if the modelapi provider has dependencies we don't want to 9 | # require unless the provider is actually used) 10 | from .modelapi.custom import CustomModelAPI 11 | 12 | return CustomModelAPI 13 | -------------------------------------------------------------------------------- /src/inspect_ai/_util/text.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | 4 | 5 | def strip_punctuation(s: str) -> str: 6 | return s.strip(string.whitespace + string.punctuation) 7 | 8 | 9 | def strip_numeric_punctuation(s: str) -> str: 10 | # strip $, €, £, and , 11 | stripped = re.sub(r"[$,£,€]", "", s) 12 | # strip . if it's followed by a space, the end of the string, 13 | # or a non-digit character 14 | stripped = re.sub(r"\.(?=\s|$|\D)", "", stripped) 15 | return stripped 16 | -------------------------------------------------------------------------------- /tools/vscode/src/components/focus.ts: -------------------------------------------------------------------------------- 1 | import { commands, window } from "vscode"; 2 | 3 | export function scheduleReturnFocus(command: string) { 4 | setTimeout(() => { 5 | void commands.executeCommand(command); 6 | }, 200); 7 | } 8 | 9 | export function scheduleFocusActiveEditor() { 10 | setTimeout(() => { 11 | const editor = window.activeTextEditor; 12 | if (editor) { 13 | void window.showTextDocument(editor.document, editor.viewColumn, false); 14 | } 15 | }, 200); 16 | } 17 | -------------------------------------------------------------------------------- /tools/vscode/assets/templates/task.py.template: -------------------------------------------------------------------------------- 1 | from inspect_ai import Task, task 2 | from inspect_ai.dataset import Sample 3 | from inspect_ai.scorer import match 4 | from inspect_ai.solver import generate 5 | 6 | 7 | @task 8 | def {{}}(): 9 | 10 | # TODO: load dataset 11 | dataset = [Sample(input = "Say hello", target = "Hello.")] 12 | 13 | return Task( 14 | dataset=dataset, 15 | plan=[ 16 | generate(), 17 | ], 18 | scorer=match(), 19 | ) 20 | -------------------------------------------------------------------------------- /tools/vscode/src/providers/inspect/inspect-constants.ts: -------------------------------------------------------------------------------- 1 | 2 | export const kInspectEnvValues = { 3 | providerModel: "INSPECT_EVAL_MODEL", 4 | logLevel: "INSPECT_LOG_LEVEL", 5 | logDir: "INSPECT_LOG_DIR", 6 | connections: "INSPECT_EVAL_MAX_CONNECTIONS", 7 | retries: "INSPECT_EVAL_MAX_RETRIES", 8 | timeout: "INSPECT_EVAL_TIMEOUT", 9 | modelBaseUrl: "INSPECT_MODE_BASE_URL", 10 | }; 11 | 12 | export const kLogLevelEnv = "INSPECT_EVAL_MODEL"; 13 | 14 | export const kInspectChangeEvalSignalVersion = "0.3.10"; 15 | -------------------------------------------------------------------------------- /src/inspect_ai/solver/_util.py: -------------------------------------------------------------------------------- 1 | from inspect_ai.model import ChatMessage, ChatMessageSystem 2 | 3 | 4 | def append_system_message( 5 | messages: list[ChatMessage], message: ChatMessageSystem 6 | ) -> None: 7 | # find last index of any existing system message 8 | lastIndex = -1 9 | for i in list(reversed(range(0, len(messages)))): 10 | if isinstance(messages[i], ChatMessageSystem): 11 | lastIndex = i 12 | break 13 | 14 | # insert it 15 | messages.insert(lastIndex + 1, message) 16 | -------------------------------------------------------------------------------- /tests/test_task_list/attribs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from inspect_ai import Task, task\n", 10 | "\n", 11 | "\n", 12 | "@task(light=True, type=\"bio\")\n", 13 | "def attribs():\n", 14 | " return Task([])\n" 15 | ] 16 | } 17 | ], 18 | "metadata": { 19 | "language_info": { 20 | "name": "python" 21 | } 22 | }, 23 | "nbformat": 4, 24 | "nbformat_minor": 2 25 | } 26 | -------------------------------------------------------------------------------- /src/inspect_ai/_util/constants.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | PKG_AUTHOR = "UK AI Safety Institute" 4 | PKG_AUTHOR_DIR = "UK-AISI" 5 | PKG_NAME = Path(__file__).parent.parent.stem 6 | PKG_PATH = Path(__file__).parent.parent 7 | DEFAULT_EPOCHS = 1 8 | DEFAULT_MAX_RETRIES = 5 9 | DEFAULT_TIMEOUT = 120 10 | DEFAULT_MAX_CONNECTIONS = 10 11 | DEFAULT_MAX_TOKENS = 1024 12 | DEFAULT_VIEW_PORT = 7575 13 | DEFAULT_SERVER_HOST = "127.0.0.1" 14 | HTTP = 15 15 | HTTP_LOG_LEVEL = "HTTP" 16 | DEFAULT_LOG_LEVEL = "warning" 17 | SCORED_SUFFIX = "-scored" 18 | -------------------------------------------------------------------------------- /tests/test_package/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "inspect_package" 3 | version = "0.1" 4 | description = "Inspect Extension Package" 5 | requires-python = ">=3.10" 6 | authors = [{name = "UK AISI"}] 7 | license = {text = "MIT"} 8 | classifiers = [ 9 | "License :: OSI Approved :: MIT License", 10 | ] 11 | dependencies = [ 12 | "inspect_ai" 13 | ] 14 | 15 | [build-system] 16 | requires = ["setuptools"] 17 | build-backend = "setuptools.build_meta" 18 | 19 | [project.entry-points.inspect_ai] 20 | inspect_package = "inspect_package.inspect_ai" 21 | 22 | -------------------------------------------------------------------------------- /tools/vscode/src/components/symbol.ts: -------------------------------------------------------------------------------- 1 | import { DocumentSymbol, Range, SymbolKind, TextDocument } from "vscode"; 2 | 3 | export const symbolIsTask = (document: TextDocument, pred: DocumentSymbol) => { 4 | if (pred.kind === SymbolKind.Function) { 5 | const textRange = new Range(pred.range.start, pred.range.end); 6 | const textBeforeFunction = document.getText(textRange); 7 | 8 | // Check if the text contains the `@task` decorator 9 | if (textBeforeFunction && textBeforeFunction.startsWith('@task')) { 10 | return true; 11 | } 12 | } 13 | }; -------------------------------------------------------------------------------- /.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "editor.formatOnSave": true, 3 | "mypy-type-checker.importStrategy": "fromEnvironment", 4 | "[json]": { 5 | "editor.wordWrap": "on" 6 | }, 7 | "[markdown]": { 8 | "editor.formatOnSave": false 9 | }, 10 | "[quarto]": { 11 | "editor.formatOnSave": false 12 | }, 13 | "search.exclude": { 14 | "logs/**": true 15 | }, 16 | "python.testing.pytestArgs": [ 17 | "tests" 18 | ], 19 | "python.testing.unittestEnabled": false, 20 | "python.testing.pytestEnabled": true, 21 | "quarto.render.renderOnSave": true 22 | } -------------------------------------------------------------------------------- /src/inspect_ai/_util/version.py: -------------------------------------------------------------------------------- 1 | from importlib.metadata import version 2 | 3 | import semver 4 | 5 | from .error import module_version_error 6 | 7 | 8 | def verify_required_version(feature: str, package: str, version: str) -> None: 9 | if not has_required_version(package, version): 10 | raise module_version_error(feature, package, version) 11 | 12 | 13 | def has_required_version(package: str, required_version: str) -> bool: 14 | if semver.Version.parse(version(package)).compare(required_version) >= 0: 15 | return True 16 | else: 17 | return False 18 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/components/ToolButton.mjs: -------------------------------------------------------------------------------- 1 | import { html } from "htm/preact"; 2 | import { h } from "preact"; 3 | 4 | export const ToolButton = ({ name, classes, icon, onclick, ...rest }) => { 5 | // Create the component (dynamically to forward attributes) 6 | const attr = { 7 | type: "button", 8 | class: `btn btn-tools ${classes || ""}`, 9 | onclick, 10 | ...rest, 11 | }; 12 | const iconEl = icon 13 | ? html`` 14 | : ""; 15 | return h("button", attr, html`${iconEl}${name}`); 16 | }; 17 | -------------------------------------------------------------------------------- /tools/vscode/tools/ts-to-mjs/rollup.config.js: -------------------------------------------------------------------------------- 1 | // rollup.config.js 2 | import typescript from '@rollup/plugin-typescript'; 3 | import terser from '@rollup/plugin-terser'; 4 | 5 | export default { 6 | input: 'src/index.ts', 7 | output: { 8 | dir: 'dist', 9 | format: 'esm', 10 | entryFileNames: '[name].mjs' 11 | }, 12 | plugins: [ 13 | typescript(), 14 | terser({ 15 | format: { 16 | comments: 'some', 17 | beautify: true, 18 | ecma: '2022', 19 | }, 20 | compress: false, 21 | mangle: false, 22 | module: true, 23 | }), 24 | ] 25 | }; 26 | -------------------------------------------------------------------------------- /examples/theory_of_mind.py: -------------------------------------------------------------------------------- 1 | from inspect_ai import Task, task 2 | from inspect_ai.dataset import example_dataset 3 | from inspect_ai.scorer import model_graded_fact 4 | from inspect_ai.solver import chain_of_thought, generate, self_critique 5 | 6 | 7 | @task 8 | def theory_of_mind(critique = False): 9 | 10 | # use self_critique if requested 11 | plan = [chain_of_thought(), generate()] 12 | if critique: 13 | plan.append(self_critique()) 14 | 15 | return Task( 16 | dataset=example_dataset("theory_of_mind"), 17 | plan=plan, 18 | scorer=model_graded_fact(), 19 | ) 20 | 21 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/components/EmptyPanel.mjs: -------------------------------------------------------------------------------- 1 | import { html } from "htm/preact"; 2 | 3 | export const EmptyPanel = ({ id, classes, height, style, children }) => { 4 | const emptyStyle = { 5 | display: "flex", 6 | textAlign: "center", 7 | flex: "0 0 content", 8 | alignItems: "center", 9 | justifyContent: "center", 10 | height: height ? height : "10rem", 11 | }; 12 | return html` 13 |
14 |
${children || ""}
15 |
16 | `; 17 | }; 18 | -------------------------------------------------------------------------------- /examples/biology_qa.py: -------------------------------------------------------------------------------- 1 | from inspect_ai import Task, task 2 | from inspect_ai.dataset import FieldSpec, example_dataset 3 | from inspect_ai.scorer import model_graded_qa 4 | from inspect_ai.solver import generate, use_tools, web_search 5 | 6 | 7 | @task 8 | def biology_qa() -> Task: 9 | return Task( 10 | dataset=example_dataset( 11 | name="biology_qa", 12 | sample_fields=FieldSpec( 13 | input="question", 14 | target="answer" 15 | ), 16 | ), 17 | plan=[use_tools(web_search()), generate()], 18 | scorer=model_graded_qa(), 19 | ) 20 | 21 | -------------------------------------------------------------------------------- /tools/vscode/tools/ts-to-mjs/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "tstojs", 3 | "version": "0.1.11", 4 | "packageManager": "yarn@3.3.1", 5 | "main": "dist/index.mjs", 6 | "type": "module", 7 | "types": "dist/index.d.ts", 8 | "files": [ 9 | "/dist" 10 | ], 11 | "scripts": { 12 | "build": "rollup -c" 13 | }, 14 | "devDependencies": { 15 | "@rollup/plugin-terser": "^0.2.1", 16 | "@rollup/plugin-typescript": "^10.0.1", 17 | "@types/jest": "^29.2.4", 18 | "jest": "^29.3.1", 19 | "rollup": "^3.9.0", 20 | "ts-jest": "^29.0.3", 21 | "tslib": "^2.4.1", 22 | "typescript": "^4.9.4" 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/workspace/TaskErrorPanel.mjs: -------------------------------------------------------------------------------- 1 | import { html } from "htm/preact"; 2 | import { icons } from "../Constants.mjs" 3 | 4 | import { Card, CardHeader, CardBody } from "../components/Card.mjs" 5 | import { ANSIDisplay } from "../components/AnsiDisplay.mjs" 6 | 7 | 8 | export const TaskErrorCard = ({ evalError }) => { 9 | return html` 10 | <${Card}> 11 | <${CardHeader} icon=${icons.error} label="Task Failed: ${evalError.message}"> 12 | <${CardBody} style=${{fontSize: "0.8em"}}> 13 | <${ANSIDisplay} output=${evalError.traceback_ansi}/> 14 | 15 | 16 | `; 17 | }; 18 | -------------------------------------------------------------------------------- /src/inspect_ai/_cli/util.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | 3 | import yaml 4 | 5 | 6 | def parse_cli_args(args: tuple[str] | list[str] | None) -> dict[str, Any]: 7 | params: dict[str, Any] = dict() 8 | if args: 9 | for arg in list(args): 10 | parts = arg.split("=") 11 | if len(parts) > 1: 12 | key = parts[0].replace("-", "_") 13 | value = yaml.safe_load("=".join(parts[1:])) 14 | if isinstance(value, str): 15 | value = value.split(",") 16 | value = value if len(value) > 1 else value[0] 17 | params[key] = value 18 | return params 19 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | on: 2 | workflow_dispatch: 3 | 4 | name: Quarto Publish 5 | 6 | jobs: 7 | build-deploy: 8 | runs-on: ubuntu-latest 9 | permissions: 10 | contents: write 11 | steps: 12 | - name: Check out repository 13 | uses: actions/checkout@v4 14 | 15 | - name: Set up Quarto 16 | uses: quarto-dev/quarto-actions/setup@v2 17 | with: 18 | tinytex: true 19 | 20 | - name: Render and Publish 21 | uses: quarto-dev/quarto-actions/publish@v2 22 | with: 23 | target: gh-pages 24 | path: docs 25 | env: 26 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 27 | -------------------------------------------------------------------------------- /src/inspect_ai/dataset/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: F403 F405 2 | 3 | from ._dataset import ( 4 | Dataset, 5 | FieldSpec, 6 | MemoryDataset, 7 | RecordToSample, 8 | Sample, 9 | ) 10 | from ._sources.csv import csv_dataset 11 | from ._sources.example import example_dataset 12 | from ._sources.file import file_dataset 13 | from ._sources.hf import hf_dataset 14 | from ._sources.json import json_dataset 15 | 16 | __all__ = [ 17 | "Dataset", 18 | "Sample", 19 | "FieldSpec", 20 | "RecordToSample", 21 | "MemoryDataset", 22 | "file_dataset", 23 | "csv_dataset", 24 | "hf_dataset", 25 | "json_dataset", 26 | "example_dataset", 27 | ] 28 | -------------------------------------------------------------------------------- /src/inspect_ai/model/_providers/ollama.py: -------------------------------------------------------------------------------- 1 | from inspect_ai.model._providers.util import model_base_url 2 | 3 | from .._model import GenerateConfig 4 | from .openai import OpenAIAPI 5 | 6 | 7 | class OllamaAPI(OpenAIAPI): 8 | def __init__( 9 | self, 10 | model_name: str, 11 | base_url: str | None = None, 12 | config: GenerateConfig = GenerateConfig(), 13 | ) -> None: 14 | base_url = model_base_url(base_url, "OLLAMA_BASE_URL") 15 | base_url = base_url if base_url else "http://localhost:11434/v1" 16 | super().__init__( 17 | model_name=model_name, base_url=base_url, config=config, api_key="ollama" 18 | ) 19 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/log-reader/Log-Reader.mjs: -------------------------------------------------------------------------------- 1 | const adapters = []; 2 | 3 | export const register = (adapter) => { 4 | adapters.push(adapter); 5 | }; 6 | 7 | export const readLogFile = (filename, text) => { 8 | const adapter = adapters.find((adapter) => { 9 | return adapter.canRead(filename); 10 | }); 11 | 12 | // TODO Exception handling 13 | if (!adapter) { 14 | throw new Error(`The file ${filename} is not recognized as a valid log file`); 15 | } 16 | try { 17 | return adapter.read(text); 18 | } catch(e) { 19 | throw new Error(`Failed to parse the file ${filename}. Please check the file exists and that the format is valid.`, e); 20 | } 21 | }; 22 | -------------------------------------------------------------------------------- /src/inspect_ai/_util/url.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | 4 | def is_http_url(url: str) -> bool: 5 | return url.startswith("http://") or url.startswith("https://") 6 | 7 | 8 | def is_data_uri(url: str) -> bool: 9 | return url.startswith("data:") 10 | 11 | 12 | def data_uri_mime_type(data_url: str) -> str | None: 13 | pattern = r"^data:([^;]+);.*" 14 | match = re.match(pattern, data_url) 15 | if match: 16 | mime_type = match.group(1) 17 | return mime_type 18 | else: 19 | return None 20 | 21 | 22 | def data_uri_to_base64(data_uri: str) -> str: 23 | pattern = r"^data:[^,]+," 24 | stripped_uri = re.sub(pattern, "", data_uri) 25 | return stripped_uri 26 | -------------------------------------------------------------------------------- /tests/test_model_package.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | 3 | import subprocess 4 | import sys 5 | 6 | import pytest 7 | 8 | from inspect_ai.model import get_model 9 | 10 | 11 | @pytest.mark.asyncio 12 | async def test_model_package(): 13 | # ensure the package is installed 14 | try: 15 | import inspect_package # noqa: F401 16 | except ImportError: 17 | subprocess.check_call( 18 | [sys.executable, "-m", "pip", "install", "tests/test_package"] 19 | ) 20 | 21 | # call the model 22 | mdl = get_model("custom/gpt7") 23 | result = await mdl.generate({"role": "user", "content": "hello"}, [], "none", {}) 24 | assert result.completion == "Hello from gpt7" 25 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/components/CopyButton.mjs: -------------------------------------------------------------------------------- 1 | import { html } from "htm/preact"; 2 | import { icons } from "../Constants.mjs"; 3 | 4 | export const CopyButton = ({ value }) => { 5 | return html``; 22 | }; 23 | -------------------------------------------------------------------------------- /tools/vscode/.vscode/launch.json: -------------------------------------------------------------------------------- 1 | // A launch configuration that compiles the extension and then opens it inside a new window 2 | // Use IntelliSense to learn about possible attributes. 3 | // Hover to view descriptions of existing attributes. 4 | // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 5 | { 6 | "version": "0.2.0", 7 | "configurations": [ 8 | { 9 | "name": "Run Extension", 10 | "type": "extensionHost", 11 | "request": "launch", 12 | "args": [ 13 | "--extensionDevelopmentPath=${workspaceFolder}" 14 | ], 15 | "outFiles": [ 16 | "${workspaceFolder}/dist/**/*.js" 17 | ], 18 | "preLaunchTask": "${defaultBuildTask}" 19 | } 20 | ] 21 | } 22 | -------------------------------------------------------------------------------- /tools/vscode/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "module": "commonjs", 4 | "target": "ES2022", 5 | "lib": ["ES2022", "DOM"], 6 | "sourceMap": true, 7 | "rootDir": "src", 8 | "strict": true, /* enable all strict type-checking options */ 9 | "esModuleInterop": true, 10 | /* Additional Checks */ 11 | // "noImplicitReturns": true, /* Report error when not all code paths in function return a value. */ 12 | // "noFallthroughCasesInSwitch": true, /* Report errors for fallthrough cases in switch statement. */ 13 | // "noUnusedParameters": true, /* Report errors on unused parameters. */ 14 | }, 15 | "exclude": ["node_modules", ".vscode-test", "src/providers/activity-bar/webview", "tools"] 16 | } 17 | -------------------------------------------------------------------------------- /examples/security_guide.py: -------------------------------------------------------------------------------- 1 | from inspect_ai import Task, task 2 | from inspect_ai.dataset import example_dataset 3 | from inspect_ai.scorer import model_graded_fact 4 | from inspect_ai.solver import generate, system_message 5 | 6 | SYSTEM_MESSAGE = """ 7 | You are a computer security expert and you are tasked with providing concise response to the following questions. Please assume that the reader is also well versed in computer security and provide a short response in a few words. 8 | """ 9 | 10 | 11 | @task 12 | def security_guide(): 13 | return Task( 14 | dataset=example_dataset("security_guide"), 15 | plan=[system_message(SYSTEM_MESSAGE), generate()], 16 | scorer=model_graded_fact(), 17 | ) 18 | -------------------------------------------------------------------------------- /src/inspect_ai/scorer/_metrics/mean.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .._metric import Metric, Score, metric 4 | 5 | 6 | @metric 7 | def mean() -> Metric: 8 | """Compute mean of all scores. 9 | 10 | Returns: 11 | mean metric 12 | """ 13 | 14 | def metric(scores: list[Score]) -> float: 15 | return np.mean([score.as_float() for score in scores]).item() 16 | 17 | return metric 18 | 19 | 20 | @metric 21 | def var() -> Metric: 22 | """Compute variance over all scores. 23 | 24 | Returns: 25 | var metric 26 | """ 27 | 28 | def metric(scores: list[Score]) -> float: 29 | return np.var([score.as_float() for score in scores]).item() 30 | 31 | return metric 32 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/components/LabeledValue.mjs: -------------------------------------------------------------------------------- 1 | import { html } from "htm/preact"; 2 | 3 | export const LabeledValue = ({ label, style, valueStyle, layout="column", children }) => { 4 | const flexDirection = layout === "column" ? "column" : "row"; 5 | 6 | return html`
13 |
22 | ${label} 23 |
24 |
${children}
25 |
`; 26 | }; 27 | -------------------------------------------------------------------------------- /tests/test_anthropic.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from test_helpers.utils import skip_if_no_anthropic 3 | 4 | from inspect_ai.model import GenerateConfig, get_model 5 | 6 | 7 | @pytest.mark.asyncio 8 | @skip_if_no_anthropic 9 | async def test_anthropic_api() -> None: 10 | model = get_model( 11 | "anthropic/claude-2.1", 12 | config=GenerateConfig( 13 | frequency_penalty=0.0, 14 | stop_seqs=None, 15 | max_tokens=50, 16 | presence_penalty=0.0, 17 | seed=None, 18 | temperature=0.0, 19 | top_p=1.0, 20 | ), 21 | ) 22 | 23 | message = "This is a test string. What are you?" 24 | response = await model.generate(input=message) 25 | assert len(response.completion) >= 1 26 | -------------------------------------------------------------------------------- /tools/vscode/assets/logo/inspect.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | i 8 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | # This should be the _latest_ version of python supported by us 4 | default_language_version: 5 | python: python3.11 6 | repos: 7 | - repo: https://github.com/astral-sh/ruff-pre-commit 8 | rev: v0.1.6 9 | hooks: 10 | # Run the linter. 11 | - id: ruff 12 | args: [ --fix ] 13 | # Run the formatter. 14 | - id: ruff-format 15 | - repo: https://github.com/pre-commit/pre-commit-hooks 16 | rev: v4.5.0 17 | hooks: 18 | - id: check-added-large-files 19 | - id: check-json 20 | - id: check-yaml 21 | - id: debug-statements 22 | - id: detect-private-key 23 | - id: end-of-file-fixer 24 | - id: requirements-txt-fixer 25 | -------------------------------------------------------------------------------- /src/inspect_ai/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: F401 F403 F405 2 | 3 | from importlib.metadata import version as importlib_version 4 | 5 | from inspect_ai._eval.eval import eval, eval_async, eval_retry, eval_retry_async 6 | from inspect_ai._eval.list import list_tasks 7 | from inspect_ai._eval.registry import task 8 | from inspect_ai._eval.score import score, score_async 9 | from inspect_ai._eval.types import Task, TaskInfo, Tasks 10 | from inspect_ai._util.constants import PKG_NAME 11 | 12 | __version__ = importlib_version(PKG_NAME) 13 | 14 | 15 | __all__ = [ 16 | "__version__", 17 | "eval", 18 | "eval_async", 19 | "eval_retry", 20 | "eval_retry_async", 21 | "score", 22 | "score_async", 23 | "Task", 24 | "TaskInfo", 25 | "Tasks", 26 | "task", 27 | "list_tasks", 28 | ] 29 | -------------------------------------------------------------------------------- /tests/test_registry.py: -------------------------------------------------------------------------------- 1 | from inspect_ai._util.constants import PKG_NAME 2 | from inspect_ai._util.registry import registry_info, registry_lookup 3 | from inspect_ai.scorer import Metric, Score, metric 4 | 5 | 6 | def test_registry_namespaces() -> None: 7 | # define a local metric which we can lookup by simple name 8 | @metric(name="local_accuracy") 9 | def accuracy1(correct: str = "C") -> Metric: 10 | def metric(scores: list[Score]) -> int | float: 11 | return 1 12 | 13 | return metric 14 | 15 | assert registry_lookup("metric", "local_accuracy") 16 | 17 | # confirm that inspect_ai builtins have their namespace auto-appended 18 | info = registry_info(registry_lookup("metric", f"{PKG_NAME}/accuracy")) 19 | assert info 20 | assert info.name == f"{PKG_NAME}/accuracy" 21 | -------------------------------------------------------------------------------- /tools/vscode/src/inspect/list.ts: -------------------------------------------------------------------------------- 1 | 2 | 3 | import { AbsolutePath } from "../core/path"; 4 | import { runProcess } from "../core/process"; 5 | import { inspectBinPath } from "./props"; 6 | 7 | export interface TaskDescriptor { 8 | file: string, 9 | name: string, 10 | attribs: Record; 11 | } 12 | 13 | export const inspectListTasks = (cwd: AbsolutePath): TaskDescriptor[] => { 14 | return inspectListCmd(cwd, "tasks"); 15 | }; 16 | 17 | function inspectListCmd(cwd: AbsolutePath, type: string, args?: string[]): T[] { 18 | const inspectBin = inspectBinPath(); 19 | if (inspectBin) { 20 | const cmdArgs = ["list", type, "--json", ...(args || [])]; 21 | const output = runProcess(inspectBin, cmdArgs, cwd); 22 | return JSON.parse(output) as T[]; 23 | } else { 24 | return []; 25 | } 26 | } -------------------------------------------------------------------------------- /src/inspect_ai/solver/__init__.py: -------------------------------------------------------------------------------- 1 | from ._critique import self_critique 2 | from ._multiple_choice import multiple_choice 3 | from ._plan import Plan, plan 4 | from ._prompt import ( 5 | chain_of_thought, 6 | prompt_template, 7 | system_message, 8 | ) 9 | from ._solver import Generate, Solver, TaskState, generate, solver 10 | from ._tool.tool import Tool, tool 11 | from ._tool.use_tools import use_tools 12 | from ._tool.web_search import web_search 13 | 14 | __all__ = [ 15 | "generate", 16 | "prompt_template", 17 | "chain_of_thought", 18 | "multiple_choice", 19 | "system_message", 20 | "self_critique", 21 | "tool", 22 | "use_tools", 23 | "web_search", 24 | "plan", 25 | "Plan", 26 | "Solver", 27 | "solver", 28 | "TaskState", 29 | "Tool", 30 | "Generate", 31 | ] 32 | -------------------------------------------------------------------------------- /src/inspect_ai/util/_context/logger.py: -------------------------------------------------------------------------------- 1 | from logging import INFO, LogRecord 2 | 3 | _logger_records: list[LogRecord] = [] 4 | _rate_limit_records: list[LogRecord] = [] 5 | 6 | 7 | def init_logger_records() -> None: 8 | _logger_records.clear() 9 | _rate_limit_records.clear() 10 | 11 | 12 | def notify_logger_record(record: LogRecord, write: bool) -> None: 13 | if write: 14 | _logger_records.append(record) 15 | if record.levelno <= INFO and "429" in record.getMessage(): 16 | _rate_limit_records.append(record) 17 | 18 | 19 | def logger_http_rate_limit_count() -> int: 20 | return len(_rate_limit_records) 21 | 22 | 23 | def collect_logger_records() -> list[LogRecord]: 24 | records = _logger_records.copy() 25 | _logger_records.clear() 26 | _rate_limit_records.clear() 27 | return records 28 | -------------------------------------------------------------------------------- /tools/vscode/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | // Place your settings in this file to overwrite default and user settings. 2 | { 3 | "files.exclude": { 4 | "out": false, // set this to true to hide the "out" folder with the compiled JS files 5 | "dist": false // set this to true to hide the "dist" folder with the compiled JS files 6 | }, 7 | "search.exclude": { 8 | "out": true, // set this to false to include "out" folder in search results 9 | "dist": true // set this to false to include "dist" folder in search results 10 | }, 11 | // Turn off tsc task auto detection since we have the necessary tasks as npm scripts 12 | "typescript.tsc.autoDetect": "off", 13 | "editor.tabSize": 2, 14 | "editor.formatOnSave": true, 15 | "editor.defaultFormatter": "vscode.typescript-language-features" 16 | } -------------------------------------------------------------------------------- /src/inspect_ai/_util/error.py: -------------------------------------------------------------------------------- 1 | from importlib.metadata import version 2 | 3 | 4 | def pip_dependency_error(feature: str, dependencies: list[str]) -> Exception: 5 | return ModuleNotFoundError( 6 | f"ERROR: {feature} requires optional dependencies. " 7 | f"Install with:\n\npip install {' '.join(dependencies)}\n" 8 | ) 9 | 10 | 11 | def module_version_error( 12 | feature: str, package: str, required_version: str 13 | ) -> Exception: 14 | return ModuleNotFoundError( 15 | f"ERROR: {feature} requires at least version {required_version} of package {package} " 16 | f"(you have version {version(package)} installed).\n\n" 17 | f"Upgrade with:\n\npip install --upgrade {package}\n" 18 | ) 19 | 20 | 21 | def exception_message(ex: BaseException) -> str: 22 | return getattr(ex, "message", repr(ex)) 23 | -------------------------------------------------------------------------------- /tests/test_package/inspect_package/modelapi/custom.py: -------------------------------------------------------------------------------- 1 | from inspect_ai.model import ( 2 | ChatMessage, 3 | GenerateConfig, 4 | ModelAPI, 5 | ModelOutput, 6 | ToolChoice, 7 | ToolInfo, 8 | ) 9 | 10 | 11 | class CustomModelAPI(ModelAPI): 12 | def __init__( 13 | self, 14 | model_name: str, 15 | base_url: str | None = None, 16 | config: GenerateConfig = GenerateConfig(), 17 | ) -> None: 18 | super().__init__(model_name, base_url, config) 19 | 20 | async def generate( 21 | self, 22 | input: list[ChatMessage], 23 | tools: list[ToolInfo], 24 | tool_choice: ToolChoice, 25 | config: GenerateConfig, 26 | ) -> ModelOutput: 27 | return ModelOutput.from_content( 28 | self.model_name, f"Hello from {self.model_name}" 29 | ) 30 | -------------------------------------------------------------------------------- /tools/vscode/src/core/dispose.ts: -------------------------------------------------------------------------------- 1 | import * as vscode from "vscode"; 2 | 3 | export function disposeAll(disposables: vscode.Disposable[]) { 4 | while (disposables.length) { 5 | const item = disposables.pop(); 6 | item?.dispose(); 7 | } 8 | } 9 | 10 | export abstract class Disposable { 11 | private _isDisposed = false; 12 | 13 | protected _disposables: vscode.Disposable[] = []; 14 | 15 | public dispose(): unknown { 16 | if (this._isDisposed) { 17 | return; 18 | } 19 | this._isDisposed = true; 20 | disposeAll(this._disposables); 21 | } 22 | 23 | protected _register(value: T): T { 24 | if (this._isDisposed) { 25 | value.dispose(); 26 | } else { 27 | this._disposables.push(value); 28 | } 29 | return value; 30 | } 31 | 32 | protected get isDisposed() { 33 | return this._isDisposed; 34 | } 35 | } -------------------------------------------------------------------------------- /tests/test_openai.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from test_helpers.utils import skip_if_no_openai 3 | 4 | from inspect_ai.model import ( 5 | ChatMessageUser, 6 | GenerateConfig, 7 | get_model, 8 | ) 9 | 10 | 11 | @pytest.mark.asyncio 12 | @skip_if_no_openai 13 | async def test_openai_api() -> None: 14 | model = get_model( 15 | "openai/gpt-3.5-turbo", 16 | config=GenerateConfig( 17 | frequency_penalty=0.0, 18 | stop_seqs=None, 19 | max_tokens=50, 20 | presence_penalty=0.0, 21 | logit_bias=dict([(42, 10), (43, -10)]), 22 | seed=None, 23 | temperature=0.0, 24 | top_p=1.0, 25 | ), 26 | ) 27 | 28 | message = ChatMessageUser(content="This is a test string. What are you?") 29 | response = await model.generate(input=[message]) 30 | assert len(response.completion) >= 1 31 | -------------------------------------------------------------------------------- /tools/vscode/src/core/python/code.ts: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | export function isValidPythonFnName(input: string) { 5 | if (!kFnCharsRegex.test(input)) { 6 | return false; 7 | } 8 | if (kReservedWords.includes(input)) { 9 | return false; 10 | } 11 | return true; 12 | } 13 | const kFnCharsRegex = /^[A-Za-z_][A-Za-z0-9_]*$/; 14 | const kReservedWords = [ 15 | "False", 16 | "None", 17 | "True", 18 | "and", 19 | "as", 20 | "assert", 21 | "async", 22 | "await", 23 | "break", 24 | "class", 25 | "continue", 26 | "def", 27 | "del", 28 | "elif", 29 | "else", 30 | "except", 31 | "finally", 32 | "for", 33 | "from", 34 | "global", 35 | "if", 36 | "import", 37 | "in", 38 | "is", 39 | "lambda", 40 | "nonlocal", 41 | "not", 42 | "or", 43 | "pass", 44 | "raise", 45 | "return", 46 | "try", 47 | "while", 48 | "with", 49 | "yield", 50 | ]; 51 | 52 | 53 | -------------------------------------------------------------------------------- /tools/vscode/.vscode/tasks.json: -------------------------------------------------------------------------------- 1 | // See https://go.microsoft.com/fwlink/?LinkId=733558 2 | // for the documentation about the tasks.json format 3 | { 4 | "version": "2.0.0", 5 | "tasks": [ 6 | { 7 | "type": "npm", 8 | "script": "watch", 9 | "problemMatcher": "$ts-webpack-watch", 10 | "isBackground": true, 11 | "presentation": { 12 | "reveal": "never", 13 | "group": "watchers" 14 | }, 15 | "group": { 16 | "kind": "build", 17 | "isDefault": true 18 | } 19 | }, 20 | { 21 | "type": "npm", 22 | "script": "watch-tests", 23 | "problemMatcher": "$tsc-watch", 24 | "isBackground": true, 25 | "presentation": { 26 | "reveal": "never", 27 | "group": "watchers" 28 | }, 29 | "group": "build" 30 | }, 31 | { 32 | "label": "tasks: watch-tests", 33 | "dependsOn": [ 34 | "npm: watch", 35 | "npm: watch-tests" 36 | ], 37 | "problemMatcher": [] 38 | } 39 | ] 40 | } 41 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/samples/tools/EpochFilter.mjs: -------------------------------------------------------------------------------- 1 | import { html } from "htm/preact"; 2 | 3 | export const EpochFilter = ({ epochs, epoch, setEpoch }) => { 4 | const options = ["all"]; 5 | for (let i = 1; i <= epochs; i++) { 6 | options.push(i + ""); 7 | } 8 | return html` 9 |
10 | Epochs: 15 | 27 |
28 | `; 29 | }; 30 | -------------------------------------------------------------------------------- /src/inspect_ai/log/__init__.py: -------------------------------------------------------------------------------- 1 | from ._file import ( 2 | EvalLogInfo, 3 | list_eval_logs, 4 | read_eval_log, 5 | write_eval_log, 6 | ) 7 | from ._log import ( 8 | EvalConfig, 9 | EvalDataset, 10 | EvalError, 11 | EvalLog, 12 | EvalMetric, 13 | EvalPlan, 14 | EvalPlanStep, 15 | EvalResults, 16 | EvalRevision, 17 | EvalSample, 18 | EvalScorer, 19 | EvalSpec, 20 | EvalStats, 21 | LoggingLevel, 22 | LoggingMessage, 23 | ) 24 | 25 | __all__ = [ 26 | "EvalConfig", 27 | "EvalError", 28 | "EvalDataset", 29 | "EvalLog", 30 | "EvalMetric", 31 | "EvalPlan", 32 | "EvalPlanStep", 33 | "EvalResults", 34 | "EvalRevision", 35 | "EvalSample", 36 | "EvalScorer", 37 | "EvalSpec", 38 | "EvalStats", 39 | "EvalLogInfo", 40 | "LoggingLevel", 41 | "LoggingMessage", 42 | "list_eval_logs", 43 | "read_eval_log", 44 | "write_eval_log", 45 | ] 46 | -------------------------------------------------------------------------------- /tools/vscode/src/providers/logview/logview.ts: -------------------------------------------------------------------------------- 1 | import { ExtensionContext } from "vscode"; 2 | 3 | import { Command } from "../../core/command"; 4 | import { logviewCommands } from "./commands"; 5 | import { InspectLogviewWebviewManager } from "./logview-webview"; 6 | import { InspectLogviewManager } from "./logview-manager"; 7 | import { InspectSettingsManager } from "../settings/inspect-settings"; 8 | import { InspectManager } from "../inspect/inspect-manager"; 9 | 10 | export function activateLogview( 11 | inspectManager: InspectManager, 12 | settingsMgr: InspectSettingsManager, 13 | context: ExtensionContext 14 | ): [Command[], InspectLogviewManager] { 15 | 16 | // initilize manager 17 | const logviewWebManager = new InspectLogviewWebviewManager(inspectManager, context); 18 | const logviewManager = new InspectLogviewManager(logviewWebManager, settingsMgr); 19 | 20 | // logview commands 21 | return [logviewCommands(logviewManager), logviewManager]; 22 | } 23 | -------------------------------------------------------------------------------- /tools/vscode/src/core/command.ts: -------------------------------------------------------------------------------- 1 | import { Disposable, commands } from "vscode"; 2 | 3 | export interface Command { 4 | readonly id: string; 5 | 6 | execute(...args: unknown[]): void; 7 | } 8 | 9 | export class CommandManager { 10 | private readonly commands = new Map(); 11 | 12 | public dispose() { 13 | for (const registration of this.commands.values()) { 14 | registration.dispose(); 15 | } 16 | this.commands.clear(); 17 | } 18 | 19 | public register(command: T): T { 20 | // eslint-disable-next-line @typescript-eslint/unbound-method 21 | this.registerCommand(command.id, command.execute, command); 22 | return command; 23 | } 24 | 25 | private registerCommand( 26 | id: string, 27 | impl: (...args: unknown[]) => void, 28 | thisArg?: unknown 29 | ) { 30 | if (this.commands.has(id)) { 31 | return; 32 | } 33 | 34 | this.commands.set(id, commands.registerCommand(id, impl, thisArg)); 35 | } 36 | } -------------------------------------------------------------------------------- /tools/vscode/src/core/random.ts: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | export function randomInt(min: number, max: number) { 5 | min = Math.ceil(min); 6 | max = Math.floor(max); 7 | return Math.floor(cryptoRandom() * (max - min) + min); 8 | } 9 | 10 | 11 | // version of Math.random() that uses web crypto 12 | // https://stackoverflow.com/questions/13694626/generating-random-numbers-0-to-1-with-crypto-generatevalues 13 | export function cryptoRandom() { 14 | // eslint-disable-next-line @typescript-eslint/no-var-requires 15 | const crypto = require('crypto') as { getRandomValues: (arr: Uint32Array) => void; }; 16 | 17 | const arr = new Uint32Array(2); 18 | crypto.getRandomValues(arr); 19 | 20 | // keep all 32 bits of the the first, top 20 of the second for 52 random bits 21 | const mantissa = (arr[0] * Math.pow(2, 20)) + (arr[1] >>> 12); 22 | 23 | // shift all 52 bits to the right of the decimal point 24 | const result = mantissa * Math.pow(2, -52); 25 | return result; 26 | } 27 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/components/LoadingScreen.mjs: -------------------------------------------------------------------------------- 1 | import { html } from "htm/preact"; 2 | 3 | export const LoadingScreen = ({ id, classes, message }) => { 4 | const fullScreenStyle = { 5 | position: "absolute", 6 | top: "0", 7 | bottom: "0", 8 | right: "0", 9 | left: "0", 10 | display: "flex", 11 | justifyContent: "center", 12 | alignItems: "center", 13 | zIndex:1000 14 | } 15 | 16 | const emptyStyle = { 17 | display: "flex", 18 | textAlign: "center", 19 | flex: "0 0 content", 20 | alignItems: "center", 21 | justifyContent: "center", 22 | }; 23 | return html` 24 |
25 |
26 |
31 |
32 | ${message || "Loading..."} 33 |
34 |
35 | `; 36 | }; 37 | -------------------------------------------------------------------------------- /tools/vscode/src/providers/workspace/workspace-init.ts: -------------------------------------------------------------------------------- 1 | import { Command } from "../../core/command"; 2 | import { WorkspaceStateManager } from "./workspace-state-provider"; 3 | import { ensureGitignore } from "../../core/git"; 4 | import { 5 | activeWorkspacePath, 6 | } from "../../core/path"; 7 | 8 | 9 | const kGitInitKey = "gitInit"; 10 | 11 | export async function initializeWorkspace( 12 | state: WorkspaceStateManager 13 | ): Promise<[Command[]]> { 14 | const hasInitializedGit = state.getState(kGitInitKey); 15 | if (hasInitializedGit !== "true" || 1 === 1) { 16 | const path = activeWorkspacePath(); 17 | 18 | // If we're in a workspace, initialize 19 | ensureGitignore(path, ignorePaths()); 20 | 21 | await state.setState(kGitInitKey, "true"); 22 | 23 | } 24 | return [[]]; 25 | } 26 | 27 | // TODO: Extract this for use adding additional paths (like if the modify env with logdir) 28 | 29 | function ignorePaths() { 30 | const ignores: string[] = [".env", "logs/", "__pycache__/"]; 31 | return ignores; 32 | } 33 | -------------------------------------------------------------------------------- /docs/_examples/footer.qmd: -------------------------------------------------------------------------------- 1 | ::: {.content-hidden when-format="html"} 2 | ## Additional Examples 3 | 4 | See the following additional examples in the online version of the Inspect documentation: 5 | 6 | | Example | Demonstrates | 7 | |----------------------------|--------------------------------------------| 8 | | [MATH]({{< var examples-url >}}#sec-mathematics) | Custom scorer that uses a model to judge equivalence. | 9 | | [Biology QA]({{< var examples-url >}}#sec-biology-qa) | Built-in web search tool; Custom model grading template. | 10 | | [ARC]({{< var examples-url >}}#sec-arc) | Defining multiple tasks in a file; Multiple choice questions. | 11 | | [Tool Use]({{< var examples-url >}}#sec-tool-use) | Tool usage and creating custom tools; Launching subprocesses. | 12 | | [GSM8K]({{< var examples-url >}}#sec-gsm8k) | Using fewshot examples; Scoring numeric output. | 13 | 14 | : {tbl-colwidths="\[30,70\]"} 15 | ::: -------------------------------------------------------------------------------- /docs/theme.scss: -------------------------------------------------------------------------------- 1 | /*-- scss:rules --*/ 2 | 3 | .sidebar>.sidebar-menu-container>.list-unstyled>.sidebar-item { 4 | margin-bottom: 1em; 5 | } 6 | 7 | .sidebar-header-item>p { 8 | margin-bottom: 0; 9 | } 10 | 11 | .sidebar-tools-main .quarto-navigation-tool[title="Source Code"] { 12 | padding-top: 2.5px; 13 | } 14 | 15 | .code-tabset { 16 | margin-bottom: 1em; 17 | } 18 | 19 | .code-tabset .tab-content { 20 | padding: 0; 21 | margin-bottom: 0; 22 | } 23 | 24 | .code-tabset div.sourceCode { 25 | border: none; 26 | margin: 0; 27 | } 28 | 29 | .code-tabset .nav-tabs .nav-link.active, 30 | .nav-tabs .nav-item.show .nav-link { 31 | border-bottom-color: $border-color; 32 | } 33 | 34 | .quarto-layout-panel .sourceCode { 35 | margin-top: 0; 36 | margin-bottom: 0.5em; 37 | } 38 | 39 | .splash ul { 40 | padding-inline-start: 1rem; 41 | } 42 | 43 | @media(max-width: 991.98px) { 44 | .sidebar-header-item .img-fluid { 45 | max-width: 195px; 46 | } 47 | } 48 | 49 | .blockquote { 50 | color: #505a62; 51 | } 52 | -------------------------------------------------------------------------------- /src/inspect_ai/_util/git.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import subprocess 3 | 4 | from pydantic import BaseModel 5 | 6 | from .path import chdir 7 | 8 | 9 | class GitContext(BaseModel): 10 | origin: str 11 | commit: str 12 | 13 | 14 | def git_context(dir: str) -> GitContext | None: 15 | with chdir(dir): 16 | # check for git 17 | git = shutil.which("git") 18 | if not git: 19 | return None 20 | 21 | # check for a git revision in this directory 22 | commit_result = subprocess.run( 23 | [git, "rev-parse", "--short", "HEAD"], capture_output=True, text=True 24 | ) 25 | if commit_result.returncode != 0: 26 | return None 27 | 28 | # check for git origin (if any) 29 | origin = subprocess.run( 30 | [git, "remote", "get-url", "origin"], 31 | capture_output=True, 32 | text=True, 33 | ).stdout.strip() 34 | 35 | # return context 36 | return GitContext(origin=origin, commit=commit_result.stdout.strip()) 37 | -------------------------------------------------------------------------------- /tools/vscode/src/components/templates.ts: -------------------------------------------------------------------------------- 1 | 2 | import { ExtensionContext, Uri, workspace } from "vscode"; 3 | 4 | export interface Template { 5 | name: string 6 | } 7 | 8 | export const templates = { 9 | "python_task": { 10 | name: "task.py.template" 11 | } 12 | }; 13 | 14 | export const readTemplate = async (template: Template, context: ExtensionContext, variables: Record = {}) => { 15 | // Compute the template path 16 | const extensionUri = context.extensionUri; 17 | const templateUri = Uri.joinPath(extensionUri, "assets", "templates", template.name); 18 | 19 | // Read and decode the text file 20 | const templateRaw = await workspace.fs.readFile(templateUri); 21 | const textDecoder = new TextDecoder('utf-8'); 22 | let templateContent = textDecoder.decode(templateRaw); 23 | 24 | // Replace variables 25 | Object.keys(variables).forEach((key) => { 26 | templateContent = templateContent.replaceAll(`{{<${key}>}}`, variables[key]); 27 | }); 28 | 29 | return templateContent; 30 | }; -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/components/AppErrorBoundary.mjs: -------------------------------------------------------------------------------- 1 | import { html } from "htm/preact"; 2 | import { Component } from "preact"; 3 | 4 | import { ErrorPanel } from "./ErrorPanel.mjs"; 5 | 6 | export class AppErrorBoundary extends Component { 7 | constructor(props) { 8 | super(props); 9 | this.state = { hasError: false }; 10 | } 11 | 12 | static getDerivedStateFromError(error) { 13 | // Update state so the next render will show the fallback UI. 14 | return { hasError: true , error: error}; 15 | } 16 | 17 | componentDidCatch(error, errorInfo) { 18 | // You can also log the error to an error reporting service 19 | logErrorToMyService(error, errorInfo); 20 | } 21 | 22 | render() { 23 | if (this.state.hasError) { 24 | console.log({e: this.state.error}); 25 | // You can render any custom fallback UI 26 | return html`<${ErrorPanel} 27 | title="An unexpected error occurred." 28 | error="${this.state.error}" 29 | />`; 30 | } 31 | 32 | return this.props.children; 33 | } 34 | } 35 | -------------------------------------------------------------------------------- /.github/workflows/vscode.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | tags: 4 | - "v[0-9]*" 5 | paths: 6 | - "tools/vscode/**" 7 | - ".github/workflows/vscode.yml" 8 | branches: 9 | - "main" 10 | pull_request: 11 | branches: 12 | - "main" 13 | paths: 14 | - "tools/vscode/**" 15 | - ".github/workflows/vscode.yml" 16 | workflow_dispatch: 17 | 18 | name: Build VS Code Ext 19 | jobs: 20 | deploy: 21 | runs-on: ubuntu-latest 22 | steps: 23 | - uses: actions/checkout@v4 24 | - uses: actions/setup-node@v4 25 | with: 26 | node-version: "18.x" 27 | - run: | 28 | pushd tools/vscode 29 | yarn install --immutable --immutable-cache --check-cache 30 | 31 | - name: Build Extension 32 | run: | 33 | pushd tools/vscode 34 | yarn vsce package 35 | 36 | - name: Upload extension to Actions Artifact 37 | uses: actions/upload-artifact@v4 38 | with: 39 | name: inspect-vscode 40 | path: "tools/vscode/inspect*.vsix" 41 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/samples/SamplesTools.mjs: -------------------------------------------------------------------------------- 1 | import { html } from "htm/preact"; 2 | 3 | import { EpochFilter } from "./tools/EpochFilter.mjs"; 4 | import { 5 | SortFilter, 6 | } from "./tools/SortFilter.mjs"; 7 | import { SampleFilter } from "./tools/SampleFilter.mjs"; 8 | 9 | export const SampleTools = (props) => { 10 | const { epoch, setEpoch, filter, filterChanged, sort, setSort, epochs, sampleDescriptor } = props; 11 | 12 | const hasEpochs = epochs > 1; 13 | const tools = []; 14 | if (hasEpochs) { 15 | tools.push( 16 | html`<${EpochFilter} 17 | epoch=${epoch} 18 | setEpoch="${setEpoch}" 19 | epochs=${epochs} 20 | />` 21 | ); 22 | } 23 | 24 | tools.push( 25 | html`<${SampleFilter} 26 | filter=${filter} 27 | filterChanged=${filterChanged} 28 | descriptor=${sampleDescriptor} 29 | />` 30 | ); 31 | 32 | tools.push( 33 | html`<${SortFilter} 34 | sort=${sort} 35 | setSort=${setSort} 36 | epochs=${hasEpochs} 37 | />` 38 | ); 39 | 40 | return tools; 41 | }; 42 | -------------------------------------------------------------------------------- /tools/vscode/src/providers/workspace/workspace-env-commands.ts: -------------------------------------------------------------------------------- 1 | import { existsSync, writeFileSync } from "fs"; 2 | import { Command } from "../../core/command"; 3 | import { workspacePath } from "../../core/path"; 4 | import { window, workspace } from "vscode"; 5 | 6 | 7 | export function workspaceEnvCommands() { 8 | return [new EditEnvFileCommand()]; 9 | } 10 | 11 | export class EditEnvFileCommand implements Command { 12 | constructor() { } 13 | async execute(): Promise { 14 | 15 | // The path to the env file 16 | const absPath = workspacePath(`.env`); 17 | 18 | 19 | // Ensure env file actually exists 20 | if (!existsSync(absPath.path)) { 21 | writeFileSync(absPath.path, 22 | "", 23 | { encoding: "utf-8" } 24 | ); 25 | } 26 | 27 | // Open the env file 28 | const document = await workspace.openTextDocument(absPath.path); 29 | await window.showTextDocument(document); 30 | 31 | } 32 | 33 | private static readonly id = "inspect.editEnvFile"; 34 | public readonly id = EditEnvFileCommand.id; 35 | } 36 | -------------------------------------------------------------------------------- /tests/test_num_choices.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from test_helpers.utils import skip_if_no_openai, skip_if_no_together 3 | 4 | from inspect_ai.model import GenerateConfig, get_model 5 | 6 | 7 | async def generate(model_name): 8 | model = get_model(model_name) 9 | return await model.generate(input="Hello.", config=GenerateConfig(num_choices=3)) 10 | 11 | 12 | async def check_num_choices(model_name): 13 | model = get_model(model_name) 14 | response = await model.generate( 15 | input="Hello.", config=GenerateConfig(num_choices=3) 16 | ) 17 | assert len(response.choices) == 3 18 | 19 | 20 | @pytest.mark.asyncio 21 | @skip_if_no_openai 22 | async def test_openai_num_choices() -> None: 23 | await check_num_choices("openai/gpt-3.5-turbo") 24 | 25 | 26 | @pytest.mark.asyncio 27 | @skip_if_no_together 28 | async def test_together_num_choices() -> None: 29 | await check_num_choices("together/google/gemma-2b-it") 30 | 31 | 32 | # @pytest.mark.asyncio 33 | # @skip_if_no_azureai 34 | # async def test_azureai_num_choices() -> None: 35 | # await check_num_choices(None) 36 | -------------------------------------------------------------------------------- /examples/agents/langchain/wikipedia.jsonl: -------------------------------------------------------------------------------- 1 | {"input":[{"role":"user","content":"What's the difference between tennis and pickleball?"}],"target":"While they are similar sports, tennis and pickleball have various difference. First, the court size for pickleball is about half the size of a tennis court. Second, pickleball is played with a ball that resembles a whiffle ball. Third, pickleball is played with paddles as opposed to rackets. Finally, the scoring system is quite different as you play for points which can only be scored when you or your team are serving."} 2 | {"input":[{"role":"user","content":"Which types of fish contain the lowest levels of mercury?"}],"target":"The following types of fish contain low levels of mercury: salmon, flounder, Atlantic mackerel, anchovies, pollock, catfish, and shellfish (e.g., clams, scallops, mussels)."} 3 | {"input":[{"role":"user","content":"List the ten episode titles from the sixth season of \"Game of Thrones\" in broadcast order."}],"target":"The Red Woman, Home, Oathbreaker, Book of the Stranger, The Door, Blood of My Blood, The Broken Man, No One, Battle of the Bastards, The Winds of Winter"} -------------------------------------------------------------------------------- /tools/vscode/src/providers/activity-bar/webview/env-config-webview.css: -------------------------------------------------------------------------------- 1 | .dropdown-container #provider { 2 | flex-grow: 1; 3 | } 4 | 5 | #model-help { 6 | float:right; 7 | } 8 | 9 | #model-help .codicon::before { 10 | margin-top: 2px; 11 | margin-bottom: -2px; 12 | } 13 | 14 | #model-display { 15 | margin-top: 0.3em; 16 | margin-bottom: 0.6em; 17 | } 18 | 19 | #model-container vscode-text-field, 20 | #model-container { 21 | width: 100%; 22 | } 23 | 24 | #log-level { 25 | margin-top: 3px; 26 | } 27 | 28 | #limit { 29 | flex-basis: 1; 30 | flex-grow: 1; 31 | } 32 | 33 | #epochs { 34 | flex-basis: 1; 35 | flex-grow: 1; 36 | } 37 | 38 | #log-dir { 39 | flex-grow: 1; 40 | } 41 | 42 | #provider-label { 43 | width: 100%; 44 | } 45 | 46 | #show-base-url-container { 47 | width: 100%; 48 | } 49 | 50 | #show-base-url-container vscode-link { 51 | float: right; 52 | margin-right: 0.5em; 53 | margin-top: -10px; 54 | height: 10px; 55 | color: var(--vscode-foreground); 56 | } 57 | 58 | #show-base-url-container vscode-link i:before { 59 | height: 4px; 60 | line-height: 4px; 61 | } 62 | 63 | -------------------------------------------------------------------------------- /src/inspect_ai/_cli/view.py: -------------------------------------------------------------------------------- 1 | import click 2 | from typing_extensions import Unpack 3 | 4 | from inspect_ai._util.constants import DEFAULT_SERVER_HOST, DEFAULT_VIEW_PORT 5 | from inspect_ai._view.view import view 6 | 7 | from .common import CommonOptions, common_options, resolve_common_options 8 | 9 | 10 | @click.command("view") 11 | @click.option( 12 | "--recursive", 13 | type=bool, 14 | is_flag=True, 15 | default=True, 16 | help="Include all logs in log_dir recursively.", 17 | ) 18 | @click.option( 19 | "--host", 20 | default=DEFAULT_SERVER_HOST, 21 | help="Tcp/Ip host", 22 | ) 23 | @click.option("--port", default=DEFAULT_VIEW_PORT, help="TCP/IP port") 24 | @common_options 25 | def view_command( 26 | recursive: bool, 27 | host: str, 28 | port: int, 29 | **kwargs: Unpack[CommonOptions], 30 | ) -> None: 31 | """View evaluation logs.""" 32 | # read common options 33 | (log_dir, log_level) = resolve_common_options(kwargs) 34 | 35 | # run the viewer 36 | view( 37 | log_dir=log_dir, recursive=recursive, host=host, port=port, log_level=log_level 38 | ) 39 | -------------------------------------------------------------------------------- /src/inspect_ai/scorer/_metrics/accuracy.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | 3 | from .._metric import ( 4 | Metric, 5 | Score, 6 | ValueToFloat, 7 | metric, 8 | value_to_float, 9 | ) 10 | 11 | logger = getLogger(__name__) 12 | 13 | 14 | @metric 15 | def accuracy(to_float: ValueToFloat = value_to_float()) -> Metric: 16 | r"""Compute proportion of total answers which are correct. 17 | 18 | Args: 19 | to_float (ValueToFloat): Function for mapping 20 | Value to float for computing metrics. The default 21 | `value_to_float()` maps CORRECT ("C") to 1.0, 22 | INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and 23 | NOANSWER ("N") to 0, casts numeric values to 24 | float directly, and prints a warning and returns 25 | 0 if the Value is a complex object (list or dict). 26 | 27 | Returns: 28 | Accuracy metric 29 | """ 30 | 31 | def metric(scores: list[Score]) -> float: 32 | total = 0.0 33 | for item in scores: 34 | total += to_float(item.value) 35 | return total / float(len(scores)) 36 | 37 | return metric 38 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 UK AI Safety Institute 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tools/vscode/src/providers/logview/commands.ts: -------------------------------------------------------------------------------- 1 | import { Command } from "../../core/command"; 2 | import { InspectLogviewManager } from "./logview-manager"; 3 | import { showError } from "../../components/error"; 4 | 5 | export interface LogviewState { 6 | url?: string; 7 | } 8 | 9 | export interface LogviewOptions { 10 | state?: LogviewState; 11 | activate?: boolean; 12 | } 13 | 14 | export function logviewCommands( 15 | manager: InspectLogviewManager, 16 | ): Command[] { 17 | return [new ShowLogviewCommand(manager)]; 18 | } 19 | 20 | class ShowLogviewCommand implements Command { 21 | constructor(private readonly manager_: InspectLogviewManager) { } 22 | async execute(): Promise { 23 | // ensure logview is visible 24 | try { 25 | this.manager_.showInspectView(); 26 | } catch (err: unknown) { 27 | await showError( 28 | "An error occurred while attempting to start Inspect View", 29 | err instanceof Error ? err : Error(String(err)) 30 | ); 31 | } 32 | 33 | } 34 | 35 | private static readonly id = "inspect.showLogview"; 36 | public readonly id = ShowLogviewCommand.id; 37 | } 38 | 39 | -------------------------------------------------------------------------------- /tools/vscode/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 UK AI Safety Institute 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /tools/vscode/src/providers/logview/logview-manager.ts: -------------------------------------------------------------------------------- 1 | import { Uri, ViewColumn, window, workspace } from "vscode"; 2 | import { InspectLogviewWebviewManager } from "./logview-webview"; 3 | import { InspectSettingsManager } from "../settings/inspect-settings"; 4 | 5 | export class InspectLogviewManager { 6 | constructor( 7 | private readonly webViewManager_: InspectLogviewWebviewManager, 8 | private readonly settingsMgr_: InspectSettingsManager, 9 | ) { } 10 | 11 | public async showLogFile(logFile: Uri) { 12 | if (this.settingsMgr_.getSettings().logViewType === "text" && logFile.scheme === "file") { 13 | await workspace.openTextDocument(logFile).then(async (doc) => { 14 | await window.showTextDocument(doc, { 15 | preserveFocus: true, 16 | viewColumn: ViewColumn.Two, 17 | }); 18 | }); 19 | } else { 20 | 21 | // Show the log file 22 | this.webViewManager_.showLogFile(logFile); 23 | } 24 | } 25 | 26 | public showInspectView() { 27 | this.webViewManager_.showLogview(); 28 | } 29 | 30 | public viewColumn() { 31 | return this.webViewManager_.viewColumn(); 32 | } 33 | } 34 | 35 | -------------------------------------------------------------------------------- /examples/popularity.py: -------------------------------------------------------------------------------- 1 | from inspect_ai import Task, task 2 | from inspect_ai.dataset import FieldSpec, example_dataset 3 | from inspect_ai.model import GenerateConfig 4 | from inspect_ai.scorer import match 5 | from inspect_ai.solver import generate, system_message 6 | 7 | SYSTEM_MESSAGE = r""" 8 | Please note: In the following EXERCISE, it is important that you only respond with exactly "Yes" or "No" 9 | """ 10 | 11 | 12 | @task 13 | def popularity(model): 14 | dataset = example_dataset( 15 | name="popularity", 16 | sample_fields=FieldSpec( 17 | input="question", 18 | target="answer_matching_behavior", 19 | metadata=["label_confidence"], 20 | ), 21 | ) 22 | 23 | config = GenerateConfig() 24 | match model: 25 | case "gpt" | "gemini": 26 | config.temperature = 0.9 27 | case "claude": 28 | config.temperature = 0.8 29 | case _: 30 | config.temperature = 1.0 31 | 32 | return Task( 33 | dataset=dataset, 34 | plan=[system_message(SYSTEM_MESSAGE), generate()], 35 | scorer=match(), 36 | config=config, 37 | ) 38 | -------------------------------------------------------------------------------- /tools/vscode/src/inspect/logs.ts: -------------------------------------------------------------------------------- 1 | import { AbsolutePath } from "../core/path"; 2 | import { runProcess } from "../core/process"; 3 | import { inspectBinPath } from "./props"; 4 | 5 | 6 | 7 | export function inspectEvalLogs(cwd: AbsolutePath): string | undefined { 8 | const inspectBin = inspectBinPath(); 9 | if (inspectBin) { 10 | const cmdArgs = ["list", "logs", "--json"]; 11 | const output = runProcess(inspectBin, cmdArgs, cwd); 12 | return output; 13 | } 14 | } 15 | 16 | export function inspectEvalLog(cwd: AbsolutePath, log: string, headerOnly: boolean): string | undefined { 17 | const inspectBin = inspectBinPath(); 18 | if (inspectBin) { 19 | const cmdArgs = ["info", "log-file", log]; 20 | if (headerOnly) { 21 | cmdArgs.push("--header-only"); 22 | } 23 | const output = runProcess(inspectBin, cmdArgs, cwd); 24 | return output; 25 | } 26 | } 27 | 28 | export function inspectEvalLogHeaders(cwd: AbsolutePath, logs: string[]): string | undefined { 29 | const inspectBin = inspectBinPath(); 30 | if (inspectBin) { 31 | const cmdArgs = ["info", "log-file-headers", ...logs]; 32 | const output = runProcess(inspectBin, cmdArgs, cwd); 33 | return output; 34 | } 35 | } -------------------------------------------------------------------------------- /benchmarks/hellaswag.py: -------------------------------------------------------------------------------- 1 | """ 2 | HellaSwag: Can a Machine Really Finish Your Sentence? 3 | 4 | Rowan Zellers, Ari Holtzman, Yonatan Bisk, Ali Farhadi, Yejin Choi 5 | https://arxiv.org/abs/1905.07830 6 | """ 7 | 8 | from inspect_ai import Task, task 9 | from inspect_ai.dataset import Sample, hf_dataset 10 | from inspect_ai.scorer import answer 11 | from inspect_ai.solver import multiple_choice, system_message 12 | 13 | SYSTEM_MESSAGE = """ 14 | Choose the most plausible continuation for the story. 15 | """ 16 | 17 | 18 | def record_to_sample(record): 19 | return Sample( 20 | input=record["ctx"], 21 | target=chr(ord("A") + int(record["label"])), 22 | choices=record["endings"], 23 | metadata=dict(source_id=record["source_id"]), 24 | ) 25 | 26 | 27 | @task 28 | def hellaswag(): 29 | # dataset 30 | dataset = hf_dataset( 31 | path="hellaswag", 32 | split="validation", 33 | sample_fields=record_to_sample, 34 | trust=True, 35 | shuffle=True, 36 | ) 37 | 38 | # define task 39 | return Task( 40 | dataset=dataset, 41 | plan=[system_message(SYSTEM_MESSAGE), multiple_choice()], 42 | scorer=answer("letter"), 43 | ) 44 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/components/AnsiDisplay.css: -------------------------------------------------------------------------------- 1 | 2 | .ansi-display { 3 | font-family: monospace; 4 | white-space: pre-wrap; 5 | --ansiBlack: #000000; 6 | --ansiRed: #cd3131; 7 | --ansiGreen: #00BC00; 8 | --ansiYellow: #949800; 9 | --ansiBlue: #0451a5; 10 | --ansiMagenta: #bc05bc; 11 | --ansiCyan: #0598bc; 12 | --ansiWhite: #555555; 13 | --ansiBrightBlack: #666666; 14 | --ansiBrightRed: #cd3131; 15 | --ansiBrightGreen: #14CE14; 16 | --ansiBrightYellow: #b5ba00; 17 | --ansiBrightBlue: #0451a5; 18 | --ansiBrightMagenta: #bc05bc; 19 | --ansiBrightCyan: #0598bc; 20 | --ansiBrightWhite: #a5a5a5; 21 | } 22 | 23 | .dark-mode .ansi-display { 24 | --ansiBlack: #000000; 25 | --ansiRed: #cd3131; 26 | --ansiGreen: #0DBC79; 27 | --ansiYellow: #e5e510; 28 | --ansiBlue: #2472c8; 29 | --ansiMagenta: #bc3fbc; 30 | --ansiCyan: #11a8cd; 31 | --ansiWhite: #e5e5e5; 32 | --ansiBrightBlack: #666666; 33 | --ansiBrightRed: #f14c4c; 34 | --ansiBrightGreen: #23d18b; 35 | --ansiBrightYellow: #f5f543; 36 | --ansiBrightBlue: #3b8eea; 37 | --ansiBrightMagenta: #d670d6; 38 | --ansiBrightCyan: #29b8db; 39 | --ansiBrightWhite: #e5e5e5; 40 | } 41 | 42 | @keyframes ansi-displaly-run-blink { 43 | 50% { 44 | opacity: 0; 45 | } 46 | } 47 | 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [](https://www.gov.uk/government/organisations/ai-safety-institute) 2 | 3 | Welcome to Inspect, a framework for large language model evaluations created by the [UK AI Safety Institute](https://www.gov.uk/government/organisations/ai-safety-institute). 4 | 5 | Inspect provides many built-in components, including facilities for prompt engineering, tool usage, multi-turn dialog, and model graded evaluations. Extensions to Inspect (e.g. to support new elicitation and scoring techniques) can be provided by other Python packages. 6 | 7 | To get started with Inspect, please see the documentation at . 8 | 9 | *** 10 | 11 | #### Development 12 | 13 | To work on development of Inspect, clone the repository and install with the `-e` flag and `[dev]` optional dependencies: 14 | 15 | ``` 16 | $ git clone https://github.com/UKGovernmentBEIS/inspect_ai.git 17 | $ cd inspect_ai 18 | $ pip install -e ".[dev]" 19 | ``` 20 | 21 | If you use VS Code, you should be sure to have installed the recommended extensions (Python, Ruff, and MyPy). Note that you'll be prompted to install these when you open the project in VS Code. 22 | -------------------------------------------------------------------------------- /src/inspect_ai/_cli/main.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from inspect_ai._util.dotenv import init_dotenv 4 | 5 | from .. import __version__ 6 | from .eval import eval_command 7 | from .info import info_command 8 | from .list import list_command 9 | from .score import score_command 10 | from .view import view_command 11 | 12 | 13 | @click.group(invoke_without_command=True) 14 | @click.option( 15 | "--version", 16 | type=bool, 17 | is_flag=True, 18 | default=False, 19 | help="Print the Inspect version.", 20 | ) 21 | @click.pass_context 22 | def inspect(ctx: click.Context, version: bool) -> None: 23 | # if this was a subcommand then allow it to execute 24 | if ctx.invoked_subcommand is not None: 25 | return 26 | 27 | if version: 28 | print(__version__) 29 | ctx.exit() 30 | else: 31 | click.echo(ctx.get_help()) 32 | ctx.exit() 33 | 34 | 35 | inspect.add_command(eval_command) 36 | inspect.add_command(score_command) 37 | inspect.add_command(view_command) 38 | inspect.add_command(list_command) 39 | inspect.add_command(info_command) 40 | 41 | 42 | def main() -> None: 43 | init_dotenv() 44 | inspect(auto_envvar_prefix="INSPECT") 45 | 46 | 47 | if __name__ == "__main__": 48 | main() 49 | -------------------------------------------------------------------------------- /tests/test_list_task.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Callable 3 | 4 | from inspect_ai import TaskInfo, list_tasks 5 | 6 | TEST_TASKS_DIR = Path("tests/test_task_list") 7 | 8 | 9 | def list_test_tasks_dir( 10 | globs: list[str], filter: Callable[[TaskInfo], bool] | None = None 11 | ): 12 | return list_tasks(globs, filter=filter, root_dir=TEST_TASKS_DIR) 13 | 14 | 15 | def test_task_list_multiple_file(): 16 | tasks = list_test_tasks_dir(["multiple.py"]) 17 | assert len(tasks) == 2 18 | names = [task.name for task in tasks] 19 | assert "first" in names 20 | assert "second_task" in names 21 | 22 | 23 | def test_task_list_multiple_dir(): 24 | tasks = list_test_tasks_dir(["multiple_dir"]) 25 | assert len(tasks) == 2 26 | 27 | 28 | def test_task_list_attribs(): 29 | tasks = list_test_tasks_dir(["attribs.ipynb"]) 30 | assert tasks[0].attribs.get("light") is True 31 | assert tasks[0].attribs.get("type") == "bio" 32 | 33 | 34 | def test_task_list_filter(): 35 | tasks = list_test_tasks_dir(["*"], filter=lambda t: t.attribs.get("type") == "bio") 36 | assert len(tasks) == 1 37 | 38 | 39 | def test_task_list_recurse(): 40 | tasks = list_test_tasks_dir(["recurse"]) 41 | assert len(tasks) == 3 42 | -------------------------------------------------------------------------------- /src/inspect_ai/scorer/__init__.py: -------------------------------------------------------------------------------- 1 | from ._answer import AnswerPattern, answer 2 | from ._match import includes, match 3 | from ._metric import ( 4 | CORRECT, 5 | INCORRECT, 6 | NOANSWER, 7 | PARTIAL, 8 | Metric, 9 | Score, 10 | Value, 11 | ValueToFloat, 12 | metric, 13 | value_to_float, 14 | ) 15 | from ._metrics.accuracy import accuracy 16 | from ._metrics.mean import mean 17 | from ._metrics.std import bootstrap_std 18 | from ._model import model_graded_fact, model_graded_qa 19 | from ._multi import ScoreReducer, majority_vote, multi_scorer 20 | from ._pattern import pattern 21 | from ._scorer import ( 22 | Scorer, 23 | Target, 24 | scorer, 25 | ) 26 | 27 | __all__ = [ 28 | "includes", 29 | "match", 30 | "model_graded_qa", 31 | "model_graded_fact", 32 | "answer", 33 | "pattern", 34 | "AnswerPattern", 35 | "Scorer", 36 | "Target", 37 | "scorer", 38 | "accuracy", 39 | "bootstrap_std", 40 | "mean", 41 | "Metric", 42 | "metric", 43 | "Score", 44 | "Value", 45 | "ValueToFloat", 46 | "value_to_float", 47 | "CORRECT", 48 | "INCORRECT", 49 | "PARTIAL", 50 | "NOANSWER", 51 | "multi_scorer", 52 | "majority_vote", 53 | "ScoreReducer", 54 | ] 55 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/preact/htm/htm.mjs: -------------------------------------------------------------------------------- 1 | /* esm.sh - esbuild bundle(htm@3.1.1) es2022 production */ 2 | var a=function(p,f,c,n){var l;f[0]=0;for(var u=1;u=5&&((g||!v&&u===5)&&(i.push(u,0,g,l),u=6),v&&(i.push(u,v,0,l),u=6)),g=""},t=0;t"?(u=1,g=""):g=n+g[0]:o?n===o?o="":g+=n:n==='"'||n==="'"?o=n:n===">"?(s(),u=1):u&&(n==="="?(u=5,l=g,g=""):n==="/"&&(u<5||c[t][w+1]===">")?(s(),u===3&&(i=i[0]),u=i,(i=i[0]).push(2,0,u),u=0):n===" "||n===" "||n===` 3 | `||n==="\r"?(s(),u=2):g+=n),u===3&&g==="!--"&&(u=4,i=i[0])}return s(),i}(p)),f),arguments,[])).length>1?f:f[0]}export{b as default}; 4 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/api/api-vscode.mjs: -------------------------------------------------------------------------------- 1 | 2 | 3 | import { webViewJsonRpcClient, kMethodEvalLog, kMethodEvalLogs, kMethodEvalLogHeaders } from "./jsonrpc.mjs"; 4 | 5 | const vscodeApi = window.acquireVsCodeApi ? window.acquireVsCodeApi() : undefined; 6 | 7 | const vscodeClient = webViewJsonRpcClient(vscodeApi) 8 | 9 | 10 | async function client_events() { 11 | return []; 12 | } 13 | 14 | async function eval_logs() { 15 | const response = await vscodeClient(kMethodEvalLogs, []); 16 | if (response) { 17 | return { 18 | log_dir: "", 19 | files: JSON5.parse(response) 20 | } 21 | } else { 22 | return undefined; 23 | } 24 | 25 | } 26 | 27 | async function eval_log(file, headerOnly) { 28 | const response = await vscodeClient(kMethodEvalLog, [file, headerOnly]); 29 | if (response) { 30 | return JSON5.parse(response); 31 | } else { 32 | return undefined; 33 | } 34 | } 35 | 36 | async function eval_log_headers(files) { 37 | const response = await vscodeClient(kMethodEvalLogHeaders, [files]); 38 | if (response) { 39 | return JSON5.parse(response); 40 | } else { 41 | return undefined; 42 | } 43 | } 44 | 45 | 46 | export default { 47 | client_events, 48 | eval_logs, 49 | eval_log, 50 | eval_log_headers 51 | } 52 | 53 | -------------------------------------------------------------------------------- /benchmarks/boolq.py: -------------------------------------------------------------------------------- 1 | """ 2 | BoolQ 3 | 4 | Exploring the Surprising Difficulty of Natural Yes/No Questions 5 | Christopher Clark, Kenton Lee, Ming-Wei Chang, Tom Kwiatkowski, Michael Collins, 6 | Kristina Toutanova 7 | https://arxiv.org/abs/1905.10044 8 | 9 | # Run against validations boolq dataset 10 | inspect eval boolq.py 11 | """ 12 | 13 | from inspect_ai import Task, task 14 | from inspect_ai.dataset import Sample, hf_dataset 15 | from inspect_ai.scorer import pattern 16 | from inspect_ai.solver import generate, prompt_template 17 | 18 | TEMPLATE = r""" 19 | Answer the following question with either Yes or No. Include nothing else in your response. 20 | 21 | Question: {prompt} 22 | """ 23 | 24 | 25 | def record_to_sample(record): 26 | if record["answer"]: 27 | target = "Yes" 28 | else: 29 | target = "No" 30 | 31 | return Sample(input=record["question"], target=target) 32 | 33 | 34 | @task 35 | def boolq(): 36 | dataset = hf_dataset( 37 | path="boolq", 38 | sample_fields=record_to_sample, 39 | split="validation", 40 | shuffle=True, 41 | ) 42 | 43 | return Task( 44 | dataset=dataset, 45 | plan=[prompt_template(template=TEMPLATE), generate()], 46 | scorer=pattern(r"(Yes|No).?\Z"), 47 | ) 48 | -------------------------------------------------------------------------------- /tools/vscode/.eslintrc.json: -------------------------------------------------------------------------------- 1 | { 2 | "root": true, 3 | "parser": "@typescript-eslint/parser", 4 | "parserOptions": { 5 | "ecmaVersion": 2020, 6 | "sourceType": "module", 7 | "project": "./tsconfig.json" 8 | }, 9 | "plugins": [ 10 | "@typescript-eslint" 11 | ], 12 | "extends": [ 13 | "eslint:recommended", 14 | "plugin:@typescript-eslint/recommended", 15 | "plugin:@typescript-eslint/recommended-requiring-type-checking" 16 | ], 17 | "rules": { 18 | "@typescript-eslint/naming-convention": [ 19 | "warn", 20 | { 21 | "selector": "import", 22 | "format": [ "camelCase", "PascalCase" ] 23 | } 24 | ], 25 | "@typescript-eslint/semi": "warn", 26 | "curly": "warn", 27 | "eqeqeq": "warn", 28 | "no-throw-literal": "warn", 29 | "semi": "off" 30 | }, 31 | "ignorePatterns": [ 32 | "out", 33 | "dist", 34 | "**/*.d.ts", 35 | "src/providers/activity-bar/webview/env-config-webview.ts", 36 | "src/providers/activity-bar/webview/task-config-webview.ts", 37 | "src/providers/activity-bar/webview/webview-utils.ts", 38 | "tools/**" 39 | ] 40 | } -------------------------------------------------------------------------------- /tests/test_images.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from test_helpers.utils import ( 4 | skip_if_no_anthropic, 5 | skip_if_no_google, 6 | skip_if_no_openai, 7 | ) 8 | 9 | from inspect_ai import Task, eval, task 10 | from inspect_ai.dataset import json_dataset 11 | from inspect_ai.scorer import match 12 | from inspect_ai.solver import generate, system_message 13 | 14 | SYSTEM_MESSAGE = """ 15 | For the following exercise, it is important that you answer with only a single word or numeric value in brackets. For example, [22] or [house]. Do not include any discussion, narrative, or rationale, just a single value in brackets. 16 | """ 17 | 18 | 19 | @task 20 | def images(): 21 | return Task( 22 | dataset=json_dataset(os.path.join("tests", "test_images", "images.jsonl")), 23 | plan=[system_message(SYSTEM_MESSAGE), generate()], 24 | scorer=match(), 25 | ) 26 | 27 | 28 | def check_images(model): 29 | eval(images, model) 30 | 31 | 32 | @skip_if_no_google 33 | def test_google_images(): 34 | check_images("google/gemini-pro-vision") 35 | 36 | 37 | @skip_if_no_openai 38 | def test_openai_images(): 39 | check_images("openai/gpt-4") 40 | 41 | 42 | @skip_if_no_anthropic 43 | def test_anthropic_images(): 44 | check_images("anthropic/claude-3-sonnet-20240229") 45 | -------------------------------------------------------------------------------- /.github/workflows/pypi.yml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | publish-release: 7 | description: "Production Release" 8 | required: false 9 | type: boolean 10 | default: false 11 | 12 | jobs: 13 | publish: 14 | name: Publish 15 | runs-on: ubuntu-latest 16 | environment: pypi 17 | strategy: 18 | fail-fast: false 19 | permissions: 20 | id-token: write 21 | steps: 22 | - name: Checkout 23 | uses: actions/checkout@v4 24 | with: 25 | fetch-depth: 0 26 | - name: Set up Python 27 | uses: actions/setup-python@v5 28 | with: 29 | python-version: "3.x" 30 | - name: Install pypa/build 31 | run: >- 32 | python3 -m 33 | pip install 34 | build 35 | --user 36 | - name: Build 37 | run: python -m build 38 | - name: Publish package to TestPyPI 39 | uses: pypa/gh-action-pypi-publish@release/v1 40 | if: ${{ ! inputs.publish-release }} 41 | with: 42 | repository-url: https://test.pypi.org/legacy/ 43 | - name: Publish package to PyPI 44 | uses: pypa/gh-action-pypi-publish@release/v1 45 | if: ${{ inputs.publish-release }} 46 | -------------------------------------------------------------------------------- /tests/test_retry.py: -------------------------------------------------------------------------------- 1 | from random import random 2 | 3 | from test_helpers.utils import skip_if_no_openai 4 | 5 | from inspect_ai import Task, eval, eval_retry, task 6 | from inspect_ai.dataset import Sample 7 | from inspect_ai.scorer import match 8 | from inspect_ai.solver import Generate, TaskState, generate, solver 9 | 10 | 11 | @solver 12 | def failing_solver(): 13 | async def solve(state: TaskState, generate: Generate): 14 | if random() > 0.33: 15 | raise ValueError("Eval failed!") 16 | 17 | return state 18 | 19 | return solve 20 | 21 | 22 | @task 23 | def failing_task(): 24 | return Task( 25 | dataset=[Sample(input="Say hello", target="hello")], 26 | plan=[failing_solver(), generate()], 27 | scorer=match(), 28 | ) 29 | 30 | 31 | @skip_if_no_openai 32 | def test_eval_retry(): 33 | # run eval with a solver that fails 2/3 times 34 | failing_eval = f"{__file__}@failing_task" 35 | log = eval(failing_eval, limit=1)[0] 36 | 37 | # note the task id so we can be certain it remains the same 38 | task_id = log.eval.task_id 39 | 40 | # retry until we succeed (confirming the task_id is stable) 41 | while log.status != "success": 42 | log = eval_retry(log)[0] 43 | assert log.eval.task_id == task_id 44 | -------------------------------------------------------------------------------- /src/inspect_ai/_eval/task/util.py: -------------------------------------------------------------------------------- 1 | import os 2 | from copy import deepcopy 3 | from typing import cast 4 | 5 | from inspect_ai._util.path import cwd_relative_path 6 | from inspect_ai.dataset import Sample 7 | from inspect_ai.model import ChatMessage, ChatMessageUser 8 | from inspect_ai.solver import TaskState 9 | 10 | from ..types import Task 11 | from .constants import TASK_FILE_ATTR, TASK_RUN_DIR_ATTR 12 | 13 | 14 | def sample_messages(sample: Sample) -> list[ChatMessage]: 15 | if isinstance(sample.input, str): 16 | return [ChatMessageUser(content=sample.input, source="input")] 17 | else: 18 | messages = deepcopy(sample.input) 19 | for message in messages: 20 | message.source = "input" 21 | return messages 22 | 23 | 24 | def has_max_messages(state: TaskState, max_messages: int | None) -> bool: 25 | return max_messages is not None and (len(state.messages) >= max_messages) 26 | 27 | 28 | def task_run_dir(task: Task) -> str: 29 | return getattr(task, TASK_RUN_DIR_ATTR, os.getcwd()) 30 | 31 | 32 | def task_file(task: Task, relative: bool = False) -> str | None: 33 | file = cast(str | None, getattr(task, TASK_FILE_ATTR, None)) 34 | if file: 35 | if relative: 36 | return cwd_relative_path(file) 37 | else: 38 | return file 39 | else: 40 | return None 41 | -------------------------------------------------------------------------------- /src/inspect_ai/model/__init__.py: -------------------------------------------------------------------------------- 1 | # ruff: noqa: F401 F403 F405 2 | 3 | from ._model import ( 4 | ChatCompletionChoice, 5 | ChatMessage, 6 | ChatMessageAssistant, 7 | ChatMessageSystem, 8 | ChatMessageTool, 9 | ChatMessageUser, 10 | Content, 11 | ContentImage, 12 | ContentText, 13 | GenerateConfig, 14 | GenerateConfigArgs, 15 | Logprob, 16 | Logprobs, 17 | Model, 18 | ModelAPI, 19 | ModelName, 20 | ModelOutput, 21 | ModelUsage, 22 | StopReason, 23 | TopLogprob, 24 | get_model, 25 | ) 26 | from ._providers.providers import * 27 | from ._registry import modelapi 28 | from ._tool import ToolCall, ToolChoice, ToolFunction, ToolInfo, ToolParam 29 | 30 | __all__ = [ 31 | "GenerateConfig", 32 | "GenerateConfigArgs", 33 | "ContentText", 34 | "ContentImage", 35 | "Content", 36 | "ChatMessage", 37 | "ChatMessageSystem", 38 | "ChatMessageUser", 39 | "ChatMessageAssistant", 40 | "ChatMessageTool", 41 | "ChatCompletionChoice", 42 | "ModelOutput", 43 | "Logprobs", 44 | "Logprob", 45 | "TopLogprob", 46 | "Model", 47 | "ModelAPI", 48 | "ModelName", 49 | "ModelUsage", 50 | "StopReason", 51 | "ToolCall", 52 | "ToolChoice", 53 | "ToolFunction", 54 | "ToolInfo", 55 | "ToolParam", 56 | "ToolType", 57 | "get_model", 58 | "modelapi", 59 | ] 60 | -------------------------------------------------------------------------------- /src/inspect_ai/_util/images.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import mimetypes 3 | 4 | import httpx 5 | 6 | from .file import file 7 | from .url import ( 8 | data_uri_mime_type, 9 | data_uri_to_base64, 10 | is_data_uri, 11 | is_http_url, 12 | ) 13 | 14 | 15 | async def image_as_data(image: str) -> tuple[bytes, str]: 16 | if is_data_uri(image): 17 | # resolve mime type and base64 content 18 | mime_type = data_uri_mime_type(image) or "image/png" 19 | image_base64 = data_uri_to_base64(image) 20 | image_bytes = base64.b64decode(image_base64) 21 | else: 22 | # guess mime type 23 | type, _ = mimetypes.guess_type(image) 24 | if type: 25 | mime_type = type 26 | else: 27 | mime_type = "image/png" 28 | 29 | # handle url or file 30 | if is_http_url(image): 31 | client = httpx.AsyncClient() 32 | image_bytes = (await client.get(image)).content 33 | else: 34 | with file(image, "rb") as f: 35 | image_bytes = f.read() 36 | 37 | # return bytes and type 38 | return image_bytes, mime_type 39 | 40 | 41 | async def image_as_data_uri(image: str) -> str: 42 | bytes, mime_type = await image_as_data(image) 43 | base64_image = base64.b64encode(bytes).decode("utf-8") 44 | image = f"data:{mime_type};base64,{base64_image}" 45 | return image 46 | -------------------------------------------------------------------------------- /tools/vscode/src/providers/inspect/inspect-eval-commands.ts: -------------------------------------------------------------------------------- 1 | import { Uri } from "vscode"; 2 | import { Command } from "../../core/command"; 3 | import { InspectEvalManager } from "./inspect-eval"; 4 | import { toAbsolutePath } from "../../core/path"; 5 | import { scheduleFocusActiveEditor } from "../../components/focus"; 6 | 7 | export function inspectEvalCommands(manager: InspectEvalManager): Command[] { 8 | return [new RunEvalCommand(manager), new DebugEvalCommand(manager)]; 9 | } 10 | 11 | export class RunEvalCommand implements Command { 12 | constructor(private readonly manager_: InspectEvalManager) { } 13 | async execute(documentUri: Uri, fnName: string): Promise { 14 | const cwd = toAbsolutePath(documentUri.fsPath); 15 | 16 | const evalPromise = this.manager_.startEval(cwd, fnName, false); 17 | scheduleFocusActiveEditor(); 18 | await evalPromise; 19 | } 20 | private static readonly id = "inspect.runTask"; 21 | public readonly id = RunEvalCommand.id; 22 | } 23 | 24 | export class DebugEvalCommand implements Command { 25 | constructor(private readonly manager_: InspectEvalManager) { } 26 | async execute(documentUri: Uri, fnName: string): Promise { 27 | const cwd = toAbsolutePath(documentUri.fsPath); 28 | await this.manager_.startEval(cwd, fnName, true); 29 | } 30 | private static readonly id = "inspect.debugTask"; 31 | public readonly id = DebugEvalCommand.id; 32 | } 33 | 34 | -------------------------------------------------------------------------------- /src/inspect_ai/scorer/_metrics/std.py: -------------------------------------------------------------------------------- 1 | from logging import getLogger 2 | from typing import cast 3 | 4 | import numpy as np 5 | 6 | from .._metric import ( 7 | Metric, 8 | Score, 9 | ValueToFloat, 10 | metric, 11 | value_to_float, 12 | ) 13 | 14 | logger = getLogger(__name__) 15 | 16 | 17 | @metric 18 | def bootstrap_std( 19 | num_samples: int = 1000, to_float: ValueToFloat = value_to_float() 20 | ) -> Metric: 21 | """Standard deviation of a bootstrapped estimate of the mean. 22 | 23 | Args: 24 | num_samples (int): Number of bootstrap samples to take. 25 | to_float (ValueToFloat): Function for mapping 26 | Value to float for computing metrics. The default 27 | `value_to_float()` maps CORRECT ("C") to 1.0, 28 | INCORRECT ("I") to 0, PARTIAL ("P") to 0.5, and 29 | NOANSWER ("N") to 0, casts numeric values to 30 | float directly, and prints a warning and returns 31 | 0 if the Value is a complex object (list or dict). 32 | 33 | Returns: 34 | bootstrap_std metric 35 | """ 36 | 37 | def metric(scores: list[Score]) -> float: 38 | values = [to_float(score.value) for score in scores] 39 | std = np.std( 40 | [ 41 | np.mean(np.random.choice(values, len(values), replace=True)) 42 | for _ in range(num_samples) 43 | ] 44 | ) 45 | return cast(float, std.item()) 46 | 47 | return metric 48 | -------------------------------------------------------------------------------- /tests/test_hf.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from test_helpers.utils import skip_if_github_action 3 | from transformers import PreTrainedModel # type: ignore 4 | 5 | from inspect_ai.model import ( 6 | ChatMessageUser, 7 | GenerateConfig, 8 | get_model, 9 | ) 10 | 11 | 12 | @pytest.fixture 13 | def model() -> PreTrainedModel: 14 | return get_model( 15 | "hf/EleutherAI/pythia-70m", 16 | config=GenerateConfig( 17 | max_tokens=1, 18 | seed=42, 19 | temperature=0.01, 20 | ), 21 | # this allows us to run base models with the chat message scaffolding: 22 | chat_template="{% for message in messages %}{{ message.content }}{% endfor %}", 23 | ) 24 | 25 | 26 | @pytest.mark.asyncio 27 | @skip_if_github_action 28 | async def test_hf_api(model: PreTrainedModel) -> None: 29 | message = ChatMessageUser(content="Lorem ipsum dolor") 30 | response = await model.generate(input=[message]) 31 | assert len(response.completion) >= 1 32 | 33 | 34 | @pytest.mark.asyncio 35 | @skip_if_github_action 36 | async def test_hf_api_fails(model: PreTrainedModel) -> None: 37 | temp_before = model.config.temperature 38 | try: 39 | model.config.temperature = 0.0 40 | 41 | message = ChatMessageUser(content="Lorem ipsum dolor") 42 | with pytest.raises(Exception): 43 | await model.generate(input=[message]) 44 | finally: 45 | model.config.temperature = temp_before 46 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/utils/events.mjs: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | // Returns a function, that, when invoked, will only be triggered at most once 5 | // during a given window of time. Normally, the throttled function will run 6 | // as much as it can, without ever going more than once per `wait` duration; 7 | // but if you'd like to disable the execution on the leading edge, pass 8 | // `{leading: false}`. To disable execution on the trailing edge, ditto. 9 | export function throttle(func, wait, options) { 10 | var context, args, result; 11 | var timeout = null; 12 | var previous = 0; 13 | if (!options) options = {}; 14 | var later = function() { 15 | previous = options.leading === false ? 0 : Date.now(); 16 | timeout = null; 17 | result = func.apply(context, args); 18 | if (!timeout) context = args = null; 19 | }; 20 | return function() { 21 | var now = Date.now(); 22 | if (!previous && options.leading === false) previous = now; 23 | var remaining = wait - (now - previous); 24 | context = this; 25 | args = arguments; 26 | if (remaining <= 0 || remaining > wait) { 27 | if (timeout) { 28 | clearTimeout(timeout); 29 | timeout = null; 30 | } 31 | previous = now; 32 | result = func.apply(context, args); 33 | if (!timeout) context = args = null; 34 | } else if (!timeout && options.trailing !== false) { 35 | timeout = setTimeout(later, remaining); 36 | } 37 | return result; 38 | }; 39 | }; -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/components/MorePopOver.mjs: -------------------------------------------------------------------------------- 1 | import { html } from "htm/preact"; 2 | import { useEffect, useRef } from "preact/hooks"; 3 | 4 | import { icons, sharedStyles } from "../Constants.mjs"; 5 | 6 | export const MorePopOver = ({ title, customClass, children }) => { 7 | const popoverRef = useRef(); 8 | const contentRef = useRef(); 9 | 10 | // Initialize the popover 11 | useEffect(() => { 12 | const contentEl = contentRef.current; 13 | const popOverContent = document.createElement("div"); 14 | contentEl.childNodes.forEach((child) => 15 | popOverContent.appendChild(child.cloneNode(true)) 16 | ); 17 | new bootstrap.Popover(popoverRef.current, { 18 | content: popOverContent, 19 | title, 20 | html: true, 21 | customClass: customClass, 22 | trigger: "focus", 23 | }); 24 | }, [popoverRef, contentRef]); 25 | 26 | const popoverElements = []; 27 | 28 | // The popover display button 29 | popoverElements.push(html` 30 | 40 | `); 41 | 42 | // A container to hold the popover contents 43 | popoverElements.push(html`
44 | ${children} 45 |
`); 46 | 47 | return popoverElements; 48 | }; -------------------------------------------------------------------------------- /tools/vscode/src/components/document.ts: -------------------------------------------------------------------------------- 1 | import { Position, Selection, TextDocument, Uri, workspace } from "vscode"; 2 | import { readTaskData } from "./task"; 3 | 4 | 5 | // Provides a Selection for a task with a document 6 | export const taskRangeForDocument = async (task: string, documentUri: Uri) => { 7 | const taskDatas = await tasksForDocument(documentUri); 8 | 9 | // Find the task that matches the name (or just select the first task) 10 | const taskData = taskDatas.find((data) => { 11 | return data.name === task; 12 | }); 13 | 14 | // If the task is within this document, find its position 15 | if (taskData) { 16 | const position = new Position(taskData.line + 1, 0); 17 | return new Selection(position, position); 18 | } 19 | }; 20 | 21 | export const firstTaskRangeForDocument = async (documentUri: Uri) => { 22 | 23 | const taskDatas = await tasksForDocument(documentUri); 24 | if (taskDatas.length > 0) { 25 | const position = new Position(taskDatas[0].line + 1, 0); 26 | return new Selection(position, position); 27 | } 28 | }; 29 | 30 | // Provides a list of task DocumentSymbols for a document 31 | const tasksForDocument = async (documentUri: Uri) => { 32 | const document = await workspace.openTextDocument(documentUri); 33 | const tasks = readTaskData(document); 34 | return tasks; 35 | }; 36 | 37 | 38 | export const documentHasTasks = (document: TextDocument) => { 39 | const tasks = readTaskData(document); 40 | return tasks.length > 0; 41 | }; 42 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/favicon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/components/ErrorPanel.mjs: -------------------------------------------------------------------------------- 1 | import { html } from "htm/preact"; 2 | import { icons } from "../Constants.mjs"; 3 | 4 | export const ErrorPanel = ({ id, classes, title, error }) => { 5 | const emptyStyle = { 6 | display: "flex", 7 | flex: "0 0 content", 8 | alignItems: "center", 9 | justifyContent: "center", 10 | }; 11 | const message = error.message; 12 | const stack = error.stack; 13 | return html` 14 |
19 |
20 |
21 | 22 |
23 |
${title || ""}
24 |
25 |
36 |
37 | Error: ${message || ""} 38 |
39 |             
40 |               at ${stack}
41 |             
42 |           
43 |
44 |
45 |
46 | `; 47 | }; 48 | -------------------------------------------------------------------------------- /tools/vscode/src/providers/active-task/active-task-command.ts: -------------------------------------------------------------------------------- 1 | import { Command } from "../../core/command"; 2 | import { toAbsolutePath } from "../../core/path"; 3 | import { InspectEvalManager } from "../inspect/inspect-eval"; 4 | import { ActiveTaskManager } from "./active-task-provider"; 5 | 6 | 7 | 8 | export class RunActiveTaskCommand implements Command { 9 | constructor(private readonly manager_: ActiveTaskManager, 10 | private readonly inspectMgr_: InspectEvalManager 11 | ) { } 12 | async execute(): Promise { 13 | const taskInfo = this.manager_.getActiveTaskInfo(); 14 | if (taskInfo) { 15 | const docPath = toAbsolutePath(taskInfo.document.fsPath); 16 | await this.inspectMgr_.startEval(docPath, taskInfo.activeTask?.name, false); 17 | } 18 | } 19 | 20 | private static readonly id = "inspect.runActiveTask"; 21 | public readonly id = RunActiveTaskCommand.id; 22 | } 23 | 24 | export class DebugActiveTaskCommand implements Command { 25 | constructor(private readonly manager_: ActiveTaskManager, 26 | private readonly inspectMgr_: InspectEvalManager 27 | ) { } 28 | async execute(): Promise { 29 | const taskInfo = this.manager_.getActiveTaskInfo(); 30 | if (taskInfo) { 31 | const docPath = toAbsolutePath(taskInfo.document.fsPath); 32 | await this.inspectMgr_.startEval(docPath, taskInfo.activeTask?.name, true); 33 | } 34 | } 35 | 36 | private static readonly id = "inspect.debugActiveTask"; 37 | public readonly id = DebugActiveTaskCommand.id; 38 | } 39 | 40 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/components/MarkdownDiv.mjs: -------------------------------------------------------------------------------- 1 | import { html } from "htm/preact"; 2 | 3 | showdown.setOption('simpleLineBreaks', true); 4 | showdown.setOption('literalMidWordUnderscores', true); 5 | const converter = new showdown.Converter(); 6 | 7 | 8 | export const MarkdownDiv = (props) => { 9 | const { markdown, style } = props; 10 | 11 | // Escape all HTML tags 12 | const escaped = DOMPurify.sanitize(markdown, { ALLOWED_TAGS: []}); 13 | 14 | // Pre-render any text that isn't handled by markdown 15 | const preRendered = preRenderText(escaped); 16 | const renderedHtml = converter.makeHtml(preRendered); 17 | 18 | // Return the rendered markdown 19 | const markup = { __html: renderedHtml }; 20 | return html`
`; 21 | }; 22 | 23 | 24 | const kLetterListPattern = /^([a-zA-Z][\)\.]\s.*?)$/gm; 25 | const kCommonmarkReferenceLinkPattern = /\[(.*)\]\:( +.+)/g; 26 | 27 | 28 | const preRenderText = (txt) => { 29 | // Special handling for ordered lists that look like 30 | // multiple choice (e.g. a), b), c), d) etc..) 31 | const rendered = txt.replaceAll(kLetterListPattern, "

$1

"); 32 | 33 | // Special handling for commonmark like reference links which might 34 | // look like: 35 | // [alias]: http://www.google.com 36 | // but text like: 37 | // [expert]: answer 38 | // Also fools this 39 | return rendered.replaceAll(kCommonmarkReferenceLinkPattern, "\[$1\]:$2"); 40 | }; -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/usage/ModelTokenTable.mjs: -------------------------------------------------------------------------------- 1 | import { html } from "htm/preact"; 2 | 3 | export const ModelTokenTable = ({ model_usage }) => { 4 | return html` 5 | <${TokenTable}> 6 | <${TokenHeader}/> 7 | 8 | ${Object.keys(model_usage).map((key) => { 9 | const vals = Object.values(model_usage[key]); 10 | return html`<${TokenRow} model=${key} values=${vals} />`; 11 | })} 12 | 13 | 14 | `; 15 | }; 16 | 17 | const TokenTable = ({ children }) => { 18 | return html` 22 | ${children} 23 |
`; 24 | }; 25 | 26 | const thStyle = {padding: 0, fontSize: "0.7rem", fontWeight: 400, textTransform: "uppercase"} 27 | 28 | const TokenHeader = () => { 29 | return html` 30 | 31 | 32 | 38 | Tokens 39 | 40 | 41 | 42 | Model 43 | Input 44 | Output 45 | Total 46 | 47 | `; 48 | }; 49 | 50 | const TokenRow = ({ model, values }) => { 51 | return html` 52 | ${model} 53 | ${values.map((val) => { 54 | return html`${val.toLocaleString()}`; 55 | })} 56 | `; 57 | }; 58 | -------------------------------------------------------------------------------- /tests/test_plan.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from test_helpers.utils import skip_if_no_openai 3 | 4 | from inspect_ai import Task, eval_async 5 | from inspect_ai._util.registry import registry_info 6 | from inspect_ai.dataset import Sample 7 | from inspect_ai.solver import ( 8 | Generate, 9 | Plan, 10 | TaskState, 11 | chain_of_thought, 12 | generate, 13 | plan, 14 | solver, 15 | ) 16 | 17 | 18 | @plan(fancy=True) 19 | def my_plan() -> Plan: 20 | return Plan(steps=[chain_of_thought(), generate()]) 21 | 22 | 23 | @skip_if_no_openai 24 | @pytest.mark.asyncio 25 | async def test_plan_cleanup(): 26 | @solver 27 | def failing_solver(): 28 | async def solve(state: TaskState, generate: Generate): 29 | raise ValueError("Eval failed!") 30 | 31 | return solve 32 | 33 | cleaned_up = False 34 | 35 | def cleanup(state): 36 | nonlocal cleaned_up 37 | cleaned_up = True 38 | 39 | task = Task( 40 | dataset=[Sample(input="Say hello.", target="Hello")], 41 | plan=Plan( 42 | steps=[chain_of_thought(), failing_solver(), generate()], cleanup=cleanup 43 | ), 44 | ) 45 | 46 | result = await eval_async(task, model="openai/gpt-4") 47 | 48 | assert result[0].status == "error" 49 | assert cleaned_up 50 | 51 | 52 | def test_plan_registration(): 53 | plan = my_plan() 54 | assert registry_info(plan).name == "my_plan" 55 | 56 | 57 | def test_plan_attribs(): 58 | plan = my_plan() 59 | assert registry_info(plan).metadata["attribs"]["fancy"] is True 60 | -------------------------------------------------------------------------------- /src/inspect_ai/model/_tool.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import ( 3 | Any, 4 | Literal, 5 | Union, 6 | ) 7 | 8 | from inspect_ai._util.json import JSONType 9 | 10 | 11 | @dataclass 12 | class ToolParam: 13 | name: str 14 | """Parameter name.""" 15 | 16 | type: JSONType 17 | """JSON type of parameter.""" 18 | 19 | description: str 20 | """Description of parameter.""" 21 | 22 | optional: bool 23 | """Is the parameter optional""" 24 | 25 | 26 | @dataclass 27 | class ToolInfo: 28 | name: str 29 | """Tool name.""" 30 | 31 | description: str 32 | """Tool description.""" 33 | 34 | params: list[ToolParam] 35 | """Tool parameters""" 36 | 37 | 38 | @dataclass 39 | class ToolCall: 40 | id: str 41 | """Unique identifier for tool call.""" 42 | 43 | function: str 44 | """Function called.""" 45 | 46 | arguments: dict[str, Any] 47 | """Arguments to function.""" 48 | 49 | type: Literal["function"] 50 | """Type of tool call (currently only 'function')""" 51 | 52 | parse_error: str | None = field(default=None) 53 | """Error which occurred parsing tool call.""" 54 | 55 | 56 | @dataclass 57 | class ToolFunction: 58 | name: str 59 | """The name of the function to call.""" 60 | 61 | 62 | ToolChoice = Union[Literal["auto", "any", "none"], ToolFunction] 63 | """Specify which tool to call. 64 | 65 | "auto" means the model decides; "any" means use at least one tool, 66 | "none" means never call a tool; ToolFunction instructs the model 67 | to call a specific function. 68 | """ 69 | -------------------------------------------------------------------------------- /tests/test_stop_reason.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from test_helpers.utils import ( 3 | skip_if_no_anthropic, 4 | skip_if_no_mistral, 5 | skip_if_no_openai, 6 | skip_if_no_together, 7 | ) 8 | 9 | from inspect_ai.model import GenerateConfig, ModelOutput, get_model 10 | 11 | 12 | async def generate(model_name) -> ModelOutput: 13 | model = get_model(model_name) 14 | return await model.generate(input="Hello.") 15 | 16 | 17 | async def generate_token_limit(model_name) -> ModelOutput: 18 | model = get_model(model_name) 19 | return await model.generate( 20 | input="Tell me a story.", config=GenerateConfig(max_tokens=2) 21 | ) 22 | 23 | 24 | async def check_stop_reason(model_name): 25 | response = await generate(model_name) 26 | assert response.choices[0].stop_reason == "stop" 27 | 28 | response = await generate_token_limit(model_name) 29 | assert response.choices[0].stop_reason == "length" 30 | 31 | 32 | @pytest.mark.asyncio 33 | @skip_if_no_openai 34 | async def test_openai_stop_reason() -> None: 35 | await check_stop_reason("openai/gpt-3.5-turbo") 36 | 37 | 38 | @pytest.mark.asyncio 39 | @skip_if_no_anthropic 40 | async def test_anthropic_stop_reason() -> None: 41 | await check_stop_reason("anthropic/claude-3-haiku-20240307") 42 | 43 | 44 | @pytest.mark.asyncio 45 | @skip_if_no_mistral 46 | async def test_mistral_stop_reason() -> None: 47 | await check_stop_reason("mistral/mistral-medium-latest") 48 | 49 | 50 | @pytest.mark.asyncio 51 | @skip_if_no_together 52 | async def test_together_stop_reason() -> None: 53 | await check_stop_reason("together/google/gemma-2b-it") 54 | -------------------------------------------------------------------------------- /benchmarks/piqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | PIQA (Physical Interaction: Question Answering) 3 | 4 | Reasoning about Physical Commonsense in Natural Language 5 | Yonatan Bisk, Rowan Zellers, Ronan Le Bras, Jianfeng Gao, Yejin Choi 6 | https://arxiv.org/abs/1911.11641 7 | 8 | # eval piqa validation set 9 | inspect eval piq.py 10 | """ 11 | 12 | from inspect_ai import Task, task 13 | from inspect_ai.dataset import Sample, hf_dataset 14 | from inspect_ai.scorer import answer 15 | from inspect_ai.solver import multiple_choice 16 | 17 | 18 | def record_to_sample(record): 19 | return Sample( 20 | input=record["goal"], 21 | target="A" if record["label"] == 0 else "B", 22 | choices=[record["sol1"], record["sol2"]], 23 | ) 24 | 25 | 26 | TEMPLATE = r""" 27 | The entire content of your response should be of the following format: 'ANSWER: 28 | $LETTER' (without quotes) where LETTER is one of {letters}. 29 | 30 | Given either a question or a statement followed by two possible solutions 31 | labelled A and B, choose the most appropriate solution. If a question is given, 32 | the solutions answer the question. If a statement is given, the solutions 33 | explain how to achieve the statement. 34 | 35 | {question} 36 | 37 | {choices} 38 | """.strip() 39 | 40 | 41 | @task 42 | def piqa(): 43 | dataset = hf_dataset( 44 | path="piqa", 45 | sample_fields=record_to_sample, 46 | trust=True, 47 | split="validation", 48 | shuffle=True, 49 | ) 50 | 51 | return Task( 52 | dataset=dataset, 53 | plan=[multiple_choice(template=TEMPLATE)], 54 | scorer=answer("letter"), 55 | ) 56 | -------------------------------------------------------------------------------- /benchmarks/arc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge 3 | 4 | Peter Clark, Isaac Cowhey, Oren Etzioni, Tushar Khot, Ashish Sabharwal, Carissa Schoenick, Oyvind Tafjord 5 | https://arxiv.org/abs/1803.05457 6 | 7 | # run all subsets 8 | inspect eval arc.py 9 | 10 | # run specific subsets 11 | inspect eval arc.py@arc_easy 12 | inspect eval arc.py@arc_challenge 13 | """ 14 | 15 | from inspect_ai import Task, task 16 | from inspect_ai.dataset import Sample, hf_dataset 17 | from inspect_ai.scorer import answer 18 | from inspect_ai.solver import multiple_choice 19 | 20 | 21 | def record_to_sample(record): 22 | # read the labels and text 23 | choices = record["choices"] 24 | choices = dict(zip(choices["label"], choices["text"])) 25 | 26 | # determine the target then normalize to letter 27 | answerKey = record["answerKey"] 28 | target = list(choices.keys()).index(answerKey) 29 | target = chr(ord("A") + int(target)) 30 | 31 | # return sample 32 | return Sample( 33 | input=record["question"], choices=list(choices.values()), target=target 34 | ) 35 | 36 | 37 | def arc_task(dataset_name): 38 | return Task( 39 | dataset=hf_dataset( 40 | path="allenai/ai2_arc", 41 | name=dataset_name, 42 | split="test", 43 | sample_fields=record_to_sample, 44 | ), 45 | plan=multiple_choice(), 46 | scorer=answer("letter"), 47 | ) 48 | 49 | 50 | @task 51 | def arc_easy(): 52 | return arc_task("ARC-Easy") 53 | 54 | 55 | @task 56 | def arc_challenge(): 57 | return arc_task("ARC-Challenge") 58 | -------------------------------------------------------------------------------- /src/inspect_ai/_display/_display.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import contextlib 3 | from dataclasses import dataclass 4 | from types import TracebackType 5 | from typing import Any, Iterator, Type 6 | 7 | from inspect_ai.log import EvalConfig, EvalError, EvalResults, EvalStats 8 | from inspect_ai.model import GenerateConfig, ModelName 9 | 10 | 11 | class Progress(abc.ABC): 12 | @abc.abstractmethod 13 | def update(self, n: float = 1) -> None: ... 14 | 15 | 16 | class TaskDisplay(abc.ABC): 17 | @abc.abstractmethod 18 | @contextlib.contextmanager 19 | def progress(self, total: int) -> Iterator[Progress]: ... 20 | 21 | @abc.abstractmethod 22 | def summary(self, results: EvalResults, stats: EvalStats) -> None: ... 23 | 24 | @abc.abstractmethod 25 | def error( 26 | self, 27 | error: EvalError, 28 | exc_type: Type[Any], 29 | exc_value: BaseException, 30 | traceback: TracebackType | None, 31 | ) -> None: ... 32 | 33 | 34 | @dataclass 35 | class TaskProfile: 36 | name: str 37 | sequence: tuple[int, int] 38 | model: ModelName 39 | dataset: str 40 | scorer: str 41 | samples: int 42 | eval_config: EvalConfig 43 | task_args: dict[str, Any] 44 | generate_config: GenerateConfig 45 | log_location: str 46 | 47 | 48 | class Display(abc.ABC): 49 | @abc.abstractmethod 50 | def print(self, message: str) -> None: ... 51 | 52 | @abc.abstractmethod 53 | @contextlib.contextmanager 54 | def progress(self, total: int) -> Iterator[Progress]: ... 55 | 56 | @abc.abstractmethod 57 | @contextlib.contextmanager 58 | def task(self, profile: TaskProfile) -> Iterator[TaskDisplay]: ... 59 | -------------------------------------------------------------------------------- /src/inspect_ai/_util/json.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | 3 | JSONType = Literal["string", "integer", "number", "boolean", "array", "object", "null"] 4 | 5 | PythonType = Literal["str", "int", "float", "bool", "list", "dict", "None"] 6 | 7 | 8 | def python_type_to_json_type(python_type: str | None) -> JSONType: 9 | match python_type: 10 | case "str": 11 | return "string" 12 | case "int": 13 | return "integer" 14 | case "float": 15 | return "number" 16 | case "bool": 17 | return "boolean" 18 | case "list": 19 | return "array" 20 | case "dict": 21 | return "object" 22 | case "None": 23 | return "null" 24 | # treat 'unknown' as string as anything can be converted to string 25 | case None: 26 | return "string" 27 | case _: 28 | raise ValueError( 29 | f"Unsupported type: {python_type} for Python to JSON conversion." 30 | ) 31 | 32 | 33 | def json_type_to_python_type(json_type: str) -> PythonType: 34 | match json_type: 35 | case "string": 36 | return "str" 37 | case "integer": 38 | return "int" 39 | case "number": 40 | return "float" 41 | case "boolean": 42 | return "bool" 43 | case "array": 44 | return "list" 45 | case "object": 46 | return "dict" 47 | case "null": 48 | return "None" 49 | case _: 50 | raise ValueError( 51 | f"Unsupported type: {json_type} for JSON to Python conversion." 52 | ) 53 | -------------------------------------------------------------------------------- /src/inspect_ai/scorer/_multi.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from collections import Counter 3 | from typing import ( 4 | Protocol, 5 | runtime_checkable, 6 | ) 7 | 8 | from inspect_ai.solver import TaskState 9 | 10 | from ._metric import Score 11 | from ._scorer import Scorer, Target 12 | 13 | 14 | @runtime_checkable 15 | class ScoreReducer(Protocol): 16 | def __call__(self, scores: list[Score]) -> Score: ... 17 | 18 | 19 | def multi_scorer(scorers: list[Scorer], reducer: ScoreReducer) -> Scorer: 20 | r"""Returns a Scorer that runs multiple Scorers in parallel and aggregates their results into a single Score using the provided reducer function. 21 | 22 | Args: 23 | scorers: a list of Scorers. 24 | reducer: a function which takes in a list of Scores and returns a single Score. 25 | """ 26 | 27 | async def score(state: TaskState, target: Target) -> Score: 28 | scores = await asyncio.gather(*[_scorer(state, target) for _scorer in scorers]) 29 | return reducer(scores) 30 | 31 | return score 32 | 33 | 34 | def majority_vote(scores: list[Score]) -> Score: 35 | r"""A utility function for taking a majority vote over a list of scores. 36 | 37 | Args: 38 | scores: a list of Scores. 39 | """ 40 | counts: Counter[str | int | float | bool] = Counter() 41 | for score in scores: 42 | counts[score._as_scalar()] += 1 43 | return Score( 44 | value=counts.most_common(1)[0][0], 45 | answer=scores[0].answer, 46 | explanation=scores[0].explanation, 47 | metadata={ 48 | "individual_scores": scores 49 | }, # TODO: massage into format better for display 50 | ) 51 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/schema.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import subprocess 4 | from pathlib import Path 5 | from typing import Any 6 | 7 | from inspect_ai.log import EvalLog 8 | 9 | WWW_DIR = os.path.abspath((Path(__file__).parent / "www").as_posix()) 10 | 11 | 12 | def sync_view_schema() -> None: 13 | """Generate a JSON schema and Typescript types for EvalLog. 14 | 15 | This is useful for keeping log file viewer JS development 16 | in sync w/ Python development 17 | """ 18 | # export schema file 19 | schema_path = Path(WWW_DIR, "log-schema.json") 20 | types_path = Path(WWW_DIR, "log.d.ts") 21 | with open(schema_path, "w", encoding="utf-8") as f: 22 | # make everything required 23 | schema = EvalLog.model_json_schema() 24 | defs: dict[str, Any] = schema["$defs"] 25 | for key in defs.keys(): 26 | defs[key] = schema_to_strict(defs[key]) 27 | f.write(json.dumps(schema, indent=2)) 28 | 29 | # generate types w/ json-schema-to-typescript 30 | subprocess.run( 31 | [ 32 | "json2ts", 33 | "--input", 34 | schema_path, 35 | "--output", 36 | types_path, 37 | "--additionalProperties", 38 | "false", 39 | ] 40 | ) 41 | 42 | 43 | def schema_to_strict(schema: dict[str, Any]) -> dict[str, Any]: 44 | properties = schema.get("properties", None) 45 | if properties: 46 | schema["required"] = list(properties.keys()) 47 | schema["additionalProperties"] = False 48 | 49 | return schema 50 | 51 | 52 | if __name__ == "__main__": 53 | sync_view_schema() 54 | -------------------------------------------------------------------------------- /tools/vscode/src/providers/settings/user-settings.ts: -------------------------------------------------------------------------------- 1 | import { ConfigurationTarget, workspace } from "vscode"; 2 | 3 | const kPackageIndexDepthsSetting = "packageIndexDepths"; 4 | 5 | export const initializeGlobalSettings = async () => { 6 | const pythonAnalysis = workspace.getConfiguration("python.analysis") || []; 7 | const pkgIndexDepths = 8 | pythonAnalysis.get>( 9 | kPackageIndexDepthsSetting 10 | ) || []; 11 | 12 | try { 13 | kInspectPackageIndexDepth.forEach((pkgDep) => { 14 | if ( 15 | !pkgIndexDepths.find((p) => { 16 | return pkgDep.name === p.name; 17 | }) 18 | ) { 19 | pkgIndexDepths.push(pkgDep); 20 | } 21 | }); 22 | await pythonAnalysis.update( 23 | kPackageIndexDepthsSetting, 24 | pkgIndexDepths, 25 | ConfigurationTarget.Global 26 | ); 27 | } catch { 28 | // This can happen if the user disables the Pylance extension 29 | // in that case, since this is a Pylance setting, we're safe to just 30 | // ignore it 31 | // 32 | // Don't log since this is an allowed state (we don't require Pylance) 33 | // and continue for any exception since we shouldn't allow this setting 34 | // to block extension init 35 | } 36 | 37 | const config = workspace.getConfiguration("editor", { languageId: "json" }); 38 | await config.update("wordWrap", "on", true); 39 | }; 40 | 41 | const kInspectPackageIndexDepth = [ 42 | { 43 | name: "inspect_ai", 44 | depth: 2, 45 | }, 46 | ]; 47 | -------------------------------------------------------------------------------- /tools/vscode/src/core/path.ts: -------------------------------------------------------------------------------- 1 | import path, { basename, dirname, join } from "path"; 2 | import { activeWorkspaceFolder } from "./workspace"; 3 | import { existsSync } from "fs"; 4 | 5 | export type UnknownPath = string; 6 | 7 | export type AbsolutePath = { 8 | path: string; 9 | dirname: () => AbsolutePath; 10 | filename: () => string; 11 | child: (file: string) => AbsolutePath; 12 | }; 13 | 14 | export const activeWorkspacePath = (): AbsolutePath => { 15 | const root = activeWorkspaceFolder(); 16 | return toAbsolutePath(root.uri.fsPath); 17 | }; 18 | 19 | // Resolves a workspace relative path into an absolute path 20 | export const workspacePath = (unknownPath: UnknownPath) => { 21 | if (path.isAbsolute(unknownPath)) { 22 | return toAbsolutePath(unknownPath); 23 | } else { 24 | const workspaceRoot = activeWorkspaceFolder().uri; 25 | const absolutePath = path.resolve(workspaceRoot.fsPath, unknownPath); 26 | return toAbsolutePath(absolutePath); 27 | } 28 | }; 29 | 30 | export const workspaceRelativePath = (absPath: AbsolutePath) => { 31 | const workspaceRoot = activeWorkspaceFolder(); 32 | return path.relative(workspaceRoot.uri.fsPath, absPath.path); 33 | }; 34 | 35 | export const toAbsolutePath = (path: string): AbsolutePath => { 36 | return { 37 | path, 38 | dirname: () => { 39 | return toAbsolutePath(dirname(path)); 40 | }, 41 | filename: () => { 42 | return basename(path); 43 | }, 44 | child: (file: string) => { 45 | return toAbsolutePath(join(path, file)); 46 | } 47 | }; 48 | }; 49 | 50 | export const pathExists = (path: string) => { 51 | const wsPath = workspacePath(path); 52 | return existsSync(wsPath.path); 53 | }; 54 | -------------------------------------------------------------------------------- /tools/vscode/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | ## 0.3.19 4 | 5 | - Fix an issue showing the log viewer when an evaluation completes (specific to Inspect 0.3.10 or later) 6 | 7 | ## 0.3.18 8 | 9 | - Fix issues with task params when type hints are provided 10 | - Improve metric appearance in `inspect view` 11 | 12 | ## 0.3.17 13 | 14 | - Improve `inspect view` title bar treatment 15 | 16 | ## 0.3.16 17 | 18 | - Fix an issue that prevented the extension from loading when the `Pylance` extension was disabled or uninstalled. 19 | - Don't send task params that have been removed from tasks 20 | - Ensure that debugger breakpoints are available outside of user code 21 | - Ensure that evaluations are run from the workspace directory 22 | - Only show the logview in VS Code window that started an eval 23 | 24 | ## 0.3.14 25 | 26 | - Fix issue where the run/debug task option would be disabled for the task configuration pane if a file containing no tasks was being editted. 27 | - Improve Inspect binary detection on Linux platforms 28 | 29 | ## 0.3.13 30 | 31 | - Ensure that inspect CLI is in the path for terminals using a global Python environment 32 | - Add 'Show Logs' command to the environment panel. 33 | - Improve models in the environment panel 34 | - Display literal provider names (rather than pretty names) 35 | - Remember the last used model for each provider 36 | - Allow free-form provide in model 37 | - Add autocomplete for Ollama 38 | - Fix 'Restart' when debugging to properly restart the Inspect debugging session 39 | - Improve performance loading task tree, selecting tasks within outline, and navigating to tasks 40 | - Improve task selection behavior when the activity bar is first shown -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/components/Dialog.mjs: -------------------------------------------------------------------------------- 1 | import { html } from "htm/preact"; 2 | 3 | export const DialogButton = ({ id, btnType, classes, style, children }) => { 4 | return html``; 13 | }; 14 | 15 | export const DialogAfterBody = ({ 16 | id, 17 | title, 18 | classes, 19 | scrollable, 20 | centered, 21 | styles, 22 | children, 23 | }) => { 24 | return html` 25 | 60 | `; 61 | }; 62 | -------------------------------------------------------------------------------- /tools/vscode/src/providers/settings/inspect-settings.ts: -------------------------------------------------------------------------------- 1 | import { workspace } from "vscode"; 2 | 3 | // Inspect Settings 4 | export interface InspectSettings { 5 | logViewAuto: boolean; 6 | logViewType: InspectLogViewStyle; 7 | } 8 | export type InspectLogViewStyle = "html" | "text"; 9 | 10 | // Settings namespace and constants 11 | const kInspectConfigSection = "inspect_ai"; 12 | const kInspectConfigLogViewAuto = "logViewAuto"; 13 | const kInspectConfigLogViewType = "logViewType"; 14 | 15 | // Manages the settings for the inspect extension 16 | export class InspectSettingsManager { 17 | constructor(private readonly onChanged_: (() => void) | undefined) { 18 | workspace.onDidChangeConfiguration((event) => { 19 | if (event.affectsConfiguration(kInspectConfigSection)) { 20 | // Configuration for has changed 21 | this.settings_ = undefined; 22 | if (this.onChanged_) { 23 | this.onChanged_(); 24 | } 25 | } 26 | }); 27 | } 28 | private settings_ : InspectSettings | undefined; 29 | 30 | // get the current settings values 31 | getSettings(): InspectSettings { 32 | if (!this.settings_) { 33 | this.settings_ = this.readSettings(); 34 | } 35 | return this.settings_; 36 | } 37 | 38 | // Read settings values directly from VS.Code 39 | private readSettings() { 40 | const configuration = workspace.getConfiguration(kInspectConfigSection); 41 | const logViewType = 42 | configuration.get(kInspectConfigLogViewType) || "html"; 43 | const logViewAuto = configuration.get(kInspectConfigLogViewAuto); 44 | return { 45 | logViewType, 46 | logViewAuto: logViewAuto !== undefined ? logViewAuto : true, 47 | }; 48 | } 49 | 50 | } -------------------------------------------------------------------------------- /tests/scorer/test_answer.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from test_helpers.utils import simple_task_state 3 | 4 | from inspect_ai.scorer import CORRECT, INCORRECT, Target, answer 5 | 6 | 7 | @pytest.mark.asyncio 8 | async def test_letter_success(): 9 | scorer = answer("letter") 10 | state = simple_task_state("ANSWER: B") 11 | result = await scorer(state, Target(["B"])) 12 | 13 | assert result.text == CORRECT 14 | 15 | 16 | @pytest.mark.asyncio 17 | async def test_letter_failure(): 18 | scorer = answer("letter") 19 | state = simple_task_state("ANSWER: B") 20 | result = await scorer(state, Target(["C"])) 21 | 22 | assert result.text == INCORRECT 23 | 24 | 25 | @pytest.mark.asyncio 26 | async def test_word_success(): 27 | scorer = answer("word") 28 | state = simple_task_state("ANSWER: Yes") 29 | result = await scorer(state, Target(["Yes"])) 30 | 31 | assert result.text == CORRECT 32 | 33 | 34 | @pytest.mark.asyncio 35 | async def test_word_failure(): 36 | scorer = answer("letter") 37 | state = simple_task_state("ANSWER: Yes") 38 | result = await scorer(state, Target(["No"])) 39 | 40 | assert result.text == INCORRECT 41 | 42 | 43 | @pytest.mark.asyncio 44 | async def test_line_success(): 45 | scorer = answer("line") 46 | state = simple_task_state("ANSWER:\nThis is a whole new line") 47 | result = await scorer(state, Target(["This is a whole new line"])) 48 | 49 | assert result.text == CORRECT 50 | 51 | 52 | @pytest.mark.asyncio 53 | async def test_line_failure(): 54 | scorer = answer("line") 55 | state = simple_task_state("ANSWER:\nThis is a whole new line") 56 | result = await scorer(state, Target(["This doesn't match does it?"])) 57 | 58 | assert result.text == INCORRECT 59 | -------------------------------------------------------------------------------- /tools/vscode/assets/www/view/view-overrides.css: -------------------------------------------------------------------------------- 1 | /* custom title block treatment for vscode */ 2 | 3 | body[class^="vscode-"] .workspace { 4 | margin-top: 0 !important; 5 | } 6 | 7 | body[class^="vscode-"] .workspace > div:first-of-type { 8 | padding-top: 0 !important; 9 | padding-bottom: 0 !important; 10 | } 11 | 12 | body[class^="vscode-"] .font-title { 13 | font-size: 0.9em; 14 | } 15 | 16 | body[class^="vscode-"] .font-subtitle { 17 | font-size: 0.8em; 18 | } 19 | 20 | body[class^="vscode-"] .font-title > span:last-of-type { 21 | font-size: 0.8em !important; 22 | } 23 | 24 | body[class^="vscode-"] .workspace > div > div > div:last-child > div > div > div:last-child { 25 | font-size: 1rem !important; 26 | } 27 | 28 | body[class^="vscode-"] #title-plan-summary > div > div > div:last-child, 29 | body[class^="vscode-"] #title-plan-summary > div > div > div:last-child > div { 30 | font-size: 0.7rem !important; 31 | } 32 | 33 | body[class^="vscode-"] #title-hyperparameters > div > div:last-child { 34 | font-size: 0.7rem !important; 35 | } 36 | 37 | /* custom sidebar treatment for vscode */ 38 | body[class^="vscode-"] .sidebar .list-group .list-group-item { 39 | font-size: 0.6rem !important; 40 | } 41 | 42 | body[class^="vscode-"] #sidebarOffCanvas > div > span { 43 | font-size: 0.8rem !important; 44 | } 45 | 46 | body[class^="vscode-"] code:not(.sourceCode) { 47 | color: var(--bs-body-color); 48 | } 49 | 50 | /* temporary hack to improve the appearance of metrics in the navbar 51 | to truly fix, remove 'navbar-brand' from metrics div and use `navbar-metrics` 52 | to properly style it */ 53 | body[class^="vscode-"] .navbar > div > .navbar-text:not(.navbar-brand) > div > div > div:last-of-type { 54 | margin-top: -10px; 55 | transform: scale(0.7); 56 | } -------------------------------------------------------------------------------- /tools/vscode/src/providers/logview/logview-link-provider.ts: -------------------------------------------------------------------------------- 1 | import { Uri } from "vscode"; 2 | 3 | import { InspectLogviewManager } from "./logview-manager"; 4 | import { workspacePath } from "../../core/path"; 5 | import { showError } from "../../components/error"; 6 | import { TerminalLink, TerminalLinkContext } from "vscode"; 7 | 8 | const kLogFilePattern = /^.*Log: (\S*?\.json)\s*/g; 9 | 10 | interface LogViewTerminalLink extends TerminalLink { 11 | data: string; 12 | } 13 | 14 | export const logviewTerminalLinkProvider = (manager: InspectLogviewManager) => { 15 | return { 16 | provideTerminalLinks: ( 17 | context: TerminalLinkContext, 18 | ) => { 19 | // Find the log file result, if present 20 | const matches = [...context.line.matchAll(kLogFilePattern)]; 21 | if (matches.length === 0) { 22 | return []; 23 | } 24 | 25 | // Forward matches 26 | const result = matches.map((match) => { 27 | // The path from the terminal. 28 | const path = match[1]; 29 | 30 | // Sort out the decoration range for the link 31 | const line = context.line; 32 | const startIndex = line.indexOf(path); 33 | return { 34 | startIndex, 35 | length: path.length, 36 | tooltip: "View Log", 37 | data: path, 38 | } as LogViewTerminalLink; 39 | }); 40 | return result; 41 | }, 42 | handleTerminalLink: (link: LogViewTerminalLink) => { 43 | 44 | const logFile = /^[a-z0-9]+:\/\//.test(link.data) ? Uri.parse(link.data) : Uri.file(workspacePath(link.data).path); 45 | 46 | 47 | manager.showLogFile(logFile).catch(async (err: Error) => { 48 | await showError("Failed to preview log file - failed to start Inspect View", err); 49 | }); 50 | }, 51 | }; 52 | }; 53 | -------------------------------------------------------------------------------- /tools/vscode/src/providers/activity-bar/task-config-commands.ts: -------------------------------------------------------------------------------- 1 | import { Command } from "../../core/command"; 2 | import { toAbsolutePath } from "../../core/path"; 3 | import { InspectEvalManager } from "../inspect/inspect-eval"; 4 | import { ActiveTaskManager } from "../active-task/active-task-provider"; 5 | import { scheduleReturnFocus } from "../../components/focus"; 6 | 7 | export class RunConfigTaskCommand implements Command { 8 | constructor(private readonly manager_: ActiveTaskManager, 9 | private readonly inspectMgr_: InspectEvalManager 10 | ) { } 11 | async execute(): Promise { 12 | const taskInfo = this.manager_.getActiveTaskInfo(); 13 | if (taskInfo) { 14 | const docPath = toAbsolutePath(taskInfo.document.fsPath); 15 | const evalPromise = this.inspectMgr_.startEval(docPath, taskInfo.activeTask?.name, false); 16 | scheduleReturnFocus("inspect_ai.task-configuration.focus"); 17 | await evalPromise; 18 | } 19 | } 20 | 21 | private static readonly id = "inspect.runConfigTask"; 22 | public readonly id = RunConfigTaskCommand.id; 23 | } 24 | 25 | export class DebugConfigTaskCommand implements Command { 26 | constructor(private readonly manager_: ActiveTaskManager, 27 | private readonly inspectMgr_: InspectEvalManager 28 | ) { } 29 | async execute(): Promise { 30 | const taskInfo = this.manager_.getActiveTaskInfo(); 31 | if (taskInfo) { 32 | const docPath = toAbsolutePath(taskInfo.document.fsPath); 33 | const evalPromise = this.inspectMgr_.startEval(docPath, taskInfo.activeTask?.name, true); 34 | scheduleReturnFocus("inspect_ai.task-configuratio.focus"); 35 | await evalPromise; 36 | } 37 | } 38 | 39 | private static readonly id = "inspect.debugConfigTask"; 40 | public readonly id = DebugConfigTaskCommand.id; 41 | } 42 | -------------------------------------------------------------------------------- /tests/test_eval_log/log_version_2.txt: -------------------------------------------------------------------------------- 1 | { 2 | "version": 2, 3 | "status": "success", 4 | "eval": { 5 | "task": "wikipedia", 6 | "task_version": 0, 7 | "task_file": "examples/agents/langchain/wikipedia.py", 8 | "task_id": "YAdbKczyeSb6mEgPd3R9Qs", 9 | "run_id": "i5LyrzaUdD9K4EW5WTAd5t", 10 | "created": "2024-05-05T07:59:35", 11 | "dataset": { 12 | "name": "wikipedia", 13 | "location": "wikipedia.jsonl" 14 | }, 15 | "model": "openai/gpt-4", 16 | "task_attribs": {}, 17 | "task_args": {}, 18 | "model_args": {}, 19 | "config": { 20 | "limit": 20 21 | } 22 | }, 23 | "plan": { 24 | "name": "plan", 25 | "steps": [ 26 | { 27 | "solver": "wikipedia_search", 28 | "params": {} 29 | } 30 | ], 31 | "config": {} 32 | }, 33 | "results": { 34 | "scorer": { 35 | "name": "model_graded_fact", 36 | "params": {} 37 | }, 38 | "metrics": { 39 | "accuracy": { 40 | "name": "accuracy", 41 | "value": 1, 42 | "options": {} 43 | }, 44 | "bootstrap_std": { 45 | "name": "bootstrap_std", 46 | "value": 0.0, 47 | "options": {} 48 | } 49 | } 50 | }, 51 | "stats": { 52 | "started_at": "2024-05-05T07:59:35", 53 | "completed_at": "2024-05-05T08:00:03", 54 | "model_usage": { 55 | "openai/gpt-4": { 56 | "input_tokens": 8868, 57 | "output_tokens": 1351, 58 | "total_tokens": 10219 59 | } 60 | } 61 | }, 62 | "logging": [] 63 | } -------------------------------------------------------------------------------- /tests/test_eval_log/log_with_nan.txt: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "status": "success", 4 | "eval": { 5 | "task": "wikipedia", 6 | "task_version": 0, 7 | "task_file": "examples/agents/langchain/wikipedia.py", 8 | "task_id": "YAdbKczyeSb6mEgPd3R9Qs", 9 | "run_id": "i5LyrzaUdD9K4EW5WTAd5t", 10 | "created": "2024-05-05T07:59:35", 11 | "dataset": { 12 | "name": "wikipedia", 13 | "location": "wikipedia.jsonl" 14 | }, 15 | "model": "openai/gpt-4", 16 | "task_attribs": {}, 17 | "task_args": {}, 18 | "model_args": {}, 19 | "config": { 20 | "limit": 20 21 | } 22 | }, 23 | "plan": { 24 | "name": "plan", 25 | "steps": [ 26 | { 27 | "solver": "wikipedia_search", 28 | "params": {} 29 | } 30 | ], 31 | "config": {} 32 | }, 33 | "results": { 34 | "scorer": { 35 | "name": "model_graded_fact", 36 | "params": {} 37 | }, 38 | "metrics": { 39 | "accuracy": { 40 | "name": "accuracy", 41 | "value": NaN, 42 | "options": {} 43 | }, 44 | "bootstrap_std": { 45 | "name": "bootstrap_std", 46 | "value": 0.0, 47 | "options": {} 48 | } 49 | } 50 | }, 51 | "stats": { 52 | "started_at": "2024-05-05T07:59:35", 53 | "completed_at": "2024-05-05T08:00:03", 54 | "model_usage": { 55 | "openai/gpt-4": { 56 | "input_tokens": 8868, 57 | "output_tokens": 1351, 58 | "total_tokens": 10219 59 | } 60 | } 61 | }, 62 | "logging": [] 63 | } -------------------------------------------------------------------------------- /src/inspect_ai/model/_providers/util.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from logging import getLogger 4 | from typing import Any 5 | 6 | from .._model import StopReason 7 | from .._tool import ToolCall 8 | 9 | logger = getLogger(__name__) 10 | 11 | 12 | def as_stop_reason(reason: str | None) -> StopReason: 13 | """Encode common reason strings into standard StopReason.""" 14 | match reason: 15 | case "stop" | "eos": 16 | return "stop" 17 | case "length" | "content_filter": 18 | return reason 19 | case "model_length": 20 | return "length" 21 | case "tool_calls" | "function_call": 22 | return "tool_calls" 23 | case _: 24 | return "unknown" 25 | 26 | 27 | def model_base_url(base_url: str | None, env_vars: str | list[str]) -> str | None: 28 | if base_url: 29 | return base_url 30 | 31 | if isinstance(env_vars, str): 32 | env_vars = [env_vars] 33 | 34 | for env_var in env_vars: 35 | base_url = os.getenv(env_var, None) 36 | if base_url: 37 | return base_url 38 | 39 | return os.getenv("INSPECT_EVAL_MODEL_BASE_URL", None) 40 | 41 | 42 | def parse_tool_call(id: str, function: str, arguments: str) -> ToolCall: 43 | error: str | None = None 44 | arguments_dict: dict[str, Any] = {} 45 | try: 46 | arguments_dict = json.loads(arguments) 47 | except json.JSONDecodeError as ex: 48 | # define and log error 49 | error = f"Error parsing the following tool call arguments:\n{arguments}\nError details: {ex}" 50 | logger.warning(error) 51 | 52 | # return ToolCall with error payload 53 | return ToolCall( 54 | id=id, 55 | function=function, 56 | arguments=arguments_dict, 57 | type="function", 58 | parse_error=error, 59 | ) 60 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/libs/prism/prism.min.css: -------------------------------------------------------------------------------- 1 | /* PrismJS 1.29.0 2 | https://prismjs.com/download.html#themes=prism&languages=markup+css+clike+javascript */ 3 | code[class*=language-],pre[class*=language-]{color:#000;background:0 0;text-shadow:0 1px #fff;font-family:Consolas,Monaco,'Andale Mono','Ubuntu Mono',monospace;font-size:1em;text-align:left;white-space:pre;word-spacing:normal;word-break:normal;word-wrap:normal;line-height:1.5;-moz-tab-size:4;-o-tab-size:4;tab-size:4;-webkit-hyphens:none;-moz-hyphens:none;-ms-hyphens:none;hyphens:none}code[class*=language-] ::-moz-selection,code[class*=language-]::-moz-selection,pre[class*=language-] ::-moz-selection,pre[class*=language-]::-moz-selection{text-shadow:none;background:#b3d4fc}code[class*=language-] ::selection,code[class*=language-]::selection,pre[class*=language-] ::selection,pre[class*=language-]::selection{text-shadow:none;background:#b3d4fc}@media print{code[class*=language-],pre[class*=language-]{text-shadow:none}}pre[class*=language-]{padding:1em;margin:.5em 0;overflow:auto}:not(pre)>code[class*=language-],pre[class*=language-]{background:#f5f2f0}:not(pre)>code[class*=language-]{padding:.1em;border-radius:.3em;white-space:normal}.token.cdata,.token.comment,.token.doctype,.token.prolog{color:#708090}.token.punctuation{color:#999}.token.namespace{opacity:.7}.token.boolean,.token.constant,.token.deleted,.token.number,.token.property,.token.symbol,.token.tag{color:#905}.token.attr-name,.token.builtin,.token.char,.token.inserted,.token.selector,.token.string{color:#690}.language-css .token.string,.style .token.string,.token.entity,.token.operator,.token.url{color:#9a6e3a;background:hsla(0,0%,100%,.5)}.token.atrule,.token.attr-value,.token.keyword{color:#07a}.token.class-name,.token.function{color:#dd4a68}.token.important,.token.regex,.token.variable{color:#e90}.token.bold,.token.important{font-weight:700}.token.italic{font-style:italic}.token.entity{cursor:help} 4 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/components/MessageContent.mjs: -------------------------------------------------------------------------------- 1 | import { html } from "htm/preact"; 2 | import { MarkdownDiv } from "./MarkdownDiv.mjs"; 3 | 4 | export const MessageContent = (props) => { 5 | const { contents } = props; 6 | if (Array.isArray(contents)) { 7 | return contents.map((content, index) => { 8 | if (typeof content === "string") { 9 | return renderer.render(content, index === contents.length - 1); 10 | } else { 11 | const renderer = messageRenderers[content.type]; 12 | if (renderer) { 13 | return renderer.render(content, index === contents.length - 1); 14 | } else { 15 | console.error(`Unknown message content type '${content.type}'`); 16 | } 17 | } 18 | }); 19 | } else { 20 | // This is a simple string 21 | return messageRenderers["text"].render({ text: contents }); 22 | } 23 | }; 24 | 25 | const messageRenderers = { 26 | text: { 27 | render: (content, isLast) => { 28 | return html`<${MarkdownDiv} 29 | markdown=${content.text} 30 | class=${isLast ? "no-last-para-padding" : ""} 31 | />`; 32 | }, 33 | }, 34 | image: { 35 | render: (content, isLast) => { 36 | return html``; 43 | }, 44 | }, 45 | tool: { 46 | render: (content, isLast) => { 47 | return html`

58 |       ${content.text}
59 |       
`; 60 | }, 61 | }, 62 | }; 63 | -------------------------------------------------------------------------------- /benchmarks/gpqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | GPQA: A Graduate-Level Google-Proof Q&A Benchmark 3 | 4 | David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard 5 | Yuanzhe Pang, Julien Dirani, Julian Michael, Samuel R. Bowman 6 | https://arxiv.org/abs/2311.12022 7 | 8 | Based on: https://github.com/openai/simple-evals/blob/main/gpqa_eval.py 9 | 10 | # eval for default epochs (4) 11 | inspect eval gpqa.py 12 | 13 | # eval with 1 epoch 14 | inspect eval gpqa.py --epochs 1 15 | 16 | # without chain of thought 17 | inspect eval gpqa.py -T cot=false 18 | """ 19 | 20 | 21 | from inspect_ai import Task, task 22 | from inspect_ai.dataset import Sample, csv_dataset 23 | from inspect_ai.model import GenerateConfig 24 | from inspect_ai.scorer import answer 25 | from inspect_ai.solver import multiple_choice 26 | 27 | # default epochs to run eval for 28 | DEFAULT_EPOCHS = 4 29 | 30 | 31 | # map records to inspect samples (note that target is always "A" in the, 32 | # dataset, we will shuffle the presentation of options to mitigate this) 33 | def record_to_sample(record): 34 | return Sample( 35 | input=record["Question"], 36 | choices=[ 37 | str(record["Correct Answer"]), 38 | str(record["Incorrect Answer 1"]), 39 | str(record["Incorrect Answer 2"]), 40 | str(record["Incorrect Answer 3"]), 41 | ], 42 | target="A", 43 | id=record["Record ID"], 44 | ) 45 | 46 | 47 | @task 48 | def gpqa_diamond(cot=True): 49 | return Task( 50 | dataset=csv_dataset( 51 | csv_file="https://openaipublic.blob.core.windows.net/simple-evals/gpqa_diamond.csv", 52 | sample_fields=record_to_sample, 53 | ), 54 | plan=[ 55 | multiple_choice(cot=cot, shuffle=True), 56 | ], 57 | scorer=answer("letter"), 58 | config=GenerateConfig(temperature=0.5), 59 | epochs=DEFAULT_EPOCHS, 60 | ) 61 | -------------------------------------------------------------------------------- /src/inspect_ai/solver/_tool/use_tools.py: -------------------------------------------------------------------------------- 1 | from inspect_ai.model import ( 2 | ChatMessageSystem, 3 | ToolChoice, 4 | ) 5 | 6 | from .._solver import Generate, Solver, TaskState, solver 7 | from .._util import append_system_message 8 | from .tool import Tool 9 | from .tool_def import tool_defs 10 | 11 | 12 | @solver 13 | def use_tools( 14 | tools: Tool | list[Tool] | None = None, tool_choice: ToolChoice = "auto" 15 | ) -> Solver: 16 | """ 17 | Solver that inject tools into the task state to be used in generate(). 18 | 19 | Args: 20 | tools (Tool | list[Tool]): one or more tools to inject into the task state. 21 | tool_choice (ToolChoice | None): Directive indicating which 22 | tools the model should use. 23 | 24 | Returns: 25 | A solver that injects the tools and tool_choice into the task state. 26 | """ 27 | # create tool defs 28 | tools = tools if isinstance(tools, list) else [tools] if tools else None 29 | tdefs = tool_defs(tools) if tools else None 30 | 31 | async def solve(state: TaskState, generate: Generate) -> TaskState: 32 | # register the tools 33 | if tools and tdefs: 34 | state.tools.extend(tools) 35 | 36 | # append the tools system prompts. mark the 'source' of messages 37 | # as tool so they can be removed if tool_choice == "none" 38 | for tool in tdefs: 39 | if tool.prompt: 40 | append_system_message( 41 | state.messages, 42 | ChatMessageSystem(content=tool.prompt, tool=tool.name), 43 | ) 44 | 45 | # set tool choice (note you can call this function w/o tools 46 | # for just the side effect of enabling/disabling tool usage) 47 | state.tool_choice = tool_choice 48 | 49 | # return state 50 | return state 51 | 52 | return solve 53 | -------------------------------------------------------------------------------- /src/inspect_ai/scorer/_match.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | 3 | from ._common import match_str, str_match_scorer 4 | from ._metrics import accuracy, bootstrap_std 5 | from ._scorer import Scorer, scorer 6 | 7 | 8 | @scorer(metrics=[accuracy(), bootstrap_std()]) 9 | def match( 10 | location: Literal["begin", "end", "any", "exact"] = "end", 11 | *, 12 | ignore_case: bool = True, 13 | numeric: bool = False, 14 | ) -> Scorer: 15 | """Scorer which matches text or a number. 16 | 17 | Args: 18 | location (Literal["begin", "end", "any", "exact"]): 19 | Location to match at. "any" matches anywhere in the 20 | output; "exact" requires the output be exactly 21 | equal to the target (module whitespace, etc.) 22 | ignore_case (bool): Do case insensitive comparison. 23 | numeric (bool): Is this a numeric match? (in this 24 | case different punctuation removal rules are 25 | used and numbers are normalized before comparison). 26 | """ 27 | 28 | def check(value: str, target: str) -> tuple[str, bool]: 29 | return match_str( 30 | value=value, 31 | target=target, 32 | location=location, 33 | ignore_case=ignore_case, 34 | numeric=numeric, 35 | ) 36 | 37 | return str_match_scorer(check) 38 | 39 | 40 | @scorer(metrics=[accuracy(), bootstrap_std()]) 41 | def includes(ignore_case: bool = True) -> Scorer: 42 | """Check whether the specified text is included in the model output. 43 | 44 | Args: 45 | ignore_case (bool): Use a case insensitive comparison. 46 | 47 | """ 48 | 49 | def check(value: str, target: str) -> tuple[str, bool]: 50 | if ignore_case: 51 | idx = value.lower().rfind(target.lower()) 52 | else: 53 | idx = value.rfind(target) 54 | return value, idx != -1 55 | 56 | return str_match_scorer(check) 57 | -------------------------------------------------------------------------------- /tools/vscode/src/core/git.ts: -------------------------------------------------------------------------------- 1 | import { existsSync, readFileSync, writeFileSync } from "fs"; 2 | import path from "path"; 3 | import { lines } from "./text"; 4 | import { runProcess } from "./process"; 5 | import { AbsolutePath } from "./path"; 6 | import { platform } from "os"; 7 | 8 | export function ensureGitignore( 9 | dir: AbsolutePath, 10 | entries: string[] 11 | ): boolean { 12 | // if .gitignore exists, then ensure it has the requisite entries 13 | const gitignorePath = path.join(dir.path, ".gitignore"); 14 | if (existsSync(gitignorePath)) { 15 | const gitignore = lines( 16 | readFileSync(gitignorePath, { 17 | encoding: "utf-8", 18 | }) 19 | ).map((line) => line.trim()); 20 | const requiredEntries: string[] = []; 21 | for (const requiredEntry of entries) { 22 | if (!gitignore.includes(requiredEntry)) { 23 | requiredEntries.push(requiredEntry); 24 | } 25 | } 26 | if (requiredEntries.length > 0) { 27 | writeGitignore(dir.path, gitignore.concat(requiredEntries)); 28 | return true; 29 | } else { 30 | return false; 31 | } 32 | } else { 33 | // if it doesn't exist then auto-create if we are in a git project or we had the force flag 34 | try { 35 | const result = runProcess("git", ["rev-parse"], dir); 36 | if (result) { 37 | createGitignore(dir.path, entries); 38 | return true; 39 | } else { 40 | return false; 41 | } 42 | } catch { 43 | return false; 44 | } 45 | } 46 | } 47 | 48 | export function createGitignore(dir: string, entries: string[]) { 49 | writeGitignore(dir, entries); 50 | } 51 | 52 | function writeGitignore(dir: string, lines: string[]) { 53 | const lineEnding = platform() === "win32" ? "\r\n" : "\n"; 54 | writeFileSync( 55 | path.join(dir, ".gitignore"), 56 | lines.join(lineEnding) + lineEnding, 57 | { encoding: "utf-8" } 58 | ); 59 | } 60 | 61 | -------------------------------------------------------------------------------- /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Build 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | - "release/**" 11 | 12 | jobs: 13 | ruff: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.10", "3.11"] 18 | steps: 19 | - uses: actions/checkout@v4 20 | - name: Lint and format with Ruff 21 | uses: chartboost/ruff-action@v1 22 | 23 | mypy: 24 | runs-on: ubuntu-latest 25 | strategy: 26 | matrix: 27 | python-version: ["3.10", "3.11"] 28 | steps: 29 | - uses: actions/checkout@v4 30 | - name: Set up Python ${{ matrix.python-version }} 31 | uses: actions/setup-python@v4 32 | with: 33 | python-version: ${{ matrix.python-version }} 34 | - name: Install dependencies 35 | run: | 36 | python -m pip install --upgrade pip 37 | python -m pip install .[dev] 38 | - name: Run mypy 39 | run: | 40 | mypy --exclude tests/test_package src tests 41 | 42 | test: 43 | runs-on: ubuntu-latest 44 | strategy: 45 | matrix: 46 | python-version: ["3.10", "3.11"] 47 | 48 | steps: 49 | - uses: actions/checkout@v4 50 | - name: Set up Python ${{ matrix.python-version }} 51 | uses: actions/setup-python@v4 52 | with: 53 | python-version: ${{ matrix.python-version }} 54 | - name: Install dependencies 55 | run: | 56 | python -m pip install --upgrade pip 57 | python -m pip install .[dev] 58 | - name: Test with pytest 59 | run: | 60 | pytest -rA --doctest-modules --color=yes --cov=inspect_ai 61 | 62 | package: 63 | name: Build & inspect the package. 64 | runs-on: ubuntu-latest 65 | 66 | steps: 67 | - uses: actions/checkout@v4 68 | - uses: hynek/build-and-inspect-python-package@v1 69 | -------------------------------------------------------------------------------- /tests/test_subprocess.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | import pytest 5 | 6 | from inspect_ai.util import subprocess 7 | 8 | 9 | @pytest.mark.asyncio 10 | async def test_subprocess_execute(): 11 | result = await subprocess(["python3", "-c", "print('foo')"]) 12 | assert result.stdout.strip() == "foo" 13 | 14 | 15 | @pytest.mark.asyncio 16 | async def test_subprocess_fail(): 17 | result = await subprocess(["python4"]) 18 | assert result.success is False 19 | 20 | 21 | @pytest.mark.asyncio 22 | async def test_subprocess_stdin(): 23 | input = "tell me a story" 24 | result = await subprocess( 25 | ["python3", "-c", "import sys; print(sys.stdin.read())"], input=input 26 | ) 27 | assert result.stdout.strip() == input 28 | 29 | 30 | @pytest.mark.asyncio 31 | async def test_subprocess_binary(): 32 | input = "tell me a story".encode() 33 | result = await subprocess( 34 | ["python3", "-c", "import sys; print(sys.stdin.read())"], 35 | text=False, 36 | input=input, 37 | ) 38 | assert result.stdout.decode().strip() == input.decode() 39 | 40 | 41 | @pytest.mark.asyncio 42 | async def test_subprocess_cwd(): 43 | parent_dir = Path(os.getcwd()).parent.as_posix() 44 | result = await subprocess( 45 | ["python3", "-c", "import os; print(os.getcwd())"], cwd=parent_dir 46 | ) 47 | assert result.stdout.strip() == parent_dir 48 | 49 | 50 | @pytest.mark.asyncio 51 | async def test_subprocess_env(): 52 | ENV_VAR = "TEST_SUBPROCESS_ENV" 53 | ENV_VALUE = "test value" 54 | result = await subprocess( 55 | ["python3", "-c", f"import os; print(os.getenv('{ENV_VAR}'))"], 56 | env={ENV_VAR: ENV_VALUE}, 57 | ) 58 | assert result.stdout.strip() == ENV_VALUE 59 | 60 | 61 | @pytest.mark.asyncio 62 | async def test_subprocess_timeout(): 63 | result = await subprocess(["sleep", "2"], timeout=1) 64 | assert result.returncode == 1 65 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/api/api-browser.mjs: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | const loaded_time = Date.now() 5 | let last_eval_time = 0 6 | 7 | 8 | async function client_events() { 9 | const params = new URLSearchParams() 10 | params.append("loaded_time", loaded_time.valueOf()) 11 | params.append("last_eval_time", last_eval_time.valueOf()) 12 | return api("GET", `/api/events?${params.toString()}`) 13 | } 14 | 15 | async function eval_logs() { 16 | const logs = await api("GET", `/api/logs`) 17 | last_eval_time = Date.now() 18 | return logs 19 | } 20 | 21 | async function eval_log(file, headerOnly) { 22 | if (headerOnly) { 23 | return api("GET", `/api/logs/${file}?header-only=true`) 24 | } else { 25 | return api("GET", `/api/logs/${file}`) 26 | } 27 | } 28 | 29 | async function eval_log_headers(files) { 30 | const params = new URLSearchParams(); 31 | for (const file of files) { 32 | params.append("file", file); 33 | } 34 | return api("GET", `/api/log-headers?${params.toString()}`) 35 | } 36 | 37 | async function api(method, path, body) { 38 | // build headers 39 | const headers = { 40 | Accept: "application/json", 41 | Pragma: "no-cache", 42 | Expires: "0", 43 | ['Cache-Control']: 'no-cache', 44 | } 45 | if (body) { 46 | headers["Content-Type"] = "application/json"; 47 | } 48 | 49 | // make request 50 | const response = await fetch(`${path}`, { method, headers, body }); 51 | if (response.ok) { 52 | const text = await response.text(); 53 | return JSON5.parse(text); 54 | } else if (response.status !== 200) { 55 | const message = await response.text() || response.statusText; 56 | const error = new Error(`Error: ${response.status}: ${message})`) 57 | throw error; 58 | } else { 59 | throw new Error(`${response.status} - ${response.statusText} `); 60 | } 61 | 62 | } 63 | 64 | export default { 65 | client_events, 66 | eval_logs, 67 | eval_log, 68 | eval_log_headers 69 | } 70 | 71 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/samples/SampleDialog.mjs: -------------------------------------------------------------------------------- 1 | import { html } from "htm/preact"; 2 | 3 | import { icons } from "../Constants.mjs"; 4 | import { EmptyPanel } from "../components/EmptyPanel.mjs"; 5 | import { LargeModal } from "../components/LargeModal.mjs"; 6 | 7 | import { SampleDisplay } from "./SampleDisplay.mjs" 8 | 9 | export const SampleDialog = (props) => { 10 | const { 11 | id, 12 | index, 13 | task, 14 | model, 15 | title, 16 | sample, 17 | sampleDescriptor, 18 | nextSample, 19 | prevSample, 20 | context, 21 | } = props; 22 | 23 | // If there is no sample, just show an empty panel 24 | // This should never happen 25 | if (!sample) { 26 | return html`<${LargeModal} id=${id} title="No Sample"><${EmptyPanel}>No Sample Selected`; 27 | } 28 | 29 | const nextTool = { 30 | label: "Next Sample", 31 | icon: icons.next, 32 | onclick: nextSample, 33 | enabled: !!nextSample, 34 | }; 35 | 36 | const prevTool = { 37 | label: "Previous Sample", 38 | icon: icons.previous, 39 | onclick: prevSample, 40 | enabled: !!prevSample, 41 | }; 42 | 43 | // Provide the dialog 44 | return html` 45 | <${LargeModal} 46 | id=${id} 47 | detail=${title} 48 | detailTools=${{ 49 | left: [prevTool], 50 | right: [nextTool], 51 | }} 52 | onkeyup=${(e) => { 53 | switch (e.key) { 54 | case "ArrowRight": 55 | if (nextSample) { 56 | nextSample(); 57 | } 58 | break; 59 | case "ArrowLeft": 60 | if (prevSample) { 61 | prevSample(); 62 | } 63 | break; 64 | } 65 | }} 66 | > 67 | <${SampleDisplay} 68 | index=${index} 69 | id=${id} 70 | sample=${sample} 71 | sampleDescriptor=${sampleDescriptor} 72 | context=${context}/> 73 | `; 74 | }; 75 | 76 | -------------------------------------------------------------------------------- /src/inspect_ai/_cli/common.py: -------------------------------------------------------------------------------- 1 | import functools 2 | from typing import Any, Callable, Tuple, cast 3 | 4 | import click 5 | from typing_extensions import TypedDict 6 | 7 | from inspect_ai._util.constants import DEFAULT_LOG_LEVEL 8 | 9 | 10 | class CommonOptions(TypedDict): 11 | log_level: str 12 | log_dir: str 13 | debug: bool 14 | debug_port: int 15 | 16 | 17 | def common_options(func: Callable[..., Any]) -> Callable[..., click.Context]: 18 | @click.option( 19 | "--log-level", 20 | type=click.Choice( 21 | ["debug", "http", "info", "warning", "error", "critical"], 22 | case_sensitive=False, 23 | ), 24 | default=DEFAULT_LOG_LEVEL, 25 | envvar="INSPECT_LOG_LEVEL", 26 | help=f"Set the log level (defaults to '{DEFAULT_LOG_LEVEL}')", 27 | ) 28 | @click.option( 29 | "--log-dir", 30 | type=str, 31 | default="./logs", 32 | envvar="INSPECT_LOG_DIR", 33 | help="Directory for log files.", 34 | ) 35 | @click.option( 36 | "--debug", is_flag=True, envvar="INSPECT_DEBUG", help="Wait to attach debugger" 37 | ) 38 | @click.option( 39 | "--debug-port", 40 | default=5678, 41 | envvar="INSPECT_DEBUG_PORT", 42 | help="Port number for debugger", 43 | ) 44 | @functools.wraps(func) 45 | def wrapper(*args: Any, **kwargs: Any) -> click.Context: 46 | return cast(click.Context, func(*args, **kwargs)) 47 | 48 | return wrapper 49 | 50 | 51 | def resolve_common_options(options: CommonOptions) -> Tuple[str, str]: 52 | # attach debugger if requested 53 | if options["debug"]: 54 | import debugpy # type: ignore 55 | 56 | debugpy.listen(options["debug_port"]) 57 | print("Waiting for debugger attach") 58 | debugpy.wait_for_client() 59 | print("Debugger attached") 60 | 61 | # return resolved options 62 | return (options["log_dir"], options["log_level"]) 63 | -------------------------------------------------------------------------------- /tests/test_collapse_user_message.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from inspect_ai.model import ( 4 | ChatMessageAssistant, 5 | ChatMessageUser, 6 | ContentImage, 7 | ContentText, 8 | ) 9 | from inspect_ai.model._model import collapse_consecutive_user_messages 10 | 11 | 12 | @pytest.fixture 13 | def user_message_str(): 14 | return ChatMessageUser(content="User message") 15 | 16 | 17 | @pytest.fixture 18 | def user_message_image_and_str(): 19 | return ChatMessageUser( 20 | content=[ContentImage(image="foo"), ContentText(text="Message")] 21 | ) 22 | 23 | 24 | @pytest.fixture 25 | def assistant_message(): 26 | return ChatMessageAssistant(content="Assistant message") 27 | 28 | 29 | @pytest.fixture 30 | def combined_user_message(): 31 | return ChatMessageUser( 32 | content=[ContentText(text="Message 1"), ContentText(text="Message 2")] 33 | ) 34 | 35 | 36 | def test_collapse_consecutive_user_messages_single_user_message(user_message_str): 37 | messages = [user_message_str] 38 | assert collapse_consecutive_user_messages(messages) == messages 39 | 40 | 41 | def test_collapse_consecutive_user_messages_alternating_messages( 42 | user_message_str, assistant_message 43 | ): 44 | messages = [user_message_str, assistant_message, user_message_str] 45 | assert collapse_consecutive_user_messages(messages) == messages 46 | 47 | 48 | def test_collapse_consecutive_user_messages_consecutive_user_messages(user_message_str): 49 | messages = [user_message_str, user_message_str, user_message_str] 50 | assert len(collapse_consecutive_user_messages(messages)) == 1 51 | 52 | 53 | def test_collapse_consecutive_user_messages_with_image_message( 54 | user_message_image_and_str, 55 | ): 56 | messages = [user_message_image_and_str, user_message_image_and_str] 57 | assert len(collapse_consecutive_user_messages(messages)) == 1 58 | assert isinstance( 59 | collapse_consecutive_user_messages(messages)[0].content[0], ContentImage 60 | ) 61 | -------------------------------------------------------------------------------- /tests/test_collapse_assistant_message.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from inspect_ai.model import ( 4 | ChatMessageAssistant, 5 | ChatMessageUser, 6 | ContentImage, 7 | ContentText, 8 | ) 9 | from inspect_ai.model._model import collapse_consecutive_assistant_messages 10 | 11 | 12 | @pytest.fixture 13 | def user_message_str(): 14 | return ChatMessageUser(content="User message") 15 | 16 | 17 | @pytest.fixture 18 | def user_message_image_and_str(): 19 | return ChatMessageUser( 20 | content=[ContentImage(image="foo"), ContentText(text="Message")] 21 | ) 22 | 23 | 24 | @pytest.fixture 25 | def assistant_message_str(): 26 | return ChatMessageAssistant(content="Assistant message") 27 | 28 | 29 | def test_collapse_consecutive_assistant_messages_single_assistant_message( 30 | assistant_message_str, 31 | ): 32 | messages = [assistant_message_str] 33 | assert collapse_consecutive_assistant_messages(messages) == messages 34 | 35 | 36 | def test_collapse_consecutive_assistant_messages_alternating_messages( 37 | user_message_str, user_message_image_and_str, assistant_message_str 38 | ): 39 | messages = [user_message_str] 40 | assert collapse_consecutive_assistant_messages(messages) == messages 41 | 42 | messages = [user_message_str, assistant_message_str] 43 | assert collapse_consecutive_assistant_messages(messages) == messages 44 | 45 | messages = [user_message_str, assistant_message_str, user_message_str] 46 | assert collapse_consecutive_assistant_messages(messages) == messages 47 | 48 | messages = [user_message_str, assistant_message_str, user_message_image_and_str] 49 | assert collapse_consecutive_assistant_messages(messages) == messages 50 | 51 | 52 | def test_collapse_consecutive_assistant_messages_consecutive_assistant_messages( 53 | assistant_message_str, 54 | ): 55 | messages = [assistant_message_str, assistant_message_str, assistant_message_str] 56 | assert len(collapse_consecutive_assistant_messages(messages)) == 1 57 | -------------------------------------------------------------------------------- /src/inspect_ai/_util/platform.py: -------------------------------------------------------------------------------- 1 | import importlib.util 2 | import os 3 | 4 | 5 | def running_in_notebook() -> bool: 6 | try: 7 | from IPython import get_ipython # type: ignore 8 | 9 | if "IPKernelApp" not in get_ipython().config: # type: ignore 10 | return False 11 | except ImportError: 12 | return False 13 | except AttributeError: 14 | return False 15 | return True 16 | 17 | 18 | def platform_init() -> None: 19 | # if we are running in a notebook, confirm that we have ipywidgets 20 | if running_in_notebook(): 21 | # check for required packages 22 | if not have_package("ipywidgets"): 23 | raise ModuleNotFoundError( 24 | "To using inspect_ai within a notebook, please install ipywidgets with:\n\n" 25 | + "pip install ipywidgets\n" 26 | ) 27 | 28 | # activate nest_asyncio (required so we operate properly within 29 | # the Jupyter async event loop) 30 | import nest_asyncio # type: ignore 31 | 32 | nest_asyncio.apply() 33 | 34 | 35 | def have_package(package: str) -> bool: 36 | return importlib.util.find_spec(package) is not None 37 | 38 | 39 | def is_running_in_jupyterlab() -> bool: 40 | return os.getenv("JPY_SESSION_NAME", None) is not None 41 | 42 | 43 | def is_running_in_vscode() -> bool: 44 | # Check if running in VS Code Jupyter notebook or interactive window 45 | if ( 46 | os.getenv("VSCODE_IPYTHON_KERNEL") is not None 47 | or os.getenv("VSCODE_CLI_REQUIRE_TOKEN") is not None 48 | or os.getenv("VSCODE_PID") is not None 49 | or os.getenv("VSCODE_CWD") is not None 50 | ): 51 | return True 52 | # Check if running in a VS Code terminal 53 | if os.getenv("TERM_PROGRAM") == "vscode": 54 | return True 55 | 56 | # If none of the conditions are met, we assume it's not running in VS Code 57 | return False 58 | 59 | 60 | def is_windows() -> bool: 61 | return os.name == "nt" 62 | -------------------------------------------------------------------------------- /tests/test_logprobs.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from test_helpers.utils import ( 3 | skip_if_github_action, 4 | skip_if_no_openai, 5 | skip_if_no_together, 6 | ) 7 | 8 | from inspect_ai.model import ChatMessageUser, GenerateConfig, ModelOutput, get_model 9 | 10 | 11 | async def generate_with_logprobs(model_name, **model_kwargs) -> ModelOutput: 12 | model = get_model( 13 | model_name, 14 | config=GenerateConfig( 15 | logprobs=True, top_logprobs=2, temperature=0.001, max_tokens=50 16 | ), 17 | **model_kwargs, 18 | ) 19 | 20 | message = ChatMessageUser(content="Hello.") 21 | return await model.generate(input=[message]) 22 | 23 | 24 | @pytest.mark.asyncio 25 | @skip_if_no_openai 26 | async def test_openai_logprobs() -> None: 27 | response = await generate_with_logprobs("openai/gpt-3.5-turbo") 28 | assert response.choices[0].logprobs is not None 29 | assert response.choices[0].logprobs.content[0].top_logprobs is not None 30 | assert len(response.choices[0].logprobs.content[0].top_logprobs) == 2 31 | 32 | 33 | @pytest.mark.asyncio 34 | @skip_if_no_together 35 | async def test_together_logprobs() -> None: 36 | response = await generate_with_logprobs("together/lmsys/vicuna-13b-v1.5") 37 | assert ( 38 | response.choices[0].logprobs is not None 39 | and response.choices[0].logprobs.content[0].top_logprobs 40 | is None # together only ever returns top-1, so top_logprobs should always be None 41 | ) 42 | 43 | 44 | @pytest.mark.asyncio 45 | @skip_if_github_action 46 | async def test_hf_logprobs() -> None: 47 | response = await generate_with_logprobs( 48 | "hf/EleutherAI/pythia-70m", 49 | chat_template="{% for message in messages %}{{ message.content }}{% endfor %}", 50 | ) 51 | assert ( 52 | response.choices[0].logprobs 53 | and response.choices[0].logprobs.content[0].top_logprobs is not None 54 | ) 55 | assert len(response.choices[0].logprobs.content[0].top_logprobs) == 2 56 | -------------------------------------------------------------------------------- /tests/test_eval_log.py: -------------------------------------------------------------------------------- 1 | import math 2 | import os 3 | 4 | import pytest 5 | from pydantic_core import PydanticSerializationError 6 | from test_helpers.utils import skip_if_no_openai 7 | 8 | from inspect_ai import Task, eval 9 | from inspect_ai.dataset import Sample 10 | from inspect_ai.log import read_eval_log 11 | from inspect_ai.solver import ( 12 | Generate, 13 | Plan, 14 | TaskState, 15 | generate, 16 | solver, 17 | ) 18 | 19 | 20 | def log_path(file: str) -> str: 21 | # use .txt extension so vscode linter doesn't complain about invalid json 22 | return os.path.join("tests", "test_eval_log", f"{file}.txt") 23 | 24 | 25 | class NotSerializable: 26 | name: str 27 | 28 | 29 | @skip_if_no_openai 30 | def test_ignore_unserializable(): 31 | @solver 32 | def inject_unserializable(): 33 | async def solve(state: TaskState, generate: Generate): 34 | state.metadata["not serializable"] = NotSerializable 35 | return state 36 | 37 | return solve 38 | 39 | task = Task( 40 | dataset=[Sample(input="Say hello.", target="Hello")], 41 | plan=Plan(steps=[inject_unserializable(), generate()]), 42 | ) 43 | 44 | try: 45 | eval(task, model="openai/gpt-4") 46 | except PydanticSerializationError: 47 | assert False, "Eval raised Pydantic serialization error." 48 | 49 | 50 | def test_read_nan(): 51 | def check_for_nan(log): 52 | assert math.isnan(log.results.metrics.get("accuracy").value) 53 | 54 | log_file = log_path("log_with_nan") 55 | check_for_nan(read_eval_log(log_file)) 56 | check_for_nan(read_eval_log(log_file, header_only=True)) 57 | 58 | 59 | def test_fail_invalid(): 60 | check_log_raises(log_path("log_invalid")) 61 | 62 | 63 | def test_fail_version(): 64 | check_log_raises(log_path("log_version_2")) 65 | 66 | 67 | def check_log_raises(log_file): 68 | with pytest.raises(ValueError): 69 | read_eval_log(log_file) 70 | with pytest.raises(ValueError): 71 | read_eval_log(log_file, header_only=True) 72 | -------------------------------------------------------------------------------- /src/inspect_ai/dataset/_examples/biology_qa.jsonl: -------------------------------------------------------------------------------- 1 | {"id": "q1", "question": "Hansen's disease is more commonly known by which name?", "answer": "Leprosy"} 2 | {"id": "q2", "question": "Botany is the study of what life form?", "answer": "Plants"} 3 | {"id": "q3", "question": "What is the human body's largest organ?", "answer": "Skin"} 4 | {"id": "q4", "question": "True or false: snails have teeth", "answer": "True"} 5 | {"id": "q5", "question": "What part of the human body is the Mandible?", "answer": "Lower Jawbone"} 6 | {"id": "q6", "question": "How many bones does an adult human have?", "answer": "206"} 7 | {"id": "q7", "question": "True or false: jellyfish have hearts", "answer": "False"} 8 | {"id": "q8", "question": "Which French microbiologist discovered the process of pasteurisation?", "answer": "Louis Pasteur"} 9 | {"id": "q9", "question": "What year was the first animal cloned?", "answer": "1996"} 10 | {"id": "q10", "question": "Who discovered penicillin?", "answer": "Alexander Fleming"} 11 | {"id": "q11", "question": "When was the Human Genome project completed?", "answer": "2003"} 12 | {"id": "q12", "question": "How many species are estimated to live on Earth?", "answer": "8.7 million"} 13 | {"id": "q13", "question": "A DNA molecule is described as being what shape?", "answer": "Double helix"} 14 | {"id": "q14", "question": "Heterochromia results in which change in physical appearance?", "answer": "Different coloured eyes"} 15 | {"id": "q15", "question": "Crohn's disease is part of which disease group?", "answer": "Inflammatory bowel disease"} 16 | {"id": "q16", "question": "How many neck vertebrae do giraffes have, compared to a human's seven?", "answer": "Seven"} 17 | {"id": "q17", "question": "Which food substance helps move waste through the body?", "answer": "Fibre"} 18 | {"id": "q18", "question": "The term 'renal' refers to which organs?", "answer": "Kidneys"} 19 | {"id": "q19", "question": "What is the name of the biggest part of the human brain?", "answer": "The cerebrum"} 20 | {"id": "q20", "question": "Can cell walls be found in plant cells, animal cells or both?", "answer": "Plant cells"} 21 | -------------------------------------------------------------------------------- /src/inspect_ai/dataset/_sources/example.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Literal 3 | 4 | from .._dataset import Dataset, FieldSpec, MemoryDataset, RecordToSample 5 | from .csv import csv_dataset 6 | from .json import json_dataset 7 | 8 | EXAMPLES_PATH = Path(__file__).parent.parent / "_examples" 9 | 10 | 11 | def example_dataset( 12 | name: Literal["security_guide", "theory_of_mind", "popularity", "biology_qa"], 13 | sample_fields: FieldSpec | RecordToSample | None = None, 14 | ) -> Dataset: 15 | """Read a dataset from inspect_ai package examples. 16 | 17 | This is primarily used for sharing runnable example 18 | snippets that don't need to read an external dataset. 19 | 20 | Args: 21 | name (Literal["security_guide", "theory_of_mind", "popularity", "biology_qa"]): 22 | Example dataset name. One of 'security_guide', 'theory_of_mind', 23 | 'popularity', or 'biology_qa' 24 | sample_fields (SampleFieldSpec | RecordToSample): Method of mapping underlying 25 | fields in the data source to `Sample` objects. Pass `None` if the data is already 26 | stored in `Sample` form (i.e. object with "input" and "target" fields); Pass a 27 | `SampleFieldSpec` to specify mapping fields by name; Pass a `RecordToSample` to 28 | handle mapping with a custom function. 29 | 30 | 31 | Returns: 32 | Dataset read from example file. 33 | """ 34 | json_file = (EXAMPLES_PATH / f"{name}.jsonl").as_posix() 35 | csv_file = (EXAMPLES_PATH / f"{name}.csv").as_posix() 36 | if not Path(json_file).exists() and Path(csv_file).exists(): 37 | raise ValueError(f"Sample dataset {name} not found.") 38 | 39 | if Path(json_file).exists(): 40 | dataset = json_dataset( 41 | json_file=json_file, 42 | sample_fields=sample_fields, 43 | ) 44 | else: 45 | dataset = csv_dataset( 46 | csv_file=csv_file, 47 | sample_fields=sample_fields, 48 | ) 49 | 50 | return MemoryDataset(samples=list(dataset), name=name, location=f"example://{name}") 51 | -------------------------------------------------------------------------------- /tools/vscode/README.md: -------------------------------------------------------------------------------- 1 | # inspect-vscode 2 | 3 | VS Code extension for the Inspect framework for large language model evaluations. This extension provides support for developing evaluations using Inspect, including: 4 | 5 | - Integrated viewer for evaluation log files 6 | - Panel to browse, run, and debug tasks in the workspace 7 | - Panel for editing Inspect `.env` file 8 | - Panel for configuring task CLI options and args 9 | - Commands and key-bindings for running tasks 10 | - Commands and key-bindings for debugging tasks 11 | 12 | ## Log Viewer 13 | 14 | The `inspect view` command is used to automatically display the log for tasks executed within the workspace (this behavior can be controlled with an option). 15 | 16 | ## Task Navigation 17 | 18 | The Tasks panel displays a listing of all the Inspect tasks within your workspace. Selecting the source file or task within the listing will open the task source code in the source editor (or Notebook viewer). You can display a tree of tasks including folders and hierarchy or a flat list of tasks sorted alphabetically. 19 | 20 | ## Configuration Panel 21 | 22 | Use the Configuration (.env) panel to edit common settings in your `.env.` file including the model provider and name, and the log directory and level. 23 | 24 | ## Task Panel 25 | 26 | Use the Task panel to edit CLI options for a task, set task args, and run or debug a task. Values will be saved for each task and used whenever the task is run or debugged from within the Inspect VS Code extension. 27 | 28 | ## Running and Debugging 29 | 30 | The Inspect VS Code extension includes commands and keyboard shortcuts for running or debugging tasks. After the task has been completed, `inspect view` is used behind the scenes to provide a results pane within VS Code alongside your source code. 31 | 32 | Use the run or debug commands to execute the current task. You can alternatively use the Ctrl+Shift+U keyboard shortcut to run a task, or the Ctrl+Shift+T keyboard shortcut to debug a task. 33 | 34 | > Note that on the Mac you should use `Cmd` rather than `Ctrl` as the prefix for all Inspect keyboard shortcuts. 35 | 36 | 37 | -------------------------------------------------------------------------------- /examples/agents/langchain/README.md: -------------------------------------------------------------------------------- 1 | ## LangChain Agent 2 | 3 | This example demonstrates creating a custom solver that utilises a LangChain agent to perform Q and A using Wikipedia. The example includes the following source files: 4 | 5 | | File | Description | 6 | |------------------------|-------------------------------------------------------------------------------------------------| 7 | | `.gitignore` | Ignore the `.venv` directory and the `.env` file containing environment variables for the eval. | 8 | | `.env.example` | Prototype of `.env` file (copy this to `.env` and provide your `TAVILY_API_KEY`). | 9 | | `inspect_langchain.py` | Utilities for creating inspect solvers that use LangChain agents. | 10 | | `wikipedia.py` | Evaluation task and custom solver that uses the search agent. | 11 | | `wikipedia.jsonl` | Dataset with questions and ideal answers. | 12 | 13 | To run this example, first, be sure you provide a `.env` file that defines a `TAVILY_API_KEY` ([Tavily](https://tavily.com/) is a search API for LLM agents). Note that `.env` files should always be included in `.gitignore` as they often contain secrets! 14 | 15 | Next, create a virtual environment and install the required dependencies: 16 | 17 | ``` bash 18 | $ python3 -m venv .venv 19 | $ source .venv/bin/activate 20 | $ pip install -r requirements.txt 21 | ``` 22 | 23 | Now you should be able to run the example as follows: 24 | 25 | ``` python 26 | $ inspect eval --model openai/gpt-4 27 | ``` 28 | 29 | This example will run with any model provider that supports tool use (so Anthropic, Google Gemini, and Mistral will all work as well). 30 | 31 | If you want to run in verbose mode (to see the agent's queries printed out), pass the `verbose` task parameter: 32 | 33 | ``` bash 34 | $ inspect eval --model openai/gpt-4 -T verbose=true --limit 1 35 | ``` 36 | 37 | Note that we specify `--limit 1` so that the verbose output from multiple samples is not intermixed. -------------------------------------------------------------------------------- /src/inspect_ai/_util/retry.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Callable 3 | 4 | from httpx import ConnectError, ConnectTimeout, HTTPStatusError, ReadTimeout 5 | from tenacity import RetryCallState 6 | 7 | from inspect_ai._util.constants import HTTP 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | def httpx_should_retry(ex: BaseException) -> bool: 13 | """Check whether an exception raised from httpx should be retried. 14 | 15 | Implements the strategy described here: https://cloud.google.com/storage/docs/retry-strategy 16 | 17 | Args: 18 | ex (BaseException): Exception to examine for retry behavior 19 | 20 | Returns: 21 | True if a retry should occur 22 | """ 23 | # httpx status exception 24 | if isinstance(ex, HTTPStatusError): 25 | # request timeout 26 | if ex.response.status_code == 408: 27 | return True 28 | # lock timeout 29 | elif ex.response.status_code == 409: 30 | return True 31 | # rate limit 32 | elif ex.response.status_code == 429: 33 | return True 34 | # internal errors 35 | elif ex.response.status_code >= 500: 36 | return True 37 | else: 38 | return False 39 | 40 | # connection error 41 | elif is_httpx_connection_error(ex): 42 | return True 43 | 44 | # don't retry 45 | else: 46 | return False 47 | 48 | 49 | def log_rate_limit_retry(context: str, retry_state: RetryCallState) -> None: 50 | logger.log( 51 | HTTP, 52 | f"{context} rate limit retry {retry_state.attempt_number} after waiting for {retry_state.idle_for}", 53 | ) 54 | 55 | 56 | def log_retry_attempt(context: str) -> Callable[[RetryCallState], None]: 57 | def log_attempt(retry_state: RetryCallState) -> None: 58 | logger.log( 59 | HTTP, 60 | f"{context} connection retry {retry_state.attempt_number} after waiting for {retry_state.idle_for}", 61 | ) 62 | 63 | return log_attempt 64 | 65 | 66 | def is_httpx_connection_error(ex: BaseException) -> bool: 67 | return isinstance(ex, ConnectTimeout | ConnectError | ConnectionError | ReadTimeout) 68 | -------------------------------------------------------------------------------- /src/inspect_ai/_view/www/src/usage/UsageCard.mjs: -------------------------------------------------------------------------------- 1 | import { html } from "htm/preact"; 2 | 3 | import { icons } from "../Constants.mjs"; 4 | import { formatTime } from "../utils/Format.mjs"; 5 | import { Card, CardHeader, CardBody } from "../components/Card.mjs"; 6 | import { MetaDataView } from "../components/MetaDataView.mjs"; 7 | import { ModelTokenTable } from "./ModelTokenTable.mjs" 8 | 9 | const kUsageCardBodyId = "usage-card-body"; 10 | 11 | export const UsageCard = ({ stats, context }) => { 12 | if (!stats) { 13 | return ""; 14 | } 15 | 16 | const totalDuration = duration(stats); 17 | 18 | const usageMetadataStyle = { 19 | fontSize: "0.8em", 20 | }; 21 | 22 | return html` 23 | 24 | <${Card}> 25 | <${CardHeader} icon=${icons.usage} label="Usage"/> 26 | <${CardBody} id=${kUsageCardBodyId} style=${{ 27 | paddingTop: "0", 28 | paddingBottom: "0", 29 | borderTop: "solid var(--bs-border-color) 1px", 30 | }}> 31 |
37 | 38 |
39 |
Duration
40 | <${MetaDataView} 41 | entries="${{ 42 | ["Start"]: new Date(stats.started_at).toLocaleString(), 43 | ["End"]: new Date(stats.completed_at).toLocaleString(), 44 | ["Duration"]: totalDuration, 45 | }}" 46 | tableOptions="borderless,sm" 47 | context=${context} 48 | style=${usageMetadataStyle} 49 | /> 50 |
51 | 52 |
53 | <${ModelTokenTable} model_usage=${stats.model_usage}/> 54 |
55 |
56 | 57 | 58 | `; 59 | }; 60 | 61 | const duration = (stats) => { 62 | const start = new Date(stats.started_at); 63 | const end = new Date(stats.completed_at); 64 | const durationMs = end.getTime() - start.getTime(); 65 | const durationSec = durationMs / 1000; 66 | return formatTime(durationSec); 67 | }; 68 | 69 | -------------------------------------------------------------------------------- /tools/vscode/src/core/process.ts: -------------------------------------------------------------------------------- 1 | import { SpawnSyncOptionsWithStringEncoding, spawn, spawnSync } from "child_process"; 2 | import { AbsolutePath } from "./path"; 3 | 4 | 5 | export function runProcess( 6 | cmd: string | AbsolutePath, 7 | args: string[], 8 | cwd?: AbsolutePath 9 | ) { 10 | 11 | // Process options 12 | const options: SpawnSyncOptionsWithStringEncoding = { 13 | cwd: cwd?.path, 14 | encoding: "utf-8", 15 | windowsHide: true, 16 | maxBuffer: 1000 * 1000 * 100 17 | }; 18 | 19 | cmd = typeof (cmd) === "string" ? cmd : cmd.path; 20 | const result = spawnSync(cmd, args, options); 21 | if (result.error) { 22 | throw new Error( 23 | `The process could not be started\n${result.error.message}` 24 | ); 25 | } else if (result.status === 0) { 26 | return result.stdout; 27 | } else { 28 | throw new Error( 29 | `Command failed with code ${result.status}: ${result.stderr}` 30 | ); 31 | } 32 | } 33 | 34 | 35 | export function spawnProcess( 36 | cmd: string, 37 | args: string[], 38 | cwd: AbsolutePath, 39 | io?: { 40 | stdout?: (data: Buffer | string) => void; 41 | stderr?: (data: Buffer | string) => void; 42 | }, 43 | lifecycle?: { 44 | onError?: (error: Error) => void; 45 | onClose?: (code: number) => void; 46 | } 47 | ) { 48 | // Process options 49 | const options = { 50 | cwd: cwd.path, 51 | detached: true, 52 | }; 53 | 54 | // Start the actual process 55 | const process = spawn(cmd, args, options); 56 | 57 | // Capture stdout 58 | if (process.stdout) { 59 | if (io?.stdout) { 60 | process.stdout.on("data", io.stdout); 61 | } 62 | } else { 63 | throw new Error("Unexpectedly missing stdout from server"); 64 | } 65 | 66 | // Capture stderr 67 | if (process.stderr) { 68 | if (io?.stderr) { 69 | process.stderr.on("data", io.stderr); 70 | } 71 | } else { 72 | throw new Error("Unexpectedly missing stderr from server"); 73 | } 74 | 75 | // Note errors 76 | if (lifecycle?.onError) { 77 | process.on("error", lifecycle.onError); 78 | } 79 | 80 | if (lifecycle?.onClose) { 81 | process.on("close", lifecycle?.onClose); 82 | } 83 | return process; 84 | } --------------------------------------------------------------------------------