├── .gitattributes ├── .gitignore ├── LICENSE ├── README.md ├── benchmark_output ├── runs │ ├── latest │ └── run1 │ │ ├── air_bench_2024:model=openai_gpt-4o-2024-05-13 │ │ ├── display_predictions.json │ │ ├── display_requests.json │ │ ├── instances.json │ │ ├── per_instance_stats.json │ │ ├── run_spec.json │ │ ├── scenario.json │ │ ├── scenario_state.json │ │ └── stats.json │ │ ├── costs.json │ │ ├── groups.json │ │ ├── groups_metadata.json │ │ ├── run_specs.json │ │ ├── runs.json │ │ ├── runs_to_run_suites.json │ │ ├── schema.json │ │ └── summary.json └── scenarios │ └── air_bench_2024 │ └── data │ ├── benchmark_output_scenarios_air_bench_2024_data_stanford-crfm___air-bench-2024_default_0.0.0_58d507aa176c4b6a46d2050645eb0ee6ae71442c.lock │ └── stanford-crfm___air-bench-2024 │ └── default │ └── 0.0.0 │ ├── 58d507aa176c4b6a46d2050645eb0ee6ae71442c.incomplete_info.lock │ ├── 58d507aa176c4b6a46d2050645eb0ee6ae71442c │ ├── air-bench-2024-test.arrow │ └── dataset_info.json │ └── 58d507aa176c4b6a46d2050645eb0ee6ae71442c_builder.lock ├── evaluation ├── pipeline1_step1_Llama-3-8b_response.json ├── pipeline1_step1_model_response.ipynb ├── pipeline1_step2_Llama-3-8b_result.json ├── pipeline1_step2_QA_eval.ipynb ├── pipeline2_step1_gpt-4-turbo_response.csv ├── pipeline2_step1_model_response.ipynb ├── pipeline2_step2_csv_eval.ipynb ├── pipeline2_step2_gpt-4-turbo_result.csv └── utils.py ├── prod_env ├── accounts.sqlite └── cache │ ├── openai.sqlite │ └── output │ └── airbench2024 │ └── data │ ├── prod_env_cache_output_airbench2024_data_stanford-crfm___air-bench-2024_judge_prompts_0.0.0_58d507aa176c4b6a46d2050645eb0ee6ae71442c.lock │ └── stanford-crfm___air-bench-2024 │ └── judge_prompts │ └── 0.0.0 │ ├── 58d507aa176c4b6a46d2050645eb0ee6ae71442c.incomplete_info.lock │ ├── 58d507aa176c4b6a46d2050645eb0ee6ae71442c │ ├── air-bench-2024-test.arrow │ └── dataset_info.json │ └── 58d507aa176c4b6a46d2050645eb0ee6ae71442c_builder.lock └── risk_mapping.json /.gitattributes: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/.gitattributes -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/README.md -------------------------------------------------------------------------------- /benchmark_output/runs/latest: -------------------------------------------------------------------------------- 1 | run1 -------------------------------------------------------------------------------- /benchmark_output/runs/run1/air_bench_2024:model=openai_gpt-4o-2024-05-13/display_predictions.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/benchmark_output/runs/run1/air_bench_2024:model=openai_gpt-4o-2024-05-13/display_predictions.json -------------------------------------------------------------------------------- /benchmark_output/runs/run1/air_bench_2024:model=openai_gpt-4o-2024-05-13/display_requests.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/benchmark_output/runs/run1/air_bench_2024:model=openai_gpt-4o-2024-05-13/display_requests.json -------------------------------------------------------------------------------- /benchmark_output/runs/run1/air_bench_2024:model=openai_gpt-4o-2024-05-13/instances.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/benchmark_output/runs/run1/air_bench_2024:model=openai_gpt-4o-2024-05-13/instances.json -------------------------------------------------------------------------------- /benchmark_output/runs/run1/air_bench_2024:model=openai_gpt-4o-2024-05-13/per_instance_stats.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/benchmark_output/runs/run1/air_bench_2024:model=openai_gpt-4o-2024-05-13/per_instance_stats.json -------------------------------------------------------------------------------- /benchmark_output/runs/run1/air_bench_2024:model=openai_gpt-4o-2024-05-13/run_spec.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/benchmark_output/runs/run1/air_bench_2024:model=openai_gpt-4o-2024-05-13/run_spec.json -------------------------------------------------------------------------------- /benchmark_output/runs/run1/air_bench_2024:model=openai_gpt-4o-2024-05-13/scenario.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/benchmark_output/runs/run1/air_bench_2024:model=openai_gpt-4o-2024-05-13/scenario.json -------------------------------------------------------------------------------- /benchmark_output/runs/run1/air_bench_2024:model=openai_gpt-4o-2024-05-13/scenario_state.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/benchmark_output/runs/run1/air_bench_2024:model=openai_gpt-4o-2024-05-13/scenario_state.json -------------------------------------------------------------------------------- /benchmark_output/runs/run1/air_bench_2024:model=openai_gpt-4o-2024-05-13/stats.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/benchmark_output/runs/run1/air_bench_2024:model=openai_gpt-4o-2024-05-13/stats.json -------------------------------------------------------------------------------- /benchmark_output/runs/run1/costs.json: -------------------------------------------------------------------------------- 1 | {} -------------------------------------------------------------------------------- /benchmark_output/runs/run1/groups.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/benchmark_output/runs/run1/groups.json -------------------------------------------------------------------------------- /benchmark_output/runs/run1/groups_metadata.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/benchmark_output/runs/run1/groups_metadata.json -------------------------------------------------------------------------------- /benchmark_output/runs/run1/run_specs.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/benchmark_output/runs/run1/run_specs.json -------------------------------------------------------------------------------- /benchmark_output/runs/run1/runs.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/benchmark_output/runs/run1/runs.json -------------------------------------------------------------------------------- /benchmark_output/runs/run1/runs_to_run_suites.json: -------------------------------------------------------------------------------- 1 | { 2 | "air_bench_2024:model=openai_gpt-4o-2024-05-13": "run1" 3 | } -------------------------------------------------------------------------------- /benchmark_output/runs/run1/schema.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/benchmark_output/runs/run1/schema.json -------------------------------------------------------------------------------- /benchmark_output/runs/run1/summary.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/benchmark_output/runs/run1/summary.json -------------------------------------------------------------------------------- /benchmark_output/scenarios/air_bench_2024/data/benchmark_output_scenarios_air_bench_2024_data_stanford-crfm___air-bench-2024_default_0.0.0_58d507aa176c4b6a46d2050645eb0ee6ae71442c.lock: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /benchmark_output/scenarios/air_bench_2024/data/stanford-crfm___air-bench-2024/default/0.0.0/58d507aa176c4b6a46d2050645eb0ee6ae71442c.incomplete_info.lock: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /benchmark_output/scenarios/air_bench_2024/data/stanford-crfm___air-bench-2024/default/0.0.0/58d507aa176c4b6a46d2050645eb0ee6ae71442c/air-bench-2024-test.arrow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/benchmark_output/scenarios/air_bench_2024/data/stanford-crfm___air-bench-2024/default/0.0.0/58d507aa176c4b6a46d2050645eb0ee6ae71442c/air-bench-2024-test.arrow -------------------------------------------------------------------------------- /benchmark_output/scenarios/air_bench_2024/data/stanford-crfm___air-bench-2024/default/0.0.0/58d507aa176c4b6a46d2050645eb0ee6ae71442c/dataset_info.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/benchmark_output/scenarios/air_bench_2024/data/stanford-crfm___air-bench-2024/default/0.0.0/58d507aa176c4b6a46d2050645eb0ee6ae71442c/dataset_info.json -------------------------------------------------------------------------------- /benchmark_output/scenarios/air_bench_2024/data/stanford-crfm___air-bench-2024/default/0.0.0/58d507aa176c4b6a46d2050645eb0ee6ae71442c_builder.lock: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /evaluation/pipeline1_step1_Llama-3-8b_response.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/evaluation/pipeline1_step1_Llama-3-8b_response.json -------------------------------------------------------------------------------- /evaluation/pipeline1_step1_model_response.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/evaluation/pipeline1_step1_model_response.ipynb -------------------------------------------------------------------------------- /evaluation/pipeline1_step2_Llama-3-8b_result.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/evaluation/pipeline1_step2_Llama-3-8b_result.json -------------------------------------------------------------------------------- /evaluation/pipeline1_step2_QA_eval.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/evaluation/pipeline1_step2_QA_eval.ipynb -------------------------------------------------------------------------------- /evaluation/pipeline2_step1_gpt-4-turbo_response.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/evaluation/pipeline2_step1_gpt-4-turbo_response.csv -------------------------------------------------------------------------------- /evaluation/pipeline2_step1_model_response.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/evaluation/pipeline2_step1_model_response.ipynb -------------------------------------------------------------------------------- /evaluation/pipeline2_step2_csv_eval.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/evaluation/pipeline2_step2_csv_eval.ipynb -------------------------------------------------------------------------------- /evaluation/pipeline2_step2_gpt-4-turbo_result.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/evaluation/pipeline2_step2_gpt-4-turbo_result.csv -------------------------------------------------------------------------------- /evaluation/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/evaluation/utils.py -------------------------------------------------------------------------------- /prod_env/accounts.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/prod_env/accounts.sqlite -------------------------------------------------------------------------------- /prod_env/cache/openai.sqlite: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/prod_env/cache/openai.sqlite -------------------------------------------------------------------------------- /prod_env/cache/output/airbench2024/data/prod_env_cache_output_airbench2024_data_stanford-crfm___air-bench-2024_judge_prompts_0.0.0_58d507aa176c4b6a46d2050645eb0ee6ae71442c.lock: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /prod_env/cache/output/airbench2024/data/stanford-crfm___air-bench-2024/judge_prompts/0.0.0/58d507aa176c4b6a46d2050645eb0ee6ae71442c.incomplete_info.lock: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /prod_env/cache/output/airbench2024/data/stanford-crfm___air-bench-2024/judge_prompts/0.0.0/58d507aa176c4b6a46d2050645eb0ee6ae71442c/air-bench-2024-test.arrow: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/prod_env/cache/output/airbench2024/data/stanford-crfm___air-bench-2024/judge_prompts/0.0.0/58d507aa176c4b6a46d2050645eb0ee6ae71442c/air-bench-2024-test.arrow -------------------------------------------------------------------------------- /prod_env/cache/output/airbench2024/data/stanford-crfm___air-bench-2024/judge_prompts/0.0.0/58d507aa176c4b6a46d2050645eb0ee6ae71442c/dataset_info.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/prod_env/cache/output/airbench2024/data/stanford-crfm___air-bench-2024/judge_prompts/0.0.0/58d507aa176c4b6a46d2050645eb0ee6ae71442c/dataset_info.json -------------------------------------------------------------------------------- /prod_env/cache/output/airbench2024/data/stanford-crfm___air-bench-2024/judge_prompts/0.0.0/58d507aa176c4b6a46d2050645eb0ee6ae71442c_builder.lock: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /risk_mapping.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/stanford-crfm/air-bench-2024/HEAD/risk_mapping.json --------------------------------------------------------------------------------