├── .devcontainer ├── .gitignore ├── Dockerfile └── devcontainer.json ├── .dockerignore ├── .dvc ├── .gitignore └── config ├── .dvcignore ├── .editorconfig ├── .github └── workflows │ ├── ci.yml │ └── sync-data.yml ├── .gitignore ├── CODEOWNERS ├── README.md ├── data ├── .gitignore ├── external │ ├── .gitignore │ ├── all_runs.jsonl │ ├── ensembled_annotations_public.csv │ ├── messiness.csv │ ├── messiness_tasks.csv │ ├── release_dates.yaml │ └── swebench_baseline_times.yaml ├── metrics │ └── messiness │ │ └── .gitignore ├── processed │ └── ga_swebench_comparison_table.tex └── wrangled │ └── ga_swebench_comparison_table.tex ├── dvc.lock ├── dvc.yaml ├── example_analysis.ipynb ├── fig_params └── figs.yaml ├── matplotlibrc ├── metrics ├── baseline_statistics.yaml ├── costs │ ├── savings_info.csv │ └── savings_non_bucketed_info.csv ├── horizon_trend │ └── .gitignore └── multiverse_boxplot.yaml ├── params.yaml ├── plots └── .gitignore ├── poetry.lock ├── pyproject.toml ├── src ├── __init__.py ├── calculate_baseline_statistics.py ├── fetch_swe_bench_runs.py ├── ga_swebench_comparison_table.py ├── plot │ ├── bar_chart.py │ ├── bar_chart_weighted_scores.py │ ├── bootstrap_ci.py │ ├── cost.py │ ├── generate_model_task_table.py │ ├── individual_binned_residuals.py │ ├── individual_histograms.py │ ├── individual_qqs.py │ ├── logistic.py │ ├── logistic_alternative_fits.py │ ├── logistic_individual.py │ ├── messiness.py │ ├── multiverse_boxplot.py │ ├── success_correlations.py │ ├── success_rates.py │ ├── success_trend_by_messiness.py │ ├── success_trend_by_messiness_and_length.py │ └── task_distribution.py ├── utils │ ├── __init__.py │ ├── logistic.py │ └── plots.py └── wrangle │ ├── bootstrap.py │ ├── cost.py │ ├── logistic.py │ └── multiverse_boxplot.py └── tests ├── __init__.py ├── test_bootstrap.py ├── test_calculate_baseline_statistics.py └── test_logistic.py /.devcontainer/.gitignore: -------------------------------------------------------------------------------- 1 | !/* -------------------------------------------------------------------------------- /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/.devcontainer/Dockerfile -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/.devcontainer/devcontainer.json -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | * 2 | 3 | !poetry.lock 4 | !pyproject.toml 5 | !README.md 6 | -------------------------------------------------------------------------------- /.dvc/.gitignore: -------------------------------------------------------------------------------- 1 | /config.local 2 | /tmp 3 | /cache 4 | -------------------------------------------------------------------------------- /.dvc/config: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/.dvc/config -------------------------------------------------------------------------------- /.dvcignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/.dvcignore -------------------------------------------------------------------------------- /.editorconfig: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/.editorconfig -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/.github/workflows/ci.yml -------------------------------------------------------------------------------- /.github/workflows/sync-data.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/.github/workflows/sync-data.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/.gitignore -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @METR/ctr-pipeline-devs 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/README.md -------------------------------------------------------------------------------- /data/.gitignore: -------------------------------------------------------------------------------- 1 | *.csv 2 | *.jsonl 3 | *.yaml 4 | /swe_bench_results 5 | -------------------------------------------------------------------------------- /data/external/.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/data/external/.gitignore -------------------------------------------------------------------------------- /data/external/all_runs.jsonl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/data/external/all_runs.jsonl -------------------------------------------------------------------------------- /data/external/ensembled_annotations_public.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/data/external/ensembled_annotations_public.csv -------------------------------------------------------------------------------- /data/external/messiness.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/data/external/messiness.csv -------------------------------------------------------------------------------- /data/external/messiness_tasks.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/data/external/messiness_tasks.csv -------------------------------------------------------------------------------- /data/external/release_dates.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/data/external/release_dates.yaml -------------------------------------------------------------------------------- /data/external/swebench_baseline_times.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/data/external/swebench_baseline_times.yaml -------------------------------------------------------------------------------- /data/metrics/messiness/.gitignore: -------------------------------------------------------------------------------- 1 | /analysis_results.txt 2 | -------------------------------------------------------------------------------- /data/processed/ga_swebench_comparison_table.tex: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/data/processed/ga_swebench_comparison_table.tex -------------------------------------------------------------------------------- /data/wrangled/ga_swebench_comparison_table.tex: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /dvc.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/dvc.lock -------------------------------------------------------------------------------- /dvc.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/dvc.yaml -------------------------------------------------------------------------------- /example_analysis.ipynb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/example_analysis.ipynb -------------------------------------------------------------------------------- /fig_params/figs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/fig_params/figs.yaml -------------------------------------------------------------------------------- /matplotlibrc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/matplotlibrc -------------------------------------------------------------------------------- /metrics/baseline_statistics.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/metrics/baseline_statistics.yaml -------------------------------------------------------------------------------- /metrics/costs/savings_info.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/metrics/costs/savings_info.csv -------------------------------------------------------------------------------- /metrics/costs/savings_non_bucketed_info.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/metrics/costs/savings_non_bucketed_info.csv -------------------------------------------------------------------------------- /metrics/horizon_trend/.gitignore: -------------------------------------------------------------------------------- 1 | /multiverse.yaml 2 | -------------------------------------------------------------------------------- /metrics/multiverse_boxplot.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/metrics/multiverse_boxplot.yaml -------------------------------------------------------------------------------- /params.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/params.yaml -------------------------------------------------------------------------------- /plots/.gitignore: -------------------------------------------------------------------------------- 1 | *.png 2 | /model_task_table.tex 3 | -------------------------------------------------------------------------------- /poetry.lock: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/poetry.lock -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/pyproject.toml -------------------------------------------------------------------------------- /src/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/calculate_baseline_statistics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/calculate_baseline_statistics.py -------------------------------------------------------------------------------- /src/fetch_swe_bench_runs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/fetch_swe_bench_runs.py -------------------------------------------------------------------------------- /src/ga_swebench_comparison_table.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/ga_swebench_comparison_table.py -------------------------------------------------------------------------------- /src/plot/bar_chart.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/plot/bar_chart.py -------------------------------------------------------------------------------- /src/plot/bar_chart_weighted_scores.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/plot/bar_chart_weighted_scores.py -------------------------------------------------------------------------------- /src/plot/bootstrap_ci.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/plot/bootstrap_ci.py -------------------------------------------------------------------------------- /src/plot/cost.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/plot/cost.py -------------------------------------------------------------------------------- /src/plot/generate_model_task_table.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/plot/generate_model_task_table.py -------------------------------------------------------------------------------- /src/plot/individual_binned_residuals.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/plot/individual_binned_residuals.py -------------------------------------------------------------------------------- /src/plot/individual_histograms.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/plot/individual_histograms.py -------------------------------------------------------------------------------- /src/plot/individual_qqs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/plot/individual_qqs.py -------------------------------------------------------------------------------- /src/plot/logistic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/plot/logistic.py -------------------------------------------------------------------------------- /src/plot/logistic_alternative_fits.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/plot/logistic_alternative_fits.py -------------------------------------------------------------------------------- /src/plot/logistic_individual.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/plot/logistic_individual.py -------------------------------------------------------------------------------- /src/plot/messiness.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/plot/messiness.py -------------------------------------------------------------------------------- /src/plot/multiverse_boxplot.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/plot/multiverse_boxplot.py -------------------------------------------------------------------------------- /src/plot/success_correlations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/plot/success_correlations.py -------------------------------------------------------------------------------- /src/plot/success_rates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/plot/success_rates.py -------------------------------------------------------------------------------- /src/plot/success_trend_by_messiness.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/plot/success_trend_by_messiness.py -------------------------------------------------------------------------------- /src/plot/success_trend_by_messiness_and_length.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/plot/success_trend_by_messiness_and_length.py -------------------------------------------------------------------------------- /src/plot/task_distribution.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/plot/task_distribution.py -------------------------------------------------------------------------------- /src/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/utils/logistic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/utils/logistic.py -------------------------------------------------------------------------------- /src/utils/plots.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/utils/plots.py -------------------------------------------------------------------------------- /src/wrangle/bootstrap.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/wrangle/bootstrap.py -------------------------------------------------------------------------------- /src/wrangle/cost.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/wrangle/cost.py -------------------------------------------------------------------------------- /src/wrangle/logistic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/wrangle/logistic.py -------------------------------------------------------------------------------- /src/wrangle/multiverse_boxplot.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/src/wrangle/multiverse_boxplot.py -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/test_bootstrap.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/tests/test_bootstrap.py -------------------------------------------------------------------------------- /tests/test_calculate_baseline_statistics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/tests/test_calculate_baseline_statistics.py -------------------------------------------------------------------------------- /tests/test_logistic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/METR/eval-analysis-public/HEAD/tests/test_logistic.py --------------------------------------------------------------------------------