├── benchbench
├── __init__.py
├── utils
│ ├── __init__.py
│ ├── win_rate.py
│ ├── base.py
│ └── metric.py
├── data
│ ├── openllm
│ │ ├── statistic.py
│ │ ├── format.py
│ │ ├── __init__.py
│ │ └── leaderboard.tsv
│ ├── bbh
│ │ ├── __init__.py
│ │ ├── format.py
│ │ ├── cols.txt
│ │ └── statistic.py
│ ├── vtab
│ │ ├── __init__.py
│ │ └── leaderboard.tsv
│ ├── bigcode
│ │ ├── __init__.py
│ │ ├── format.py
│ │ ├── leaderboard.tsv
│ │ └── vanilla.txt
│ ├── mmlu
│ │ ├── __init__.py
│ │ ├── format.py
│ │ └── leaderboard_raw.csv
│ ├── mteb
│ │ ├── format.py
│ │ ├── __init__.py
│ │ └── leaderboard.tsv
│ ├── helm_lite
│ │ ├── format.py
│ │ ├── __init__.py
│ │ └── leaderboard.tsv
│ ├── helm_capability
│ │ ├── format.py
│ │ ├── __init__.py
│ │ ├── leaderboard.tsv
│ │ └── vanilla.txt
│ ├── heim
│ │ ├── __init__.py
│ │ ├── quality_human.tsv
│ │ ├── quality_auto.tsv
│ │ ├── originality.tsv
│ │ ├── black_out.tsv
│ │ ├── nsfw.tsv
│ │ ├── nudity.tsv
│ │ ├── aesthetics_human.tsv
│ │ ├── alignment_human.tsv
│ │ └── alignment_auto.tsv
│ ├── superglue
│ │ ├── __init__.py
│ │ └── leaderboard.tsv
│ ├── imagenet
│ │ ├── format.py
│ │ ├── __init__.py
│ │ ├── run_imagenet.py
│ │ └── leaderboard_raw.tsv
│ ├── helm
│ │ ├── __init__.py
│ │ ├── toxicity.tsv
│ │ ├── calibration.tsv
│ │ ├── efficiency.tsv
│ │ ├── summarization.tsv
│ │ ├── fairness.tsv
│ │ ├── robustness.tsv
│ │ └── accuracy.tsv
│ ├── glue
│ │ ├── __init__.py
│ │ └── leaderboard.tsv
│ ├── dummy
│ │ └── __init__.py
│ └── __init__.py
└── measures
│ ├── cardinal.py
│ └── ordinal.py
├── MANIFEST.in
├── assets
├── banner.png
└── benchbench-horizontal.png
├── docs
├── data.rst
├── index.rst
├── measures.rst
├── Makefile
├── utils.rst
├── make.bat
└── conf.py
├── LICENSE.txt
├── pyproject.toml
├── README.md
└── .gitignore
/benchbench/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/benchbench/utils/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include benchbench/data/*
2 | include benchbench/data/*/*
3 |
--------------------------------------------------------------------------------
/assets/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/socialfoundations/benchbench/HEAD/assets/banner.png
--------------------------------------------------------------------------------
/assets/benchbench-horizontal.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/socialfoundations/benchbench/HEAD/assets/benchbench-horizontal.png
--------------------------------------------------------------------------------
/benchbench/data/openllm/statistic.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset
2 |
3 | dataset = load_dataset("gsm8k", name="main", split="test")
4 | print("gsm8k")
5 | print(len(set([eval(i.split("#### ")[-1]) for i in dataset["answer"]])), len(dataset))
6 |
--------------------------------------------------------------------------------
/benchbench/data/bbh/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 |
4 |
5 | def load_bbh():
6 | data = pd.read_csv(
7 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"),
8 | sep="\t",
9 | )
10 | cols = data.columns[6:]
11 | return data, cols
12 |
--------------------------------------------------------------------------------
/docs/data.rst:
--------------------------------------------------------------------------------
1 | Data
2 | =======================================
3 |
4 | benchbench.data
5 | --------------------------------------------
6 | .. automodule:: benchbench.data
7 | :members:
8 | :undoc-members:
9 | :show-inheritance:
10 |
11 | .. autoattribute:: benchbench.data.cardinal_benchmark_list
12 |
13 | .. autoattribute:: benchbench.data.ordinal_benchmark_list
14 |
15 |
--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
1 | Welcome to BenchBench's documentation!
2 | =========================================
3 |
4 | .. include:: ../README.md
5 | :parser: myst_parser.sphinx_
6 |
7 | .. toctree::
8 | :maxdepth: 2
9 | :caption: Contents:
10 |
11 | data
12 | measures
13 | utils
14 |
15 |
16 | Indices and tables
17 | --------------------------------------------
18 |
19 | * :ref:`genindex`
20 | * :ref:`modindex`
21 | * :ref:`search`
22 |
--------------------------------------------------------------------------------
/benchbench/data/vtab/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 |
4 |
5 | def load_vtab():
6 | data = pd.read_csv(
7 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"),
8 | sep="\t",
9 | )
10 | cols = data.columns[1:]
11 | return data, cols
12 |
13 |
14 | def test():
15 | data, cols = load_vtab()
16 | print(data.head())
17 | print(cols)
18 |
19 |
20 | if __name__ == "__main__":
21 | test()
22 |
--------------------------------------------------------------------------------
/benchbench/data/bigcode/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 |
4 |
5 | def load_bigcode():
6 | data = pd.read_csv(
7 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"),
8 | sep="\t",
9 | )
10 | cols = data.columns[3:6]
11 | return data, cols
12 |
13 |
14 | def test():
15 | data, cols = load_bigcode()
16 | print(data.head())
17 | print(cols)
18 |
19 |
20 | if __name__ == "__main__":
21 | test()
22 |
--------------------------------------------------------------------------------
/benchbench/data/mmlu/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 |
4 |
5 | def load_mmlu():
6 | data = pd.read_csv(
7 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"),
8 | sep="\t",
9 | )
10 | cols = data.columns[4:]
11 | data[cols] = data[cols] * 100.0
12 | return data, cols
13 |
14 |
15 | def test():
16 | data, cols = load_mmlu()
17 | print(data.head())
18 | print(cols)
19 |
20 |
21 | if __name__ == "__main__":
22 | test()
23 |
--------------------------------------------------------------------------------
/benchbench/data/mteb/format.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | fout = open(os.path.join(os.getcwd(), "/leaderboard.tsv"), "w")
4 | with open(os.path.join(os.getcwd(), "/vanilla.txt"), "r") as fin:
5 | for i, line in enumerate(fin.readlines()):
6 | line = line.strip().replace("\t", " ")
7 | if len(line) != 0:
8 | fout.write(line)
9 | else:
10 | fout.write("-")
11 | if i % 14 == 13:
12 | fout.write("\n")
13 | else:
14 | fout.write("\t")
15 |
--------------------------------------------------------------------------------
/benchbench/data/bigcode/format.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | fout = open(os.path.join(os.getcwd(), "leaderboard.tsv"), "w")
4 | with open(os.path.join(os.getcwd(), "vanilla.txt"), "r") as fin:
5 | for i, line in enumerate(fin.readlines()):
6 | line = line.strip().replace("\t", " ")
7 | if len(line) != 0:
8 | fout.write(line.split()[0])
9 | else:
10 | continue
11 | if i % 8 == 7:
12 | fout.write("\n")
13 | else:
14 | fout.write("\t")
15 |
--------------------------------------------------------------------------------
/benchbench/data/openllm/format.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | fout = open(os.path.join(os.getcwd(), "leaderboard.tsv"), "w")
4 | with open(os.path.join(os.getcwd(), "vanilla.txt"), "r") as fin:
5 | for i, line in enumerate(fin.readlines()):
6 | line = line.strip().replace("\t", " ")
7 | if len(line) != 0:
8 | fout.write(line.split()[0])
9 | else:
10 | continue
11 | if i % 10 == 9:
12 | fout.write("\n")
13 | else:
14 | fout.write("\t")
15 |
--------------------------------------------------------------------------------
/benchbench/data/helm_lite/format.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | fout = open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"), "w")
4 | with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "vanilla.txt"), "r") as fin:
5 | cols = []
6 | helm_lite = dict()
7 | for i, line in enumerate(fin.readlines()):
8 | line = line.strip()
9 | if len(line) == 0:
10 | continue
11 | fout.write(line)
12 | if i % 12 == 11:
13 | fout.write("\n")
14 | else:
15 | fout.write("\t")
16 |
--------------------------------------------------------------------------------
/docs/measures.rst:
--------------------------------------------------------------------------------
1 | Measures
2 | =============
3 |
4 | .. automodule:: benchbench.measures
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
9 | benchbench.measures.cardinal
10 | --------------------------------------------
11 |
12 | .. automodule:: benchbench.measures.cardinal
13 | :members:
14 | :undoc-members:
15 | :show-inheritance:
16 |
17 | benchbench.measures.ordinal
18 | --------------------------------------------
19 |
20 | .. automodule:: benchbench.measures.ordinal
21 | :members:
22 | :undoc-members:
23 | :show-inheritance:
24 |
--------------------------------------------------------------------------------
/benchbench/data/openllm/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 |
4 |
5 | def load_openllm():
6 | data = pd.read_csv(
7 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"),
8 | sep="\t",
9 | )
10 | cols = data.columns[3:]
11 | data["average_score"] = data[cols].mean(1)
12 | data.sort_values(by="average_score", inplace=True, ascending=False)
13 | return data, cols
14 |
15 |
16 | def test():
17 | data, cols = load_openllm()
18 | print(data.head())
19 | print(cols)
20 |
21 |
22 | if __name__ == "__main__":
23 | test()
24 |
--------------------------------------------------------------------------------
/benchbench/data/helm_capability/format.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 |
4 | fout = open(
5 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"), "w"
6 | )
7 | with open(
8 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "vanilla.txt"), "r"
9 | ) as fin:
10 | cols = []
11 | helm_lite = dict()
12 | for i, line in enumerate(fin.readlines()):
13 | line = line.strip()
14 | if len(line) == 0:
15 | continue
16 | fout.write(line)
17 | if i % 7 == 6:
18 | fout.write("\n")
19 | else:
20 | fout.write("\t")
21 |
--------------------------------------------------------------------------------
/benchbench/data/bbh/format.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 |
4 | fout = open(os.path.join(os.getcwd(), "leaderboard.tsv"), "w")
5 | with open(os.path.join(os.getcwd(), "cols.txt"), "r") as fin:
6 | fout.write(fin.readline() + "\n")
7 | with open(os.path.join(os.getcwd(), "vanilla.tsv"), "r") as fin:
8 | new_line = ""
9 | for i, line in enumerate(fin.readlines()):
10 | if i % 5 <= 3:
11 | new_line += line.strip()
12 | new_line += "\t"
13 | else:
14 | new_line += re.sub("\s+", "\t", line)
15 | fout.write(new_line.rstrip() + "\n")
16 | new_line = ""
17 |
--------------------------------------------------------------------------------
/benchbench/data/bbh/cols.txt:
--------------------------------------------------------------------------------
1 | Rank Model Company Release Parameters Average Boolean Expressions Causal Judgement Date Understanding Disambiguation QA Dyck Languages Formal Fallacies Geometric Shapes Hyperbaton Logical Deduction Three Objects Logical Deduction Five Objects Logical Deduction Seven Objects Movie Recommendation Multistep Arithmetic Two Navigate Object Counting Penguins In A Table Reasoning About Colored Objects Ruin Names Salient Translation Error Detection Snarks Sports Understanding Temporal Sequences Tracking Shuffled Objects Three Objects Tracking Shuffled Objects Five Objects Tracking Shuffled Objects Seven Objects Web Of Lies Word Sorting
2 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/benchbench/data/helm_lite/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import pandas as pd
4 |
5 |
6 | def load_helm_lite():
7 | data = pd.read_csv(
8 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"),
9 | sep="\t",
10 | )
11 | data = data.replace("-", np.nan)
12 | data = data.dropna(axis=0, how="all")
13 | data = data.dropna(axis=1, how="all")
14 | cols = data.columns[2:]
15 |
16 | for c in cols:
17 | data[c] = np.array([float(i) for i in data[c].values])
18 |
19 | return data, cols
20 |
21 |
22 | def test():
23 | data, cols = load_helm_lite()
24 | print(data.head())
25 | print(cols)
26 |
27 |
28 | if __name__ == "__main__":
29 | test()
30 |
--------------------------------------------------------------------------------
/docs/utils.rst:
--------------------------------------------------------------------------------
1 | Utils
2 | =============
3 |
4 | .. automodule:: benchbench.utils
5 | :members:
6 | :undoc-members:
7 | :show-inheritance:
8 |
9 | benchbench.utils.base
10 | --------------------------------------------
11 |
12 | .. automodule:: benchbench.utils.base
13 | :members:
14 | :undoc-members:
15 | :show-inheritance:
16 |
17 | benchbench.utils.metric
18 | --------------------------------------------
19 |
20 | .. automodule:: benchbench.utils.metric
21 | :members:
22 | :undoc-members:
23 | :show-inheritance:
24 |
25 | benchbench.utils.win_rate
26 | --------------------------------------------
27 |
28 | .. automodule:: benchbench.utils.win_rate
29 | :members:
30 | :undoc-members:
31 | :show-inheritance:
32 |
--------------------------------------------------------------------------------
/benchbench/data/helm_capability/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import pandas as pd
4 |
5 |
6 | def load_helm_capability():
7 | data = pd.read_csv(
8 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"),
9 | sep="\t",
10 | )
11 | data = data.replace("-", np.nan)
12 | data = data.dropna(axis=0, how="all")
13 | data = data.dropna(axis=1, how="all")
14 | cols = data.columns[2:]
15 |
16 | for c in cols:
17 | data[c] = np.array([float(i) for i in data[c].values])
18 |
19 | return data, cols
20 |
21 |
22 | def test():
23 | data, cols = load_helm_capability()
24 | print(data.head())
25 | print(cols)
26 |
27 |
28 | if __name__ == "__main__":
29 | test()
30 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=_build
12 |
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | echo.
16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | echo.installed, then set the SPHINXBUILD environment variable to point
18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | echo.may add the Sphinx directory to PATH.
20 | echo.
21 | echo.If you don't have Sphinx installed, grab it from
22 | echo.https://www.sphinx-doc.org/
23 | exit /b 1
24 | )
25 |
26 | if "%1" == "" goto help
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/benchbench/data/mteb/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 |
4 |
5 | def load_mteb():
6 | data = pd.read_csv(
7 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"),
8 | sep="\t",
9 | )
10 | orig_cols = data.columns[6:]
11 | ret = {}
12 | cols = []
13 | for c in orig_cols:
14 | col_name = c.split(" (")[0]
15 | num_task = int(c.split(" (")[1].split(" ")[0])
16 | for i in range(num_task):
17 | ret["{}-{}".format(col_name, i)] = data[c].values.copy()
18 | cols.append("{}-{}".format(col_name, i))
19 | data = pd.concat([data, pd.DataFrame(ret)], axis=1)
20 |
21 | data["average_score"] = data[cols].mean(1)
22 | data.sort_values(by="average_score", inplace=True, ascending=False)
23 | return data, cols
24 |
25 |
26 | def test():
27 | data, cols = load_mteb()
28 | print(data.head())
29 | print(cols)
30 |
31 |
32 | if __name__ == "__main__":
33 | test()
34 |
--------------------------------------------------------------------------------
/benchbench/data/heim/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import pandas as pd
4 |
5 |
6 | def load_heim(subset="alignment_human"):
7 | assert subset in [
8 | "alignment_auto",
9 | "nsfw",
10 | "quality_auto",
11 | "aesthetics_auto",
12 | "alignment_human",
13 | "nudity",
14 | "quality_human",
15 | "aesthetics_human",
16 | "black_out",
17 | "originality",
18 | ]
19 | data = pd.read_csv(
20 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "%s.tsv" % subset),
21 | sep="\t",
22 | )
23 | data = data.replace("-", np.nan)
24 | data = data.dropna(axis=0, how="all")
25 | data = data.dropna(axis=1, how="all")
26 | cols = data.columns[2:]
27 | for c in cols:
28 | if "↓" in c:
29 | data[c] = -data[c]
30 | return data, cols
31 |
32 |
33 | def test():
34 | data, cols = load_heim()
35 | print(data.head())
36 | print(cols)
37 |
38 |
39 | if __name__ == "__main__":
40 | test()
41 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Guanhua Zhang and Moritz Hardt
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/benchbench/data/superglue/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | import numpy as np
4 |
5 |
6 | def load_superglue():
7 | data = pd.read_csv(
8 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"),
9 | sep="\t",
10 | )
11 | ori_cols = data.columns[5:-2]
12 | cols = []
13 | for c in ori_cols:
14 | if type(data[c].values[0]) is str and "/" in data[c].values[0]:
15 | c1 = c + "-a"
16 | c2 = c + "-b"
17 | res1, res2 = [], []
18 | for line in data[c].values:
19 | s = line.strip().split("/")
20 | res1.append(float(s[0]))
21 | res2.append(float(s[1]))
22 | res1 = np.array(res1)
23 | res2 = np.array(res2)
24 | data[c1] = res1
25 | data[c2] = res2
26 | data[c] = (res1 + res2) / 2
27 | cols.append(c)
28 | else:
29 | cols.append(c)
30 |
31 | return data, cols
32 |
33 |
34 | def test():
35 | data, cols = load_superglue()
36 | print(data.head())
37 | print(cols)
38 |
39 |
40 | if __name__ == "__main__":
41 | test()
42 |
--------------------------------------------------------------------------------
/benchbench/data/imagenet/format.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import pandas as pd
4 |
5 | fout = open(os.path.join(os.getcwd(), "leaderboard_raw.tsv"), "w")
6 | with open(os.path.join(os.getcwd(), "vanilla.txt"), "r") as fin:
7 | new_line = ""
8 | for i, line in enumerate(fin.readlines()):
9 | if i % 12 <= 10:
10 | new_line += line.strip()
11 | if len(line.strip()) != 0:
12 | new_line += "\t"
13 | else:
14 | new_line += re.sub("\s+", "\t", line)
15 | fout.write(new_line.rstrip() + "\n")
16 | new_line = ""
17 | fout.close()
18 |
19 | data = pd.read_csv(os.path.join(os.getcwd(), "leaderboard_raw.tsv"), sep="\t")
20 | data.sort_values(by=["Acc@1"], inplace=True, ascending=False)
21 | data["Model"] = data["Weight"].apply(
22 | lambda t: "_".join(t.split(".")[0].split("_")[:-1]).lower()
23 | )
24 | # data.to_csv(os.path.join(os.getcwd(), "leaderboard_raw.tsv"), sep="\t", index=False)
25 |
26 | with open(os.path.join(os.getcwd(), "run.sh"), "w") as fout:
27 | for i in range(len(data)):
28 | fout.write(
29 | f"python run_imagenet.py --model_name {data['Model'][i]} --weight_name {data['Weight'][i]}\n"
30 | )
31 |
--------------------------------------------------------------------------------
/benchbench/data/bbh/statistic.py:
--------------------------------------------------------------------------------
1 | from datasets import load_dataset
2 |
3 | configs = [
4 | "boolean_expressions",
5 | "causal_judgement",
6 | "date_understanding",
7 | "disambiguation_qa",
8 | "dyck_languages",
9 | "formal_fallacies",
10 | "geometric_shapes",
11 | "hyperbaton",
12 | "logical_deduction_five_objects",
13 | "logical_deduction_seven_objects",
14 | "logical_deduction_three_objects",
15 | "movie_recommendation",
16 | "multistep_arithmetic_two",
17 | "navigate",
18 | "object_counting",
19 | "penguins_in_a_table",
20 | "reasoning_about_colored_objects",
21 | "ruin_names",
22 | "salient_translation_error_detection",
23 | "snarks",
24 | "sports_understanding",
25 | "temporal_sequences",
26 | "tracking_shuffled_objects_five_objects",
27 | "tracking_shuffled_objects_seven_objects",
28 | "tracking_shuffled_objects_three_objects",
29 | "web_of_lies",
30 | "word_sorting",
31 | ]
32 | ret = []
33 | for c in configs:
34 | dataset = load_dataset("lukaemon/bbh", name=c, split="test")
35 | ret.append((c, set(dataset["target"])))
36 |
37 | ret = sorted(ret, key=lambda x: len(x[1]))
38 | for i in ret:
39 | print(i[0], len(i[1]), i[1])
40 |
--------------------------------------------------------------------------------
/benchbench/data/helm/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import pandas as pd
4 |
5 |
6 | def load_helm(subset="accuracy"):
7 | assert subset in [
8 | "accuracy",
9 | "bias",
10 | "calibration",
11 | "fairness",
12 | "efficiency",
13 | "robustness",
14 | "summarization",
15 | "toxicity",
16 | ]
17 | data = pd.read_csv(
18 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "%s.tsv" % subset),
19 | sep="\t",
20 | )
21 | data = data.replace("-", np.nan)
22 | data = data.dropna(axis=0, how="all")
23 | data = data.dropna(axis=1, how="all")
24 | cols = data.columns[2:]
25 |
26 | for c in cols:
27 | data[c] = np.array([float(i) for i in data[c].values])
28 |
29 | for c in cols:
30 | if (
31 | "ECE" in c
32 | or "Representation" in c
33 | or "Toxic fraction" in c
34 | or "Stereotype" in c
35 | or "inference time" in c
36 | ):
37 | data[c] = -data[c]
38 |
39 | return data, cols
40 |
41 |
42 | def test():
43 | data, cols = load_helm()
44 | print(data.head())
45 | print(cols)
46 |
47 |
48 | if __name__ == "__main__":
49 | test()
50 |
--------------------------------------------------------------------------------
/benchbench/data/glue/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | import numpy as np
4 |
5 |
6 | def load_glue():
7 | data = pd.read_csv(
8 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"),
9 | sep="\t",
10 | )
11 | ori_cols = data.columns[5:-1]
12 | cols = []
13 | for c in ori_cols:
14 | if type(data[c].values[0]) is str and "/" in data[c].values[0]:
15 | c1 = c + "-a"
16 | c2 = c + "-b"
17 | res1, res2 = [], []
18 | for line in data[c].values:
19 | s = line.strip().split("/")
20 | res1.append(float(s[0]))
21 | res2.append(float(s[1]))
22 | res1 = np.array(res1)
23 | res2 = np.array(res2)
24 | data[c1] = res1
25 | data[c2] = res2
26 | data[c] = (res1 + res2) / 2
27 | cols.append(c)
28 | elif "MNLI" in c:
29 | continue
30 | else:
31 | cols.append(c)
32 | data["MNLI"] = (data["MNLI-m"] + data["MNLI-mm"]) / 2
33 | cols.append("MNLI")
34 |
35 | return data, cols
36 |
37 |
38 | def test():
39 | data, cols = load_glue()
40 | print(data.head())
41 | print(cols)
42 |
43 |
44 | if __name__ == "__main__":
45 | test()
46 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=40.8.0", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "benchbench"
7 | authors = [
8 | {name = "Guanhua Zhang"},
9 | ]
10 | description = "Tools for measuring sensitivity and diversity of multi-task benchmarks."
11 | version = "1.0.1"
12 | requires-python = ">=3.7"
13 | readme = "README.md"
14 | license = {text = "MIT"}
15 | classifiers=[
16 | "Development Status :: 3 - Alpha",
17 | "License :: OSI Approved :: MIT License",
18 | "Intended Audience :: Science/Research",
19 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
20 | "Natural Language :: English",
21 | "Programming Language :: Python :: 3",
22 | "Programming Language :: Python :: 3.7",
23 | "Programming Language :: Python :: 3.8",
24 | "Programming Language :: Python :: 3.9",
25 | "Programming Language :: Python :: 3.10",
26 | "Programming Language :: Python :: 3.11",
27 | "Programming Language :: Python :: 3.12",
28 | ]
29 | dependencies = [
30 | "scipy",
31 | "numpy",
32 | "torch",
33 | "pandas",
34 | "joblib",
35 | "scikit-learn",
36 | "zarth_utils==1.0"
37 | ]
38 |
39 |
40 | [tool.setuptools]
41 | include-package-data = true
42 |
43 | [tool.setuptools.packages.find]
44 | include = ["benchbench*"]
45 |
--------------------------------------------------------------------------------
/docs/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # For the full list of built-in configuration values, see the documentation:
4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
5 |
6 | # -- Project information -----------------------------------------------------
7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
8 | import os
9 | import sys
10 |
11 | sys.path.insert(0, os.path.abspath('../'))
12 |
13 | project = 'BenchBench'
14 | copyright = '2024, Guanhua'
15 | author = 'Guanhua'
16 |
17 | # -- General configuration ---------------------------------------------------
18 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
19 |
20 | extensions = [
21 | 'sphinx.ext.autodoc', # pull doc from docstrings
22 | 'sphinx.ext.intersphinx', # link to other projects
23 | 'sphinx.ext.todo', # support TODOs
24 | 'sphinx.ext.ifconfig', # include stuff based on configuration
25 | 'sphinx.ext.viewcode', # add source code
26 | 'myst_parser', # add MD files
27 | 'sphinx.ext.napoleon' # Google style doc
28 | ]
29 |
30 | templates_path = ['_templates']
31 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
32 | pygments_style = 'sphinx'
33 |
34 | # -- Options for HTML output -------------------------------------------------
35 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
36 |
37 | html_theme = 'alabaster'
38 | html_static_path = ['_static']
39 |
--------------------------------------------------------------------------------
/benchbench/utils/win_rate.py:
--------------------------------------------------------------------------------
1 | import math
2 | import numpy as np
3 |
4 |
5 | class WinningRate:
6 | def __init__(self, data, cols):
7 | """
8 | Calculate the winning rate of a list of models.
9 |
10 | Args:
11 | data (pd.DataFrame): Each row represents a model, each column represents a task.
12 | cols (list): The column names of the tasks.
13 |
14 | Returns:
15 | None
16 | """
17 | m = len(data)
18 | n = len(cols)
19 | self.win_rate = np.zeros([m, m])
20 | data = data[cols].values
21 | for i in range(m):
22 | for j in range(m):
23 | n_win, n_tot = 0, 0
24 | for k in range(n):
25 | if not math.isnan(data[i, k]) and not math.isnan(data[j, k]):
26 | n_tot += 1
27 | if float(data[i, k]) > float(data[j, k]) and i != j:
28 | n_win += 1
29 | self.win_rate[i, j] = n_win / n_tot if n_tot > 0 else 0
30 |
31 | def get_winning_rate(self, model_indices=None):
32 | """
33 | Get the winning rate of the selected models.
34 |
35 | Args:
36 | model_indices (list): Indices of the selected models.
37 |
38 | Returns:
39 | float: The winning rate.
40 | """
41 | model_indices = (
42 | np.arange(len(self.win_rate)) if model_indices is None else model_indices
43 | )
44 | return self.win_rate[model_indices][:, model_indices].mean(axis=1)
45 |
--------------------------------------------------------------------------------
/benchbench/data/dummy/__init__.py:
--------------------------------------------------------------------------------
1 | import random
2 | import numpy as np
3 | import pandas as pd
4 |
5 |
6 | def load_random_benchmark(seed=0, num_task=100, num_model=100):
7 | np.random.seed(seed)
8 | random.seed(seed)
9 | data = np.random.random([num_model, num_task]) * 100
10 | data = pd.DataFrame(data)
11 | cols = list(data.columns)
12 | return data, cols
13 |
14 |
15 | def load_constant_benchmark(seed=0, num_task=100, num_model=100):
16 | np.random.seed(seed)
17 | random.seed(seed)
18 | rd = np.random.random([num_model, 1])
19 | data = np.concatenate([rd.copy() for _ in range(num_task)], axis=1) * 100
20 | data = pd.DataFrame(data)
21 | cols = list(data.columns)
22 | return data, cols
23 |
24 |
25 | def load_interpolation_benchmark(seed=0, mix_ratio=0.0, num_task=100, num_model=100):
26 | num_random = int(mix_ratio * num_task + 0.5)
27 | num_constant = int((1 - mix_ratio) * num_task + 0.5)
28 | if num_random == 0:
29 | return load_constant_benchmark(
30 | seed=seed, num_task=num_constant, num_model=num_model
31 | )
32 | elif num_constant == 0:
33 | return load_random_benchmark(
34 | seed=seed, num_task=num_random, num_model=num_model
35 | )
36 | else:
37 | random = load_random_benchmark(
38 | seed=seed, num_task=num_random, num_model=num_model
39 | )[0]
40 | constant = load_constant_benchmark(
41 | seed=seed, num_task=num_constant, num_model=num_model
42 | )[0]
43 | data = pd.DataFrame(np.concatenate([random.values, constant.values], axis=1))
44 | cols = list(data.columns)
45 | return data, cols
46 |
--------------------------------------------------------------------------------
/benchbench/data/imagenet/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import random
3 |
4 | import numpy as np
5 |
6 | import pandas as pd
7 |
8 |
9 | def load_imagenet(*args, **kwargs):
10 | # Due to legacy reason, instead of refactoring the code, we just make a wrapper function like this.
11 | return load_data(*args, **kwargs)
12 |
13 |
14 | def load_data(load_raw=False, seed=0, num_task=20):
15 | if load_raw:
16 | data = pd.read_csv(
17 | os.path.join(
18 | os.path.dirname(os.path.abspath(__file__)), "leaderboard_raw.tsv"
19 | ),
20 | sep="\t",
21 | )
22 | data = data.dropna(axis=0, how="any")
23 | cols = [data.columns[1]]
24 | else:
25 | data = pd.read_csv(
26 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"),
27 | sep="\t",
28 | )
29 | data = data.sort_values(by=["acc"], ascending=False).reset_index()
30 | if num_task < 1000:
31 | assert 1000 % num_task == 0 and num_task >= 1
32 | cols = []
33 | random.seed(seed)
34 | np.random.seed(seed)
35 | size_task = 1000 // num_task
36 | perm = np.random.permutation(1000)
37 | for i in range(num_task):
38 | task_cols = [
39 | "acc_%d" % j for j in perm[i * size_task : (i + 1) * size_task]
40 | ]
41 | data["acc_aggr_%d" % i] = data[task_cols].values.mean(1)
42 | cols.append("acc_aggr_%d" % i)
43 | else:
44 | cols = ["acc_%d" % i for i in range(1000)]
45 | return data, cols
46 |
47 |
48 | def test():
49 | data, cols = load_data()
50 | print(data.head())
51 | print(cols)
52 |
53 |
54 | if __name__ == "__main__":
55 | test()
56 |
--------------------------------------------------------------------------------
/benchbench/data/vtab/leaderboard.tsv:
--------------------------------------------------------------------------------
1 | Mean (selected datasets) CIFAR-100 Caltech101 Camelyon Clevr-Count Clevr-Dist DMLab DTD EuroSAT Flowers102 KITTI-Dist Pets Resisc45 Retinopathy SVHN Sun397 dSpr-Loc dSpr-Orient sNORB-Azim sNORB-Elev
2 | Sup-Rotation-100% 90.2 84.8 94.6 85.9 99.8 92.5 76.5 75.9 98.8 94.7 82.3 91.5 94.9 79.5 97.0 70.2 100 96.5 100 98.4
3 | Sup-Exemplar-100% 90.1 84.1 94.4 86.7 99.8 92.7 76.8 74.5 98.6 93.4 84.0 91.8 95.1 79.5 97.1 69.4 100 96.4 99.8 98.0
4 | Sup-100% 89.7 83.8 94.1 83.9 99.8 92.1 76.4 74.0 98.8 93.2 80.7 91.9 95.3 79.3 97.0 70.7 100 96.4 99.8 97.7
5 | Semi-Exemplar-10% 88.8 82.7 85.3 86.0 99.8 93.1 76.8 70.5 98.6 92.2 81.5 89.0 94.7 78.8 97.0 67.4 100 96.5 100 97.8
6 | Semi-Rotation-10% 88.6 82.4 88.1 78.6 99.8 93.2 76.1 72.4 98.7 93.2 81.0 87.9 94.9 79.0 96.9 66.7 100 96.5 99.9 97.5
7 | Rotation 86.4 73.6 88.3 86.4 99.8 93.3 76.8 63.3 98.3 83.4 82.6 71.8 93.4 78.6 96.9 60.5 100 96.5 99.9 98.0
8 | Exemplar 84.8 70.7 81.9 84.7 99.8 93.3 74.7 61.1 98.5 79.3 78.2 67.8 93.5 79.0 96.7 58.2 100 96.5 99.9 97.4
9 | Rel.Pat.Loc 83.1 65.7 79.9 85.3 99.5 87.7 71.5 65.2 97.8 78.8 75.0 66.8 91.5 79.8 93.7 58.0 100 90.4 99.7 92.6
10 | Jigsaw 83.0 65.3 79.1 83.0 99.6 88.6 72.0 63.9 97.9 77.9 74.7 65.4 92.0 80.1 93.9 59.2 100 90.3 99.9 93.6
11 | From-Scratch 75.4 64.4 55.9 81.2 99.7 89.4 71.5 31.3 96.2 50.6 68.4 23.8 86.8 76.8 96.3 52.7 100 96.3 99.9 91.7
12 | Uncond-BigGAN 68.2 58.1 73.6 82.2 47.6 54.9 54.8 44.9 89.8 63.5 57.4 30.9 75.4 75.9 93.0 46.9 86.1 95.9 88.1 76.6
13 | VAE 66.8 44.2 48.4 81.3 98.4 90.1 59.7 16.0 92.5 18.4 57.0 14.0 65.0 74.2 93.1 29.3 100 94.7 97.9 95.6
14 | WAE-MMD 64.9 38.8 50.8 80.6 98.1 89.3 52.6 11.0 94.1 20.8 61.6 16.2 64.8 73.8 90.9 31.6 100 90.2 96.3 72.4
15 | Cond-BigGAN 51.4 56.3 0.148 81.3 12.4 24.5 51.4 44.8 94.5 68.8 49.7 31.6 76.5 75.3 91.4 44.9 6.16 7.45 80.6 79.2
16 | WAE-GAN 48.5 24.8 42.0 77.1 52.2 70.2 37.3 8.67 81.5 15.5 62.3 13.1 38.4 73.6 78.2 12.8 97.7 49.9 33.4 52.2
17 | WAE-UKL 46.8 23.2 41.7 76.4 44.5 67.8 36.7 12.3 78.1 17.2 55.1 12.3 36.8 73.6 65.5 12.0 98.1 51.4 35.9 51.0
--------------------------------------------------------------------------------
/benchbench/data/bigcode/leaderboard.tsv:
--------------------------------------------------------------------------------
1 | T Models Win humaneval-python java javascript Throughput
2 | 🔴 DeepSeek-Coder-33b-instruct 39.58 80.02 52.03 65.13 25.2
3 | 🔴 DeepSeek-Coder-7b-instruct 38.75 80.22 53.34 65.8 51
4 | 🔶 Phind-CodeLlama-34B-v2 37.04 71.95 54.06 65.34 15.1
5 | 🔶 Phind-CodeLlama-34B-v1 36.12 65.85 49.47 64.45 15.1
6 | 🔶 Phind-CodeLlama-34B-Python-v1 35.27 70.22 48.72 66.24 15.1
7 | 🔴 DeepSeek-Coder-33b-base 35 52.45 43.77 51.28 25.2
8 | 🔶 WizardCoder-Python-34B-V1.0 33.96 70.73 44.94 55.28 15.1
9 | 🔴 DeepSeek-Coder-7b-base 31.75 45.83 37.72 45.9 51
10 | 🔶 CodeLlama-34b-Instruct 30.96 50.79 41.53 45.85 15.1
11 | 🔶 WizardCoder-Python-13B-V1.0 30.58 62.19 41.77 48.45 25.3
12 | 🟢 CodeLlama-34b 30.35 45.11 40.19 41.66 15.1
13 | 🟢 CodeLlama-34b-Python 29.65 53.29 39.46 44.72 15.1
14 | 🔶 WizardCoder-15B-V1.0 28.92 58.12 35.77 41.91 43.7
15 | 🔶 CodeLlama-13b-Instruct 27.88 50.6 33.99 40.92 25.3
16 | 🟢 CodeLlama-13b 26.19 35.07 32.23 38.26 25.3
17 | 🟢 CodeLlama-13b-Python 24.73 42.89 33.56 40.66 25.3
18 | 🔶 CodeLlama-7b-Instruct 23.69 45.65 28.77 33.11 33.1
19 | 🟢 CodeLlama-7b 22.31 29.98 29.2 31.8 33.1
20 | 🔴 CodeShell-7B 22.31 34.32 30.43 33.17 33.9
21 | 🔶 OctoCoder-15B 21.15 45.3 26.03 32.8 44.4
22 | 🟢 Falcon-180B 20.9 35.37 28.48 31.68 -1
23 | 🟢 CodeLlama-7b-Python 20.62 40.48 29.15 36.34 33.1
24 | 🟢 StarCoder-15B 20.58 33.57 30.22 30.79 43.9
25 | 🟢 StarCoderBase-15B 20.15 30.35 28.53 31.7 43.8
26 | 🟢 CodeGeex2-6B 17.42 33.49 23.46 29.9 32.7
27 | 🟢 StarCoderBase-7B 16.85 28.37 24.44 27.35 46.9
28 | 🔶 OctoGeeX-7B 16.65 42.28 19.33 28.5 32.7
29 | 🔶 WizardCoder-3B-V1.0 15.73 32.92 24.34 26.16 50
30 | 🟢 CodeGen25-7B-multi 15.35 28.7 26.01 26.27 32.6
31 | 🔶 Refact-1.6B 14.85 31.1 22.78 22.36 50
32 | 🔴 DeepSeek-Coder-1b-base 14.42 32.13 27.16 28.46 -1
33 | 🟢 StarCoderBase-3B 11.65 21.5 19.25 21.32 50
34 | 🔶 WizardCoder-1B-V1.0 10.35 23.17 19.68 19.13 71.4
35 | 🟢 Replit-2.7B 8.54 20.12 21.39 20.18 42.2
36 | 🟢 CodeGen25-7B-mono 8.15 33.08 19.75 23.22 34.1
37 | 🟢 StarCoderBase-1.1B 8.12 15.17 14.2 13.38 71.4
38 | 🟢 CodeGen-16B-Multi 7.08 19.26 22.2 19.15 17.2
39 | 🟢 Phi-1 6.25 51.22 10.76 19.25 -1
40 | 🟢 StableCode-3B 6.04 20.2 19.54 18.98 30.2
41 | 🟢 DeciCoder-1B 5.81 19.32 15.3 17.85 54.6
42 | 🟢 SantaCoder-1.1B 4.58 18.12 15 15.47 50.8
43 |
--------------------------------------------------------------------------------
/benchbench/data/mmlu/format.py:
--------------------------------------------------------------------------------
1 | import os
2 | import math
3 | import numpy as np
4 | import pandas as pd
5 | from datasets import load_dataset
6 |
7 | # read top 100 model names
8 | top_100_with_duplicate = pd.read_csv("leaderboard_raw.csv", header=None)
9 | top_100 = []
10 | for i in top_100_with_duplicate[0].values:
11 | if i not in top_100:
12 | top_100.append(i)
13 | print(top_100)
14 |
15 | # download the meta data
16 | os.makedirs("data", exist_ok=True)
17 | with open("data/download.sh", "w") as fout:
18 | fout.write("git lfs install\n")
19 | for i in top_100:
20 | cmd = "git clone git@hf.co:data/%s" % i
21 | fout.write(cmd + "\n")
22 | print(cmd)
23 | # one must download the data manually by ``cd data; bash download.sh''
24 | # comment the following lines if you have downloaded the data
25 | # exit(0)
26 |
27 | # load all model names and split names
28 | all_model_split = []
29 | dir_dataset = os.path.join("data")
30 | for model_name in top_100:
31 | model_name = model_name[len("open-llm-leaderboard/") :]
32 | dir_model = os.path.join("data", model_name)
33 | if not os.path.isdir(dir_model):
34 | continue
35 | for split_name in os.listdir(dir_model):
36 | if not split_name.endswith(".parquet"):
37 | continue
38 | split_name = split_name[len("results_") : -len(".parquet")]
39 | all_model_split.append((model_name, split_name))
40 | print(len(all_model_split))
41 |
42 | # load all scores and filter broken ones
43 | ret = []
44 | for model_name, split_name in all_model_split:
45 | model = load_dataset(
46 | "parquet",
47 | data_files=os.path.join("data", model_name, "results_%s.parquet" % split_name),
48 | split="train",
49 | )["results"][0]
50 | tasks = [i for i in model.keys() if "hendrycksTest" in i]
51 | if len(tasks) != 57:
52 | continue
53 | avg = np.mean([model[c]["acc_norm"] for c in tasks])
54 | if math.isnan(avg):
55 | continue
56 | record = dict()
57 | record["model_name"] = model_name
58 | record["split_name"] = split_name
59 | record["average_score"] = avg
60 | record.update({c: model[c]["acc_norm"] for c in tasks})
61 | ret.append(record)
62 | print(model_name, split_name, "%.2lf" % avg)
63 | ret = sorted(ret, key=lambda x: -x["average_score"])
64 | ret = pd.DataFrame(ret)
65 | ret.to_csv("calibration.tsv", sep="\t")
66 |
--------------------------------------------------------------------------------
/benchbench/utils/base.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def is_int(x):
5 | """
6 | Check if a string can be converted to an integer.
7 |
8 | Args:
9 | x(str): Input string.
10 |
11 | Returns:
12 | bool: True if x can be converted to an integer, False otherwise
13 | """
14 | try:
15 | int(x)
16 | return True
17 | except ValueError:
18 | return False
19 |
20 |
21 | def is_number(s):
22 | """
23 | Check if a string can be converted to a number.
24 |
25 | Args:
26 | s(str): Input string.
27 |
28 | Returns:
29 | bool: True if s can be converted to a number, False otherwise
30 | """
31 | try:
32 | float(s)
33 | return True
34 | except ValueError:
35 | return False
36 |
37 |
38 | def get_combinations(s, k):
39 | """
40 | Generate all subsets of size k from set s.
41 |
42 | Args:
43 | s(list): List of elements to get combinations from.
44 | k(int): Size of each combination.
45 |
46 | Returns:
47 | list: A list of combinations, where each combination is represented as a list.
48 | """
49 | if k == 0:
50 | return [[]]
51 | elif k > len(s):
52 | return []
53 | else:
54 | all_combinations = []
55 | for i in range(len(s)):
56 | # For each element in the set, generate the combinations that include this element
57 | # and then recurse to generate combinations from the remaining elements
58 | element = s[i]
59 | remaining_elements = s[i + 1 :]
60 | for c in get_combinations(remaining_elements, k - 1):
61 | all_combinations.append([element] + c)
62 | return all_combinations
63 |
64 |
65 | def rankdata(a, method="average"):
66 | assert method == "average", "Only average method is implemented"
67 | arr = np.ravel(np.asarray(a))
68 | sorter = np.argsort(arr, kind="quicksort")
69 |
70 | inv = np.empty(sorter.size, dtype=np.intp)
71 | inv[sorter] = np.arange(sorter.size, dtype=np.intp)
72 |
73 | arr = arr[sorter]
74 | obs = np.r_[True, np.fabs(arr[1:] - arr[:-1]) > 1e-8] # this is the only change
75 | dense = obs.cumsum()[inv]
76 |
77 | # cumulative counts of each unique value
78 | count = np.r_[np.nonzero(obs)[0], len(obs)]
79 |
80 | # average method
81 | return 0.5 * (count[dense] + count[dense - 1] + 1)
82 |
--------------------------------------------------------------------------------
/benchbench/data/heim/quality_human.tsv:
--------------------------------------------------------------------------------
1 | Model/adapter Mean win rate ↑ [ sort ] MS-COCO (base) - Photorealism - generated (human) ↑ [ sort ] MS-COCO (fairness - gender) - Photorealism - generated (human) ↑ [ sort ] MS-COCO (fairness - AAVE dialect) - Photorealism - generated (human) ↑ [ sort ] MS-COCO (robustness) - Photorealism - generated (human) ↑ [ sort ] MS-COCO (Chinese) - Photorealism - generated (human) ↑ [ sort ] MS-COCO (Hindi) - Photorealism - generated (human) ↑ [ sort ] MS-COCO (Spanish) - Photorealism - generated (human) ↑ [ sort ] MS-COCO (Art styles) - Photorealism - generated (human) ↑ [ sort ]
2 | Dreamlike Photoreal v2.0 (1B) 0.92 2.619 2.694 2.65 2.726 2.76 2.628 2.894 -
3 | Safe Stable Diffusion weak (1B) 0.863 2.611 2.647 2.643 2.637 2.676 2.504 2.952 -
4 | DALL-E 2 (3.5B) 0.851 2.621 2.632 2.411 2.552 2.54 3.769 2.935 -
5 | Safe Stable Diffusion strong (1B) 0.771 2.286 2.332 2.526 2.807 2.936 2.684 2.712 -
6 | Stable Diffusion v1.5 (1B) 0.743 2.375 2.392 2.551 2.502 2.7 2.516 2.85 -
7 | DeepFloyd IF X-Large (4.3B) 0.726 2.207 2.216 2.554 2.776 2.51 2.842 2.736 -
8 | Safe Stable Diffusion medium (1B) 0.714 2.489 2.467 2.521 2.426 2.586 2.478 2.886 -
9 | Stable Diffusion v2 base (1B) 0.691 2.494 2.515 2.476 2.5 2.558 2.316 2.792 -
10 | GigaGAN (1B) 0.686 2.118 2.165 2.385 2.508 2.928 2.794 2.826 -
11 | Safe Stable Diffusion max (1B) 0.674 2.305 2.276 2.437 2.564 2.702 2.524 2.652 -
12 | Stable Diffusion v1.4 (1B) 0.657 2.512 2.482 2.309 2.561 2.752 2.27 2.644 -
13 | Stable Diffusion v2.1 base (1B) 0.6 2.42 2.38 2.318 2.44 2.436 2.33 2.77 -
14 | DeepFloyd IF Medium (0.4B) 0.554 2.101 2.122 2.406 2.542 2.238 2.698 2.72 -
15 | DeepFloyd IF Large (0.9B) 0.514 2.089 2.092 2.15 2.518 2.104 2.968 2.758 -
16 | MultiFusion (13B) 0.44 2.309 2.323 2.297 2.318 2.428 1.564 2.69 -
17 | DALL-E mega (2.6B) 0.417 2.058 2.097 2.046 2.308 2.39 2.284 2.884 -
18 | Dreamlike Diffusion v1.0 (1B) 0.4 2.15 2.155 2.119 2.342 2.472 2.164 2.44 -
19 | Openjourney v2 (1B) 0.309 1.941 1.928 2.145 2.322 2.178 2.422 2.508 -
20 | DALL-E mini (0.4B) 0.291 1.975 1.987 1.981 2.377 2.294 1.868 2.602 -
21 | Redshift Diffusion (1B) 0.28 1.914 1.982 1.95 2.002 2.396 2.51 2.31 -
22 | minDALL-E (1.3B) 0.274 2.058 2.04 1.896 2.047 2.226 2.016 2.666 -
23 | Lexica Search with Stable Diffusion v1.5 (1B) 0.263 1.883 1.897 1.806 1.93 1.94 3.074 2.374 -
24 | CogView2 (6B) 0.189 1.756 1.794 1.959 2.021 2.394 1.828 2.354 -
25 | Vintedois (22h) Diffusion model v0.1 (1B) 0.074 1.57 1.593 1.558 1.867 1.886 1.878 1.862 -
26 | Promptist + Stable Diffusion v1.4 (1B) 0.057 1.593 1.587 1.552 1.682 1.716 2.242 1.506 -
27 | Openjourney v1 (1B) 0.04 1.602 1.582 1.579 1.693 1.234 1.586 1.57 -
28 |
--------------------------------------------------------------------------------
/benchbench/data/imagenet/run_imagenet.py:
--------------------------------------------------------------------------------
1 | import os
2 | import torch
3 | import torchvision
4 | import joblib as jbl
5 | import pandas as pd
6 | from torchvision.models import *
7 | from tqdm import tqdm
8 |
9 | from zarth_utils.config import Config
10 |
11 |
12 | def load_model(model_name, weight_name):
13 | model = eval(model_name)
14 | weights = eval(weight_name)
15 | model = model(weights=weights).eval()
16 | preprocess = weights.transforms()
17 | return model, preprocess
18 |
19 |
20 | def main():
21 | config = Config(
22 | default_config_dict={
23 | "model_name": "vit_h_14",
24 | "weight_name": "ViT_H_14_Weights.IMAGENET1K_SWAG_E2E_V1",
25 | },
26 | use_argparse=True,
27 | )
28 |
29 | dir2save = os.path.join(
30 | os.path.dirname(os.path.abspath(__file__)),
31 | "%s--%s" % (config["model_name"], config["weight_name"]),
32 | )
33 | os.makedirs(dir2save, exist_ok=True)
34 | if os.path.exists(os.path.join(dir2save, "meta_info.pkl")):
35 | print("Already exists, skip")
36 | return
37 |
38 | device = (
39 | torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
40 | )
41 | model, preprocess = load_model(config["model_name"], config["weight_name"])
42 | model = model.to(device)
43 |
44 | dataset = torchvision.datasets.ImageNet(
45 | root=os.path.dirname(os.path.abspath(__file__)),
46 | split="val",
47 | transform=preprocess,
48 | )
49 | data_loader = torch.utils.data.DataLoader(
50 | dataset, batch_size=128, shuffle=False, num_workers=2
51 | )
52 |
53 | all_prob, all_pred, all_target = [], [], []
54 | for i, (batch, target) in tqdm(enumerate(data_loader)):
55 | with torch.no_grad():
56 | batch = batch.to(device)
57 | prob = model(batch).softmax(dim=1)
58 | pred = prob.argmax(dim=1)
59 | all_prob.append(prob.detach().cpu())
60 | all_pred.append(pred.detach().cpu())
61 | all_target.append(target.detach().cpu())
62 | all_prob = torch.cat(all_prob, dim=0).numpy()
63 | all_pred = torch.cat(all_pred, dim=0).numpy()
64 | all_target = torch.cat(all_target, dim=0).numpy()
65 |
66 | jbl.dump(all_prob, os.path.join(dir2save, "prob.pkl"))
67 | jbl.dump(all_pred, os.path.join(dir2save, "pred.pkl"))
68 | jbl.dump(all_target, os.path.join(dir2save, "target.pkl"))
69 | pd.DataFrame({"pred": all_pred, "target": all_target}).to_csv(
70 | os.path.join(dir2save, "pred_target.tsv"), sep="\t", index=False
71 | )
72 |
73 | meta_info = {}
74 | correct = all_pred == all_target
75 | meta_info["acc"] = correct.mean()
76 | for i in range(1000):
77 | subset = all_target == i
78 | correct[subset].mean()
79 | meta_info["acc_%d" % i] = correct[subset].mean()
80 | jbl.dump(meta_info, os.path.join(dir2save, "meta_info.pkl"))
81 |
82 |
83 | if __name__ == "__main__":
84 | main()
85 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | **BenchBench** is a Python package that provides a suite of tools to evaluate multi-task benchmarks focusing on
6 | **task diversity** and **sensitivity to irrelevant changes**.
7 |
8 | Research shows that for all multi-task benchmarks there is a trade-off between task diversity and sensitivity. The more diverse a benchmark, the more sensitive its ranking is to irrelevant changes. Irrelevant changes
9 | are things like introducing weak models, or changing the metric in ways that shouldn't matter.
10 |
11 | Based on BenchBench, we're maintaining a living [benchmark of multi-task benchmarks](https://socialfoundations.github.io/benchbench/). Visit the project page to see the results or contribute your own benchmark.
12 |
13 | Please see [our paper](https://arxiv.org/pdf/2405.01719) for all relevant background and scientific results. Cite as:
14 |
15 | ```
16 | @inproceedings{zhang2024inherent,
17 | title={Inherent Trade-Offs between Diversity and Stability in Multi-Task Benchmarks},
18 | author={Guanhua Zhang and Moritz Hardt},
19 | booktitle={International Conference on Machine Learning},
20 | year={2024}
21 | }
22 | ```
23 |
24 | ## Quick Start
25 |
26 | To install the package, simply run:
27 |
28 | ```bash
29 | pip install benchbench
30 | ```
31 |
32 | ## Example Usage
33 |
34 | To evaluate a cardinal benchmark, you can use the following code:
35 |
36 | ```python
37 | from benchbench.data import load_cardinal_benchmark
38 | from benchbench.measures.cardinal import get_diversity, get_sensitivity
39 |
40 | data, cols = load_cardinal_benchmark('GLUE')
41 | diversity = get_diversity(data, cols)
42 | sensitivity = get_sensitivity(data, cols)
43 | ```
44 |
45 | To evaluate an ordinal benchmark, you can use the following code:
46 |
47 | ```python
48 | from benchbench.data import load_ordinal_benchmark
49 | from benchbench.measures.ordinal import get_diversity, get_sensitivity
50 |
51 | data, cols = load_ordinal_benchmark('HELM-accuracy')
52 | diversity = get_diversity(data, cols)
53 | sensitivity = get_sensitivity(data, cols)
54 | ```
55 |
56 | To use your own benchmark, you just need to provide a pandas DataFrame and a list of columns indicating the tasks.
57 | Check the [documentation](https://socialfoundations.github.io/benchbench) for more details.
58 |
59 | ## Reproduce the results from our paper
60 |
61 |
62 |
63 |
64 |
65 | You can reproduce the figures from our paper using the following Colabs:
66 |
67 | * [cardinal.ipynb](https://githubtocolab.com/socialfoundations/benchbench/blob/main/examples/cardinal.ipynb)
68 | * [ordinal.ipynb](https://githubtocolab.com/socialfoundations/benchbench/blob/main/examples/ordinal.ipynb)
69 | * [banner.ipynb](https://githubtocolab.com/socialfoundations/benchbench/blob/main/examples/banner.ipynb)
70 |
--------------------------------------------------------------------------------
/benchbench/data/superglue/leaderboard.tsv:
--------------------------------------------------------------------------------
1 | Rank Name Model URL Score BoolQ CB COPA MultiRC ReCoRD RTE WiC WSC AX-b AX-g
2 | 1 JDExplore d-team Vega v2 91.3 90.5 98.6/99.2 99.4 88.2/62.4 94.4/93.9 96.0 77.4 98.6 -0.4 100.0/50.0
3 | 2 Liam Fedus ST-MoE-32B 91.2 92.4 96.9/98.0 99.2 89.6/65.8 95.1/94.4 93.5 77.7 96.6 72.3 96.1/94.1
4 | 3 Microsoft Alexander v-team Turing NLR v5 90.9 92.0 95.9/97.6 98.2 88.4/63.0 96.4/95.9 94.1 77.1 97.3 67.8 93.3/95.5
5 | 4 ERNIE Team - Baidu ERNIE 3.0 90.6 91.0 98.6/99.2 97.4 88.6/63.2 94.7/94.2 92.6 77.4 97.3 68.6 92.7/94.7
6 | 5 Yi Tay PaLM 540B 90.4 91.9 94.4/96.0 99.0 88.7/63.6 94.2/93.3 94.1 77.4 95.9 72.9 95.5/90.4
7 | 6 Zirui Wang T5 + UDG, Single Model (Google Brain) 90.4 91.4 95.8/97.6 98.0 88.3/63.0 94.2/93.5 93.0 77.9 96.6 69.1 92.7/91.9
8 | 7 DeBERTa Team - Microsoft DeBERTa / TuringNLRv4 90.3 90.4 95.7/97.6 98.4 88.2/63.7 94.5/94.1 93.2 77.5 95.9 66.7 93.3/93.8
9 | 8 SuperGLUE Human Baselines SuperGLUE Human Baselines 89.8 89.0 95.8/98.9 100.0 81.8/51.9 91.7/91.3 93.6 80.0 100.0 76.6 99.3/99.7
10 | 9 T5 Team - Google T5 89.3 91.2 93.9/96.8 94.8 88.1/63.3 94.1/93.4 92.5 76.9 93.8 65.6 92.7/91.9
11 | 10 SPoT Team - Google Frozen T5 1.1 + SPoT 89.2 91.1 95.8/97.6 95.6 87.9/61.9 93.3/92.4 92.9 75.8 93.8 66.9 83.1/82.6
12 | 11 Huawei Noah's Ark Lab NEZHA-Plus 86.7 87.8 94.4/96.0 93.6 84.6/55.1 90.1/89.6 89.1 74.6 93.2 58.0 87.1/74.4
13 | 12 Alibaba PAI&ICBU PAI Albert 86.1 88.1 92.4/96.4 91.8 84.6/54.7 89.0/88.3 88.8 74.1 93.2 75.6 98.3/99.2
14 | 13 Infosys : DAWN : AI Research RoBERTa-iCETS 86.0 88.5 93.2/95.2 91.2 86.4/58.2 89.9/89.3 89.9 72.9 89.0 61.8 88.8/81.5
15 | 14 Tencent Jarvis Lab RoBERTa (ensemble) 85.9 88.2 92.5/95.6 90.8 84.4/53.4 91.5/91.0 87.9 74.1 91.8 57.6 89.3/75.6
16 | 15 Zhuiyi Technology RoBERTa-mtl-adv 85.7 87.1 92.4/95.6 91.2 85.1/54.3 91.7/91.3 88.1 72.1 91.8 58.5 91.0/78.1
17 | 16 Facebook AI RoBERTa 84.6 87.1 90.5/95.2 90.6 84.4/52.5 90.6/90.0 88.2 69.9 89.0 57.9 91.0/78.1
18 | 17 Anuar Sharafudinov AILabs Team, Transformers 82.6 88.1 91.6/94.8 86.8 85.1/54.7 82.8/79.8 88.9 74.1 78.8 100.0 100.0/100.0
19 | 18 Ying Luo FSL++(ALBERT)-Few-Shot(32 Examples) 77.7 81.1 87.8/92.0 87.0 77.3/38.4 81.9/81.1 75.1 60.5 88.4 35.9 94.4/63.5
20 | 19 Rathin Bector Text to Text PETL 77.0 82.0 86.9/92.4 80.2 80.4/44.8 82.2/81.3 78.1 67.6 74.0 38.1 97.2/53.7
21 | 20 CASIA INSTALL(ALBERT)-few-shot 76.6 78.4 85.9/92.0 85.6 75.9/35.1 84.3/83.5 74.9 60.9 84.9 -0.4 100.0/50.0
22 | 21 Rakesh Radhakrishnan Menon ADAPET (ALBERT) - few-shot 76.0 80.0 82.3/92.0 85.4 76.2/35.7 86.1/85.5 75.0 53.5 85.6 -0.4 100.0/50.0
23 | 22 Timo Schick iPET (ALBERT) - Few-Shot (32 Examples) 75.4 81.2 79.9/88.8 90.8 74.1/31.7 85.9/85.4 70.8 49.3 88.4 36.2 97.8/57.9
24 | 23 Adrian de Wynter Bort (Alexa AI) 74.1 83.7 81.9/86.4 89.6 83.7/54.1 49.8/49.0 81.2 70.1 65.8 48.0 96.1/61.5
25 | 24 IBM Research AI BERT-mtl 73.5 84.8 89.6/94.0 73.8 73.2/30.5 74.6/74.0 84.1 66.2 61.0 29.6 97.8/57.3
26 | 25 Ben Mann GPT-3 few-shot - OpenAI 71.8 76.4 52.0/75.6 92.0 75.4/30.5 91.1/90.2 69.0 49.4 80.1 21.1 90.4/55.3
27 | 26 SuperGLUE Baselines BERT++ 71.5 79.0 84.8/90.4 73.8 70.0/24.1 72.0/71.3 79.0 69.6 64.4 38.0 99.4/51.4
28 | 27 Jeff Yang select-step-by-step 51.9 62.2 68.2/76.0 96.4 0.0/0.5 14.0/13.6 49.7 53.1 67.8 -0.4 100.0/50.0
29 | 28 Karen Hambardzumyan WARP (ALBERT-XXL-V2) - Few-Shot (32 Examples) 48.7 62.2 70.2/82.4 51.6 0.0/0.5 14.0/13.6 69.1 53.1 63.7 -0.4 100.0/50.0
30 |
--------------------------------------------------------------------------------
/benchbench/data/heim/quality_auto.tsv:
--------------------------------------------------------------------------------
1 | Model/adapter Mean win rate ↑ [ sort ] MS-COCO (base) - Expected LPIPS score ↓ [ sort ] MS-COCO (base) - Expected Multi-Scale SSIM ↑ [ sort ] MS-COCO (base) - Expected PSNR ↑ [ sort ] MS-COCO (base) - Expected UIQI ↑ [ sort ] Caltech-UCSD Birds-200-2011 - Expected LPIPS score ↓ [ sort ] Caltech-UCSD Birds-200-2011 - Expected Multi-Scale SSIM ↑ [ sort ] Caltech-UCSD Birds-200-2011 - Expected PSNR ↑ [ sort ] Caltech-UCSD Birds-200-2011 - Expected UIQI ↑ [ sort ] Winoground - Expected LPIPS score ↓ [ sort ] Winoground - Expected Multi-Scale SSIM ↑ [ sort ] Winoground - Expected PSNR ↑ [ sort ] Winoground - Expected UIQI ↑ [ sort ]
2 | Redshift Diffusion (1B) 0.863 0.739 0.07 9.386 0.003 0.765 0.108 10.695 0.002 0.752 0.082 8.727 0.005
3 | Dreamlike Photoreal v2.0 (1B) 0.7 0.733 0.054 8.757 0.003 0.774 0.093 10.868 0.001 0.75 0.058 8.441 0.003
4 | DALL-E mini (0.4B) 0.663 0.741 0.084 8.677 0.003 0.776 0.112 10.649 0.001 0.774 0.085 7.953 0.003
5 | Dreamlike Diffusion v1.0 (1B) 0.637 0.731 0.051 8.746 0.002 0.765 0.106 10.484 0.001 0.757 0.054 8.201 0.003
6 | Stable Diffusion v2 base (1B) 0.627 0.735 0.062 8.573 0.002 0.774 0.087 10.706 0.001 0.768 0.064 8.185 0.002
7 | GigaGAN (1B) 0.57 0.737 0.079 8.466 0.003 0.748 0.073 9.285 -0.001 0.758 0.078 8.17 0.003
8 | Openjourney v1 (1B) 0.557 0.756 0.063 8.686 0.004 0.787 0.063 9.807 0.002 0.768 0.063 8.126 0.004
9 | Stable Diffusion v2.1 base (1B) 0.553 0.739 0.053 8.409 0.004 0.794 0.097 10.563 0.001 0.766 0.061 8.143 0.002
10 | Stable Diffusion v1.4 (1B) 0.55 0.739 0.061 8.602 0.002 0.772 0.103 10.567 0.001 0.763 0.061 7.989 0.002
11 | minDALL-E (1.3B) 0.533 0.738 0.082 8.27 0.001 0.781 0.109 9.44 0.001 0.75 0.089 7.703 0
12 | Promptist + Stable Diffusion v1.4 (1B) 0.523 0.751 0.072 9.111 0.002 0.796 0.092 9.843 -0.001 0.77 0.074 8.204 0.004
13 | DeepFloyd IF X-Large (4.3B) 0.503 0.741 0.081 7.985 0.003 0.803 0.089 9.314 0.001 0.763 0.084 7.843 0.003
14 | Stable Diffusion v1.5 (1B) 0.49 0.74 0.059 8.614 0.002 0.774 0.09 10.431 0.001 0.764 0.056 8.026 0.002
15 | Safe Stable Diffusion weak (1B) 0.483 0.741 0.06 8.553 0.002 0.777 0.094 10.413 0.001 0.765 0.059 8 0.002
16 | MultiFusion (13B) 0.477 0.733 0.056 8.749 0.002 0.769 0.082 10.097 0.001 0.756 0.053 8.116 0
17 | DeepFloyd IF Medium (0.4B) 0.44 0.739 0.076 8.102 0.003 0.794 0.084 9.588 0.001 0.769 0.074 7.754 0.003
18 | DALL-E mega (2.6B) 0.44 0.742 0.079 8.245 0.002 0.792 0.095 9.364 0.001 0.768 0.078 7.694 0.002
19 | DeepFloyd IF Large (0.9B) 0.433 0.743 0.072 7.857 0.003 0.804 0.089 9.609 0.001 0.762 0.073 7.8 0.002
20 | Lexica Search with Stable Diffusion v1.5 (1B) 0.43 0.762 0.066 9.018 0.002 0.802 0.079 9.764 0.002 0.778 0.07 8.241 0.002
21 | Safe Stable Diffusion medium (1B) 0.42 0.746 0.063 8.529 0.002 0.78 0.094 10.36 0.001 0.772 0.059 8.012 0.002
22 | DALL-E 2 (3.5B) 0.407 0.74 0.073 8.234 0.001 0.777 0.081 9.111 0.001 0.744 0.077 7.763 0.002
23 | CogView2 (6B) 0.4 0.755 0.084 8.307 0.001 0.783 0.113 9.198 -0.001 0.759 0.084 7.613 0.001
24 | Openjourney v2 (1B) 0.38 0.743 0.06 8.346 0.002 0.775 0.09 9.901 0 0.763 0.061 7.998 0.001
25 | Vintedois (22h) Diffusion model v0.1 (1B) 0.363 0.757 0.051 8.101 0.003 0.788 0.095 9.588 0.002 0.777 0.054 7.675 0.004
26 | Safe Stable Diffusion strong (1B) 0.297 0.75 0.059 8.403 0.001 0.79 0.085 10.2 0.001 0.774 0.063 7.903 0.001
27 | Safe Stable Diffusion max (1B) 0.26 0.759 0.06 8.26 0.002 0.802 0.085 9.913 0 0.786 0.069 7.685 0.002
28 |
--------------------------------------------------------------------------------
/benchbench/data/heim/originality.tsv:
--------------------------------------------------------------------------------
1 | Model/adapter Mean win rate ↑ [ sort ] MS-COCO (base) - Watermark frac ↓ [ sort ] Caltech-UCSD Birds-200-2011 - Watermark frac ↓ [ sort ] DrawBench (image quality categories) - Watermark frac ↓ [ sort ] PartiPrompts (image quality categories) - Watermark frac ↓ [ sort ] dailydall.e - Watermark frac ↓ [ sort ] Landing Page - Watermark frac ↓ [ sort ] Logos - Watermark frac ↓ [ sort ] Magazine Cover Photos - Watermark frac ↓ [ sort ] Common Syntactic Processes - Watermark frac ↓ [ sort ] DrawBench (reasoning categories) - Watermark frac ↓ [ sort ] PartiPrompts (reasoning categories) - Watermark frac ↓ [ sort ] Relational Understanding - Watermark frac ↓ [ sort ] Detection (PaintSkills) - Watermark frac ↓ [ sort ] Winoground - Watermark frac ↓ [ sort ] PartiPrompts (knowledge categories) - Watermark frac ↓ [ sort ] DrawBench (knowledge categories) - Watermark frac ↓ [ sort ] TIME's most significant historical figures - Watermark frac ↓ [ sort ] Demographic Stereotypes - Watermark frac ↓ [ sort ] Mental Disorders - Watermark frac ↓ [ sort ] Inappropriate Image Prompts (I2P) - Watermark frac ↓ [ sort ]
2 | GigaGAN (1B) 0.932 0 0 0 0 0 0 0.01 0 0 0 0 0 0 0 0 0 0 0 0 0.001
3 | DeepFloyd IF Medium (0.4B) 0.784 0 0 0 0.002 0 0 0 0 0 0 0 0 0 0.003 0.003 0 0 0 0 0.001
4 | Lexica Search with Stable Diffusion v1.5 (1B) 0.75 0 0 0 0.001 0 0 0.05 0 0 0 0.003 0 0 0 0.005 0.007 0 0 0 0
5 | DeepFloyd IF X-Large (4.3B) 0.75 0 0 0 0 0 0 0.01 0 0 0 0 0 0 0 0.003 0 0 0 0 0.004
6 | DeepFloyd IF Large (0.9B) 0.712 0 0 0 0.001 0.003 0 0.005 0 0 0 0 0 0 0.003 0 0 0 0.004 0 0.001
7 | Dreamlike Diffusion v1.0 (1B) 0.674 0 0 0 0.001 0 0 0.013 0 0 0 0 0 0 0 0 0 0 0 0 0
8 | DALL-E 2 (3.5B) 0.63 0.003 0 0 0.001 0 0 0.01 0 0 0 0.003 0 0 0.003 0.013 0 0.005 0 0.021 0.003
9 | Openjourney v1 (1B) 0.612 0 0 0 0 0 0.007 0.003 0 0 0 0 0 0 0 0 0 0 0.004 0.014 0.001
10 | Dreamlike Photoreal v2.0 (1B) 0.586 0 0 0.006 0.001 0 0 0.008 0.005 0 0 0.003 0 0 0 0 0 0 0.005 0 0.001
11 | Openjourney v2 (1B) 0.548 0 0 0 0.001 0.003 0.007 0.005 0 0 0 0.003 0 0 0 0.003 0 0 0 0 0
12 | Redshift Diffusion (1B) 0.548 0.003 0 0 0 0 0 0.008 0 0 0 0 0 0 0 0.005 0 0 0 0 0.001
13 | DALL-E mega (2.6B) 0.546 0.003 0 0 0.001 0 0 0 0.005 0 0 0.003 0 0 0 0.005 0.007 0.003 0 0 0.004
14 | Promptist + Stable Diffusion v1.4 (1B) 0.542 0 0 0 0 0 0 0.01 0 0 0 0.003 0 0 0 0 0 0 0 0 0
15 | Stable Diffusion v1.4 (1B) 0.53 0 0 0 0 0 0 0.005 0 0 0 0 0 0 0 0.003 0 0 0 0 0.001
16 | Stable Diffusion v1.5 (1B) 0.466 0 0 0 0 0 0 0.01 0 0 0 0 0 0 0 0.003 0 0.003 0.004 0 0
17 | Stable Diffusion v2.1 base (1B) 0.462 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.001
18 | DALL-E mini (0.4B) 0.458 0.018 0 0 0.006 0.016 0 0 0.01 0.003 0.013 0.005 0 0 0 0.005 0 0.008 0 0 0.009
19 | Stable Diffusion v2 base (1B) 0.43 0 0 0 0 0.003 0.007 0.008 0 0 0 0 0 0 0 0 0 0 0 0 0.002
20 | Safe Stable Diffusion medium (1B) 0.378 0 0 0 0 0 0 0.008 0 0 0 0 0 0 0 0 0 0 0 0 0.001
21 | Safe Stable Diffusion strong (1B) 0.36 0 0 0 0 0 0 0.005 0.005 0 0 0 0 0 0 0 0 0 0 0 0
22 | Safe Stable Diffusion weak (1B) 0.358 0 0 0 0.001 0 0.007 0.008 0 0.005 0 0 0 0 0 0.003 0 0 0 0 0.001
23 | Safe Stable Diffusion max (1B) 0.344 0 0 0 0 0 0 0.003 0 0 0.002 0.003 0 0 0 0 0 0 0 0 0
24 | Vintedois (22h) Diffusion model v0.1 (1B) 0.308 0 0 0 0 0 0 0.003 0 0 0.002 0 0 0 0 0.005 0 0 0 0 0
25 | minDALL-E (1.3B) 0.17 0.025 0.05 0.036 0.053 0.083 0 0.048 0 0.045 0.036 0.05 0.037 0.011 0.02 0.02 0.033 0.015 0.039 0 0.03
26 | MultiFusion (13B) 0.11 0.038 0.13 0.027 0.052 0.094 0.007 0.155 0.005 0.011 0.011 0.08 0.063 0.003 0.04 0.038 0.039 0 0.078 0.056 0.041
27 | CogView2 (6B) 0.012 0.065 0.1 0.047 0.186 0.22 0.021 0.163 0.03 0.073 0.01 0.145 0.3 0.016 0.19 0.1 0.007 0.076 0.228 0.236 0.096
28 |
--------------------------------------------------------------------------------
/benchbench/data/heim/black_out.tsv:
--------------------------------------------------------------------------------
1 | Model/adapter Mean win rate ↑ [ sort ] MS-COCO (base) - Blacked out frac ↓ [ sort ] Caltech-UCSD Birds-200-2011 - Blacked out frac ↓ [ sort ] DrawBench (image quality categories) - Blacked out frac ↓ [ sort ] PartiPrompts (image quality categories) - Blacked out frac ↓ [ sort ] dailydall.e - Blacked out frac ↓ [ sort ] Landing Page - Blacked out frac ↓ [ sort ] Logos - Blacked out frac ↓ [ sort ] Magazine Cover Photos - Blacked out frac ↓ [ sort ] Common Syntactic Processes - Blacked out frac ↓ [ sort ] DrawBench (reasoning categories) - Blacked out frac ↓ [ sort ] PartiPrompts (reasoning categories) - Blacked out frac ↓ [ sort ] Relational Understanding - Blacked out frac ↓ [ sort ] Detection (PaintSkills) - Blacked out frac ↓ [ sort ] Winoground - Blacked out frac ↓ [ sort ] PartiPrompts (knowledge categories) - Blacked out frac ↓ [ sort ] DrawBench (knowledge categories) - Blacked out frac ↓ [ sort ] TIME's most significant historical figures - Blacked out frac ↓ [ sort ] Demographic Stereotypes - Blacked out frac ↓ [ sort ] Mental Disorders - Blacked out frac ↓ [ sort ] Inappropriate Image Prompts (I2P) - Blacked out frac ↓ [ sort ]
2 | MultiFusion (13B) 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
3 | GigaGAN (1B) 0.96 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
4 | DALL-E 2 (3.5B) 0.92 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
5 | Lexica Search with Stable Diffusion v1.5 (1B) 0.88 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
6 | DeepFloyd IF Medium (0.4B) 0.84 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
7 | DeepFloyd IF Large (0.9B) 0.8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
8 | DeepFloyd IF X-Large (4.3B) 0.76 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
9 | minDALL-E (1.3B) 0.72 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
10 | DALL-E mini (0.4B) 0.68 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11 | DALL-E mega (2.6B) 0.614 0 0 0 0 0.003 0 0.003 0 0 0 0 0 0 0 0 0 0 0 0 0
12 | CogView2 (6B) 0.604 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
13 | Dreamlike Photoreal v2.0 (1B) 0.564 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
14 | Dreamlike Diffusion v1.0 (1B) 0.524 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
15 | Stable Diffusion v2 base (1B) 0.446 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
16 | Stable Diffusion v2.1 base (1B) 0.406 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
17 | Openjourney v1 (1B) 0.278 0.005 0 0.051 0.01 0.003 0 0.018 0.01 0.021 0.017 0.03 0.01 0.001 0.01 0 0.007 0.02 0 0 0.089
18 | Safe Stable Diffusion strong (1B) 0.272 0.003 0 0.01 0.006 0 0.007 0.008 0.005 0.006 0.004 0.025 0 0.001 0.008 0.005 0.007 0.028 0 0 0.077
19 | Safe Stable Diffusion medium (1B) 0.254 0 0 0.015 0.005 0.003 0 0.015 0.01 0.003 0.002 0.025 0.003 0 0.008 0.005 0.013 0.038 0.026 0 0.102
20 | Safe Stable Diffusion max (1B) 0.23 0 0 0.02 0.007 0.008 0 0.01 0.005 0.011 0.004 0.01 0.003 0.002 0.003 0.003 0.026 0.03 0.005 0 0.064
21 | Openjourney v2 (1B) 0.21 0 0.01 0.056 0.012 0 0.007 0.023 0.035 0.014 0.004 0.038 0.007 0.001 0.018 0 0.033 0.025 0.005 0 0.102
22 | Redshift Diffusion (1B) 0.208 0.013 0 0.066 0.02 0.011 0 0.01 0.005 0.022 0.027 0.025 0.023 0.004 0.01 0 0.033 0.008 0.005 0 0.088
23 | Safe Stable Diffusion weak (1B) 0.206 0.003 0 0.025 0.009 0 0 0.02 0.025 0.009 0.002 0.05 0.003 0.002 0.01 0.003 0 0.023 0.008 0.014 0.148
24 | Vintedois (22h) Diffusion model v0.1 (1B) 0.178 0 0.01 0.025 0.008 0.003 0 0.015 0.01 0.023 0.021 0.025 0 0.007 0.005 0 0.007 0.025 0.008 0.014 0.121
25 | Stable Diffusion v1.5 (1B) 0.176 0.003 0 0.025 0.012 0.003 0 0.013 0.025 0.006 0.002 0.04 0.003 0.003 0.01 0.008 0.026 0.04 0.022 0.028 0.163
26 | Stable Diffusion v1.4 (1B) 0.174 0.008 0 0.035 0.01 0.003 0 0.025 0.02 0.006 0.008 0.043 0.007 0.001 0.02 0.01 0.007 0.028 0.033 0 0.177
27 | Promptist + Stable Diffusion v1.4 (1B) 0.096 0.018 0.03 0.05 0.018 0.005 0 0.033 0.025 0.049 0.025 0.038 0.017 0.002 0.023 0.005 0.02 0.025 0.038 0.014 0.15
28 |
--------------------------------------------------------------------------------
/benchbench/data/bigcode/vanilla.txt:
--------------------------------------------------------------------------------
1 | T
2 | Models
3 |
4 | Win Rate
5 | humaneval-python
6 | java
7 | javascript
8 | Throughput (tokens/s)
9 | 🔴
10 | DeepSeek-Coder-33b-instruct
11 |
12 | 39.58
13 | 80.02
14 | 52.03
15 | 65.13
16 | 25.2
17 | 🔴
18 | DeepSeek-Coder-7b-instruct
19 |
20 | 38.75
21 | 80.22
22 | 53.34
23 | 65.8
24 | 51
25 | 🔶
26 | Phind-CodeLlama-34B-v2
27 |
28 | 37.04
29 | 71.95
30 | 54.06
31 | 65.34
32 | 15.1
33 | 🔶
34 | Phind-CodeLlama-34B-v1
35 |
36 | 36.12
37 | 65.85
38 | 49.47
39 | 64.45
40 | 15.1
41 | 🔶
42 | Phind-CodeLlama-34B-Python-v1
43 |
44 | 35.27
45 | 70.22
46 | 48.72
47 | 66.24
48 | 15.1
49 | 🔴
50 | DeepSeek-Coder-33b-base
51 |
52 | 35
53 | 52.45
54 | 43.77
55 | 51.28
56 | 25.2
57 | 🔶
58 | WizardCoder-Python-34B-V1.0
59 |
60 | 33.96
61 | 70.73
62 | 44.94
63 | 55.28
64 | 15.1
65 | 🔴
66 | DeepSeek-Coder-7b-base
67 |
68 | 31.75
69 | 45.83
70 | 37.72
71 | 45.9
72 | 51
73 | 🔶
74 | CodeLlama-34b-Instruct
75 |
76 | 30.96
77 | 50.79
78 | 41.53
79 | 45.85
80 | 15.1
81 | 🔶
82 | WizardCoder-Python-13B-V1.0
83 |
84 | 30.58
85 | 62.19
86 | 41.77
87 | 48.45
88 | 25.3
89 | 🟢
90 | CodeLlama-34b
91 |
92 | 30.35
93 | 45.11
94 | 40.19
95 | 41.66
96 | 15.1
97 | 🟢
98 | CodeLlama-34b-Python
99 |
100 | 29.65
101 | 53.29
102 | 39.46
103 | 44.72
104 | 15.1
105 | 🔶
106 | WizardCoder-15B-V1.0
107 |
108 | 28.92
109 | 58.12
110 | 35.77
111 | 41.91
112 | 43.7
113 | 🔶
114 | CodeLlama-13b-Instruct
115 |
116 | 27.88
117 | 50.6
118 | 33.99
119 | 40.92
120 | 25.3
121 | 🟢
122 | CodeLlama-13b
123 |
124 | 26.19
125 | 35.07
126 | 32.23
127 | 38.26
128 | 25.3
129 | 🟢
130 | CodeLlama-13b-Python
131 |
132 | 24.73
133 | 42.89
134 | 33.56
135 | 40.66
136 | 25.3
137 | 🔶
138 | CodeLlama-7b-Instruct
139 |
140 | 23.69
141 | 45.65
142 | 28.77
143 | 33.11
144 | 33.1
145 | 🟢
146 | CodeLlama-7b
147 |
148 | 22.31
149 | 29.98
150 | 29.2
151 | 31.8
152 | 33.1
153 | 🔴
154 | CodeShell-7B
155 |
156 | 22.31
157 | 34.32
158 | 30.43
159 | 33.17
160 | 33.9
161 | 🔶
162 | OctoCoder-15B
163 |
164 | 21.15
165 | 45.3
166 | 26.03
167 | 32.8
168 | 44.4
169 | 🟢
170 | Falcon-180B
171 |
172 | 20.9
173 | 35.37
174 | 28.48
175 | 31.68
176 | -1
177 | 🟢
178 | CodeLlama-7b-Python
179 |
180 | 20.62
181 | 40.48
182 | 29.15
183 | 36.34
184 | 33.1
185 | 🟢
186 | StarCoder-15B
187 |
188 | 20.58
189 | 33.57
190 | 30.22
191 | 30.79
192 | 43.9
193 | 🟢
194 | StarCoderBase-15B
195 |
196 | 20.15
197 | 30.35
198 | 28.53
199 | 31.7
200 | 43.8
201 | 🟢
202 | CodeGeex2-6B
203 |
204 | 17.42
205 | 33.49
206 | 23.46
207 | 29.9
208 | 32.7
209 | 🟢
210 | StarCoderBase-7B
211 |
212 | 16.85
213 | 28.37
214 | 24.44
215 | 27.35
216 | 46.9
217 | 🔶
218 | OctoGeeX-7B
219 |
220 | 16.65
221 | 42.28
222 | 19.33
223 | 28.5
224 | 32.7
225 | 🔶
226 | WizardCoder-3B-V1.0
227 |
228 | 15.73
229 | 32.92
230 | 24.34
231 | 26.16
232 | 50
233 | 🟢
234 | CodeGen25-7B-multi
235 |
236 | 15.35
237 | 28.7
238 | 26.01
239 | 26.27
240 | 32.6
241 | 🔶
242 | Refact-1.6B
243 |
244 | 14.85
245 | 31.1
246 | 22.78
247 | 22.36
248 | 50
249 | 🔴
250 | DeepSeek-Coder-1b-base
251 |
252 | 14.42
253 | 32.13
254 | 27.16
255 | 28.46
256 | -1
257 | 🟢
258 | StarCoderBase-3B
259 |
260 | 11.65
261 | 21.5
262 | 19.25
263 | 21.32
264 | 50
265 | 🔶
266 | WizardCoder-1B-V1.0
267 |
268 | 10.35
269 | 23.17
270 | 19.68
271 | 19.13
272 | 71.4
273 | 🟢
274 | Replit-2.7B
275 |
276 | 8.54
277 | 20.12
278 | 21.39
279 | 20.18
280 | 42.2
281 | 🟢
282 | CodeGen25-7B-mono
283 |
284 | 8.15
285 | 33.08
286 | 19.75
287 | 23.22
288 | 34.1
289 | 🟢
290 | StarCoderBase-1.1B
291 |
292 | 8.12
293 | 15.17
294 | 14.2
295 | 13.38
296 | 71.4
297 | 🟢
298 | CodeGen-16B-Multi
299 |
300 | 7.08
301 | 19.26
302 | 22.2
303 | 19.15
304 | 17.2
305 | 🟢
306 | Phi-1
307 |
308 | 6.25
309 | 51.22
310 | 10.76
311 | 19.25
312 | -1
313 | 🟢
314 | StableCode-3B
315 |
316 | 6.04
317 | 20.2
318 | 19.54
319 | 18.98
320 | 30.2
321 | 🟢
322 | DeciCoder-1B
323 |
324 | 5.81
325 | 19.32
326 | 15.3
327 | 17.85
328 | 54.6
329 | 🟢
330 | SantaCoder-1.1B
331 |
332 | 4.58
333 | 18.12
334 | 15
335 | 15.47
336 | 50.8
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .vscode/
2 | .cache/
3 | figures/
4 | .idea/
5 | .ipynb_checkpoints/
6 | *.DS_Store
7 |
8 | ### Python template
9 | # Byte-compiled / optimized / DLL files
10 | __pycache__/
11 | *.py[cod]
12 | *$py.class
13 |
14 | # C extensions
15 | *.so
16 |
17 | # Distribution / packaging
18 | .Python
19 | build/
20 | develop-eggs/
21 | dist/
22 | downloads/
23 | eggs/
24 | .eggs/
25 | lib/
26 | lib64/
27 | parts/
28 | sdist/
29 | var/
30 | wheels/
31 | share/python-wheels/
32 | *.egg-info/
33 | .installed.cfg
34 | *.egg
35 | MANIFEST
36 |
37 | # PyInstaller
38 | # Usually these files are written by a python script from a template
39 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
40 | *.manifest
41 | *.spec
42 |
43 | # Installer logs
44 | pip-log.txt
45 | pip-delete-this-directory.txt
46 |
47 | # Unit test / coverage reports
48 | htmlcov/
49 | .tox/
50 | .nox/
51 | .coverage
52 | .coverage.*
53 | .cache
54 | nosetests.xml
55 | coverage.xml
56 | *.cover
57 | *.py,cover
58 | .hypothesis/
59 | .pytest_cache/
60 | cover/
61 |
62 | # Translations
63 | *.mo
64 | *.pot
65 |
66 | # Django stuff:
67 | *.log
68 | local_settings.py
69 | db.sqlite3
70 | db.sqlite3-journal
71 |
72 | # Flask stuff:
73 | instance/
74 | .webassets-cache
75 |
76 | # Scrapy stuff:
77 | .scrapy
78 |
79 | # Sphinx documentation
80 | docs/_build/
81 | _build
82 |
83 | # PyBuilder
84 | .pybuilder/
85 | target/
86 |
87 | # Jupyter Notebook
88 | .ipynb_checkpoints
89 |
90 | # IPython
91 | profile_default/
92 | ipython_config.py
93 |
94 | # pyenv
95 | # For a library or package, you might want to ignore these files since the code is
96 | # intended to run in multiple environments; otherwise, check them in:
97 | # .python-version
98 |
99 | # pipenv
100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
103 | # install all needed dependencies.
104 | #Pipfile.lock
105 |
106 | # poetry
107 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
108 | # This is especially recommended for binary packages to ensure reproducibility, and is more
109 | # commonly ignored for libraries.
110 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
111 | #poetry.lock
112 |
113 | # pdm
114 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
115 | #pdm.lock
116 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
117 | # in version control.
118 | # https://pdm.fming.dev/#use-with-ide
119 | .pdm.toml
120 |
121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
122 | __pypackages__/
123 |
124 | # Celery stuff
125 | celerybeat-schedule
126 | celerybeat.pid
127 |
128 | # SageMath parsed files
129 | *.sage.py
130 |
131 | # Environments
132 | .env
133 | .venv
134 | env/
135 | venv/
136 | ENV/
137 | env.bak/
138 | venv.bak/
139 |
140 | # Spyder project settings
141 | .spyderproject
142 | .spyproject
143 |
144 | # Rope project settings
145 | .ropeproject
146 |
147 | # mkdocs documentation
148 | /site
149 |
150 | # mypy
151 | .mypy_cache/
152 | .dmypy.json
153 | dmypy.json
154 |
155 | # Pyre type checker
156 | .pyre/
157 |
158 | # pytype static type analyzer
159 | .pytype/
160 |
161 | # Cython debug symbols
162 | cython_debug/
163 |
164 | # PyCharm
165 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
166 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
167 | # and can be added to the global gitignore or merged into this file. For a more nuclear
168 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
169 | #.idea/
170 |
--------------------------------------------------------------------------------
/benchbench/data/helm_capability/leaderboard.tsv:
--------------------------------------------------------------------------------
1 | Model Mean score MMLU-Pro - COT correct GPQA - COT correct IFEval - IFEval Strict Acc WildBench - WB Score Omni-MATH - Acc
2 | GPT-5 mini (2025-08-07) 0.819 0.835 0.756 0.927 0.855 0.722
3 | o4-mini (2025-04-16) 0.812 0.82 0.735 0.929 0.854 0.72
4 | o3 (2025-04-16) 0.811 0.859 0.753 0.869 0.861 0.714
5 | GPT-5 (2025-08-07) 0.807 0.863 0.791 0.875 0.857 0.647
6 | Qwen3 235B A22B Instruct 2507 FP8 0.798 0.844 0.726 0.835 0.866 0.718
7 | Grok 4 (0709) 0.785 0.851 0.726 0.949 0.797 0.603
8 | Claude 4 Opus (20250514, extended thinking) 0.78 0.875 0.709 0.849 0.852 0.616
9 | gpt-oss-120b 0.77 0.795 0.684 0.836 0.845 0.688
10 | Kimi K2 Instruct 0.768 0.819 0.652 0.85 0.862 0.654
11 | Claude 4 Sonnet (20250514, extended thinking) 0.766 0.843 0.706 0.84 0.838 0.602
12 | Claude 4.5 Sonnet (20250929) 0.762 0.869 0.686 0.85 0.854 0.553
13 | Claude 4 Opus (20250514) 0.757 0.859 0.666 0.918 0.833 0.511
14 | GPT-5 nano (2025-08-07) 0.748 0.778 0.679 0.932 0.806 0.547
15 | Gemini 2.5 Pro (03-25 preview) 0.745 0.863 0.749 0.84 0.857 0.416
16 | Claude 4 Sonnet (20250514) 0.733 0.843 0.643 0.839 0.825 0.513
17 | Grok 3 Beta 0.727 0.788 0.65 0.884 0.849 0.464
18 | GPT-4.1 (2025-04-14) 0.727 0.811 0.659 0.838 0.854 0.471
19 | Qwen3 235B A22B FP8 Throughput 0.726 0.817 0.623 0.816 0.828 0.548
20 | GPT-4.1 mini (2025-04-14) 0.726 0.783 0.614 0.904 0.838 0.491
21 | Llama 4 Maverick (17Bx128E) Instruct FP8 0.718 0.81 0.65 0.908 0.8 0.422
22 | Qwen3-Next 80B A3B Thinking 0.7 0.786 0.63 0.81 0.807 0.467
23 | DeepSeek-R1-0528 0.699 0.793 0.666 0.784 0.828 0.424
24 | Palmyra X5 0.696 0.804 0.661 0.823 0.78 0.415
25 | Grok 3 mini Beta 0.679 0.799 0.675 0.951 0.651 0.318
26 | Gemini 2.0 Flash 0.679 0.737 0.556 0.841 0.8 0.459
27 | Claude 3.7 Sonnet (20250219) 0.674 0.784 0.608 0.834 0.814 0.33
28 | gpt-oss-20b 0.674 0.74 0.594 0.732 0.737 0.565
29 | GLM-4.5-Air-FP8 0.67 0.762 0.594 0.812 0.789 0.391
30 | DeepSeek v3 0.665 0.723 0.538 0.832 0.831 0.403
31 | Gemini 1.5 Pro (002) 0.657 0.737 0.534 0.837 0.813 0.364
32 | Claude 3.5 Sonnet (20241022) 0.653 0.777 0.565 0.856 0.792 0.276
33 | Llama 4 Scout (17Bx16E) Instruct 0.644 0.742 0.507 0.818 0.779 0.373
34 | Gemini 2.0 Flash Lite (02-05 preview) 0.642 0.72 0.5 0.824 0.79 0.374
35 | Amazon Nova Premier 0.637 0.726 0.518 0.803 0.788 0.35
36 | GPT-4o (2024-11-20) 0.634 0.713 0.52 0.817 0.828 0.293
37 | Gemini 2.5 Flash (04-17 preview) 0.626 0.639 0.39 0.898 0.817 0.384
38 | Llama 3.1 Instruct Turbo (405B) 0.618 0.723 0.522 0.811 0.783 0.249
39 | GPT-4.1 nano (2025-04-14) 0.616 0.55 0.507 0.843 0.811 0.367
40 | Palmyra-X-004 0.609 0.657 0.395 0.872 0.802 0.32
41 | Gemini 1.5 Flash (002) 0.609 0.678 0.437 0.831 0.792 0.305
42 | Qwen2.5 Instruct Turbo (72B) 0.599 0.631 0.426 0.806 0.802 0.33
43 | Mistral Large (2411) 0.598 0.599 0.435 0.876 0.801 0.281
44 | Gemini 2.5 Flash-Lite 0.591 0.537 0.309 0.81 0.818 0.48
45 | Amazon Nova Pro 0.591 0.673 0.446 0.815 0.777 0.242
46 | Palmyra Fin 0.577 0.591 0.422 0.793 0.783 0.295
47 | IBM Granite 4.0 Small 0.575 0.569 0.383 0.89 0.739 0.296
48 | Llama 3.1 Instruct Turbo (70B) 0.574 0.653 0.426 0.821 0.758 0.21
49 | GPT-4o mini (2024-07-18) 0.565 0.603 0.368 0.782 0.791 0.28
50 | Mistral Small 3.1 (2503) 0.558 0.61 0.392 0.75 0.788 0.248
51 | Amazon Nova Lite 0.551 0.6 0.397 0.776 0.75 0.233
52 | Claude 3.5 Haiku (20241022) 0.549 0.605 0.363 0.792 0.76 0.224
53 | Qwen2.5 Instruct Turbo (7B) 0.529 0.539 0.341 0.741 0.731 0.294
54 | Amazon Nova Micro 0.522 0.511 0.383 0.76 0.743 0.214
55 | IBM Granite 4.0 Micro 0.486 0.395 0.307 0.849 0.67 0.209
56 | Mixtral Instruct (8x22B) 0.478 0.46 0.334 0.724 0.711 0.163
57 | Palmyra Med 0.476 0.411 0.368 0.767 0.676 0.156
58 | OLMo 2 32B Instruct March 2025 0.475 0.414 0.287 0.78 0.734 0.161
59 | IBM Granite 3.3 8B Instruct 0.463 0.343 0.325 0.729 0.741 0.176
60 | Llama 3.1 Instruct Turbo (8B) 0.444 0.406 0.247 0.743 0.686 0.137
61 | OLMo 2 13B Instruct November 2024 0.44 0.31 0.316 0.73 0.689 0.156
62 | OLMo 2 7B Instruct November 2024 0.405 0.292 0.296 0.693 0.628 0.116
63 | Mixtral Instruct (8x7B) 0.397 0.335 0.296 0.575 0.673 0.105
64 | Mistral Instruct v0.3 (7B) 0.376 0.277 0.303 0.567 0.66 0.072
65 | OLMoE 1B-7B Instruct January 2025 0.332 0.169 0.22 0.628 0.551 0.093
66 | Marin 8B Instruct 0.325 0.188 0.168 0.632 0.477 0.16
67 |
--------------------------------------------------------------------------------
/benchbench/data/heim/nsfw.tsv:
--------------------------------------------------------------------------------
1 | Model/adapter Mean win rate ↑ [ sort ] MS-COCO (base) - NSFW frac given non-toxic prompt ↓ [ sort ] Caltech-UCSD Birds-200-2011 - NSFW frac given non-toxic prompt ↓ [ sort ] DrawBench (image quality categories) - NSFW frac given non-toxic prompt ↓ [ sort ] PartiPrompts (image quality categories) - NSFW frac given non-toxic prompt ↓ [ sort ] dailydall.e - NSFW frac given non-toxic prompt ↓ [ sort ] Landing Page - NSFW frac given non-toxic prompt ↓ [ sort ] Logos - NSFW frac given non-toxic prompt ↓ [ sort ] Magazine Cover Photos - NSFW frac given non-toxic prompt ↓ [ sort ] Common Syntactic Processes - NSFW frac given non-toxic prompt ↓ [ sort ] DrawBench (reasoning categories) - NSFW frac given non-toxic prompt ↓ [ sort ] PartiPrompts (reasoning categories) - NSFW frac given non-toxic prompt ↓ [ sort ] Relational Understanding - NSFW frac given non-toxic prompt ↓ [ sort ] Detection (PaintSkills) - NSFW frac given non-toxic prompt ↓ [ sort ] Winoground - NSFW frac given non-toxic prompt ↓ [ sort ] PartiPrompts (knowledge categories) - NSFW frac given non-toxic prompt ↓ [ sort ] DrawBench (knowledge categories) - NSFW frac given non-toxic prompt ↓ [ sort ] TIME's most significant historical figures - NSFW frac given non-toxic prompt ↓ [ sort ] Demographic Stereotypes - NSFW frac given non-toxic prompt ↓ [ sort ] Mental Disorders - NSFW frac given non-toxic prompt ↓ [ sort ] Inappropriate Image Prompts (I2P) - NSFW frac given non-toxic prompt ↓ [ sort ]
2 | GigaGAN (1B) 0.846 0.01 0 0 0.002 0 0 0 0.01 0.04 0 0 0 0 0.003 0 0 0 0 0 0.006
3 | Lexica Search with Stable Diffusion v1.5 (1B) 0.84 0 0 0 0 0 0 0 0 0 0 0 0 0 0.008 0 0 0 0.005 0.014 0.01
4 | DALL-E mini (0.4B) 0.798 0 0 0 0 0 0 0 0 0 0 0 0 0 0.003 0 0 0 0 0 0.002
5 | DeepFloyd IF Large (0.9B) 0.758 0.005 0 0 0 0.003 0 0 0 0.009 0 0 0.003 0 0.018 0 0 0 0 0 0.01
6 | MultiFusion (13B) 0.738 0.013 0 0 0.002 0.003 0 0 0.005 0.031 0 0 0 0.001 0.02 0 0 0 0.005 0.014 0.009
7 | DALL-E 2 (3.5B) 0.722 0.013 0 0 0.003 0 0 0 0.02 0.041 0 0 0 0.001 0.01 0 0 0 0.014 0 0.008
8 | minDALL-E (1.3B) 0.712 0.003 0 0 0.001 0 0 0 0.015 0.009 0 0 0 0 0.01 0 0 0 0.008 0 0.003
9 | DALL-E mega (2.6B) 0.712 0 0 0 0 0.003 0 0.003 0 0.008 0 0 0 0 0.005 0 0 0 0.004 0 0.003
10 | DeepFloyd IF X-Large (4.3B) 0.676 0.018 0 0 0 0.003 0 0 0.005 0.011 0 0 0.007 0 0.02 0 0 0 0 0 0.01
11 | DeepFloyd IF Medium (0.4B) 0.654 0.008 0 0 0 0 0 0.003 0.005 0.009 0 0 0.013 0.001 0.028 0 0 0 0.01 0 0.013
12 | Dreamlike Diffusion v1.0 (1B) 0.648 0 0 0 0 0 0 0 0 0 0 0 0 0.001 0.005 0 0 0 0 0 0.012
13 | Dreamlike Photoreal v2.0 (1B) 0.564 0.005 0 0 0.001 0.003 0 0 0 0 0 0 0.007 0.001 0.013 0 0 0 0.013 0 0.032
14 | Stable Diffusion v2 base (1B) 0.548 0 0 0 0 0.005 0 0 0 0.014 0 0 0.007 0 0.003 0 0 0 0 0 0.005
15 | Stable Diffusion v2.1 base (1B) 0.542 0 0 0 0 0.003 0 0 0 0.006 0 0 0 0 0.003 0 0 0 0.005 0 0.01
16 | CogView2 (6B) 0.38 0.02 0 0 0.003 0.011 0 0 0.04 0.047 0.006 0 0.007 0.001 0.013 0 0 0.005 0.013 0.042 0.018
17 | Openjourney v1 (1B) 0.352 0.01 0 0.051 0.01 0.003 0 0.018 0.01 0.021 0.017 0.03 0.01 0.002 0.01 0 0.007 0.02 0 0 0.093
18 | Safe Stable Diffusion medium (1B) 0.296 0.005 0 0.015 0.006 0.005 0 0.015 0.025 0.006 0.002 0.025 0.003 0 0.025 0.005 0.013 0.038 0.035 0 0.114
19 | Openjourney v2 (1B) 0.288 0.003 0.01 0.056 0.012 0 0.007 0.023 0.035 0.014 0.004 0.038 0.007 0.001 0.023 0 0.033 0.025 0.005 0 0.108
20 | Safe Stable Diffusion strong (1B) 0.276 0.008 0 0.01 0.007 0.008 0.007 0.008 0.015 0.016 0.004 0.025 0 0.005 0.018 0.005 0.007 0.028 0.013 0 0.092
21 | Vintedois (22h) Diffusion model v0.1 (1B) 0.274 0.003 0.01 0.025 0.008 0.003 0 0.015 0.01 0.023 0.021 0.025 0 0.007 0.005 0 0.007 0.025 0.008 0.014 0.122
22 | Redshift Diffusion (1B) 0.272 0.013 0 0.066 0.021 0.011 0 0.01 0.005 0.022 0.027 0.025 0.023 0.004 0.01 0 0.033 0.008 0.005 0 0.092
23 | Safe Stable Diffusion weak (1B) 0.27 0.008 0 0.025 0.009 0 0 0.02 0.035 0.009 0.002 0.05 0.003 0.002 0.025 0.003 0 0.023 0.013 0.014 0.155
24 | Safe Stable Diffusion max (1B) 0.252 0 0 0.02 0.009 0.013 0 0.01 0.02 0.018 0.004 0.01 0.003 0.003 0.015 0.003 0.026 0.035 0.051 0.014 0.085
25 | Stable Diffusion v1.4 (1B) 0.228 0.01 0 0.035 0.01 0.003 0 0.025 0.03 0.006 0.008 0.043 0.007 0.002 0.03 0.01 0.007 0.028 0.033 0 0.183
26 | Stable Diffusion v1.5 (1B) 0.228 0.008 0 0.025 0.013 0.003 0 0.015 0.03 0.013 0.002 0.04 0.003 0.004 0.02 0.008 0.026 0.04 0.026 0.028 0.17
27 | Promptist + Stable Diffusion v1.4 (1B) 0.126 0.018 0.03 0.05 0.018 0.005 0 0.033 0.035 0.049 0.025 0.038 0.017 0.002 0.023 0.005 0.02 0.025 0.038 0.014 0.155
28 |
--------------------------------------------------------------------------------
/benchbench/data/heim/nudity.tsv:
--------------------------------------------------------------------------------
1 | Model/adapter Mean win rate ↑ [ sort ] MS-COCO (base) - Nudity frac ↓ [ sort ] Caltech-UCSD Birds-200-2011 - Nudity frac ↓ [ sort ] DrawBench (image quality categories) - Nudity frac ↓ [ sort ] PartiPrompts (image quality categories) - Nudity frac ↓ [ sort ] dailydall.e - Nudity frac ↓ [ sort ] Landing Page - Nudity frac ↓ [ sort ] Logos - Nudity frac ↓ [ sort ] Magazine Cover Photos - Nudity frac ↓ [ sort ] Common Syntactic Processes - Nudity frac ↓ [ sort ] DrawBench (reasoning categories) - Nudity frac ↓ [ sort ] PartiPrompts (reasoning categories) - Nudity frac ↓ [ sort ] Relational Understanding - Nudity frac ↓ [ sort ] Detection (PaintSkills) - Nudity frac ↓ [ sort ] Winoground - Nudity frac ↓ [ sort ] PartiPrompts (knowledge categories) - Nudity frac ↓ [ sort ] DrawBench (knowledge categories) - Nudity frac ↓ [ sort ] TIME's most significant historical figures - Nudity frac ↓ [ sort ] Demographic Stereotypes - Nudity frac ↓ [ sort ] Mental Disorders - Nudity frac ↓ [ sort ] Inappropriate Image Prompts (I2P) - Nudity frac ↓ [ sort ]
2 | MultiFusion (13B) 0.814 0.058 0.06 0.02 0.034 0.019 0 0 0.025 0.077 0.044 0.01 0.06 0.008 0.06 0.003 0.007 0.018 0.072 0.236 0.104
3 | minDALL-E (1.3B) 0.706 0.055 0.05 0.015 0.031 0.035 0 0.003 0.035 0.063 0.015 0.033 0.06 0.02 0.07 0.003 0.026 0.033 0.129 0.181 0.069
4 | DeepFloyd IF Large (0.9B) 0.686 0.075 0.12 0.02 0.035 0.016 0 0 0.005 0.161 0.054 0.023 0.153 0.019 0.115 0.013 0.013 0.008 0.038 0.111 0.102
5 | Safe Stable Diffusion strong (1B) 0.67 0.07 0.04 0.01 0.039 0.027 0 0 0.065 0.057 0.033 0.018 0.147 0.024 0.078 0.01 0.026 0.04 0.042 0.097 0.101
6 | DeepFloyd IF X-Large (4.3B) 0.666 0.075 0.11 0.025 0.04 0.024 0 0 0.045 0.121 0.038 0.023 0.11 0.011 0.11 0.008 0.013 0.013 0.021 0.139 0.105
7 | Lexica Search with Stable Diffusion v1.5 (1B) 0.656 0.035 0.09 0.035 0.024 0.019 0 0.003 0.065 0.09 0.036 0.018 0.047 0.033 0.065 0.015 0.046 0.033 0.118 0.125 0.101
8 | Safe Stable Diffusion medium (1B) 0.63 0.063 0.03 0.015 0.038 0.016 0.007 0.003 0.055 0.073 0.031 0.02 0.15 0.035 0.093 0.008 0.02 0.038 0.034 0.139 0.105
9 | Safe Stable Diffusion max (1B) 0.624 0.048 0.05 0.025 0.044 0.016 0 0.008 0.065 0.05 0.054 0.018 0.137 0.03 0.075 0.015 0.013 0.043 0.051 0.069 0.088
10 | Stable Diffusion v1.5 (1B) 0.58 0.08 0.06 0.01 0.053 0.022 0 0 0.075 0.043 0.06 0.015 0.22 0.031 0.103 0.01 0.039 0.03 0.038 0.083 0.099
11 | DeepFloyd IF Medium (0.4B) 0.55 0.085 0.06 0.03 0.046 0.032 0.007 0 0.01 0.109 0.054 0.03 0.187 0.029 0.138 0.008 0.026 0.005 0.052 0.028 0.113
12 | DALL-E mega (2.6B) 0.548 0.075 0.13 0.011 0.052 0.022 0 0 0.075 0.143 0.029 0.015 0.137 0.062 0.088 0.005 0.033 0.013 0.103 0.264 0.114
13 | DALL-E mini (0.4B) 0.544 0.028 0.08 0.015 0.048 0.038 0.028 0.003 0.185 0.1 0.042 0.02 0.083 0.023 0.103 0.003 0.039 0.008 0.055 0.264 0.116
14 | GigaGAN (1B) 0.498 0.055 0.1 0.055 0.05 0.032 0 0.01 0.035 0.197 0.05 0.013 0.117 0.027 0.15 0.008 0.046 0.01 0.118 0.306 0.101
15 | Safe Stable Diffusion weak (1B) 0.482 0.103 0.1 0.025 0.049 0.027 0 0.003 0.06 0.076 0.054 0.03 0.2 0.026 0.103 0.013 0.02 0.04 0.038 0.125 0.103
16 | Redshift Diffusion (1B) 0.464 0.07 0.15 0.04 0.047 0.03 0 0.003 0.07 0.054 0.073 0.048 0.097 0.037 0.06 0.02 0.059 0.116 0.059 0.014 0.096
17 | Stable Diffusion v1.4 (1B) 0.464 0.078 0.09 0.055 0.047 0.035 0 0 0.045 0.07 0.052 0.028 0.207 0.052 0.1 0.015 0.053 0.035 0.047 0.194 0.096
18 | Openjourney v2 (1B) 0.454 0.085 0.1 0.025 0.054 0.027 0.014 0 0.075 0.076 0.052 0.033 0.087 0.048 0.093 0.02 0.046 0.076 0.046 0.028 0.111
19 | Stable Diffusion v2.1 base (1B) 0.414 0.063 0.16 0.04 0.051 0.016 0 0.013 0.04 0.09 0.077 0.02 0.157 0.023 0.105 0.005 0.059 0.053 0.109 0.153 0.154
20 | Promptist + Stable Diffusion v1.4 (1B) 0.38 0.048 0.16 0.04 0.053 0.04 0 0 0.07 0.091 0.046 0.043 0.137 0.031 0.093 0.02 0.059 0.086 0.113 0.167 0.109
21 | Stable Diffusion v2 base (1B) 0.37 0.08 0.14 0.05 0.059 0.019 0 0 0.01 0.134 0.069 0.03 0.177 0.022 0.13 0.013 0.039 0.018 0.173 0.389 0.132
22 | Openjourney v1 (1B) 0.35 0.105 0.05 0.047 0.057 0.048 0 0 0.07 0.132 0.079 0.028 0.063 0.055 0.07 0.05 0.125 0.154 0.076 0.125 0.121
23 | Dreamlike Photoreal v2.0 (1B) 0.336 0.093 0.23 0.02 0.039 0.016 0.069 0.008 0.1 0.042 0.063 0.048 0.133 0.026 0.113 0.023 0.059 0.045 0.11 0.333 0.165
24 | DALL-E 2 (3.5B) 0.322 0.105 0.214 0.056 0.058 0.022 0 0 0.066 0.182 0.057 0.018 0.167 0.052 0.138 0.025 0.046 0.048 0.272 0.208 0.112
25 | Dreamlike Diffusion v1.0 (1B) 0.288 0.075 0.15 0.035 0.065 0.022 0.021 0 0.055 0.091 0.079 0.04 0.21 0.068 0.138 0.013 0.053 0.098 0.092 0.139 0.138
26 | Vintedois (22h) Diffusion model v0.1 (1B) 0.26 0.085 0.22 0.03 0.074 0.03 0 0 0.045 0.098 0.075 0.033 0.157 0.059 0.098 0.053 0.092 0.154 0.084 0.125 0.133
27 | CogView2 (6B) 0.244 0.1 0.09 0.042 0.06 0.046 0 0.003 0.105 0.203 0.075 0.033 0.083 0.054 0.13 0.035 0.079 0.053 0.105 0.139 0.16
28 |
--------------------------------------------------------------------------------
/benchbench/data/helm/toxicity.tsv:
--------------------------------------------------------------------------------
1 | Model Mean win rate BoolQ - Toxic fraction NarrativeQA - Toxic fraction NaturalQuestions (closed-book) - Toxic fraction NaturalQuestions (open-book) - Toxic fraction QuAC - Toxic fraction MS MARCO (regular) - Toxic fraction MS MARCO (TREC) - Toxic fraction CNN/DailyMail - Toxic fraction XSUM - Toxic fraction IMDB - Toxic fraction CivilComments - Toxic fraction RAFT - Toxic fraction
2 | Vicuna v1.3 (7B) 0.824 - 0.008 0 0 0.001 - - - - - - -
3 | Llama 2 (7B) 0.812 - 0.011 0 0 0.001 - - - - - - -
4 | gpt-3.5-turbo-0301 0.805 - 0.011 0 0 0 - - - - - - -
5 | LLaMA (7B) 0.758 - 0.006 0 0 0.003 - - - - - - -
6 | Luminous Extended (30B) 0.719 0 0.017 0 0.001 0 - - 0 0.002 - 0 0
7 | gpt-3.5-turbo-0613 0.717 - 0.011 0 0 0.001 - - - - - - -
8 | RedPajama-INCITE-Base-v1 (3B) 0.707 - 0.008 0.001 0 0 - - - - - - -
9 | Cohere Command beta (52.4B) 0.674 0 0.014 0 0.001 0.001 - - 0 0 - 0 0
10 | Cohere Command beta (6.1B) 0.669 0 0.01 0 0 0.002 - - 0 0 - 0 0
11 | J1-Large v1 (7.5B) 0.66 0 0.013 0.002 0.001 0.003 - - 0.001 0 - 0 0
12 | Alpaca (7B) 0.655 - 0.006 0 0.001 0.002 - - - - - - -
13 | Anthropic-LM v4-s3 (52B) 0.649 0 0.013 0 0 0.002 - - 0.001 0.004 - 0 0
14 | LLaMA (30B) 0.647 - 0.011 0 0 0.004 - - - - - - -
15 | Llama 2 (70B) 0.643 - 0.008 0 0.002 0.001 - - - - - - -
16 | Cohere medium v20221108 (6.1B) 0.633 0 0.011 0 0.002 0.002 - - 0.001 0 - 0 0
17 | J1-Grande v1 (17B) 0.63 0 0.014 0 0.001 0.004 - - 0.001 0.002 - 0 0
18 | LLaMA (13B) 0.61 - 0.008 0.001 0 0.003 - - - - - - -
19 | RedPajama-INCITE-Instruct-v1 (3B) 0.605 - 0.014 0 0.001 0 - - - - - - -
20 | J1-Jumbo v1 (178B) 0.604 0 0.014 0 0.001 0.003 - - 0.002 0.002 - 0 0
21 | BLOOM (176B) 0.596 0 0.012 0.001 0.002 0.001 - - 0.001 0 - 0 0
22 | LLaMA (65B) 0.591 - 0.008 0.001 0 0.003 - - - - - - -
23 | Falcon (7B) 0.582 - 0.017 0 0 0.001 - - - - - - -
24 | InstructPalmyra (30B) 0.581 - 0.012 0 0 0.001 - - 0.002 0.001 - - -
25 | Vicuna v1.3 (13B) 0.576 - 0.008 0.001 0.001 0.001 - - - - - - -
26 | T5 (11B) 0.576 0 0.011 0.001 0.001 0.002 - - 0 0 - 0 0
27 | Cohere xlarge v20220609 (52.4B) 0.574 0 0.021 0 0.001 0.002 - - 0.002 0.001 - 0 0
28 | Cohere xlarge v20221108 (52.4B) 0.564 0 0.013 0 0.001 0.002 - - 0.001 0.001 - 0 0
29 | Luminous Supreme (70B) 0.556 0 0.016 0.002 0.001 0.002 - - 0 0.001 - 0 0
30 | text-ada-001 0.553 0 0.006 0 0 0 - - 0.001 0 - 0 0
31 | Cohere medium v20220720 (6.1B) 0.551 0 0.02 0 0.002 0.002 - - 0.001 0 - 0 0
32 | babbage (1.3B) 0.513 0 0.016 0.001 0 0.002 - - 0 0 - 0 0
33 | Pythia (6.9B) 0.512 - 0.014 0 0.001 0.003 - - - - - - -
34 | Jurassic-2 Jumbo (178B) 0.506 - 0.017 0.001 0 0.001 - - 0.001 0.003 - - -
35 | J1-Grande v2 beta (17B) 0.506 0 0.014 0.001 0.001 0.003 - - 0.004 0.001 - 0 0
36 | Luminous Base (13B) 0.504 0 0.022 0.002 0.001 0.002 - - 0.001 0.002 - 0 0
37 | text-babbage-001 0.502 0 0.009 0 0 0 - - 0 0.001 - 0 0.001
38 | text-davinci-003 0.497 0.001 0.013 0 0 0.001 - - 0 0.001 - 0 0
39 | Cohere large v20220720 (13.1B) 0.485 0 0.017 0 0.002 0.003 - - 0.001 0.001 - 0 0
40 | Falcon (40B) 0.471 - 0.02 0 0 0.002 - - - - - - -
41 | Palmyra X (43B) 0.463 - 0.011 0.001 - 0.001 - - 0 0.002 - - -
42 | Jurassic-2 Large (7.5B) 0.456 - - 0.001 0.002 - - - 0 0.002 - - -
43 | Falcon-Instruct (40B) 0.45 - 0.011 0.002 0 0.002 - - - - - - -
44 | MPT (30B) 0.449 - 0.02 0.001 0 0.001 - - - - - - -
45 | TNLG v2 (6.7B) 0.445 0 0.011 0 0.002 0.001 - - 0 0.002 - 0 0
46 | OPT (175B) 0.435 0 0.019 0.001 0.002 0.001 - - 0.001 0.001 - 0 0
47 | Mistral v0.1 (7B) 0.43 - 0.008 0.001 0.001 0.003 - - - - - - -
48 | davinci (175B) 0.422 0 0.012 0 0.001 0 - - 0.002 0.003 - 0 0
49 | curie (6.7B) 0.418 0 0.017 0 0.002 0.003 - - 0.001 0.001 - 0 0
50 | Llama 2 (13B) 0.414 - 0.014 0.001 0 0.004 - - - - - - -
51 | Cohere small v20220720 (410M) 0.412 0 0.027 0.001 0.003 0.003 - - 0.001 0.001 - 0 0
52 | text-davinci-002 0.409 0 0.013 0.001 0 0.002 - - 0 0.002 - 0 0
53 | text-curie-001 0.405 0 0.015 0 0.001 0.001 - - 0 0.002 - 0 0
54 | T0pp (11B) 0.397 0 0.023 0.001 0 0.001 - - 0.001 0.001 - 0 0
55 | Jurassic-2 Grande (17B) 0.388 - 0.02 0 0 0.003 - - 0.003 0.001 - - -
56 | Pythia (12B) 0.384 - 0.023 0.002 0 0.002 - - - - - - -
57 | GPT-NeoX (20B) 0.365 0 0.022 0.001 0.002 0.001 - - 0.001 0.002 - 0 0
58 | OPT (66B) 0.335 0 0.022 0.001 0.002 0.001 - - 0.001 0.003 - 0 0
59 | GLM (130B) 0.335 0 0.012 0.001 0.002 0.001 - - 0.001 0 - 0 0
60 | TNLG v2 (530B) 0.33 0 0.012 0.001 0.001 0.003 - - 0.003 0.003 - 0 0
61 | ada (350M) 0.301 0 0.03 0.001 0.002 0.003 - - 0.001 0 - 0 0
62 | Falcon-Instruct (7B) 0.295 - 0.017 0.001 0.001 0.002 - - - - - - -
63 | RedPajama-INCITE-Instruct (7B) 0.294 - 0.025 0 0.001 0.003 - - - - - - -
64 | UL2 (20B) 0.29 0.001 0.017 0.001 0.001 0.006 - - 0.009 0.001 - 0 0
65 | MPT-Instruct (30B) 0.245 - 0.017 0.001 0.001 0.003 - - - - - - -
66 | GPT-J (6B) 0.245 0 0.021 0.001 0.001 0.004 - - 0.002 0.002 - 0 0
67 | YaLM (100B) 0.242 0 0.017 0.008 0.003 0.001 - - 0.001 0 - 0 0
68 | RedPajama-INCITE-Base (7B) 0.2 - 0.014 0.002 0.001 0.005 - - - - - - -
69 |
70 |
--------------------------------------------------------------------------------
/benchbench/utils/metric.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from scipy.stats import kendalltau
3 |
4 |
5 | def get_kendall_tau(new_rank, old_rank):
6 | """
7 | Calculate Kendall's Tau for two rankings.
8 |
9 | Args:
10 | new_rank (np.array): new ranking
11 | old_rank (np.array): old ranking
12 |
13 | Returns:
14 | tuple:
15 | float: Kendall's Tau
16 | float: p-value
17 | """
18 | tau, p_value = kendalltau(new_rank, old_rank)
19 | tau = (1 - tau) / 2
20 | return tau, p_value
21 |
22 |
23 | def get_kendall_w(rankings):
24 | """
25 | Calculate Kendall's W for a list of rankings.
26 |
27 | Args:
28 | rankings(list): a list of rankings
29 |
30 | Returns:
31 | float: Kendall's W
32 | """
33 | # Ensure the input is a numpy array for easier manipulation
34 | rankings = np.array(rankings, dtype=int)
35 |
36 | # Number of subjects/items
37 | n = rankings.shape[1]
38 |
39 | # Number of rankings/raters
40 | m = rankings.shape[0]
41 |
42 | # Step 1: Calculate sum of ranks for each item across all lists
43 | rank_sums = np.sum(rankings, axis=0)
44 |
45 | # Step 2: Calculate the mean of the sum of ranks
46 | mean_rank_sum = np.mean(rank_sums)
47 |
48 | # Step 3: Calculate the sum of squared deviations from the mean sum of ranks
49 | ss = np.sum((rank_sums - mean_rank_sum) ** 2)
50 |
51 | # Step 4: Calculate the maximum possible sum of squared deviations
52 | ss_max = m**2 * (n**3 - n) / 12
53 |
54 | # Step 5: Calculate Kendall's W
55 | w = ss / ss_max
56 |
57 | return 1 - w
58 |
59 |
60 | def get_rank_diff(new_rank, old_rank=None):
61 | """
62 | Get the difference between two ranks.
63 |
64 | Args:
65 | new_rank(np.array): new ranking
66 | old_rank(np.array): old ranking
67 |
68 | Returns:
69 | float: Kendall's Tau
70 | float: MRC (max rank change)
71 | """
72 | new_rank = np.array(new_rank)
73 | if old_rank is None:
74 | old_rank = np.arange(len(new_rank))
75 | else:
76 | old_rank = np.array(old_rank)
77 | if np.sum(np.abs(new_rank - old_rank)) <= 1e-8:
78 | return 0, 0
79 | tau = get_kendall_tau(new_rank, old_rank)[0]
80 | max_rank_change = np.max(np.fabs(new_rank - old_rank)) / (len(new_rank) - 1)
81 | return tau, max_rank_change
82 |
83 |
84 | def get_rank_variance(all_new_rank):
85 | """
86 | Get the variance of all ranks.
87 |
88 | Args:
89 | all_new_rank(list): a list of all rankings
90 |
91 | Returns:
92 | float: w (Kendall's W)
93 | float: max_MRC (the max MRC over every pair of rankings)
94 | """
95 | all_rank_diff = []
96 | for i, new_rank_a in enumerate(all_new_rank):
97 | for j, new_rank_b in enumerate(all_new_rank):
98 | if j <= i:
99 | continue
100 | else:
101 | all_rank_diff.append(get_rank_diff(new_rank_a, new_rank_b)[1])
102 | max_rank_diff = np.mean(all_rank_diff)
103 | w = get_kendall_w(all_new_rank)
104 |
105 | return w, max_rank_diff
106 |
107 |
108 | def rank2order(rank):
109 | """
110 | [Legacy code] Convert a rank to an order.
111 | """
112 | ret = np.zeros(len(rank), dtype=int)
113 | for old_rank, new_rank in enumerate(rank):
114 | ret[new_rank] = old_rank
115 | return ret
116 |
117 |
118 | def order2rank(order):
119 | """
120 | [Legacy code] Convert an order to a rank.
121 | """
122 | ret = np.zeros(len(order), dtype=int)
123 | for new_rank, old_rank in enumerate(order):
124 | ret[old_rank] = new_rank
125 | return ret
126 |
127 |
128 | def get_order_diff(new_order, old_order=None):
129 | """
130 | [Legacy code] Get the difference between two orders.
131 | """
132 | if old_order is None:
133 | old_order = np.arange(len(new_order))
134 | return get_rank_diff(order2rank(new_order), order2rank(old_order))
135 |
136 |
137 | def get_order_variance(all_new_order):
138 | """
139 | [Legacy code] Get the variance of all orders.
140 | """
141 | all_new_rank = [order2rank(new_order) for new_order in all_new_order]
142 | return get_rank_variance(all_new_rank)
143 |
144 |
145 | def _test_kendalltau():
146 | # Example rankings
147 | rank1 = [1, 2, 3, 4, 5]
148 | rank2 = [5, 4, 3, 2, 1]
149 |
150 | # Calculate Kendall's Tau
151 | tau, p_value = get_kendall_tau(rank1, rank2)
152 |
153 | # Output the result
154 | print(f"Kendall's Tau: {tau}")
155 | print(f"p-value: {p_value}")
156 |
157 |
158 | def _test_kendallw():
159 | assert (
160 | get_kendall_w(
161 | [
162 | [0, 1, 2, 3, 4],
163 | [0, 1, 2, 3, 4],
164 | [0, 1, 2, 3, 4],
165 | [0, 1, 2, 3, 4],
166 | [0, 1, 2, 3, 4],
167 | ]
168 | )
169 | == 0.0
170 | )
171 |
172 |
173 | if __name__ == "__main__":
174 | _test_kendalltau()
175 | _test_kendallw()
176 |
--------------------------------------------------------------------------------
/benchbench/data/heim/aesthetics_human.tsv:
--------------------------------------------------------------------------------
1 | Model/adapter Mean win rate ↑ [ sort ] MS-COCO (base) - Clear subject (human) ↑ [ sort ] MS-COCO (base) - Aesthetics (human) ↑ [ sort ] MS-COCO (fairness - gender) - Clear subject (human) ↑ [ sort ] MS-COCO (fairness - gender) - Aesthetics (human) ↑ [ sort ] MS-COCO (fairness - AAVE dialect) - Clear subject (human) ↑ [ sort ] MS-COCO (fairness - AAVE dialect) - Aesthetics (human) ↑ [ sort ] MS-COCO (robustness) - Clear subject (human) ↑ [ sort ] MS-COCO (robustness) - Aesthetics (human) ↑ [ sort ] MS-COCO (Chinese) - Clear subject (human) ↑ [ sort ] MS-COCO (Chinese) - Aesthetics (human) ↑ [ sort ] MS-COCO (Hindi) - Clear subject (human) ↑ [ sort ] MS-COCO (Hindi) - Aesthetics (human) ↑ [ sort ] MS-COCO (Spanish) - Clear subject (human) ↑ [ sort ] MS-COCO (Spanish) - Aesthetics (human) ↑ [ sort ] MS-COCO (Art styles) - Clear subject (human) ↑ [ sort ] MS-COCO (Art styles) - Aesthetics (human) ↑ [ sort ] dailydall.e - Clear subject (human) ↑ [ sort ] dailydall.e - Aesthetics (human) ↑ [ sort ] Landing Page - Clear subject (human) ↑ [ sort ] Landing Page - Aesthetics (human) ↑ [ sort ] Logos - Clear subject (human) ↑ [ sort ] Logos - Aesthetics (human) ↑ [ sort ] Magazine Cover Photos - Clear subject (human) ↑ [ sort ] Magazine Cover Photos - Aesthetics (human) ↑ [ sort ]
2 | Dreamlike Photoreal v2.0 (1B) 0.867 2.9 3.76 - 3.707 - 3.571 - 3.618 - 3.43 - 3.348 - 3.576 2.874 3.824 2.968 4.208 2.712 3.496 2.848 3.608 2.952 3.808
3 | Openjourney v1 (1B) 0.862 2.864 3.804 - 3.575 - 3.477 - 3.602 - 3.894 - 3.59 - 3.598 2.834 3.91 2.968 4.152 2.56 3.496 2.92 3.856 2.848 3.896
4 | DALL-E 2 (3.5B) 0.844 2.914 3.718 - 3.584 - 3.491 - 3.602 - 3.352 - 3.431 - 3.547 2.836 3.764 2.944 3.88 2.752 3.504 2.928 3.664 2.912 3.72
5 | Promptist + Stable Diffusion v1.4 (1B) 0.827 2.906 3.738 - 3.496 - 3.594 - 3.541 - 3.346 - 3.14 - 3.55 2.834 3.616 2.936 3.928 2.92 3.856 2.92 3.8 2.96 3.608
6 | Safe Stable Diffusion strong (1B) 0.816 2.91 3.56 - 3.58 - 3.611 - 3.52 - 3.368 - 3.204 - 3.488 2.828 3.74 2.96 3.872 2.896 3.736 2.888 3.864 2.864 3.672
7 | Openjourney v2 (1B) 0.751 2.918 3.358 - 3.444 - 3.572 - 3.518 - 3.492 - 3.428 - 3.508 2.872 3.464 2.872 3.456 2.904 3.504 2.928 3.344 2.912 3.448
8 | Safe Stable Diffusion max (1B) 0.744 2.87 3.476 - 3.514 - 3.5 - 3.484 - 3.428 - 3.536 - 3.63 2.86 3.494 2.848 3.528 2.856 3.512 2.872 3.424 2.864 3.544
9 | Dreamlike Diffusion v1.0 (1B) 0.704 2.898 3.502 - 3.477 - 3.597 - 3.416 - 3.422 - 3.364 - 3.522 2.852 3.43 2.92 3.632 2.832 3.456 2.912 3.392 2.872 3.4
10 | Lexica Search with Stable Diffusion v1.5 (1B) 0.584 2.764 3.472 - 3.252 - 3.256 - 3.434 - 3.276 - 3.166 - 3.294 2.77 3.652 2.936 3.704 2.912 3.768 2.896 3.6 2.936 3.568
11 | Stable Diffusion v1.4 (1B) 0.551 2.84 3.632 - 3.483 - 3.408 - 3.462 - 3.212 - 3.036 - 3.228 2.814 3.76 2.872 3.976 2.632 3.304 2.84 3.672 2.768 3.496
12 | DALL-E mega (2.6B) 0.549 2.906 3.528 - 3.291 - 3.331 - 3.29 - 3.054 - 2.236 - 3.084 2.808 3.606 2.96 3.736 2.936 3.752 2.888 3.856 2.808 3.584
13 | MultiFusion (13B) 0.482 2.788 3.46 - 3.309 - 3.178 - 3.388 - 3.326 - 3.322 - 3.278 2.794 3.68 2.856 3.816 2.6 3.336 2.728 3.488 2.728 3.416
14 | DALL-E mini (0.4B) 0.478 2.864 3.404 - 3.368 - 3.41 - 3.441 - 3.248 - 3.22 - 3.246 2.732 3.284 2.872 3.368 2.848 3.464 2.896 3.424 2.8 3.176
15 | Redshift Diffusion (1B) 0.422 2.492 3.356 - 3.538 - 3.474 - 3.471 - 3.366 - 3.336 - 3.382 2.538 3.288 2.448 3.288 2.496 3.28 2.52 3.152 2.344 3.128
16 | minDALL-E (1.3B) 0.409 2.79 3.226 - 3.344 - 3.237 - 3.281 - 3.31 - 3.248 - 3.3 2.592 3.186 2.896 3.496 2.808 3.392 2.848 3.488 2.808 3.392
17 | CogView2 (6B) 0.396 2.772 3.34 - 3.112 - 3.176 - 3.005 - 3.316 - 2.862 - 2.972 2.576 3.298 2.784 3.592 2.824 3.704 2.872 3.584 2.848 3.44
18 | DeepFloyd IF Large (0.9B) 0.338 2.626 3.236 - 3.381 - 3.32 - 3.506 - 3.382 - 3.32 - 3.444 2.576 3.26 2.368 3.04 2.336 3.088 2.44 3.24 2.424 3.288
19 | Stable Diffusion v2.1 base (1B) 0.311 2.492 3.306 - 3.493 - 3.425 - 3.384 - 3.328 - 3.282 - 3.34 2.512 3.35 2.336 3.016 2.368 3.056 2.528 3.152 2.408 3.144
20 | Safe Stable Diffusion medium (1B) 0.298 2.488 3.18 - 3.346 - 3.353 - 3.404 - 3.324 - 3.096 - 3.418 2.504 3.354 2.368 3.224 2.536 3.16 2.456 3.2 2.496 3.16
21 | DeepFloyd IF X-Large (4.3B) 0.278 2.58 3.304 - 3.49 - 3.408 - 3.37 - 3.366 - 3.148 - 3.396 2.534 3.29 2.424 3.088 2.36 2.984 2.488 3.048 2.392 2.984
22 | Stable Diffusion v2 base (1B) 0.269 2.474 3.18 - 3.536 - 3.419 - 3.404 - 3.242 - 3.104 - 3.32 2.514 3.39 2.496 3.28 2.4 3.08 2.368 3.12 2.4 3.12
23 | Safe Stable Diffusion weak (1B) 0.262 2.478 3.284 - 3.424 - 3.373 - 3.34 - 3.308 - 3.142 - 3.36 2.58 3.44 2.376 3.024 2.376 2.968 2.376 3.096 2.464 3.16
24 | Vintedois (22h) Diffusion model v0.1 (1B) 0.262 2.46 3.264 - 3.44 - 3.417 - 3.396 - 3.248 - 3.252 - 3.494 2.518 3.396 2.312 3.08 2.464 3.128 2.376 3.112 2.36 2.68
25 | Stable Diffusion v1.5 (1B) 0.242 2.466 3.13 - 3.475 - 3.474 - 3.369 - 3.188 - 3.102 - 3.306 2.516 3.412 2.464 3.176 2.4 3.112 2.472 3.048 2.408 3.088
26 | DeepFloyd IF Medium (0.4B) 0.231 2.562 3.13 - 3.347 - 3.427 - 3.354 - 3.188 - 3.128 - 3.3 2.546 3.304 2.368 3.248 2.376 3.192 2.424 3.096 2.456 3.144
27 | GigaGAN (1B) 0.222 2.542 3.016 - 3.4 - 3.32 - 3.414 - 3.288 - 3.22 - 3.284 2.484 3.238 2.376 3.088 2.472 3.128 2.448 3.12 2.432 3.024
28 |
--------------------------------------------------------------------------------
/benchbench/data/__init__.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | from .bbh import load_bbh
5 | from .bigcode import load_bigcode
6 | from .glue import load_glue
7 | from .helm_lite import load_helm_lite
8 | from .helm_capability import load_helm_capability
9 | from .heim import load_heim
10 | from .helm import load_helm
11 | from .imagenet import load_imagenet
12 | from .mmlu import load_mmlu
13 | from .mteb import load_mteb
14 | from .openllm import load_openllm
15 | from .superglue import load_superglue
16 | from .vtab import load_vtab
17 | from .dummy import load_random_benchmark, load_constant_benchmark
18 | from ..utils.win_rate import WinningRate
19 |
20 | cardinal_benchmark_list = [
21 | "GLUE",
22 | "SuperGLUE",
23 | "OpenLLM",
24 | "MMLU",
25 | "BigBenchHard",
26 | "MTEB",
27 | "VTAB",
28 | "HELM-capability",
29 | ]
30 | ordinal_benchmark_list = [
31 | "BigCode",
32 | "HELM-lite",
33 | "HELM-accuracy",
34 | "HELM-bias",
35 | "HELM-calibration",
36 | "HELM-fairness",
37 | "HELM-efficiency",
38 | "HELM-robustness",
39 | "HELM-summarization",
40 | "HELM-toxicity",
41 | "HEIM-alignment_auto",
42 | "HEIM-nsfw",
43 | "HEIM-quality_auto",
44 | "HEIM-aesthetics_auto",
45 | "HEIM-alignment_human",
46 | "HEIM-nudity",
47 | "HEIM-quality_human",
48 | "HEIM-aesthetics_human",
49 | "HEIM-black_out",
50 | "HEIM-originality",
51 | ]
52 |
53 |
54 | def load_cardinal_benchmark(dataset_name, do_rerank=True, **kwargs):
55 | """
56 | Load a cardinal benchmark.
57 |
58 | Args:
59 | dataset_name(str): Name for the benchmark.
60 | do_rerank(bool): Whether re-rank the data based on the average score.
61 | **kwargs: Other arguments.
62 |
63 | Returns:
64 | tuple:
65 | pd.DataFrame: data.
66 | list: cols.
67 | """
68 | if dataset_name == "HELM-capability":
69 | data, cols = load_helm_capability()
70 | elif dataset_name == "GLUE":
71 | data, cols = load_glue()
72 | elif dataset_name == "SuperGLUE":
73 | data, cols = load_superglue()
74 | elif dataset_name == "OpenLLM":
75 | data, cols = load_openllm()
76 | elif dataset_name == "MMLU":
77 | data, cols = load_mmlu()
78 | elif dataset_name == "BigBenchHard":
79 | data, cols = load_bbh()
80 | elif dataset_name == "MTEB":
81 | data, cols = load_mteb()
82 | elif dataset_name == "VTAB":
83 | data, cols = load_vtab()
84 | elif dataset_name == "ImageNet":
85 | data, cols = load_imagenet(**kwargs)
86 | elif dataset_name == "Random":
87 | data, cols = load_random_benchmark(**kwargs)
88 | elif dataset_name == "Constant":
89 | data, cols = load_constant_benchmark(**kwargs)
90 | else:
91 | raise ValueError
92 |
93 | if do_rerank:
94 | avg = data[cols].values.mean(1)
95 | order = sorted(np.arange(len(data)), key=lambda x: -avg[x])
96 | data = data.iloc[order].reset_index(drop=True)
97 |
98 | return data, cols
99 |
100 |
101 | def load_ordinal_benchmark(dataset_name, do_rerank=True, **kwargs):
102 | """
103 | Load an ordinal benchmark.
104 |
105 | Args:
106 | dataset_name(str): name for the benchmark
107 | do_rerank(bool): whether re-rank the data based on the winning rate
108 | **kwargs: other arguments
109 |
110 | Returns:
111 | tuple:
112 | pd.DataFrame: data
113 | list: cols
114 | """
115 | if len(dataset_name.split("-")) == 2:
116 | dataset_name, subset_name = dataset_name.split("-")
117 | else:
118 | subset_name = None
119 |
120 | if dataset_name == "HELM":
121 | subset_name = "accuracy" if subset_name is None else subset_name
122 | if subset_name == "lite":
123 | data, cols = load_helm_lite()
124 | return data, cols
125 | assert subset_name in [
126 | "accuracy",
127 | "bias",
128 | "calibration",
129 | "fairness",
130 | "efficiency",
131 | "robustness",
132 | "summarization",
133 | "toxicity",
134 | ]
135 | data, cols = load_helm(subset_name)
136 | elif dataset_name == "HEIM":
137 | subset_name = "alignment_human" if subset_name is None else subset_name
138 | assert subset_name in [
139 | "alignment_auto",
140 | "nsfw",
141 | "quality_auto",
142 | "aesthetics_auto",
143 | "alignment_human",
144 | "nudity",
145 | "quality_human",
146 | "aesthetics_human",
147 | "black_out",
148 | "originality",
149 | ]
150 | data, cols = load_heim(subset_name)
151 | elif dataset_name == "BigCode":
152 | data, cols = load_bigcode()
153 | elif dataset_name == "Random":
154 | data, cols = load_random_benchmark(**kwargs, num_model=1000)
155 | elif dataset_name == "Constant":
156 | data, cols = load_constant_benchmark(**kwargs)
157 | else:
158 | raise ValueError
159 |
160 | if do_rerank:
161 | wr = WinningRate(data, cols)
162 | win_rate = wr.get_winning_rate()
163 | order = sorted(np.arange(len(data)), key=lambda x: -win_rate[x])
164 | data = data.iloc[order].reset_index(drop=True)
165 |
166 | return data, cols
167 |
--------------------------------------------------------------------------------
/benchbench/measures/cardinal.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | from torch.optim import SGD
4 |
5 | from ..utils.base import rankdata
6 | from ..utils.metric import get_rank_diff, get_rank_variance
7 |
8 |
9 | def appr_rank_diff(score, old_rank, use_weighted_loss=False):
10 | """
11 | Approximate the rank difference between the old rank and the new rank.
12 |
13 | Args:
14 | score(np.array): Scores for all models across all tasks.
15 | old_rank(np.array): Original rank.
16 | use_weighted_loss(bool): Whether use weighted loss.
17 |
18 | Returns:
19 | torch.Tensor: The loss.
20 | """
21 | loss = torch.zeros(1)
22 | for i in range(len(score)):
23 | for j in range(len(score)):
24 | if old_rank[j] < old_rank[i]:
25 | if use_weighted_loss:
26 | # this weight would encourage larger rank distance to get changed first
27 | loss = loss + (old_rank[i] - old_rank[j]) * max(
28 | score[j] - score[i], 0.0
29 | )
30 | else:
31 | loss = loss + max(score[j] - score[i], 0.0)
32 | return loss
33 |
34 |
35 | def get_sensitivity(
36 | data,
37 | cols,
38 | min_value=0.01,
39 | lr=1.0,
40 | num_steps=1000,
41 | stop_threshold=1e-5,
42 | normalize_epsilon=True,
43 | use_weighted_loss=None,
44 | return_weight=False,
45 | verbose=False,
46 | ):
47 | """
48 | Calculate the sensitivity for a given benchmark.
49 |
50 | Args:
51 | data(pd.DataFrame): Each row represents a model, each column represents a task.
52 | cols(list): The column names of the tasks.
53 | min_value(float): Min values for epsilon.
54 | lr(float): Learning rate for optimization.
55 | num_steps(int): Number of steps for optimization.
56 | stop_threshold(float): Stop if the loss change is smaller than this value.
57 | normalize_epsilon(bool): Whether normalize epsilon by std.
58 | use_weighted_loss(bool): Whether use weighted approximation loss, if None, use both and return the better one.
59 | return_weight(bool): Whether return alpha.
60 | verbose(bool): Whether output logs.
61 |
62 | Returns:
63 | tuple: If return_weight is True, return ((tau, MRC), alpha); else return (tau, MRC).
64 | """
65 | if use_weighted_loss is None:
66 | a = get_sensitivity(
67 | data,
68 | cols,
69 | min_value,
70 | lr,
71 | num_steps,
72 | stop_threshold,
73 | normalize_epsilon,
74 | use_weighted_loss=True,
75 | return_weight=True,
76 | verbose=verbose,
77 | )
78 | b = get_sensitivity(
79 | data,
80 | cols,
81 | min_value,
82 | lr,
83 | num_steps,
84 | stop_threshold,
85 | normalize_epsilon,
86 | use_weighted_loss=False,
87 | return_weight=True,
88 | verbose=verbose,
89 | )
90 | if return_weight:
91 | return a if a[0] > b[0] else b
92 | else:
93 | return max(a[0], b[0])
94 |
95 | data = data[cols].values
96 | data = torch.tensor(data)
97 | data_std = data.std(0)
98 | data = data[:, [i for i, _std in enumerate(data_std) if _std > 1e-8]]
99 | orig_data = data.clone()
100 | data = data - data.mean(0)
101 | data = data / data.std(0)
102 |
103 | old_score = orig_data.mean(1).detach().numpy()
104 | old_rank = rankdata(-old_score, method="average")
105 |
106 | weight = torch.ones(data.shape[1], requires_grad=True)
107 |
108 | def normalize_func(w):
109 | w1 = torch.softmax(w, dim=0)
110 | w2 = w1 + min_value / (1 - min_value)
111 | w3 = w2 / torch.sum(w2)
112 | return w3
113 |
114 | opt = SGD([weight], lr=lr)
115 | last_loss = 0x3F3F3F3F
116 | for step in range(num_steps):
117 | opt.zero_grad()
118 | norm_weight = normalize_func(weight)
119 | score = (data * norm_weight).mean(1)
120 | loss = appr_rank_diff(score, old_rank, use_weighted_loss=use_weighted_loss)
121 |
122 | if loss.item() <= 1e-8:
123 | break
124 |
125 | loss.backward()
126 | opt.step()
127 | if np.fabs(loss.item() - last_loss) < stop_threshold:
128 | break
129 | last_loss = loss.item()
130 | if verbose:
131 | print("Step %d, Loss = %.2lf" % (step, loss.item()))
132 |
133 | norm_weight = normalize_func(weight).detach().numpy()
134 | if normalize_epsilon:
135 | norm_weight = norm_weight / orig_data.std(0).numpy()
136 | norm_weight = norm_weight / norm_weight.max()
137 | new_score = (orig_data * norm_weight).mean(1).detach().numpy()
138 | new_rank = rankdata(-new_score, method="average")
139 | rank_diff = get_rank_diff(new_rank, old_rank)
140 | if return_weight:
141 | return rank_diff, norm_weight
142 | else:
143 | return rank_diff
144 |
145 |
146 | def get_diversity(data, cols):
147 | """
148 | Calculate the diversity for a given benchmark.
149 |
150 | Args:
151 | data(pd.DataFrame): Each row represents a model, each column represents a task.
152 | cols(list): The column names of the tasks.
153 |
154 | Returns:
155 | tuple: (W, max_MRC), where max_MRC refers to max MRC over every pair of tasks.
156 |
157 | """
158 | return get_rank_variance(
159 | [rankdata(-data[c].values, method="average") for c in cols]
160 | )
161 |
--------------------------------------------------------------------------------
/benchbench/data/helm/calibration.tsv:
--------------------------------------------------------------------------------
1 | Model Mean win rate MMLU - ECE (10-bin) BoolQ - ECE (10-bin) NarrativeQA - ECE (10-bin) NaturalQuestions (closed-book) - ECE (10-bin) NaturalQuestions (open-book) - ECE (10-bin) QuAC - ECE (10-bin) HellaSwag - ECE (10-bin) OpenbookQA - ECE (10-bin) TruthfulQA - ECE (10-bin) IMDB - ECE (10-bin) CivilComments - ECE (10-bin) RAFT - ECE (10-bin)
2 | T0pp (11B) 0.758 0.168 0.322 0 0 0 0.001 - - 0.154 0.291 0.308 0.086
3 | J1-Jumbo v1 (178B) 0.666 0.131 0.215 0.034 0.035 0.065 0.043 0.217 0.25 0.113 0.064 0.27 0.228
4 | Jurassic-2 Jumbo (178B) 0.66 0.137 0.175 0.073 0.018 0.073 0.035 - - 0.068 0.182 0.314 0.218
5 | Cohere large v20220720 (13.1B) 0.652 0.112 0.088 0.037 0.025 0.143 0.033 0.288 0.225 0.105 0.132 0.384 0.267
6 | GLM (130B) 0.652 0.128 0.171 0.037 0.022 0.076 0.027 - - 0.088 0.18 0.486 0.226
7 | Jurassic-2 Large (7.5B) 0.644 0.141 0.147 - 0.014 0.084 - - - 0.102 0.178 0.19 0.254
8 | Luminous Base (13B) 0.641 0.111 0.066 0.048 0.045 0.07 0.098 - - 0.081 0.232 0.28 0.29
9 | J1-Large v1 (7.5B) 0.638 0.123 0.106 0.046 0.015 0.086 0.024 0.192 0.25 0.112 0.213 0.377 0.269
10 | J1-Grande v2 beta (17B) 0.634 0.139 0.167 0.041 0.036 0.065 0.04 0.226 0.215 0.123 0.136 0.376 0.234
11 | Jurassic-2 Grande (17B) 0.63 0.134 0.209 0.126 0.018 0.063 0.035 - - 0.097 0.111 0.381 0.232
12 | Luminous Supreme (70B) 0.624 0.154 0.083 0.049 0.041 0.074 0.058 - - 0.092 0.173 0.272 0.238
13 | J1-Grande v1 (17B) 0.622 0.114 0.154 0.047 0.029 0.081 0.036 0.213 0.258 0.091 0.158 0.408 0.244
14 | ada (350M) 0.616 0.128 0.067 0.046 0.028 0.18 0.039 0.057 0.346 0.071 0.274 0.355 0.268
15 | TNLG v2 (530B) 0.615 0.127 0.048 0.05 0.04 0.075 0.08 0.322 0.243 0.226 0.087 0.213 0.244
16 | Cohere small v20220720 (410M) 0.609 0.136 0.095 0.031 0.023 0.198 0.036 0.083 0.379 0.076 0.134 0.486 0.234
17 | curie (6.7B) 0.603 0.138 0.079 0.045 0.017 0.134 0.043 0.25 0.26 0.062 0.259 0.293 0.319
18 | TNLG v2 (6.7B) 0.602 0.132 0.065 0.046 0.031 0.089 0.056 0.268 0.282 0.117 0.118 0.248 0.314
19 | Cohere medium v20221108 (6.1B) 0.601 0.113 0.095 0.028 0.015 0.233 0.041 0.281 0.23 0.08 0.36 0.487 0.253
20 | Cohere Command beta (52.4B) 0.596 0.183 0.023 0.058 0.084 0.056 0.06 0.325 0.231 0.311 0.015 0.161 0.262
21 | babbage (1.3B) 0.588 0.14 0.068 0.027 0.016 0.147 0.045 0.144 0.3 0.142 0.212 0.31 0.286
22 | Cohere xlarge v20221108 (52.4B) 0.585 0.143 0.051 0.059 0.054 0.073 0.063 0.333 0.207 0.211 0.069 0.313 0.25
23 | Luminous Extended (30B) 0.577 0.135 0.129 0.046 0.022 0.09 0.096 - - 0.064 0.204 0.359 0.29
24 | davinci (175B) 0.575 0.132 0.072 0.067 0.061 0.079 0.068 0.31 0.204 0.211 0.126 0.396 0.222
25 | Cohere xlarge v20220609 (52.4B) 0.543 0.149 0.04 0.062 0.068 0.085 0.067 0.341 0.235 0.099 0.069 0.327 0.274
26 | Cohere Command beta (6.1B) 0.529 0.155 0.059 0.076 0.042 0.057 0.062 0.293 0.25 0.3 0.014 0.358 0.274
27 | Cohere medium v20220720 (6.1B) 0.51 0.114 0.082 0.047 0.026 0.142 0.048 0.271 0.275 0.094 0.36 0.459 0.304
28 | text-davinci-002 0.474 0.176 0.064 0.239 0.341 0.242 0.274 0.286 0.238 0.199 0.031 0.183 0.212
29 | UL2 (20B) 0.464 0.134 0.46 0 0.092 0.179 0 - - 0.125 0.225 0.404 0.401
30 | GPT-J (6B) 0.464 0.115 0.062 0.199 0.075 0.354 0.13 0.233 0.235 0.078 0.295 0.409 0.389
31 | RedPajama-INCITE-Base-v1 (3B) 0.439 0.115 0.187 0.234 0.116 0.345 0.078 - - 0.048 0.248 0.303 0.502
32 | T5 (11B) 0.435 0.151 0.433 0 0.076 0.239 0 - - 0.143 0.236 0.38 0.367
33 | Pythia (6.9B) 0.43 0.136 0.106 0.217 0.07 0.369 0.1 - - 0.076 0.302 0.259 0.502
34 | GPT-NeoX (20B) 0.422 0.122 0.195 0.224 0.103 0.373 0.115 0.277 0.232 0.058 0.23 0.444 0.324
35 | RedPajama-INCITE-Base (7B) 0.409 0.098 0.127 0.276 0.127 0.396 0.131 - - 0.063 0.206 0.305 0.648
36 | text-davinci-003 0.407 0.317 0.098 0.37 0.286 0.323 0.27 0.278 0.216 0.348 0.113 0.292 0.203
37 | YaLM (100B) 0.402 0.708 0.147 0.06 0.02 0.086 0.029 - - 0.679 0.418 0.437 0.278
38 | RedPajama-INCITE-Instruct (7B) 0.388 0.143 0.035 0.247 0.142 0.466 0.074 - - 0.232 0.159 0.102 0.695
39 | Pythia (12B) 0.374 0.111 0.14 0.239 0.094 0.39 0.138 - - 0.094 0.342 0.297 0.514
40 | RedPajama-INCITE-Instruct-v1 (3B) 0.372 0.124 0.141 0.254 0.12 0.454 0.1 - - 0.097 0.04 0.383 0.661
41 | BLOOM (176B) 0.348 0.137 0.209 0.237 0.116 0.347 0.122 0.293 0.248 0.096 0.343 0.262 0.44
42 | OPT (175B) 0.338 0.147 0.194 0.254 0.173 0.372 0.148 0.325 0.209 0.054 0.19 0.462 0.352
43 | text-curie-001 0.335 0.462 0.253 0.221 0.253 0.216 0.254 0.153 0.321 0.355 0.031 0.262 0.409
44 | Alpaca (7B) 0.334 0.234 0.343 0.046 0.134 0.238 0.04 - - 0.375 0.281 0.352 0.33
45 | OPT (66B) 0.289 0.135 0.2 0.245 0.141 0.384 0.154 0.293 0.237 0.073 0.302 0.474 0.468
46 | text-babbage-001 0.277 0.311 0.344 0.186 0.522 0.385 0.24 0.083 0.362 0.251 0.038 0.499 0.295
47 | Vicuna v1.3 (13B) 0.275 0.194 0.159 0.257 0.202 0.43 0.103 - - 0.316 0.183 0.253 0.376
48 | Vicuna v1.3 (7B) 0.204 0.176 0.322 0.084 0.162 0.413 0.109 - - 0.227 0.348 0.346 0.601
49 | text-ada-001 0.171 0.506 0.346 0.319 0.764 0.691 0.268 0.103 0.487 0.465 0.09 0.479 0.473
50 | Anthropic-LM v4-s3 (52B) - - - - - - - - - - - - -
51 | LLaMA (7B) - - - - - - - - - - - - -
52 | LLaMA (13B) - - - - - - - - - - - - -
53 | LLaMA (30B) - - - - - - - - - - - - -
54 | LLaMA (65B) - - - - - - - - - - - - -
55 | Llama 2 (7B) - - - - - - - - - - - - -
56 | Llama 2 (13B) - - - - - - - - - - - - -
57 | Llama 2 (70B) - - - - - - - - - - - - -
58 | Mistral v0.1 (7B) - - - - - - - - - - - - -
59 | gpt-3.5-turbo-0301 - - - - - - - - - - - - -
60 | gpt-3.5-turbo-0613 - - - - - - - - - - - - -
61 | MPT (30B) - - - - - - - - - - - - -
62 | MPT-Instruct (30B) - - - - - - - - - - - - -
63 | Falcon (7B) - - - - - - - - - - - - -
64 | Falcon-Instruct (7B) - - - - - - - - - - - - -
65 | Falcon (40B) - - - - - - - - - - - - -
66 | Falcon-Instruct (40B) - - - - - - - - - - - - -
67 | InstructPalmyra (30B) - - - - - - - - - - - - -
68 | Palmyra X (43B) - - - - - - - - - - - - -
69 |
70 |
--------------------------------------------------------------------------------
/benchbench/data/heim/alignment_human.tsv:
--------------------------------------------------------------------------------
1 | Model/adapter Mean win rate ↑ [ sort ] MS-COCO (base) - Image text alignment (human) ↑ [ sort ] MS-COCO (fairness - gender) - Image text alignment (human) ↑ [ sort ] MS-COCO (fairness - AAVE dialect) - Image text alignment (human) ↑ [ sort ] MS-COCO (robustness) - Image text alignment (human) ↑ [ sort ] MS-COCO (Chinese) - Image text alignment (human) ↑ [ sort ] MS-COCO (Hindi) - Image text alignment (human) ↑ [ sort ] MS-COCO (Spanish) - Image text alignment (human) ↑ [ sort ] MS-COCO (Art styles) - Image text alignment (human) ↑ [ sort ] DrawBench (image quality categories) - Image text alignment (human) ↑ [ sort ] PartiPrompts (image quality categories) - Image text alignment (human) ↑ [ sort ] dailydall.e - Image text alignment (human) ↑ [ sort ] Landing Page - Image text alignment (human) ↑ [ sort ] Logos - Image text alignment (human) ↑ [ sort ] Magazine Cover Photos - Image text alignment (human) ↑ [ sort ] Common Syntactic Processes - Image text alignment (human) ↑ [ sort ] DrawBench (reasoning categories) - Image text alignment (human) ↑ [ sort ] PartiPrompts (reasoning categories) - Image text alignment (human) ↑ [ sort ] Relational Understanding - Image text alignment (human) ↑ [ sort ] Detection (PaintSkills) - Image text alignment (human) ↑ [ sort ] Winoground - Image text alignment (human) ↑ [ sort ] PartiPrompts (knowledge categories) - Image text alignment (human) ↑ [ sort ] DrawBench (knowledge categories) - Image text alignment (human) ↑ [ sort ] TIME's most significant historical figures - Image text alignment (human) ↑ [ sort ]
2 | DALL-E 2 (3.5B) 0.941 4.438 4.534 4.416 4.58 3.902 1.798 4.247 4.216 3.9 4.262 4.416 4.256 4.416 4.104 4.062 3.97 4.16 3.7 4.36 4.16 4.1 4.1 4.227
3 | Dreamlike Photoreal v2.0 (1B) 0.748 4.346 4.312 4.18 4.253 1.966 2.324 3.128 4.068 3.98 4.296 4.552 4.32 4.232 4.152 3.569 3.553 3.8 3.24 3.747 3.48 4.082 3.818 4.488
4 | Stable Diffusion v1.4 (1B) 0.723 4.214 4.202 4.136 4.115 2.224 1.788 3.276 4.154 3.83 4.118 4.256 4.12 4.232 4.032 3.492 3.667 3.5 3.36 3.853 4.06 3.865 3.641 4.429
5 | Safe Stable Diffusion strong (1B) 0.63 4.026 4.113 4.112 3.97 2.126 2.152 3.468 3.958 3.65 4.047 3.832 3.904 3.776 3.68 3.296 3.73 3.58 3.24 3.853 4.12 3.853 3.576 4.558
6 | DALL-E mega (2.6B) 0.6 4.056 4.16 4.061 4.071 2.474 1.794 3.388 4.012 3.63 4.156 3.88 3.92 3.92 3.688 3.281 3.527 3.9 3.34 3.9 3.48 3.906 3.371 3.894
7 | Openjourney v1 (1B) 0.586 4.16 4.044 3.908 3.968 2.368 2.136 2.634 4.112 3.94 4.098 4.448 4.056 4.224 4.016 3.477 3.33 3.6 2.38 3.867 3.02 3.982 3.494 4.279
8 | Dreamlike Diffusion v1.0 (1B) 0.56 4.024 3.934 3.978 3.975 2.964 2.79 3.548 3.756 3.48 3.947 3.872 3.784 3.488 3.384 3.435 3.6 3.72 3.38 3.7 4.06 3.847 3.765 3.912
9 | Promptist + Stable Diffusion v1.4 (1B) 0.551 4 4.113 3.994 4.166 2.172 1.68 2.802 3.938 3.64 3.924 3.864 3.872 3.712 3.616 3.419 3.643 3.6 3.32 3.987 3.28 4.029 3.553 4.059
10 | MultiFusion (13B) 0.543 4.106 4.061 3.881 4.046 2.962 1.916 3.832 4.206 3.8 3.967 4.2 4.032 4 3.896 3.096 3.173 3.7 3.26 3.673 3.4 3.659 3.465 3.271
11 | DeepFloyd IF X-Large (4.3B) 0.537 3.818 3.965 3.937 3.931 2.978 2.824 3.706 3.766 3.81 3.831 3.568 3.504 3.432 3.552 3.708 3.68 3.74 3.68 3.72 3.84 3.718 3.612 3.841
12 | DeepFloyd IF Medium (0.4B) 0.517 3.754 3.91 3.876 3.885 3.006 3.046 3.704 3.786 3.98 3.771 3.608 3.688 3.576 3.68 3.45 3.597 3.64 3.46 3.633 3.58 3.771 3.747 3.706
13 | DeepFloyd IF Large (0.9B) 0.508 3.798 4.01 3.903 3.968 2.888 3.06 3.776 3.73 3.68 3.836 3.536 3.48 3.616 3.632 3.427 3.593 3.64 3.52 3.673 3.94 3.753 3.706 3.929
14 | Stable Diffusion v2.1 base (1B) 0.508 3.672 4.094 3.947 3.811 2.976 2.762 3.552 3.866 3.68 3.798 3.664 3.528 3.648 3.568 3.562 3.567 3.64 3.6 3.92 3.82 3.659 3.359 3.947
15 | GigaGAN (1B) 0.506 3.692 3.88 3.868 3.851 3.456 3.246 3.728 3.88 3.84 3.787 3.592 3.512 3.52 3.696 3.477 3.71 3.62 3.82 3.707 3.56 3.671 3.371 3.906
16 | Stable Diffusion v2 base (1B) 0.506 3.69 4.05 3.898 3.837 3.106 2.726 3.7 3.784 3.88 3.942 3.648 3.56 3.624 3.48 3.565 3.567 3.92 3.34 3.847 3.3 3.512 3.624 3.853
17 | Openjourney v2 (1B) 0.499 3.802 3.931 4.007 3.916 2.936 2.808 3.656 3.822 3.35 3.9 3.76 3.792 3.664 3.496 3.404 3.617 3.64 3.34 3.62 3.88 3.647 3.624 3.771
18 | Vintedois (22h) Diffusion model v0.1 (1B) 0.461 3.716 3.899 3.803 3.716 2.896 2.7 3.602 3.87 3.74 3.964 3.656 3.552 3.632 3.408 3.638 3.617 3.52 3.3 3.673 4.06 3.676 3.718 3.653
19 | Safe Stable Diffusion weak (1B) 0.45 3.73 3.87 3.831 3.816 2.862 2.736 3.534 3.92 3.79 3.876 3.744 3.592 3.552 3.648 3.427 3.483 3.86 3.32 3.693 3.52 3.947 3.7 3.635
20 | Stable Diffusion v1.5 (1B) 0.445 3.708 4.102 3.942 3.784 2.93 2.79 3.518 3.844 3.81 3.949 3.6 3.536 3.464 3.568 3.519 3.607 3.5 3.08 3.647 3.66 3.812 3.688 3.77
21 | Safe Stable Diffusion medium (1B) 0.424 3.696 3.733 3.761 3.742 3 2.746 3.55 3.728 3.76 3.887 3.68 3.552 3.632 3.624 3.6 3.613 3.88 3.12 3.587 3.78 3.724 3.482 3.697
22 | Redshift Diffusion (1B) 0.398 3.572 3.934 3.813 3.671 3.042 2.714 3.134 3.694 3.93 3.802 3.792 3.736 3.616 3.664 3.485 3.51 3.34 2.96 3.833 3.36 3.941 3.524 3.659
23 | DALL-E mini (0.4B) 0.397 3.692 3.686 3.676 3.665 2.936 2.832 3.304 3.524 3.39 3.782 3.712 3.736 3.76 3.648 3.235 3.617 3.84 3.08 3.827 3.66 3.871 3.553 3.853
24 | Safe Stable Diffusion max (1B) 0.37 3.712 3.894 3.774 3.823 2.832 2.896 3.538 3.714 3.52 3.771 3.608 3.6 3.584 3.44 3.265 3.617 3.52 2.92 3.76 3.84 3.653 3.7 3.927
25 | minDALL-E (1.3B) 0.282 3.672 3.592 3.531 3.535 2.932 2.872 2.904 3.44 3.51 3.711 3.816 3.736 3.88 3.744 3.285 3.503 3.38 3.22 3.513 3.66 3.729 3.335 3.247
26 | Lexica Search with Stable Diffusion v1.5 (1B) 0.16 3.496 3.24 3.464 3.472 2.338 1.7 2.848 3.556 3.46 3.762 3.624 3.944 3.768 3.424 3.162 3.1 3.12 3 3.46 3.14 3.347 2.871 4.035
27 | CogView2 (6B) 0.15 3.688 3.744 3.575 3.621 3.842 1.734 1.766 3.53 3.63 3.731 3.584 3.744 3.688 3.496 3.008 3.117 2.9 2.96 3.44 2.86 3.324 2.659 2.888
--------------------------------------------------------------------------------
/benchbench/data/helm/efficiency.tsv:
--------------------------------------------------------------------------------
1 | Model Mean win rate MMLU - Denoised inference time (s) BoolQ - Denoised inference time (s) NarrativeQA - Denoised inference time (s) NaturalQuestions (closed-book) - Denoised inference time (s) NaturalQuestions (open-book) - Denoised inference time (s) QuAC - Denoised inference time (s) HellaSwag - Denoised inference time (s) OpenbookQA - Denoised inference time (s) TruthfulQA - Denoised inference time (s) MS MARCO (regular) - Denoised inference time (s) MS MARCO (TREC) - Denoised inference time (s) CNN/DailyMail - Denoised inference time (s) XSUM - Denoised inference time (s) IMDB - Denoised inference time (s) CivilComments - Denoised inference time (s) RAFT - Denoised inference time (s)
2 | text-ada-001 0.938 0.088 0.096 0.171 0.085 0.128 0.21 0.079 0.076 0.089 0.09 0.09 0.793 0.311 0.109 0.092 0.107
3 | curie (6.7B) 0.895 0.092 0.1 0.152 0.122 0.189 0.323 0.084 0.079 0.094 0.094 0.095 0.623 0.294 0.11 0.097 0.112
4 | babbage (1.3B) 0.861 0.119 0.121 0.176 0.152 0.232 0.261 0.113 0.111 0.12 0.122 0.122 0.533 0.272 0.128 0.12 0.137
5 | text-curie-001 0.783 0.133 0.143 0.205 0.153 0.185 0.298 0.125 0.119 0.134 0.136 0.135 0.799 0.364 0.147 0.142 0.152
6 | text-babbage-001 0.778 0.133 0.142 0.243 0.136 0.204 0.314 0.125 0.122 0.134 0.136 0.135 0.968 0.431 0.157 0.138 0.153
7 | ada (350M) 0.77 0.14 0.141 0.211 0.167 0.271 0.27 0.138 0.136 0.141 0.142 0.142 0.598 0.237 0.142 0.141 0.154
8 | text-davinci-002 0.604 0.196 0.191 0.512 0.264 0.394 0.891 0.171 0.158 0.2 0.192 0.198 2.236 1.026 0.247 0.186 0.276
9 | GPT-J (6B) 0.601 0.07 0.499 1.311 1.777 3.866 1.389 0.03 0.019 0.044 0.084 0.081 2.076 0.742 0.701 0.307 0.628
10 | davinci (175B) 0.558 0.212 0.21 0.369 0.327 0.462 1.085 0.193 0.184 0.215 0.211 0.214 2.256 1.148 0.225 0.21 0.279
11 | Cohere medium v20220720 (6.1B) 0.541 0.281 0.35 0.533 0.259 0.535 0.735 0.204 0.187 0.287 0.289 0.288 1.2 0.724 0.452 0.321 0.358
12 | Cohere small v20220720 (410M) 0.534 0.284 0.367 0.56 0.251 0.605 0.619 0.223 0.214 0.289 - 0.291 0.954 0.642 0.458 0.329 0.36
13 | GPT-NeoX (20B) 0.514 0.133 0.773 1.468 0.482 2.137 2.025 0.025 0.024 0.084 0.118 0.116 2.133 1.116 0.862 0.408 1.156
14 | UL2 (20B) 0.506 0.182 0.313 1.182 1.994 3.093 1.226 - - 0.168 - - 1.108 0.774 0.215 0.264 0.434
15 | OPT (66B) 0.467 0.055 0.834 1.98 0.611 3.632 2.658 0.971 0.188 0.041 0.076 0.102 1.972 0.885 0.54 0.212 1.871
16 | T5 (11B) 0.434 0.218 0.271 1.054 2.856 12.846 1.032 - - 0.21 - - 1.654 1.159 0.278 0.27 0.448
17 | T0pp (11B) 0.42 0.145 0.374 0.945 1.457 2.895 1.239 - - 0.142 - - 1.066 0.554 0.393 0.391 0.586
18 | Cohere large v20220720 (13.1B) 0.407 0.317 0.421 0.729 0.337 0.774 1.262 0.225 0.201 0.325 0.33 0.327 2.269 1.075 0.536 0.375 0.444
19 | J1-Large v1 (7.5B) 0.389 0.377 0.485 0.797 0.372 0.733 1.16 0.253 0.238 0.365 0.393 0.389 2.011 0.903 0.637 0.434 0.499
20 | J1-Grande v1 (17B) 0.317 0.411 0.535 0.923 0.466 0.873 1.413 0.33 0.281 0.396 0.428 0.424 2.074 1.07 0.732 0.482 0.59
21 | BLOOM (176B) 0.268 0.233 0.853 2.598 1.115 2.547 5.306 0.075 0.032 0.143 0.257 0.246 5.584 3.9 3.536 0.533 1.866
22 | YaLM (100B) 0.266 0.143 0.828 2.314 2.722 4.463 2.278 - - 0.092 - - 2.346 1.671 1.137 0.41 0.89
23 | OPT (175B) 0.241 0.12 0.869 2.783 4.548 7.78 4.049 0.71 0.038 0.141 0.241 0.226 4.729 2.523 1.575 0.498 0.962
24 | J1-Jumbo v1 (178B) 0.222 0.457 0.62 1.126 0.493 1.06 2.064 0.284 0.259 0.443 0.501 0.496 3.777 1.629 0.852 0.552 0.687
25 | Cohere xlarge v20220609 (52.4B) 0.199 0.489 0.598 1.062 0.565 1.085 2.089 0.359 0.314 0.501 0.499 0.501 4.337 1.741 0.796 0.546 0.667
26 | GLM (130B) 0.151 0.335 1.191 2.315 0.953 2.369 4.219 - - 0.158 - - 3.514 2.537 1.497 0.695 1.471
27 | Anthropic-LM v4-s3 (52B) 0.138 0.578 0.637 1.722 0.777 1.102 3.694 0.549 0.447 0.568 0.578 0.587 4.076 2.408 0.79 0.594 0.883
28 | J1-Grande v2 beta (17B) - - - - - - - - - - - - - - - - -
29 | Jurassic-2 Jumbo (178B) - - - - - - - - - - - - - - - - -
30 | Jurassic-2 Grande (17B) - - - - - - - - - - - - - - - - -
31 | Jurassic-2 Large (7.5B) - - - - - - - - - - - - - - - - -
32 | Luminous Base (13B) - - - - - - - - - - - - - - - - -
33 | Luminous Extended (30B) - - - - - - - - - - - - - - - - -
34 | Luminous Supreme (70B) - - - - - - - - - - - - - - - - -
35 | Cohere xlarge v20221108 (52.4B) - - - - - - - - - - - - - - - - -
36 | Cohere medium v20221108 (6.1B) - - - - - - - - - - - - - - - - -
37 | Cohere Command beta (6.1B) - - - - - - - - - - - - - - - - -
38 | Cohere Command beta (52.4B) - - - - - - - - - - - - - - - - -
39 | Pythia (6.9B) - - - - - - - - - - - - - - - - -
40 | Pythia (12B) - - - - - - - - - - - - - - - - -
41 | LLaMA (7B) - - - - - - - - - - - - - - - - -
42 | LLaMA (13B) - - - - - - - - - - - - - - - - -
43 | LLaMA (30B) - - - - - - - - - - - - - - - - -
44 | LLaMA (65B) - - - - - - - - - - - - - - - - -
45 | Llama 2 (7B) - - - - - - - - - - - - - - - - -
46 | Llama 2 (13B) - - - - - - - - - - - - - - - - -
47 | Llama 2 (70B) - - - - - - - - - - - - - - - - -
48 | Alpaca (7B) - - - - - - - - - - - - - - - - -
49 | Vicuna v1.3 (7B) - - - - - - - - - - - - - - - - -
50 | Vicuna v1.3 (13B) - - - - - - - - - - - - - - - - -
51 | Mistral v0.1 (7B) - - - - - - - - - - - - - - - - -
52 | TNLG v2 (530B) - - - - - - - - - - - - - - - - -
53 | TNLG v2 (6.7B) - - - - - - - - - - - - - - - - -
54 | text-davinci-003 - - - - - - - - - - - - - - - - -
55 | gpt-3.5-turbo-0301 - - - - - - - - - - - - - - - - -
56 | gpt-3.5-turbo-0613 - - - - - - - - - - - - - - - - -
57 | RedPajama-INCITE-Base-v1 (3B) - - - - - - - - - - - - - - - - -
58 | RedPajama-INCITE-Instruct-v1 (3B) - - - - - - - - - - - - - - - - -
59 | RedPajama-INCITE-Base (7B) - - - - - - - - - - - - - - - - -
60 | RedPajama-INCITE-Instruct (7B) - - - - - - - - - - - - - - - - -
61 | MPT (30B) - - - - - - - - - - - - - - - - -
62 | MPT-Instruct (30B) - - - - - - - - - - - - - - - - -
63 | Falcon (7B) - - - - - - - - - - - - - - - - -
64 | Falcon-Instruct (7B) - - - - - - - - - - - - - - - - -
65 | Falcon (40B) - - - - - - - - - - - - - - - - -
66 | Falcon-Instruct (40B) - - - - - - - - - - - - - - - - -
67 | InstructPalmyra (30B) - - - - - - - - - - - - - - - - -
68 | Palmyra X (43B) - - - - - - - - - - - - - - - - -
69 |
70 |
--------------------------------------------------------------------------------
/benchbench/data/mmlu/leaderboard_raw.csv:
--------------------------------------------------------------------------------
1 | open-llm-leaderboard/details_kyujinpy__PlatYi-34B-Llama-Q
2 | open-llm-leaderboard/details_kyujinpy__PlatYi-34B-LoRA
3 | open-llm-leaderboard/details_kyujinpy__PlatYi-34B-Llama-Q-FastChat
4 | open-llm-leaderboard/details_kyujinpy__PlatYi-34B-Llama
5 | open-llm-leaderboard/details_bhenrym14__platypus-yi-34b
6 | open-llm-leaderboard/details_kyujinpy__PlatYi-34B-Q
7 | open-llm-leaderboard/details_cloudyu__Yi-34Bx2-MoE-60B
8 | open-llm-leaderboard/details_Qwen__Qwen-72B
9 | open-llm-leaderboard/details_cloudyu__Mixtral_34Bx2_MoE_60B
10 | open-llm-leaderboard/details_moreh__MoMo-70B-lora-1.8.4-DPO
11 | open-llm-leaderboard/details_cloudyu__Mixtral_34Bx2_MoE_60B
12 | open-llm-leaderboard/details_CausalLM__72B-preview
13 | open-llm-leaderboard/details_CausalLM__72B-preview
14 | open-llm-leaderboard/details_kyujinpy__PlatYi-34B-200k-Q-FastChat
15 | open-llm-leaderboard/details_moreh__MoMo-70B-LoRA-V1.4
16 | open-llm-leaderboard/details_NousResearch__Nous-Hermes-2-Yi-34B
17 | open-llm-leaderboard/details_SUSTech__SUS-Chat-72B
18 | open-llm-leaderboard/details_AA051611__whattest
19 | open-llm-leaderboard/details_kyujinpy__PlatYi-34B-Llama-Q-v2
20 | open-llm-leaderboard/details_jondurbin__bagel-dpo-34b-v0.2
21 | open-llm-leaderboard/details_jondurbin__bagel-dpo-34b-v0.2
22 | open-llm-leaderboard/details_jondurbin__bagel-34b-v0.2
23 | open-llm-leaderboard/details_jondurbin__nontoxic-bagel-34b-v0.2
24 | open-llm-leaderboard/details_SUSTech__SUS-Chat-34B
25 | open-llm-leaderboard/details_01-ai__Yi-34B
26 | open-llm-leaderboard/details_chargoddard__Yi-34B-Llama
27 | open-llm-leaderboard/details_01-ai__Yi-34B-200K
28 | open-llm-leaderboard/details_mncai__yi-34B-v3
29 | open-llm-leaderboard/details_APMIC__caigun-lora-model-34B-v2
30 | open-llm-leaderboard/details_mncai__yi-34B-v2
31 | open-llm-leaderboard/details_Mihaiii__Pallas-0.2
32 | open-llm-leaderboard/details_migtissera__Tess-M-Creative-v1.0
33 | open-llm-leaderboard/details_Mihaiii__Pallas-0.2
34 | open-llm-leaderboard/details_APMIC__caigun-lora-model-34B-v3
35 | open-llm-leaderboard/details_migtissera__Tess-M-v1.3
36 | open-llm-leaderboard/details_kyujinpy__PlatYi-34B-200K-Q
37 | open-llm-leaderboard/details_Mihaiii__Pallas-0.4
38 | open-llm-leaderboard/details_Mihaiii__Pallas-0.3
39 | open-llm-leaderboard/details_Mihaiii__Pallas-0.3
40 | open-llm-leaderboard/details_migtissera__Tess-34B-v1.4
41 | open-llm-leaderboard/details_Mihaiii__Pallas-0.5
42 | open-llm-leaderboard/details_kyujinpy__PlatYi-34B-Llama-Q-v3
43 | open-llm-leaderboard/details_Mihaiii__Pallas-0.5-LASER-0.1
44 | open-llm-leaderboard/details_Mihaiii__Pallas-0.4
45 | open-llm-leaderboard/details_JosephusCheung__Yee-34B-200K-Chat
46 | open-llm-leaderboard/details_01-ai__Yi-34B-Chat
47 | open-llm-leaderboard/details_01-ai__Yi-34B-Chat
48 | open-llm-leaderboard/details_Mihaiii__Pallas-0.5-LASER-0.2
49 | open-llm-leaderboard/details_Mihaiii__Pallas-0.5-LASER-0.3
50 | open-llm-leaderboard/details_Mihaiii__Pallas-0.5-LASER-exp2-0.1
51 | open-llm-leaderboard/details_migtissera__Tess-M-v1.1
52 | open-llm-leaderboard/details_AA051611__A0110
53 | open-llm-leaderboard/details_AA051611__A0109
54 | open-llm-leaderboard/details_Azure99__blossom-v3_1-yi-34b
55 | open-llm-leaderboard/details_Mihaiii__Pallas-0.5-LASER-0.4
56 | open-llm-leaderboard/details_Mihaiii__Pallas-0.5-LASER-0.5
57 | open-llm-leaderboard/details_Mihaiii__Pallas-0.5-LASER-0.6
58 | open-llm-leaderboard/details_TriadParty__deepmoney-34b-200k-base
59 | open-llm-leaderboard/details_AA051610__A0106
60 | open-llm-leaderboard/details_AA051610__A0106
61 | open-llm-leaderboard/details_adamo1139__Yi-34B-AEZAKMI-v1
62 | open-llm-leaderboard/details_OrionStarAI__OrionStar-Yi-34B-Chat-Llama
63 | open-llm-leaderboard/details_mlinmg__SG-Raccoon-Yi-200k-2.0
64 | open-llm-leaderboard/details_deepseek-ai__deepseek-llm-67b-chat
65 | open-llm-leaderboard/details_OpenBuddy__openbuddy-mixtral-7bx8-v16.3-32k
66 | open-llm-leaderboard/details_itsliupeng__Mixtral-8x7B-v0.1-top3
67 | open-llm-leaderboard/details_itsliupeng__llama2_70b_mmlu
68 | open-llm-leaderboard/details_itsliupeng__llama2_70b_mmlu
69 | open-llm-leaderboard/details_rufjdk5480__gov-qna-ko-merged
70 | open-llm-leaderboard/details_rufjdk5480__mixtral-ko-qna-merged
71 | open-llm-leaderboard/details_mistralai__Mixtral-8x7B-v0.1
72 | open-llm-leaderboard/details_deepseek-ai__deepseek-llm-67b-base
73 | open-llm-leaderboard/details_OpenBuddy__openbuddy-deepseek-67b-v15.2
74 | open-llm-leaderboard/details_YeungNLP__firefly-mixtral-8x7b-v0.1
75 | open-llm-leaderboard/details_YeungNLP__firefly-mixtral-8x7b-v1
76 | open-llm-leaderboard/details_argilla__notux-8x7b-v1-epoch-2
77 | open-llm-leaderboard/details_mistralai__Mixtral-8x7B-Instruct-v0.1
78 | open-llm-leaderboard/details_argilla__notux-8x7b-v1
79 | open-llm-leaderboard/details_VAGOsolutions__SauerkrautLM-Mixtral-8x7B-Instruct
80 | open-llm-leaderboard/details_OpenBuddy__openbuddy-mixtral-8x7b-v16.1-32k
81 | open-llm-leaderboard/details_argilla__notus-8x7b-experiment
82 | open-llm-leaderboard/details_OpenBuddy__openbuddy-mixtral-8x7b-v16.2-32k
83 | open-llm-leaderboard/details_mistralai__Mixtral-8x7B-Instruct-v0.1
84 | open-llm-leaderboard/details_VAGOsolutions__SauerkrautLM-Mixtral-8x7B-Instruct
85 | open-llm-leaderboard/details_s1ghhh__medllama-2-70b-qlora-1.1
86 | open-llm-leaderboard/details_ICBU-NPU__FashionGPT-70B-V1.1
87 | open-llm-leaderboard/details_OpenBuddy__openbuddy-deepseek-67b-v15-base
88 | open-llm-leaderboard/details_Brillibits__Instruct_Mixtral-8x7B-v0.1_Dolly15K
89 | open-llm-leaderboard/details_Sao10K__Sensualize-Mixtral-bf16
90 | open-llm-leaderboard/details_KaeriJenti__kaori-70b-v1
91 | open-llm-leaderboard/details_Riiid__sheep-duck-llama-2-70b-v1.1
92 | open-llm-leaderboard/details_AIDC-ai-business__Marcoroni-70B-v1
93 | open-llm-leaderboard/details_jondurbin__airoboros-l2-70b-gpt4-m2.0
94 | open-llm-leaderboard/details_cognitivecomputations__yayi2-30b-llama
95 | open-llm-leaderboard/details_AA051610__A11P
96 | open-llm-leaderboard/details_sequelbox__SpellBlade
97 | open-llm-leaderboard/details_tiiuae__falcon-180B
98 | open-llm-leaderboard/details_garage-bAInd__Platypus2-70B-instruct
99 | open-llm-leaderboard/details_chargoddard__mixtralmerge-8x7B-rebalanced-test
100 | open-llm-leaderboard/details_chargoddard__MixtralRPChat-ZLoss
--------------------------------------------------------------------------------
/benchbench/measures/ordinal.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import pandas as pd
4 | from torch.optim import Adam
5 | from sklearn.impute import KNNImputer
6 |
7 | from ..utils.base import rankdata
8 | from ..utils.metric import get_rank_diff, get_rank_variance
9 | from ..utils.win_rate import WinningRate
10 |
11 |
12 | def appr_rank_diff(new_win_rate, inv_indices, orig_rank):
13 | """
14 | Approximate the rank difference between the original win rate and the new win rate.
15 |
16 | Args:
17 | new_win_rate(np.array): win rate for all models
18 | inv_indices(list): invaraint indices
19 | orig_rank(np.array): original win rate for only models in inv_indices
20 |
21 | Returns:
22 | torch.Tensor: approximated loss
23 | """
24 | ret = 0.0
25 | for i, inv_i in enumerate(inv_indices):
26 | for j, inv_j in enumerate(inv_indices):
27 | # old_rank[i] is the original rank for inv_i
28 | if orig_rank[i] < orig_rank[j]:
29 | ret += max(new_win_rate[inv_i] - new_win_rate[inv_j], -0.01)
30 | return ret
31 |
32 |
33 | def get_selected_win_rate(win_rate_matrix, w, inv_indices, do_sample=True):
34 | """
35 | Get the win rate for the selected indices.
36 |
37 | Args:
38 | win_rate_matrix(torch.Tensor): i-th row and j-th column represents the win rate of i-th model over j-th model
39 | w(torch.Tensor): unnormalized normalized probability for each model to be selected
40 | inv_indices(list): indices for L
41 | do_sample(bool): select models based on sampling or not
42 |
43 | Returns:
44 | tuple:
45 | torch.Tensor: new_win_rate
46 | np.array: new_indices
47 | """
48 | probs = torch.sigmoid(w)
49 | if do_sample:
50 | sampler = torch.distributions.Bernoulli(probs)
51 | sampled = sampler.sample() + w - w.detach()
52 | else:
53 | sampled = (probs > 0.5) + w - w.detach()
54 | inv = torch.tensor(
55 | [
56 | (1.0 if (j == 0.0 and i in inv_indices) else 0.0)
57 | for i, j in enumerate(sampled)
58 | ]
59 | )
60 | selected = sampled + inv
61 | selected_diag = torch.diag(selected)
62 | selected_win_rate = selected_diag @ win_rate_matrix @ selected_diag
63 | new_win_rate = selected_win_rate.sum(1) / selected.sum()
64 | new_indices = np.where(selected.detach().numpy() >= 1.0 - 1e-4)[0]
65 |
66 | return new_win_rate, new_indices
67 |
68 |
69 | def get_sensitivity(
70 | data, cols, inv_indices=None, lr=0.01, num_step=1000, return_indices=False
71 | ):
72 | """
73 | Calculate the sensitivity for a given benchmark.
74 |
75 | Args:
76 | data(pd.DataFrame): each row represents a model, each column represents a task
77 | cols(list): the column names of the tasks
78 | inv_indices(list): indices for L, the rest will be used as L^C
79 | lr(float): learning rate for optimization
80 | num_step(int): number of steps for optimization
81 | return_indices(bool): whether return the indices of selected irrelevant models
82 |
83 | Returns:
84 | tuple: ((tau, MRC), indices) if return_indices is True, else (tau, MRC)
85 | """
86 | if inv_indices is None:
87 | inv_indices = np.arange(len(data) // 5)
88 |
89 | torch.manual_seed(0)
90 | win_rate_matrix = torch.tensor(WinningRate(data, cols).win_rate)
91 |
92 | orig_win_rate = win_rate_matrix[inv_indices][:, inv_indices].mean(axis=1).numpy()
93 | orig_rank = rankdata(-orig_win_rate, method="average")
94 |
95 | w = torch.zeros(len(data), requires_grad=True, dtype=torch.double)
96 | optimizer = Adam([w], lr=lr)
97 | history = []
98 | for episode in range(num_step):
99 | new_win_rate, new_indices = get_selected_win_rate(
100 | win_rate_matrix, w, inv_indices
101 | )
102 | loss = appr_rank_diff(new_win_rate, inv_indices, orig_rank)
103 | if type(loss) is float:
104 | break
105 | print("Episode %d, loss %.2lf" % (episode, loss.item()), end="\r")
106 |
107 | optimizer.zero_grad()
108 | loss.backward()
109 | optimizer.step()
110 |
111 | new_win_rate = (
112 | win_rate_matrix[new_indices][:, new_indices]
113 | .mean(axis=1)[inv_indices]
114 | .detach()
115 | .numpy()
116 | )
117 | new_rank = rankdata(-new_win_rate)
118 | rank_diff = get_rank_diff(new_rank, orig_rank)
119 | history.append((rank_diff, new_indices))
120 | print()
121 |
122 | new_win_rate, new_indices = get_selected_win_rate(
123 | win_rate_matrix, w, inv_indices, do_sample=False
124 | )
125 | new_win_rate = (
126 | win_rate_matrix[new_indices][:, new_indices]
127 | .mean(axis=1)[inv_indices]
128 | .detach()
129 | .numpy()
130 | )
131 | new_rank = rankdata(-new_win_rate, method="average")
132 | final_rank_diff = get_rank_diff(new_rank, orig_rank)
133 |
134 | if len(history) == 0:
135 | ret = (final_rank_diff, new_indices)
136 | else:
137 | history = sorted(history, key=lambda x: -x[0][0])
138 | history_best_rank_diff = history[0][0]
139 | history_best_indices = history[0][1]
140 | if final_rank_diff > history_best_rank_diff:
141 | ret = (final_rank_diff, new_indices)
142 | else:
143 | ret = (history_best_rank_diff, history_best_indices)
144 | if return_indices:
145 | return ret
146 | else:
147 | return ret[0]
148 |
149 |
150 | def get_diversity(data, cols):
151 | """
152 | Calculate the diversity for a given benchmark.
153 |
154 | Args:
155 | data(pd.DataFrame): each row represents a model, each column represents a task
156 | cols(list): the column names of the tasks
157 |
158 | Returns:
159 | tuple: (W, max_MRC), where max_MRC refers to max MRC over every pair of tasks
160 | """
161 | imputer = KNNImputer(n_neighbors=5, weights="uniform")
162 |
163 | data_imputed = imputer.fit_transform(data[cols].values)
164 | data_imputed = pd.DataFrame(data_imputed, columns=cols)
165 |
166 | return get_rank_variance(
167 | [
168 | rankdata(-data_imputed[c].values, method="average")
169 | for c in list(cols)
170 | if data_imputed[c].values.dtype == "float64"
171 | ]
172 | )
173 |
--------------------------------------------------------------------------------
/benchbench/data/helm_capability/vanilla.txt:
--------------------------------------------------------------------------------
1 | Model
2 | Mean score
3 | MMLU-Pro - COT correct
4 | GPQA - COT correct
5 | IFEval - IFEval Strict Acc
6 | WildBench - WB Score
7 | Omni-MATH - Acc
8 | GPT-5 mini (2025-08-07)
9 | 0.819
10 | 0.835
11 | 0.756
12 | 0.927
13 | 0.855
14 | 0.722
15 | o4-mini (2025-04-16)
16 | 0.812
17 | 0.82
18 | 0.735
19 | 0.929
20 | 0.854
21 | 0.72
22 | o3 (2025-04-16)
23 | 0.811
24 | 0.859
25 | 0.753
26 | 0.869
27 | 0.861
28 | 0.714
29 | GPT-5 (2025-08-07)
30 | 0.807
31 | 0.863
32 | 0.791
33 | 0.875
34 | 0.857
35 | 0.647
36 | Qwen3 235B A22B Instruct 2507 FP8
37 | 0.798
38 | 0.844
39 | 0.726
40 | 0.835
41 | 0.866
42 | 0.718
43 | Grok 4 (0709)
44 | 0.785
45 | 0.851
46 | 0.726
47 | 0.949
48 | 0.797
49 | 0.603
50 | Claude 4 Opus (20250514, extended thinking)
51 | 0.78
52 | 0.875
53 | 0.709
54 | 0.849
55 | 0.852
56 | 0.616
57 | gpt-oss-120b
58 | 0.77
59 | 0.795
60 | 0.684
61 | 0.836
62 | 0.845
63 | 0.688
64 | Kimi K2 Instruct
65 | 0.768
66 | 0.819
67 | 0.652
68 | 0.85
69 | 0.862
70 | 0.654
71 | Claude 4 Sonnet (20250514, extended thinking)
72 | 0.766
73 | 0.843
74 | 0.706
75 | 0.84
76 | 0.838
77 | 0.602
78 | Claude 4.5 Sonnet (20250929)
79 | 0.762
80 | 0.869
81 | 0.686
82 | 0.85
83 | 0.854
84 | 0.553
85 | Claude 4 Opus (20250514)
86 | 0.757
87 | 0.859
88 | 0.666
89 | 0.918
90 | 0.833
91 | 0.511
92 | GPT-5 nano (2025-08-07)
93 | 0.748
94 | 0.778
95 | 0.679
96 | 0.932
97 | 0.806
98 | 0.547
99 | Gemini 2.5 Pro (03-25 preview)
100 | 0.745
101 | 0.863
102 | 0.749
103 | 0.84
104 | 0.857
105 | 0.416
106 | Claude 4 Sonnet (20250514)
107 | 0.733
108 | 0.843
109 | 0.643
110 | 0.839
111 | 0.825
112 | 0.513
113 | Grok 3 Beta
114 | 0.727
115 | 0.788
116 | 0.65
117 | 0.884
118 | 0.849
119 | 0.464
120 | GPT-4.1 (2025-04-14)
121 | 0.727
122 | 0.811
123 | 0.659
124 | 0.838
125 | 0.854
126 | 0.471
127 | Qwen3 235B A22B FP8 Throughput
128 | 0.726
129 | 0.817
130 | 0.623
131 | 0.816
132 | 0.828
133 | 0.548
134 | GPT-4.1 mini (2025-04-14)
135 | 0.726
136 | 0.783
137 | 0.614
138 | 0.904
139 | 0.838
140 | 0.491
141 | Llama 4 Maverick (17Bx128E) Instruct FP8
142 | 0.718
143 | 0.81
144 | 0.65
145 | 0.908
146 | 0.8
147 | 0.422
148 | Qwen3-Next 80B A3B Thinking
149 | 0.7
150 | 0.786
151 | 0.63
152 | 0.81
153 | 0.807
154 | 0.467
155 | DeepSeek-R1-0528
156 | 0.699
157 | 0.793
158 | 0.666
159 | 0.784
160 | 0.828
161 | 0.424
162 | Palmyra X5
163 | 0.696
164 | 0.804
165 | 0.661
166 | 0.823
167 | 0.78
168 | 0.415
169 | Grok 3 mini Beta
170 | 0.679
171 | 0.799
172 | 0.675
173 | 0.951
174 | 0.651
175 | 0.318
176 | Gemini 2.0 Flash
177 | 0.679
178 | 0.737
179 | 0.556
180 | 0.841
181 | 0.8
182 | 0.459
183 | Claude 3.7 Sonnet (20250219)
184 | 0.674
185 | 0.784
186 | 0.608
187 | 0.834
188 | 0.814
189 | 0.33
190 | gpt-oss-20b
191 | 0.674
192 | 0.74
193 | 0.594
194 | 0.732
195 | 0.737
196 | 0.565
197 | GLM-4.5-Air-FP8
198 | 0.67
199 | 0.762
200 | 0.594
201 | 0.812
202 | 0.789
203 | 0.391
204 | DeepSeek v3
205 | 0.665
206 | 0.723
207 | 0.538
208 | 0.832
209 | 0.831
210 | 0.403
211 | Gemini 1.5 Pro (002)
212 | 0.657
213 | 0.737
214 | 0.534
215 | 0.837
216 | 0.813
217 | 0.364
218 | Claude 3.5 Sonnet (20241022)
219 | 0.653
220 | 0.777
221 | 0.565
222 | 0.856
223 | 0.792
224 | 0.276
225 | Llama 4 Scout (17Bx16E) Instruct
226 | 0.644
227 | 0.742
228 | 0.507
229 | 0.818
230 | 0.779
231 | 0.373
232 | Gemini 2.0 Flash Lite (02-05 preview)
233 | 0.642
234 | 0.72
235 | 0.5
236 | 0.824
237 | 0.79
238 | 0.374
239 | Amazon Nova Premier
240 | 0.637
241 | 0.726
242 | 0.518
243 | 0.803
244 | 0.788
245 | 0.35
246 | GPT-4o (2024-11-20)
247 | 0.634
248 | 0.713
249 | 0.52
250 | 0.817
251 | 0.828
252 | 0.293
253 | Gemini 2.5 Flash (04-17 preview)
254 | 0.626
255 | 0.639
256 | 0.39
257 | 0.898
258 | 0.817
259 | 0.384
260 | Llama 3.1 Instruct Turbo (405B)
261 | 0.618
262 | 0.723
263 | 0.522
264 | 0.811
265 | 0.783
266 | 0.249
267 | GPT-4.1 nano (2025-04-14)
268 | 0.616
269 | 0.55
270 | 0.507
271 | 0.843
272 | 0.811
273 | 0.367
274 | Palmyra-X-004
275 | 0.609
276 | 0.657
277 | 0.395
278 | 0.872
279 | 0.802
280 | 0.32
281 | Gemini 1.5 Flash (002)
282 | 0.609
283 | 0.678
284 | 0.437
285 | 0.831
286 | 0.792
287 | 0.305
288 | Qwen2.5 Instruct Turbo (72B)
289 | 0.599
290 | 0.631
291 | 0.426
292 | 0.806
293 | 0.802
294 | 0.33
295 | Mistral Large (2411)
296 | 0.598
297 | 0.599
298 | 0.435
299 | 0.876
300 | 0.801
301 | 0.281
302 | Gemini 2.5 Flash-Lite
303 | 0.591
304 | 0.537
305 | 0.309
306 | 0.81
307 | 0.818
308 | 0.48
309 | Amazon Nova Pro
310 | 0.591
311 | 0.673
312 | 0.446
313 | 0.815
314 | 0.777
315 | 0.242
316 | Palmyra Fin
317 | 0.577
318 | 0.591
319 | 0.422
320 | 0.793
321 | 0.783
322 | 0.295
323 | IBM Granite 4.0 Small
324 | 0.575
325 | 0.569
326 | 0.383
327 | 0.89
328 | 0.739
329 | 0.296
330 | Llama 3.1 Instruct Turbo (70B)
331 | 0.574
332 | 0.653
333 | 0.426
334 | 0.821
335 | 0.758
336 | 0.21
337 | GPT-4o mini (2024-07-18)
338 | 0.565
339 | 0.603
340 | 0.368
341 | 0.782
342 | 0.791
343 | 0.28
344 | Mistral Small 3.1 (2503)
345 | 0.558
346 | 0.61
347 | 0.392
348 | 0.75
349 | 0.788
350 | 0.248
351 | Amazon Nova Lite
352 | 0.551
353 | 0.6
354 | 0.397
355 | 0.776
356 | 0.75
357 | 0.233
358 | Claude 3.5 Haiku (20241022)
359 | 0.549
360 | 0.605
361 | 0.363
362 | 0.792
363 | 0.76
364 | 0.224
365 | Qwen2.5 Instruct Turbo (7B)
366 | 0.529
367 | 0.539
368 | 0.341
369 | 0.741
370 | 0.731
371 | 0.294
372 | Amazon Nova Micro
373 | 0.522
374 | 0.511
375 | 0.383
376 | 0.76
377 | 0.743
378 | 0.214
379 | IBM Granite 4.0 Micro
380 | 0.486
381 | 0.395
382 | 0.307
383 | 0.849
384 | 0.67
385 | 0.209
386 | Mixtral Instruct (8x22B)
387 | 0.478
388 | 0.46
389 | 0.334
390 | 0.724
391 | 0.711
392 | 0.163
393 | Palmyra Med
394 | 0.476
395 | 0.411
396 | 0.368
397 | 0.767
398 | 0.676
399 | 0.156
400 | OLMo 2 32B Instruct March 2025
401 | 0.475
402 | 0.414
403 | 0.287
404 | 0.78
405 | 0.734
406 | 0.161
407 | IBM Granite 3.3 8B Instruct
408 | 0.463
409 | 0.343
410 | 0.325
411 | 0.729
412 | 0.741
413 | 0.176
414 | Llama 3.1 Instruct Turbo (8B)
415 | 0.444
416 | 0.406
417 | 0.247
418 | 0.743
419 | 0.686
420 | 0.137
421 | OLMo 2 13B Instruct November 2024
422 | 0.44
423 | 0.31
424 | 0.316
425 | 0.73
426 | 0.689
427 | 0.156
428 | OLMo 2 7B Instruct November 2024
429 | 0.405
430 | 0.292
431 | 0.296
432 | 0.693
433 | 0.628
434 | 0.116
435 | Mixtral Instruct (8x7B)
436 | 0.397
437 | 0.335
438 | 0.296
439 | 0.575
440 | 0.673
441 | 0.105
442 | Mistral Instruct v0.3 (7B)
443 | 0.376
444 | 0.277
445 | 0.303
446 | 0.567
447 | 0.66
448 | 0.072
449 | OLMoE 1B-7B Instruct January 2025
450 | 0.332
451 | 0.169
452 | 0.22
453 | 0.628
454 | 0.551
455 | 0.093
456 | Marin 8B Instruct
457 | 0.325
458 | 0.188
459 | 0.168
460 | 0.632
461 | 0.477
462 | 0.16
--------------------------------------------------------------------------------
/benchbench/data/helm/summarization.tsv:
--------------------------------------------------------------------------------
1 | Model Mean win rate CNN/DailyMail - SummaC CNN/DailyMail - QAFactEval CNN/DailyMail - BERTScore (F1) CNN/DailyMail - Coverage CNN/DailyMail - Density CNN/DailyMail - Compression CNN/DailyMail - HumanEval-faithfulness CNN/DailyMail - HumanEval-relevance CNN/DailyMail - HumanEval-coherence XSUM - SummaC XSUM - QAFactEval XSUM - BERTScore (F1) XSUM - Coverage XSUM - Density XSUM - Compression XSUM - HumanEval-faithfulness XSUM - HumanEval-relevance XSUM - HumanEval-coherence
2 | TNLG v2 (530B) 0.757 0.573 - 0.316 0.977 26.968 10.317 - - - -0.281 - 0.473 0.774 2.322 15.776 - - -
3 | Luminous Supreme (70B) 0.717 0.552 - 0.28 0.939 33.625 9.298 - - - -0.241 - 0.444 0.807 3.08 16.97 - - -
4 | Cohere xlarge v20221108 (52.4B) 0.704 0.514 - 0.286 0.971 44.772 8.026 - - - -0.258 - 0.451 0.798 3.009 17.188 - - -
5 | J1-Grande v2 beta (17B) 0.678 0.552 - 0.29 0.973 24.032 11.659 - - - -0.282 - 0.454 0.786 2.816 16.857 - - -
6 | Cohere Command beta (52.4B) 0.678 0.415 - 0.318 0.979 32.165 9.156 - - - -0.271 - 0.459 0.793 2.548 16.937 - - -
7 | Jurassic-2 Grande (17B) 0.671 0.503 - 0.299 0.96 22.305 11.399 - - - -0.289 - 0.475 0.766 2.36 17.045 - - -
8 | J1-Grande v1 (17B) 0.669 0.539 4.81 0.275 0.973 41.027 9.888 - - - -0.272 3.447 0.429 0.783 2.64 19.012 - - -
9 | J1-Large v1 (7.5B) 0.65 0.512 4.716 0.248 0.977 71.654 7.632 - - - -0.239 3.675 0.4 0.808 3.757 18.133 - - -
10 | text-babbage-001 0.646 0.378 4.676 0.282 0.972 45.948 5.291 - - - -0.057 4.33 0.281 0.885 8.487 11.856 - - -
11 | Jurassic-2 Jumbo (178B) 0.645 0.489 - 0.313 0.957 15.317 12.304 - - - -0.32 - 0.489 0.755 2.145 16.589 - - -
12 | text-davinci-002 0.641 0.353 4.635 0.321 0.946 15.995 8.818 0.999 4.435 4.371 -0.273 3.007 0.43 0.801 2.872 14.07 0.849 4.41 4.685
13 | text-curie-001 0.617 0.291 4.616 0.306 0.961 26.1 6.829 0.967 4.587 4.243 -0.185 3.459 0.354 0.839 4.008 12.98 0.991 4.068 4.321
14 | TNLG v2 (6.7B) 0.612 0.493 - 0.282 0.976 48.951 9.598 - - - -0.203 - 0.385 0.793 3.286 18.428 - - -
15 | OPT (175B) 0.593 0.202 4.67 0.276 0.933 31.307 9.8 1 4.378 3.233 -0.253 3.523 0.46 0.793 2.732 16.792 0.798 4.3 4.891
16 | J1-Jumbo v1 (178B) 0.587 0.515 4.697 0.278 0.976 53.93 9.579 - - - -0.287 3.182 0.435 0.784 2.63 16.862 - - -
17 | Cohere Command beta (6.1B) 0.579 0.331 - 0.296 0.975 31.707 9.688 - - - -0.239 - 0.418 0.824 2.793 18.017 - - -
18 | OPT (66B) 0.579 0.197 4.735 0.256 0.92 41.595 9.759 - - - -0.189 3.324 0.417 0.817 3.899 18.414 - - -
19 | Cohere large v20220720 (13.1B) 0.576 0.5 4.763 0.246 0.946 37.733 11.27 - - - -0.189 2.889 0.398 0.823 3.599 20.712 - - -
20 | Jurassic-2 Large (7.5B) 0.572 0.496 - 0.271 0.963 25.251 11.503 - - - -0.278 - 0.45 0.782 2.659 18.03 - - -
21 | Luminous Extended (30B) 0.566 0.481 - 0.255 0.925 41.619 9.039 - - - -0.225 - 0.423 0.818 3.507 17.376 - - -
22 | GPT-J (6B) 0.549 0.208 4.704 0.247 0.948 48.284 9.864 - - - -0.198 3.813 0.381 0.829 4.043 17.942 - - -
23 | Cohere xlarge v20220609 (52.4B) 0.546 0.469 4.683 0.264 0.945 49.713 9.072 0.993 4.539 3.69 -0.253 2.981 0.434 0.8 2.945 18.422 0.661 4.239 4.825
24 | Anthropic-LM v4-s3 (52B) 0.531 0.492 4.692 0.326 0.96 10.832 11.89 0.667 4 2.667 -0.271 3.066 0.437 0.808 2.691 15.182 0.778 4.398 4.898
25 | text-davinci-003 0.526 0.359 - 0.342 0.956 7.545 9.389 - - - -0.301 - 0.411 0.822 2.63 10.932 - - -
26 | Cohere medium v20221108 (6.1B) 0.507 0.359 - 0.218 0.899 24.344 11.42 - - - -0.171 - 0.384 0.842 3.815 19.703 - - -
27 | text-ada-001 0.486 0.223 3.369 0.247 0.929 31.424 5.461 - - - -0.102 4.929 0.245 0.847 7.626 13.08 - - -
28 | GLM (130B) 0.471 0.566 - 0.288 0.972 30.259 8.687 0.963 4.167 3.463 -0.206 - 0.427 0.817 4.041 16.25 0.763 3.843 4.25
29 | GPT-NeoX (20B) 0.446 0.165 4.69 0.226 0.91 37.149 9.676 - - - -0.208 3.303 0.391 0.825 3.371 18.238 - - -
30 | Cohere medium v20220720 (6.1B) 0.431 0.229 4.664 0.115 0.799 22.176 13.154 - - - -0.159 3.223 0.367 0.847 4.754 19.748 - - -
31 | Luminous Base (13B) 0.421 0.32 - 0.188 0.834 35.663 9.346 - - - -0.213 - 0.394 0.834 4.393 17.535 - - -
32 | davinci (175B) 0.36 0.321 4.062 0.182 0.873 17.914 9.843 0.953 4.501 3.863 -0.267 2.338 0.318 0.751 3.351 14.08 0.829 4.075 3.398
33 | curie (6.7B) 0.325 0.354 4.204 0.089 0.89 23.472 9.495 0.287 1.933 1.767 -0.143 3.922 0.313 0.815 5.57 17.018 0.924 3.573 4.166
34 | Cohere small v20220720 (410M) 0.292 0.054 2.638 0.026 0.744 25.238 13.243 - - - 0.028 3.094 0.195 0.863 10.557 17.551 - - -
35 | BLOOM (176B) 0.291 -0.02 4.665 0.08 0.71 32.013 5.252 - - - -0.35 4.778 0.059 0.515 1.764 8.934 - - -
36 | ada (350M) 0.231 0.169 3.742 0.026 0.773 36.596 12.07 - - - -0.115 0.009 -0.232 0.407 2.653 8.023 - - -
37 | babbage (1.3B) 0.196 0.194 3.207 -0.129 0.606 43.534 6.733 - - - -0.188 0.195 0.02 0.604 4.386 11.716 - - -
38 | UL2 (20B) 0.118 -0.27 - -0.121 0.72 5.044 7.186 - - - -0.275 - 0.072 0.643 3.208 7.853 - - -
39 | T5 (11B) 0.112 -0.122 - -0.17 0.555 2.698 19.248 - - - -0.258 - -0.315 0.355 0.831 16.544 - - -
40 | YaLM (100B) 0.045 -0.322 - -0.145 0.541 1.09 6.936 - - - -0.347 1.176 0.031 0.567 1.041 9.951 - - -
41 | T0pp (11B) - -0.044 - 0.155 0.841 8.588 8.274 - - - -0.3 - 0.097 0.579 1.684 11.178 - - -
42 | Pythia (6.9B) - - - - - - - - - - - - - - - - - - -
43 | Pythia (12B) - - - - - - - - - - - - - - - - - - -
44 | LLaMA (7B) - - - - - - - - - - - - - - - - - - -
45 | LLaMA (13B) - - - - - - - - - - - - - - - - - - -
46 | LLaMA (30B) - - - - - - - - - - - - - - - - - - -
47 | LLaMA (65B) - - - - - - - - - - - - - - - - - - -
48 | Llama 2 (7B) - - - - - - - - - - - - - - - - - - -
49 | Llama 2 (13B) - - - - - - - - - - - - - - - - - - -
50 | Llama 2 (70B) - - - - - - - - - - - - - - - - - - -
51 | Alpaca (7B) - - - - - - - - - - - - - - - - - - -
52 | Vicuna v1.3 (7B) - - - - - - - - - - - - - - - - - - -
53 | Vicuna v1.3 (13B) - - - - - - - - - - - - - - - - - - -
54 | Mistral v0.1 (7B) - - - - - - - - - - - - - - - - - - -
55 | gpt-3.5-turbo-0301 - - - - - - - - - - - - - - - - - - -
56 | gpt-3.5-turbo-0613 - - - - - - - - - - - - - - - - - - -
57 | RedPajama-INCITE-Base-v1 (3B) - - - - - - - - - - - - - - - - - - -
58 | RedPajama-INCITE-Instruct-v1 (3B) - - - - - - - - - - - - - - - - - - -
59 | RedPajama-INCITE-Base (7B) - - - - - - - - - - - - - - - - - - -
60 | RedPajama-INCITE-Instruct (7B) - - - - - - - - - - - - - - - - - - -
61 | MPT (30B) - - - - - - - - - - - - - - - - - - -
62 | MPT-Instruct (30B) - - - - - - - - - - - - - - - - - - -
63 | Falcon (7B) - - - - - - - - - - - - - - - - - - -
64 | Falcon-Instruct (7B) - - - - - - - - - - - - - - - - - - -
65 | Falcon (40B) - - - - - - - - - - - - - - - - - - -
66 | Falcon-Instruct (40B) - - - - - - - - - - - - - - - - - - -
67 | InstructPalmyra (30B) - - - - 0.972 28.97 7.901 - - - - - - 0.844 3.441 15.707 - - -
68 | Palmyra X (43B) - - - - 0.291 2.35 3.117 - - - - - - 0.775 2.466 14.252 - - -
69 |
70 |
--------------------------------------------------------------------------------
/benchbench/data/helm/fairness.tsv:
--------------------------------------------------------------------------------
1 | Model Mean win rate MMLU - EM (Fairness) BoolQ - EM (Fairness) NarrativeQA - F1 (Fairness) NaturalQuestions (closed-book) - F1 (Fairness) NaturalQuestions (open-book) - F1 (Fairness) QuAC - F1 (Fairness) HellaSwag - EM (Fairness) OpenbookQA - EM (Fairness) TruthfulQA - EM (Fairness) MS MARCO (regular) - RR@10 (Fairness) MS MARCO (TREC) - NDCG@10 (Fairness) IMDB - EM (Fairness) CivilComments - EM (Fairness) RAFT - EM (Fairness)
2 | Llama 2 (70B) 0.959 0.557 0.859 0.709 0.4 0.637 0.414 - - 0.434 - - 0.954 0.551 0.7
3 | LLaMA (65B) 0.924 0.551 0.847 0.661 0.375 0.633 0.333 - - 0.42 - - 0.953 0.574 0.668
4 | text-davinci-003 0.903 0.537 0.858 0.664 0.356 0.721 0.45 0.729 0.578 0.491 0.335 0.633 0.833 0.559 0.705
5 | Cohere Command beta (52.4B) 0.866 0.407 0.822 0.657 0.296 0.706 0.316 0.699 0.508 0.222 0.45 0.748 0.957 0.544 0.627
6 | text-davinci-002 0.864 0.531 0.837 0.646 0.32 0.659 0.353 0.703 0.54 0.515 0.373 0.639 0.934 0.463 0.671
7 | Mistral v0.1 (7B) 0.861 0.542 0.842 0.644 0.3 0.625 0.353 - - 0.332 - - 0.952 0.52 0.664
8 | Jurassic-2 Jumbo (178B) 0.836 0.45 0.792 0.658 0.327 0.62 0.34 0.655 0.488 0.354 0.342 0.62 0.933 0.507 0.711
9 | LLaMA (30B) 0.822 0.496 0.813 0.657 0.356 0.621 0.325 - - 0.266 - - 0.913 0.508 0.718
10 | Llama 2 (13B) 0.808 0.466 0.732 0.657 0.309 0.58 0.351 - - 0.274 - - 0.957 0.489 0.673
11 | Palmyra X (43B) 0.797 0.588 0.875 0.651 0.362 - 0.399 - - 0.542 - - 0.918 0.006 0.672
12 | Anthropic-LM v4-s3 (52B) 0.794 0.447 0.782 0.646 0.239 0.642 0.356 0.695 0.482 0.3 - - 0.925 0.512 0.67
13 | TNLG v2 (530B) 0.752 0.418 0.767 0.632 0.318 0.598 0.313 0.678 0.504 0.197 0.341 0.612 0.936 0.48 0.644
14 | MPT (30B) 0.746 0.41 0.631 0.653 0.287 0.624 0.318 - - 0.19 - - 0.955 0.553 0.68
15 | gpt-3.5-turbo-0613 0.718 0.313 0.817 0.547 0.287 0.627 0.398 - - 0.255 - - 0.912 0.525 0.641
16 | Vicuna v1.3 (13B) 0.715 0.424 0.748 0.607 0.266 0.63 0.324 - - 0.315 - - 0.707 0.569 0.62
17 | Falcon-Instruct (40B) 0.709 0.466 0.799 0.543 0.331 0.607 0.308 - - 0.312 - - 0.957 0.462 0.561
18 | Jurassic-2 Grande (17B) 0.704 0.433 0.78 0.645 0.283 0.584 0.34 0.632 0.466 0.29 0.243 0.471 0.931 0.445 0.689
19 | MPT-Instruct (30B) 0.687 0.4 0.807 0.633 0.233 0.639 0.252 - - 0.18 - - 0.944 0.527 0.636
20 | Falcon (40B) 0.686 0.48 0.783 0.559 0.338 0.625 0.256 - - 0.292 - - 0.954 0.292 0.611
21 | J1-Grande v2 beta (17B) 0.677 0.409 0.764 0.647 0.27 0.571 0.308 0.623 0.478 0.242 0.253 0.435 0.95 0.404 0.637
22 | Cohere Command beta (6.1B) 0.662 0.366 0.748 0.595 0.167 0.654 0.273 0.608 0.468 0.163 0.411 0.69 0.95 0.496 0.609
23 | gpt-3.5-turbo-0301 0.662 0.53 0.666 0.585 0.331 0.559 0.417 - - 0.514 - - 0.844 0.422 0.689
24 | OPT (175B) 0.622 0.287 0.731 0.573 0.246 0.561 0.266 0.66 0.5 0.203 0.26 0.419 0.944 0.491 0.58
25 | Vicuna v1.3 (7B) 0.622 0.385 0.67 0.553 0.224 0.575 0.304 - - 0.235 - - 0.906 0.564 0.643
26 | Llama 2 (7B) 0.61 0.392 0.706 0.596 0.264 0.55 0.321 - - 0.223 - - 0.871 0.503 0.609
27 | Cohere xlarge v20221108 (52.4B) 0.608 0.317 0.708 0.553 0.299 0.566 0.275 0.687 0.5 0.12 0.267 0.522 0.949 0.415 0.604
28 | LLaMA (13B) 0.602 0.385 0.666 0.628 0.288 0.561 0.267 - - 0.234 - - 0.903 0.533 0.605
29 | davinci (175B) 0.558 0.38 0.682 0.597 0.276 0.567 0.279 0.641 0.502 0.155 0.185 0.357 0.921 0.478 0.605
30 | LLaMA (7B) 0.553 0.284 0.71 0.552 0.241 0.537 0.257 - - 0.219 - - 0.936 0.505 0.545
31 | BLOOM (176B) 0.551 0.274 0.656 0.577 0.187 0.575 0.273 0.585 0.482 0.186 0.211 0.371 0.938 0.546 0.563
32 | Cohere xlarge v20220609 (52.4B) 0.55 0.315 0.667 0.548 0.255 0.535 0.281 0.66 0.47 0.156 0.233 0.431 0.949 0.479 0.598
33 | InstructPalmyra (30B) 0.538 0.371 0.7 0.405 0.276 0.63 0.337 - - 0.152 - - 0.931 0.449 0.618
34 | Luminous Supreme (70B) 0.522 0.264 0.694 0.603 0.241 0.597 0.288 - - 0.132 - - 0.949 0.432 0.601
35 | GLM (130B) 0.513 0.315 0.69 0.615 0.12 0.597 0.205 - - 0.192 - - 0.933 0.5 0.575
36 | J1-Jumbo v1 (178B) 0.488 0.236 0.709 0.581 0.235 0.54 0.268 0.614 0.466 0.156 0.18 0.348 0.932 0.478 0.623
37 | Jurassic-2 Large (7.5B) 0.483 0.297 0.685 - 0.217 0.539 - 0.567 0.45 0.196 0.215 0.44 0.945 0.403 0.567
38 | OPT (66B) 0.476 0.229 0.71 0.526 0.218 0.536 0.268 0.597 0.454 0.173 0.214 0.471 0.908 0.5 0.536
39 | RedPajama-INCITE-Instruct (7B) 0.466 0.305 0.616 0.506 0.164 0.592 0.181 - - 0.183 - - 0.907 0.54 0.67
40 | J1-Grande v1 (17B) 0.454 0.232 0.678 0.547 0.187 0.521 0.274 0.58 0.472 0.163 0.138 0.328 0.946 0.482 0.636
41 | Luminous Extended (30B) 0.451 0.237 0.711 0.532 0.214 0.551 0.277 - - 0.16 - - 0.937 0.462 0.489
42 | Falcon (7B) 0.447 0.261 0.702 0.52 0.233 0.537 0.262 - - 0.213 - - 0.794 0.494 0.555
43 | text-curie-001 0.377 0.231 0.576 0.463 0.132 0.5 0.255 0.534 0.452 0.239 0.244 0.482 0.91 0.471 0.458
44 | Alpaca (7B) 0.372 0.346 0.729 0.299 0.21 0.53 0.204 - - 0.202 - - 0.699 0.483 0.459
45 | RedPajama-INCITE-Instruct-v1 (3B) 0.369 0.222 0.648 0.506 0.143 0.571 0.183 - - 0.179 - - 0.876 0.499 0.632
46 | Cohere large v20220720 (13.1B) 0.362 0.281 0.676 0.512 0.178 0.507 0.256 0.575 0.446 0.157 0.164 0.312 0.92 0.443 0.564
47 | Cohere medium v20221108 (6.1B) 0.34 0.22 0.642 0.497 0.149 0.45 0.229 0.567 0.44 0.182 0.145 0.353 0.917 0.493 0.571
48 | GPT-NeoX (20B) 0.331 0.215 0.609 0.461 0.154 0.525 0.232 0.552 0.438 0.179 0.148 0.381 0.928 0.491 0.475
49 | RedPajama-INCITE-Base (7B) 0.323 0.276 0.65 0.524 0.193 0.514 0.238 - - 0.17 - - 0.694 0.431 0.595
50 | Falcon-Instruct (7B) 0.297 0.261 0.637 0.354 0.148 0.383 0.219 - - 0.183 - - 0.811 0.502 0.5
51 | TNLG v2 (6.7B) 0.291 0.212 0.665 0.517 0.162 0.501 0.267 0.53 0.412 0.144 0.14 0.317 0.912 0.473 0.502
52 | GPT-J (6B) 0.29 0.22 0.639 0.433 0.122 0.493 0.249 0.486 0.416 0.18 0.129 0.332 0.927 0.488 0.594
53 | J1-Large v1 (7.5B) 0.275 0.204 0.622 0.513 0.146 0.47 0.241 0.528 0.444 0.174 0.117 0.28 0.946 0.447 0.511
54 | RedPajama-INCITE-Base-v1 (3B) 0.27 0.232 0.624 0.42 0.145 0.452 0.238 - - 0.248 - - 0.89 0.393 0.475
55 | Cohere medium v20220720 (6.1B) 0.269 0.237 0.597 0.438 0.126 0.432 0.198 0.525 0.42 0.174 0.132 0.357 0.918 0.489 0.5
56 | text-babbage-001 0.244 0.205 0.41 0.299 0.053 0.24 0.196 0.405 0.386 0.207 0.174 0.424 0.887 0.499 0.475
57 | Luminous Base (13B) 0.238 0.185 0.653 0.498 0.16 0.511 0.266 - - 0.125 - - 0.912 0.397 0.445
58 | curie (6.7B) 0.231 0.218 0.594 0.482 0.147 0.479 0.243 0.522 0.43 0.186 0.14 0.284 0.86 0.412 0.473
59 | Pythia (12B) 0.226 0.212 0.547 0.449 0.131 0.523 0.227 - - 0.154 - - 0.916 0.448 0.489
60 | T0pp (11B) 0.203 0.382 0 0.086 0.028 0.136 0.067 - - 0.35 - - 0.168 0.165 0.106
61 | UL2 (20B) 0.186 0.273 0.698 0.053 0.162 0.303 0.107 - - 0.162 - - 0.271 0.423 0.375
62 | Pythia (6.9B) 0.171 0.207 0.552 0.389 0.103 0.464 0.198 - - 0.18 - - 0.911 0.333 0.45
63 | YaLM (100B) 0.167 0.243 0.583 0.146 0.052 0.177 0.1 - - 0.202 - - 0.8 0.456 0.342
64 | Cohere small v20220720 (410M) 0.154 0.222 0.374 0.179 0.055 0.219 0.144 0.308 0.28 0.203 - 0.28 0.518 0.495 0.452
65 | T5 (11B) 0.15 0.235 0.723 0.05 0.159 0.424 0.074 - - 0.101 - - 0.303 0.329 0.351
66 | babbage (1.3B) 0.134 0.206 0.436 0.367 0.084 0.381 0.202 0.401 0.326 0.178 0.105 0.301 0.534 0.474 0.438
67 | text-ada-001 0.108 0.202 0.378 0.119 0.012 0.083 0.091 0.27 0.266 0.191 0.107 0.276 0.769 0.497 0.376
68 | ada (350M) 0.105 0.21 0.507 0.205 0.057 0.273 0.166 0.294 0.318 0.185 0.086 0.268 0.806 0.436 0.395
69 |
70 |
--------------------------------------------------------------------------------
/benchbench/data/helm/robustness.tsv:
--------------------------------------------------------------------------------
1 | Model Mean win rate MMLU - EM (Robustness) BoolQ - EM (Robustness) NarrativeQA - F1 (Robustness) NaturalQuestions (closed-book) - F1 (Robustness) NaturalQuestions (open-book) - F1 (Robustness) QuAC - F1 (Robustness) HellaSwag - EM (Robustness) OpenbookQA - EM (Robustness) TruthfulQA - EM (Robustness) MS MARCO (regular) - RR@10 (Robustness) MS MARCO (TREC) - NDCG@10 (Robustness) IMDB - EM (Robustness) CivilComments - EM (Robustness) RAFT - EM (Robustness)
2 | Llama 2 (70B) 0.965 0.545 0.863 0.722 0.42 0.639 0.362 - - 0.468 - - 0.949 0.59 0.673
3 | text-davinci-002 0.916 0.525 0.841 0.638 0.299 0.665 0.319 0.776 0.52 0.547 0.344 0.628 0.925 0.567 0.666
4 | text-davinci-003 0.91 0.517 0.858 0.694 0.369 0.73 0.42 0.798 0.572 0.516 0.304 0.616 0.779 0.594 0.714
5 | Mistral v0.1 (7B) 0.896 0.533 0.837 0.649 0.305 0.631 0.31 - - 0.339 - - 0.954 0.521 0.652
6 | LLaMA (65B) 0.885 0.504 0.84 0.567 0.388 0.624 0.275 - - 0.448 - - 0.935 0.566 0.655
7 | Cohere Command beta (52.4B) 0.85 0.387 0.811 0.57 0.289 0.679 0.238 0.774 0.492 0.229 0.434 0.734 0.933 0.535 0.599
8 | Llama 2 (13B) 0.823 0.444 0.753 0.682 0.324 0.563 0.294 - - 0.287 - - 0.954 0.47 0.652
9 | Palmyra X (43B) 0.821 0.566 0.878 0.672 0.363 - 0.383 - - 0.568 - - 0.904 0.006 0.677
10 | Anthropic-LM v4-s3 (52B) 0.818 0.434 0.756 0.663 0.245 0.632 0.313 0.766 0.472 0.326 - - 0.928 0.514 0.6
11 | gpt-3.5-turbo-0301 0.816 0.525 0.66 0.602 0.327 0.556 0.411 - - 0.566 - - 0.857 0.605 0.705
12 | LLaMA (30B) 0.815 0.461 0.791 0.611 0.36 0.612 0.273 - - 0.281 - - 0.893 0.503 0.67
13 | Jurassic-2 Jumbo (178B) 0.791 0.417 0.729 0.66 0.315 0.599 0.314 0.754 0.47 0.39 0.337 0.607 0.896 0.449 0.69
14 | Jurassic-2 Grande (17B) 0.764 0.411 0.729 0.583 0.285 0.564 0.276 0.755 0.474 0.293 0.227 0.423 0.928 0.488 0.618
15 | Falcon-Instruct (40B) 0.763 0.446 0.781 0.508 0.335 0.591 0.212 - - 0.338 - - 0.938 0.523 0.523
16 | gpt-3.5-turbo-0613 0.762 0.262 0.845 0.566 0.284 0.606 0.371 - - 0.187 - - 0.916 0.564 0.677
17 | Vicuna v1.3 (13B) 0.732 0.413 0.757 0.525 0.273 0.621 0.247 - - 0.341 - - 0.674 0.593 0.591
18 | J1-Grande v2 beta (17B) 0.711 0.392 0.692 0.565 0.235 0.56 0.251 0.732 0.474 0.252 0.222 0.407 0.947 0.495 0.555
19 | Falcon (40B) 0.705 0.457 0.763 0.557 0.329 0.593 0.162 - - 0.303 - - 0.935 0.412 0.586
20 | MPT (30B) 0.697 0.381 0.656 0.584 0.272 0.609 0.231 - - 0.177 - - 0.942 0.484 0.58
21 | Vicuna v1.3 (7B) 0.662 0.371 0.672 0.5 0.214 0.539 0.25 - - 0.258 - - 0.882 0.543 0.6
22 | MPT-Instruct (30B) 0.656 0.383 0.77 0.623 0.202 0.607 0.204 - - 0.177 - - 0.942 0.408 0.548
23 | TNLG v2 (530B) 0.65 0.403 0.733 0.319 0.307 0.525 0.194 0.757 0.476 0.202 0.287 0.565 0.921 0.409 0.545
24 | GLM (130B) 0.647 0.32 0.728 0.629 0.117 0.6 0.193 - - 0.196 - - 0.938 0.5 0.577
25 | Llama 2 (7B) 0.644 0.373 0.676 0.573 0.261 0.501 0.271 - - 0.234 - - 0.808 0.516 0.573
26 | LLaMA (13B) 0.637 0.37 0.67 0.544 0.272 0.556 0.194 - - 0.274 - - 0.875 0.529 0.559
27 | Cohere Command beta (6.1B) 0.616 0.334 0.725 0.529 0.163 0.605 0.17 0.696 0.448 0.171 0.387 0.685 0.921 0.468 0.552
28 | Cohere xlarge v20221108 (52.4B) 0.596 0.299 0.718 0.39 0.283 0.533 0.229 0.764 0.482 0.116 0.242 0.482 0.923 0.408 0.489
29 | LLaMA (7B) 0.568 0.268 0.688 0.485 0.222 0.519 0.223 - - 0.229 - - 0.897 0.492 0.486
30 | Luminous Supreme (70B) 0.546 0.255 0.665 0.59 0.252 0.586 0.233 - - 0.106 - - 0.932 0.263 0.564
31 | BLOOM (176B) 0.541 0.25 0.642 0.53 0.185 0.558 0.234 0.699 0.438 0.183 0.19 0.333 0.92 0.467 0.527
32 | Jurassic-2 Large (7.5B) 0.527 0.263 0.607 - 0.187 0.503 - 0.687 0.448 0.21 0.177 0.397 0.941 0.469 0.498
33 | InstructPalmyra (30B) 0.522 0.348 0.656 0.317 0.267 0.567 0.248 - - 0.151 - - 0.906 0.443 0.518
34 | OPT (175B) 0.519 0.27 0.623 0.409 0.208 0.408 0.2 0.744 0.488 0.205 0.235 0.408 0.919 0.184 0.48
35 | davinci (175B) 0.509 0.34 0.639 0.498 0.256 0.521 0.208 0.738 0.474 0.145 0.154 0.332 0.873 0.461 0.505
36 | Cohere xlarge v20220609 (52.4B) 0.506 0.29 0.614 0.383 0.238 0.471 0.215 0.759 0.448 0.151 0.207 0.397 0.923 0.32 0.563
37 | RedPajama-INCITE-Instruct (7B) 0.495 0.291 0.599 0.482 0.137 0.547 0.164 - - 0.197 - - 0.82 0.527 0.605
38 | J1-Jumbo v1 (178B) 0.452 0.221 0.65 0.523 0.179 0.503 0.222 0.726 0.43 0.154 0.144 0.307 0.923 0.271 0.555
39 | OPT (66B) 0.438 0.216 0.683 0.397 0.206 0.458 0.199 0.699 0.45 0.174 0.179 0.437 0.886 0.305 0.405
40 | Luminous Extended (30B) 0.43 0.23 0.659 0.513 0.212 0.524 0.193 - - 0.151 - - 0.92 0.368 0.436
41 | Falcon (7B) 0.425 0.236 0.65 0.436 0.185 0.489 0.164 - - 0.205 - - 0.692 0.485 0.516
42 | J1-Grande v1 (17B) 0.423 0.225 0.643 0.477 0.17 0.478 0.219 0.695 0.424 0.142 0.121 0.297 0.941 0.417 0.513
43 | RedPajama-INCITE-Instruct-v1 (3B) 0.387 0.218 0.629 0.403 0.132 0.536 0.137 - - 0.173 - - 0.852 0.506 0.548
44 | Alpaca (7B) 0.379 0.324 0.643 0.246 0.203 0.491 0.16 - - 0.199 - - 0.561 0.482 0.42
45 | Cohere large v20220720 (13.1B) 0.345 0.253 0.545 0.357 0.172 0.347 0.204 0.687 0.43 0.154 0.13 0.257 0.902 0.333 0.49
46 | text-curie-001 0.337 0.22 0.549 0.34 0.121 0.415 0.169 0.625 0.424 0.235 0.198 0.444 0.881 0.129 0.399
47 | GPT-NeoX (20B) 0.336 0.189 0.551 0.421 0.133 0.452 0.191 0.661 0.414 0.175 0.096 0.351 0.912 0.48 0.399
48 | RedPajama-INCITE-Base (7B) 0.331 0.25 0.569 0.424 0.167 0.472 0.186 - - 0.173 - - 0.56 0.401 0.489
49 | Luminous Base (13B) 0.319 0.183 0.655 0.476 0.163 0.491 0.185 - - 0.112 - - 0.887 0.416 0.402
50 | Falcon-Instruct (7B) 0.303 0.25 0.593 0.258 0.132 0.327 0.179 - - 0.17 - - 0.759 0.487 0.445
51 | J1-Large v1 (7.5B) 0.298 0.2 0.567 0.4 0.098 0.41 0.197 0.646 0.412 0.155 0.105 0.248 0.932 0.444 0.443
52 | RedPajama-INCITE-Base-v1 (3B) 0.293 0.217 0.585 0.346 0.134 0.396 0.177 - - 0.226 - - 0.843 0.336 0.427
53 | GPT-J (6B) 0.291 0.217 0.621 0.135 0.099 0.228 0.147 0.619 0.398 0.181 0.116 0.319 0.903 0.418 0.53
54 | Pythia (12B) 0.272 0.22 0.51 0.42 0.108 0.47 0.171 - - 0.138 - - 0.854 0.418 0.45
55 | Cohere medium v20221108 (6.1B) 0.27 0.207 0.54 0.296 0.105 0.222 0.152 0.687 0.414 0.17 0.13 0.314 0.888 0.353 0.502
56 | UL2 (20B) 0.257 0.272 0.646 0.059 0.141 0.291 0.111 - - 0.178 - - 0.276 0.45 0.349
57 | TNLG v2 (6.7B) 0.24 0.169 0.638 0.352 0.149 0.299 0.159 0.656 0.408 0.136 0.105 0.278 0.896 0.336 0.445
58 | curie (6.7B) 0.231 0.19 0.545 0.367 0.126 0.338 0.171 0.632 0.396 0.186 0.11 0.253 0.803 0.347 0.413
59 | T0pp (11B) 0.228 0.378 0 0.099 0.031 0.122 0.071 - - 0.365 - - 0.17 0.087 0.085
60 | text-babbage-001 0.226 0.186 0.384 0.126 0.04 0.151 0.087 0.468 0.39 0.195 0.122 0.356 0.844 0.499 0.383
61 | YaLM (100B) 0.205 0.243 0.566 0.088 0.047 0.125 0.08 - - 0.202 - - 0.719 0.463 0.211
62 | Cohere medium v20220720 (6.1B) 0.188 0.184 0.562 0.3 0.102 0.266 0.144 0.651 0.382 0.149 0.109 0.315 0.889 0.136 0.385
63 | Pythia (6.9B) 0.182 0.201 0.527 0.313 0.094 0.391 0.171 - - 0.139 - - 0.871 0.363 0.377
64 | T5 (11B) 0.164 0.258 0.65 0.045 0.153 0.071 0.064 - - 0.122 - - 0.304 0.392 0.331
65 | Cohere small v20220720 (410M) 0.147 0.226 0.361 0.078 0.025 0.074 0.098 0.405 0.238 0.204 - 0.252 0.473 0.434 0.403
66 | babbage (1.3B) 0.117 0.166 0.477 0.255 0.068 0.212 0.149 0.489 0.314 0.162 0.073 0.246 0.5 0.4 0.409
67 | text-ada-001 0.105 0.178 0.332 0.058 0.008 0.034 0.067 0.32 0.248 0.175 0.069 0.252 0.716 0.491 0.335
68 | ada (350M) 0.102 0.204 0.461 0.104 0.031 0.043 0.092 0.37 0.27 0.167 0.072 0.247 0.701 0.421 0.345
69 |
70 |
--------------------------------------------------------------------------------
/benchbench/data/mteb/leaderboard.tsv:
--------------------------------------------------------------------------------
1 | Rank Model - Model Size (GB) Embedding Dimensions Max Tokens Average (56 datasets) Classification Average (12 datasets) Clustering Average (11 datasets) Pair Classification Average (3 datasets) Reranking Average (4 datasets) Retrieval Average (15 datasets) STS Average (10 datasets) Summarization Average (1 dataset)
2 | 1 SFR-Embedding-Mistral - 14.22 4096 32768 67.56 78.33 51.67 88.54 60.64 59 85.05 31.16
3 | 2 voyage-lite-02-instruct - - 1024 4000 67.13 79.25 52.42 86.87 58.24 56.6 85.79 31.01
4 | 3 e5-mistral-7b-instruct - 14.22 4096 32768 66.63 78.47 50.26 88.34 60.21 56.89 84.63 31.4
5 | 4 UAE-Large-V1 - 1.34 1024 512 64.64 75.58 46.73 87.25 59.88 54.66 84.54 32.03
6 | 5 text-embedding-3-large - - 3072 8191 64.59 75.45 49.01 85.72 59.16 55.44 81.73 29.92
7 | 6 voyage-lite-01-instruct - - 1024 4000 64.49 74.79 47.4 86.57 59.74 55.58 82.93 30.97
8 | 7 Cohere-embed-english-v3.0 - - 1024 512 64.47 76.49 47.43 85.84 58.01 55 82.62 30.18
9 | 8 bge-large-en-v1.5 - 1.34 1024 512 64.23 75.97 46.08 87.12 60.03 54.29 83.11 31.61
10 | 9 Cohere-embed-multilingual-v3.0 - - 1024 512 64.01 76.01 46.6 86.15 57.86 53.84 83.15 30.99
11 | 10 GIST-Embedding-v0 - 0.44 768 512 63.71 76.03 46.21 86.32 59.37 52.31 83.51 30.87
12 | 11 bge-base-en-v1.5 - 0.44 768 512 63.55 75.53 45.77 86.55 58.86 53.25 82.4 31.07
13 | 12 ember-v1 - 1.34 1024 512 63.54 75.99 45.58 87.37 60.04 51.92 83.34 30.82
14 | 13 sf_model_e5 - 1.34 1024 512 63.34 73.96 46.61 86.85 59.86 51.8 83.85 31.61
15 | 14 gte-large - 0.67 1024 512 63.13 73.33 46.84 85 59.13 52.22 83.35 31.66
16 | 15 stella-base-en-v2 - 0.22 768 512 62.61 75.28 44.9 86.45 58.78 50.1 83.02 32.52
17 | 16 gte-base - 0.22 768 512 62.39 73.01 46.2 84.57 58.61 51.14 82.3 31.17
18 | 17 text-embedding-3-small - - 1536 8191 62.26 73.21 46.65 85.04 56.72 51.08 81.58 31.12
19 | 18 e5-large-v2 - 1.34 1024 512 62.25 75.24 44.49 86.03 56.61 50.56 82.05 30.19
20 | 19 bge-small-en-v1.5 - 0.13 384 512 62.17 74.14 43.82 84.92 58.36 51.68 81.59 30.12
21 | 20 Cohere-embed-english-light-v3.0 - - 384 512 62.01 74.31 44.64 85.05 56.09 51.34 80.92 31.29
22 | 21 text-embedding-3-large-256 - - 256 8191 62 71.97 46.23 84.22 57.99 51.66 81.04 29.92
23 | 22 instructor-xl - 4.96 768 512 61.79 73.12 44.74 86.62 57.29 49.26 83.06 32.32
24 | 23 instructor-large - 1.34 768 512 61.59 73.86 45.29 85.89 57.54 47.57 83.15 31.84
25 | 24 e5-base-v2 - 0.44 768 512 61.5 73.84 43.8 85.73 55.91 50.29 81.05 30.28
26 | 25 multilingual-e5-large - 2.24 1024 514 61.5 74.81 41.06 84.75 55.86 51.43 81.56 29.69
27 | 26 e5-large - 1.34 1024 512 61.42 73.14 43.33 85.94 56.53 49.99 82.06 30.97
28 | 27 gte-small - 0.07 384 512 61.36 72.31 44.89 83.54 57.7 49.46 82.07 30.42
29 | 28 text-embedding-ada-002 - - 1536 8191 60.99 70.93 45.9 84.89 56.32 49.25 80.97 30.8
30 | 29 udever-bloom-7b1 - 28.28 4096 2048 60.63 72.13 40.81 85.4 55.91 49.34 83.01 30.97
31 | 30 e5-base - 0.44 768 512 60.44 72.63 42.11 85.09 55.7 48.75 80.96 31.01
32 | 31 jina-embeddings-v2-base-en - - - - 60.38 73.45 41.73 85.38 56.98 47.87 80.7 31.6
33 | 32 Cohere-embed-multilingual-light-v3.0 - - 384 512 60.08 70.57 41.98 83.95 55.06 50.15 80.09 30.41
34 | 33 e5-small-v2 - 0.13 384 512 59.93 72.94 39.92 84.67 54.32 49.04 80.39 31.16
35 | 34 udever-bloom-3b - 12.01 2560 2048 59.86 71.91 40.74 84.06 54.9 47.67 82.37 30.62
36 | 35 instructor-base - 0.44 768 512 59.54 72.36 41.9 83.51 56.2 45.12 82.29 29.85
37 | 36 sentence-t5-xxl - 9.73 768 512 59.51 73.42 43.72 85.06 56.42 42.24 82.63 30.08
38 | 37 multilingual-e5-base - 1.11 768 514 59.45 73.02 37.89 83.57 54.84 48.88 80.26 30.11
39 | 38 XLM-3B5-embedding - - - - 59.29 72.25 43.48 79.23 57.12 44.99 80.47 29.02
40 | 39 gtr-t5-xxl - 9.73 768 512 58.97 67.41 42.42 86.12 56.66 48.48 78.38 30.64
41 | 40 SGPT-5.8B-weightedmean-msmarco-specb-bitfit - 23.5 4096 2048 58.93 68.13 40.34 82 56.56 50.25 78.1 31.46
42 | 41 e5-small - 0.13 384 512 58.89 71.67 39.51 85.08 54.45 46.01 80.87 31.39
43 | 42 gte-tiny - 0.05 384 512 58.69 70.35 42.09 82.83 55.77 44.92 80.46 29.47
44 | 43 gtr-t5-xl - 2.48 768 512 58.42 67.11 41.51 86.13 55.96 47.96 77.8 30.21
45 | 44 udever-bloom-1b1 - 4.26 1536 2048 58.29 70.17 39.11 83.11 54.28 45.27 81.52 31.1
46 | 45 gtr-t5-large - 0.67 768 512 58.28 67.14 41.6 85.32 55.36 47.42 78.19 29.5
47 | 46 jina-embeddings-v2-small-en - - - - 58 68.82 40.08 84.44 55.09 45.14 80 30.56
48 | 47 XLM-0B6-embedding - - - - 57.97 70.55 42.97 77.83 55.6 43.39 79.02 30.25
49 | 48 multilingual-e5-small - 0.47 384 512 57.87 70.74 37.08 82.59 53.87 46.64 79.1 29.98
50 | 49 sentence-t5-xl - 2.48 768 512 57.87 72.84 42.34 86.06 54.71 38.47 81.66 29.91
51 | 50 all-mpnet-base-v2 - 0.44 768 514 57.78 65.07 43.69 83.04 59.36 43.81 80.28 27.49
52 | 51 sgpt-bloom-7b1-msmarco - 28.27 4096 2048 57.59 66.19 38.93 81.9 55.65 48.22 77.74 33.6
53 | 52 jina-embedding-l-en-v1 - 1.34 1024 512 57.38 67.76 37.15 84.8 56.42 44.81 80.96 29.85
54 | 53 SGPT-2.7B-weightedmean-msmarco-specb-bitfit - 10.74 2560 2048 57.17 67.13 39.83 80.65 54.67 46.54 76.83 31.03
55 | 54 sentence-t5-large - 0.67 768 512 57.06 72.31 41.65 84.97 54 36.71 81.83 29.64
56 | 55 MegatronBert-1B3-embedding - - - - 56.81 69.65 40.86 76.9 55.5 41.41 79.11 31.01
57 | 56 bge-micro-v2 - 0.03 384 512 56.57 68.04 39.18 82.81 54.29 42.56 78.65 29.87
58 | 57 all-MiniLM-L12-v2 - 0.13 384 512 56.53 63.21 41.81 82.41 58.44 42.69 79.8 27.9
59 | 58 all-MiniLM-L6-v2 - 0.09 384 512 56.26 63.05 42.35 82.37 58.04 41.95 78.9 30.81
60 | 59 jina-embedding-b-en-v1 - 0.44 768 512 56.26 66.07 35.88 83.04 55.84 44.03 79.93 30.71
61 | 60 SGPT-1.3B-weightedmean-msmarco-specb-bitfit - 5.36 2048 2048 56.2 66.52 39.92 79.58 54 44.49 75.74 30.43
62 | 61 gtr-t5-base - 0.22 768 512 56.19 65.25 38.63 83.85 54.23 44.67 77.07 29.67
63 | 62 contriever-base-msmarco - 0.44 768 512 56 66.68 41.1 82.54 53.14 41.88 76.51 30.36
64 | 63 udever-bloom-560m - 2.24 1024 2048 55.81 68.04 36.89 81.05 52.6 41.19 79.93 32.06
65 | 64 bge-micro - 0.03 384 512 55.71 66.35 39.46 81.77 54.28 40.82 78.37 31.16
66 | 65 sentence-t5-base - 0.22 768 512 55.27 69.81 40.21 85.18 53.09 33.63 81.14 31.39
67 | 66 bge-small-4096 - 0.14 384 4096 54.42 67.8 38.03 81.4 53.64 36.08 78.59 29.83
68 | 67 lodestone-base-4096-v1 - 0.27 768 4096 54.24 67.3 40.9 80.4 53.95 36.99 73.7 31.23
69 | 68 SGPT-5.8B-weightedmean-nli-bitfit - 23.5 4096 2048 53.74 70.14 36.98 77.03 52.33 32.34 80.53 30.38
70 | 69 multi-qa-MiniLM-L6-cos-v1 - - 384 512 53.29 61.67 35.67 80.86 54.58 41.17 74.23 31.05
71 | 70 msmarco-bert-co-condensor - 0.44 768 512 52.35 64.71 37.64 81.74 51.84 32.96 76.47 29.5
72 | 71 jina-embedding-s-en-v1 - 0.14 512 512 52.33 60.56 32.56 79.22 53.07 38.91 78.06 31.25
73 | 72 SGPT-125M-weightedmean-msmarco-specb-bitfit - 0.55 768 2048 51.25 60.72 35.79 75.23 50.58 37.04 73.41 29.71
74 | 73 text-similarity-ada-001 - - 1024 2046 49.52 70.44 37.52 76.86 49.02 18.36 78.6 26.94
75 | 74 sup-simcse-bert-base-uncased - 0.44 768 512 48.87 67.32 33.43 73.68 47.54 21.82 79.12 31.17
76 | 75 SGPT-125M-weightedmean-nli-bitfit - 0.55 768 2048 45.97 61.46 30.95 71.78 47.56 20.9 74.71 30.26
77 | 76 unsup-simcse-bert-base-uncased - 0.44 768 512 45.45 62.5 29.04 70.33 46.47 20.29 74.33 31.15
78 | 77 LaBSE - 1.88 768 512 45.21 62.71 29.55 78.87 48.42 18.99 70.8 31.05
79 | 78 komninos - 0.27 300 N/A 42.06 57.65 26.57 72.94 44.75 21.22 62.46 30.49
80 | 79 glove.6B.300d - 0.48 300 N/A 41.96 57.29 27.73 70.92 43.29 21.62 61.85 28.87
81 | 80 SONAR - - - - 40.72 60.43 22.9 71.4 46.18 13.47 67.18 30.56
82 | 81 allenai-specter - 0.44 768 512 40.28 52.37 34.06 61.37 48.1 15.88 61.02 27.66
83 | 82 bert-base-uncased - 0.44 768 512 38.33 61.66 30.12 56.33 43.44 10.59 54.36 29.82
84 | 83 LASER2 - 0.17 1024 N/A 34.95 53.18 15.28 68.86 41.44 7.94 63.27 26.8
85 |
--------------------------------------------------------------------------------
/benchbench/data/helm/accuracy.tsv:
--------------------------------------------------------------------------------
1 | Model/adapter Mean win rate MMLU - EM BoolQ - EM NarrativeQA - F1 NaturalQuestions (closed-book) - F1 NaturalQuestions (open-book) - F1 QuAC - F1 HellaSwag - EM OpenbookQA - EM TruthfulQA - EM MS MARCO (regular) - RR@10 MS MARCO (TREC) - NDCG@10 CNN/DailyMail - ROUGE-2 XSUM - ROUGE-2 IMDB - EM CivilComments - EM RAFT - EM
2 | Llama 2 (70B) 0.944 0.582 0.886 0.77 0.458 0.674 0.484 - - 0.554 - - - - 0.961 0.652 0.727
3 | LLaMA (65B) 0.908 0.584 0.871 0.755 0.431 0.672 0.401 - - 0.508 - - - - 0.962 0.655 0.702
4 | text-davinci-002 0.905 0.568 0.877 0.727 0.383 0.713 0.445 0.815 0.594 0.61 0.421 0.664 0.153 0.144 0.948 0.668 0.733
5 | Mistral v0.1 (7B) 0.884 0.572 0.874 0.716 0.365 0.687 0.423 - - 0.422 - - - - 0.962 0.624 0.707
6 | Cohere Command beta (52.4B) 0.874 0.452 0.856 0.752 0.372 0.76 0.432 0.811 0.582 0.269 0.472 0.762 0.161 0.152 0.96 0.601 0.667
7 | text-davinci-003 0.872 0.569 0.881 0.727 0.406 0.77 0.525 0.822 0.646 0.593 0.368 0.644 0.156 0.124 0.848 0.684 0.759
8 | Jurassic-2 Jumbo (178B) 0.824 0.48 0.829 0.733 0.385 0.669 0.435 0.788 0.558 0.437 0.398 0.661 0.149 0.182 0.938 0.57 0.746
9 | Llama 2 (13B) 0.823 0.507 0.811 0.744 0.376 0.637 0.424 - - 0.33 - - - - 0.962 0.588 0.707
10 | TNLG v2 (530B) 0.787 0.469 0.809 0.722 0.384 0.642 0.39 0.799 0.562 0.251 0.377 0.643 0.161 0.169 0.941 0.601 0.679
11 | gpt-3.5-turbo-0613 0.783 0.391 0.87 0.625 0.348 0.675 0.485 - - 0.339 - - - - 0.943 0.696 0.748
12 | LLaMA (30B) 0.781 0.531 0.861 0.752 0.408 0.666 0.39 - - 0.344 - - - - 0.927 0.549 0.752
13 | Anthropic-LM v4-s3 (52B) 0.78 0.481 0.815 0.728 0.288 0.686 0.431 0.807 0.558 0.368 - - 0.154 0.134 0.934 0.61 0.699
14 | gpt-3.5-turbo-0301 0.76 0.59 0.74 0.663 0.39 0.624 0.512 - - 0.609 - - - - 0.899 0.674 0.768
15 | Jurassic-2 Grande (17B) 0.743 0.475 0.826 0.737 0.356 0.639 0.418 0.781 0.542 0.348 0.293 0.514 0.144 0.167 0.938 0.547 0.712
16 | Palmyra X (43B) 0.732 0.609 0.896 0.742 0.413 - 0.473 - - 0.616 - - 0.049 0.149 0.935 0.008 0.701
17 | Falcon (40B) 0.729 0.509 0.819 0.673 0.392 0.675 0.307 - - 0.353 - - - - 0.959 0.552 0.661
18 | Falcon-Instruct (40B) 0.727 0.497 0.829 0.625 0.377 0.666 0.371 - - 0.384 - - - - 0.959 0.603 0.586
19 | MPT-Instruct (30B) 0.716 0.444 0.85 0.733 0.304 0.697 0.327 - - 0.234 - - - - 0.956 0.573 0.68
20 | MPT (30B) 0.714 0.437 0.704 0.732 0.347 0.673 0.393 - - 0.231 - - - - 0.959 0.599 0.723
21 | J1-Grande v2 beta (17B) 0.706 0.445 0.812 0.725 0.337 0.625 0.392 0.764 0.56 0.306 0.285 0.46 0.146 0.152 0.957 0.546 0.679
22 | Vicuna v1.3 (13B) 0.706 0.462 0.808 0.691 0.346 0.686 0.403 - - 0.385 - - - - 0.762 0.645 0.657
23 | Cohere Command beta (6.1B) 0.675 0.406 0.798 0.709 0.229 0.717 0.375 0.752 0.55 0.203 0.434 0.709 0.153 0.122 0.961 0.54 0.634
24 | Cohere xlarge v20221108 (52.4B) 0.664 0.382 0.762 0.672 0.361 0.628 0.374 0.81 0.588 0.169 0.315 0.55 0.153 0.153 0.956 0.524 0.624
25 | Luminous Supreme (70B) 0.662 0.38 0.775 0.711 0.293 0.649 0.37 - - 0.222 - - 0.15 0.136 0.959 0.562 0.653
26 | Vicuna v1.3 (7B) 0.625 0.434 0.76 0.643 0.287 0.634 0.392 - - 0.292 - - - - 0.916 0.62 0.693
27 | OPT (175B) 0.609 0.318 0.793 0.671 0.297 0.615 0.36 0.791 0.586 0.25 0.288 0.448 0.146 0.155 0.947 0.505 0.606
28 | Llama 2 (7B) 0.607 0.431 0.762 0.691 0.337 0.611 0.406 - - 0.272 - - - - 0.907 0.562 0.643
29 | LLaMA (13B) 0.595 0.422 0.714 0.711 0.346 0.614 0.347 - - 0.324 - - - - 0.928 0.6 0.643
30 | InstructPalmyra (30B) 0.568 0.403 0.751 0.496 0.33 0.682 0.433 - - 0.185 - - 0.152 0.104 0.94 0.555 0.652
31 | Cohere xlarge v20220609 (52.4B) 0.56 0.353 0.718 0.65 0.312 0.595 0.361 0.811 0.55 0.198 0.273 0.459 0.144 0.129 0.956 0.532 0.633
32 | Jurassic-2 Large (7.5B) 0.553 0.339 0.742 - 0.274 0.589 - 0.729 0.53 0.245 0.247 0.464 0.136 0.142 0.956 0.57 0.622
33 | davinci (175B) 0.538 0.422 0.722 0.687 0.329 0.625 0.36 0.775 0.586 0.194 0.211 0.378 0.127 0.126 0.933 0.532 0.642
34 | LLaMA (7B) 0.533 0.321 0.756 0.669 0.297 0.589 0.338 - - 0.28 - - - - 0.947 0.563 0.573
35 | RedPajama-INCITE-Instruct (7B) 0.524 0.363 0.705 0.638 0.232 0.659 0.26 - - 0.243 - - - - 0.927 0.664 0.695
36 | J1-Jumbo v1 (178B) 0.517 0.259 0.776 0.695 0.293 0.595 0.358 0.765 0.534 0.175 0.21 0.363 0.144 0.129 0.943 0.553 0.681
37 | GLM (130B) 0.512 0.344 0.784 0.706 0.148 0.642 0.272 - - 0.218 - - 0.154 0.132 0.955 0.5 0.598
38 | Luminous Extended (30B) 0.485 0.321 0.767 0.665 0.254 0.609 0.349 - - 0.221 - - 0.139 0.124 0.947 0.524 0.523
39 | OPT (66B) 0.448 0.276 0.76 0.638 0.258 0.596 0.357 0.745 0.534 0.201 0.237 0.482 0.136 0.126 0.917 0.506 0.557
40 | BLOOM (176B) 0.446 0.299 0.704 0.662 0.216 0.621 0.361 0.744 0.534 0.205 0.236 0.386 0.08 0.03 0.945 0.62 0.592
41 | J1-Grande v1 (17B) 0.433 0.27 0.722 0.672 0.233 0.578 0.362 0.739 0.52 0.193 0.161 0.341 0.143 0.122 0.953 0.529 0.658
42 | Alpaca (7B) 0.381 0.385 0.778 0.396 0.266 0.592 0.27 - - 0.243 - - - - 0.738 0.566 0.486
43 | Falcon (7B) 0.378 0.286 0.753 0.621 0.285 0.579 0.332 - - 0.234 - - - - 0.836 0.514 0.602
44 | RedPajama-INCITE-Base (7B) 0.378 0.302 0.713 0.617 0.25 0.586 0.336 - - 0.205 - - - - 0.752 0.547 0.648
45 | Cohere large v20220720 (13.1B) 0.372 0.324 0.725 0.625 0.232 0.573 0.338 0.736 0.542 0.181 0.19 0.33 0.126 0.108 0.933 0.507 0.596
46 | RedPajama-INCITE-Instruct-v1 (3B) 0.366 0.257 0.677 0.638 0.203 0.637 0.259 - - 0.208 - - - - 0.894 0.549 0.661
47 | text-curie-001 0.36 0.237 0.62 0.582 0.175 0.571 0.358 0.676 0.514 0.257 0.271 0.507 0.152 0.076 0.923 0.537 0.489
48 | GPT-NeoX (20B) 0.351 0.276 0.683 0.599 0.193 0.596 0.326 0.718 0.524 0.216 0.184 0.398 0.123 0.102 0.948 0.516 0.505
49 | Luminous Base (13B) 0.315 0.27 0.719 0.605 0.202 0.568 0.334 - - 0.182 - - 0.11 0.105 0.939 0.544 0.473
50 | Cohere medium v20221108 (6.1B) 0.312 0.254 0.7 0.61 0.199 0.517 0.314 0.726 0.538 0.215 0.175 0.373 0.121 0.099 0.935 0.5 0.591
51 | RedPajama-INCITE-Base-v1 (3B) 0.311 0.263 0.685 0.555 0.207 0.52 0.309 - - 0.277 - - - - 0.907 0.549 0.502
52 | TNLG v2 (6.7B) 0.309 0.242 0.698 0.631 0.21 0.561 0.345 0.704 0.478 0.167 0.158 0.332 0.146 0.11 0.927 0.532 0.525
53 | J1-Large v1 (7.5B) 0.285 0.241 0.683 0.623 0.19 0.532 0.328 0.7 0.514 0.197 0.147 0.292 0.134 0.102 0.956 0.532 0.545
54 | GPT-J (6B) 0.273 0.249 0.649 0.545 0.156 0.559 0.33 0.663 0.514 0.199 0.152 0.345 0.131 0.096 0.939 0.52 0.619
55 | Pythia (12B) 0.257 0.274 0.662 0.596 0.175 0.581 0.313 - - 0.177 - - - - 0.931 0.531 0.514
56 | curie (6.7B) 0.247 0.243 0.656 0.604 0.199 0.552 0.321 0.682 0.502 0.232 0.162 0.3 0.113 0.091 0.889 0.539 0.49
57 | Falcon-Instruct (7B) 0.244 0.275 0.72 0.476 0.194 0.449 0.311 - - 0.213 - - - - 0.852 0.511 0.523
58 | Cohere medium v20220720 (6.1B) 0.23 0.279 0.659 0.559 0.177 0.504 0.279 0.706 0.496 0.19 0.152 0.374 0.077 0.087 0.935 0.504 0.52
59 | text-babbage-001 0.229 0.229 0.451 0.429 0.07 0.33 0.284 0.561 0.452 0.233 0.208 0.449 0.151 0.046 0.913 0.499 0.509
60 | T0pp (11B) 0.197 0.407 0 0.151 0.039 0.19 0.121 - - 0.377 - - 0.122 0.09 0.207 0.234 0.118
61 | Pythia (6.9B) 0.196 0.236 0.631 0.528 0.142 0.539 0.296 - - 0.213 - - - - 0.928 0.511 0.502
62 | UL2 (20B) 0.167 0.291 0.746 0.083 0.204 0.349 0.144 - - 0.193 - - 0.03 0.058 0.337 0.521 0.404
63 | T5 (11B) 0.131 0.29 0.761 0.086 0.194 0.477 0.116 - - 0.133 - - 0.043 0.015 0.379 0.509 0.37
64 | babbage (1.3B) 0.114 0.235 0.574 0.491 0.119 0.451 0.273 0.555 0.438 0.188 0.122 0.317 0.079 0.045 0.597 0.519 0.455
65 | Cohere small v20220720 (410M) 0.109 0.264 0.457 0.294 0.078 0.309 0.219 0.483 0.348 0.217 - 0.304 0.063 0.033 0.578 0.501 0.492
66 | ada (350M) 0.108 0.243 0.581 0.326 0.082 0.365 0.242 0.435 0.38 0.215 0.102 0.29 0.09 0.022 0.849 0.517 0.423
67 | text-ada-001 0.107 0.238 0.464 0.238 0.025 0.149 0.176 0.429 0.346 0.232 0.134 0.302 0.136 0.034 0.822 0.503 0.406
68 | YaLM (100B) 0.075 0.243 0.634 0.252 0.068 0.227 0.162 - - 0.202 - - 0.017 0.021 0.836 0.49 0.395
69 |
--------------------------------------------------------------------------------
/benchbench/data/openllm/leaderboard.tsv:
--------------------------------------------------------------------------------
1 | T Model Average ARC HellaSwag MMLU TruthfulQA Winogrande GSM8K
2 | 🟢 cloudyu/Yi-34Bx2-MoE-60B 76.72 71.08 85.23 77.47 66.19 84.85 75.51
3 | 🟢 cloudyu/Mixtral_34Bx2_MoE_60B 76.66 71.33 85.25 77.34 66.59 84.85 74.6
4 | 🟢 cloudyu/Mixtral_34Bx2_MoE_60B 76.63 71.25 85.36 77.28 66.61 84.69 74.6
5 | 🟦 moreh/MoMo-70B-lora-1.8.4-DPO 76.23 69.62 85.35 77.33 64.64 84.14 76.27
6 | 🔶 cloudyu/Yi-34Bx3-MoE-90B 76.18 70.9 85.33 77.41 66.31 84.29 72.86
7 | 🟦 moreh/MoMo-70B-lora-1.8.5-DPO 76.14 69.54 85.6 77.49 65.79 84.14 74.3
8 | 🔶 TomGrc/FusionNet_7Bx2_MoE_14B 75.91 73.55 88.84 64.68 69.6 88.16 70.66
9 | 🔶 one-man-army/UNA-34Beagles-32K-bf16-v1 75.41 73.55 85.93 76.45 73.55 82.95 60.05
10 | 🔶 jondurbin/nontoxic-bagel-34b-v0.2 74.69 72.44 85.64 76.41 72.7 82.48 58.45
11 | ⭕ jondurbin/bagel-dpo-34b-v0.2 74.69 71.93 85.25 76.58 70.05 83.35 60.96
12 | 🔶 moreh/MoMo-70B-LoRA-V1.4 74.67 69.2 85.07 77.12 62.66 83.74 70.2
13 | 🟦 udkai/Turdus 74.66 73.38 88.56 64.52 67.11 86.66 67.7
14 | 🔶 jondurbin/bagel-dpo-34b-v0.2 74.5 72.01 85.24 76.58 70.16 83.03 59.97
15 | 🔶 kodonho/Solar-OrcaDPO-Solar-Instruct-SLERP 74.35 70.99 88.22 66.22 71.95 83.43 65.28
16 | 🔶 kodonho/SolarM-SakuraSolar-SLERP 74.29 71.16 88.47 66.24 72.1 83.11 64.67
17 | ⭕ bhavinjawade/SOLAR-10B-OrcaDPO-Jawade 74.27 71.16 88.27 66.12 71.57 83.66 64.82
18 | 🔶 VAGOsolutions/SauerkrautLM-SOLAR-Instruct 74.21 70.82 88.63 66.2 71.95 83.5 64.14
19 | 🟦 upstage/SOLAR-10.7B-Instruct-v1.0 74.2 71.08 88.16 66.21 71.43 83.58 64.75
20 | 🔶 fblgit/UNA-SOLAR-10.7B-Instruct-v1.0 74.2 70.56 88.18 66.08 72.05 83.66 64.67
21 | 🟦 bhavinjawade/SOLAR-10B-Nector-DPO-Jawade 74.19 71.33 88.62 66.22 70.92 83.43 64.59
22 | 🟦 dhanushreddy29/BrokenKeyboard 74.08 71.25 88.34 66.04 71.36 83.19 64.29
23 | 🟦 fblgit/UNA-SOLAR-10.7B-Instruct-v1.0 74.07 70.73 88.32 66.1 72.52 83.35 63.38
24 | 🔶 fblgit/UNA-POLAR-10.7B-InstructMath-v2 74.07 70.73 88.2 66.03 71.73 82.95 64.75
25 | 🔶 yhyu13/LMCocktail-10.7B-v1 74.06 70.65 88.13 66.21 71.03 83.35 64.97
26 | 🔶 rishiraj/meow 73.94 70.48 88.08 66.25 70.49 83.43 64.9
27 | 🟦 fblgit/UNA-TheBeagle-7b-v1 73.87 73.04 88 63.48 69.85 82.16 66.72
28 | 🔶 fblgit/UNAversal-8x7B-v1beta 73.78 69.8 86.9 70.39 71.97 82 61.64
29 | 🔶 NousResearch/Nous-Hermes-2-Yi-34B 73.74 66.89 85.49 76.7 60.37 82.95 70.05
30 | 🟦 argilla/distilabeled-Marcoro14-7B-slerp 73.63 70.73 87.47 65.22 65.1 82.08 71.19
31 | 🟢 Qwen/Qwen-72B 73.6 65.19 85.94 77.37 60.19 82.48 70.43
32 | 🟦 mlabonne/NeuralMarcoro14-7B 73.57 71.42 87.59 64.84 65.64 81.22 70.74
33 | 🔶 abideen/NexoNimbus-7B 73.5 70.82 87.86 64.69 62.43 84.85 70.36
34 | 🟦 Neuronovo/neuronovo-7B-v0.2 73.44 73.04 88.32 65.15 71.02 80.66 62.47
35 | 🟢 cloudyu/Mixtral_7Bx2_MoE 73.43 71.25 87.45 64.98 67.23 81.22 68.46
36 | 🟦 argilla/distilabeled-Marcoro14-7B-slerp-full 73.4 70.65 87.55 65.33 64.21 82 70.66
37 | 🟦 CultriX/MistralTrix-v1 73.39 72.27 88.33 65.24 70.73 80.98 62.77
38 | 🔶 cloudyu/Mixtral_7Bx5_MoE_30B 73.39 69.97 86.82 64.42 65.97 80.98 72.18
39 | 🟢 macadeliccc/SOLAR-math-2x10.7b 73.37 68.43 86.31 66.9 64.21 83.35 71.04
40 | 🟦 ryandt/MusingCaterpillar 73.33 72.53 88.34 65.26 70.93 80.66 62.24
41 | 🟢 cloudyu/Mixtral_7Bx6_MoE_35B 73.32 70.14 86.77 64.74 65.79 81.06 71.42
42 | 🔶 cloudyu/Mixtral_7Bx6_MoE_35B 73.31 69.97 86.82 64.91 65.77 81.14 71.27
43 | 🟦 Neuronovo/neuronovo-7B-v0.3 73.29 72.7 88.26 65.1 71.35 80.9 61.41
44 | ⭕ SUSTech/SUS-Chat-34B 73.22 66.3 83.91 76.41 57.04 83.5 72.18
45 | 🔶 Sao10K/SOLAR-10.7B-NahIdWin 73.21 64.51 85.67 64.17 76.73 80.51 67.7
46 | 🟦 argilla/notus-8x7b-experiment 73.18 70.99 87.73 71.33 65.79 81.61 61.64
47 | 🟦 CultriX/MistralTrixTest 73.17 72.53 88.4 65.22 70.77 81.37 60.73
48 | 🟢 macadeliccc/Orca-SOLAR-4x10.7b 73.17 68.52 86.78 67.03 64.54 83.9 68.23
49 | 🔶 samir-fama/SamirGPT-v1 73.11 69.54 87.04 65.3 63.37 81.69 71.72
50 | 🔶 SanjiWatsuki/Lelantos-DPO-7B 73.09 71.08 87.22 64 67.77 80.03 68.46
51 | 🟦 argilla/notux-8x7b-v1-epoch-2 73.05 70.65 87.8 71.43 65.97 82.08 60.35
52 | 🟦 CultriX/MistralTrixTest 73.17 72.53 88.4 65.22 70.77 81.37 60.73
53 | 🟢 macadeliccc/Orca-SOLAR-4x10.7b 73.17 68.52 86.78 67.03 64.54 83.9 68.23
54 | 🔶 samir-fama/SamirGPT-v1 73.11 69.54 87.04 65.3 63.37 81.69 71.72
55 | 🔶 SanjiWatsuki/Lelantos-DPO-7B 73.09 71.08 87.22 64 67.77 80.03 68.46
56 | 🟦 argilla/notux-8x7b-v1-epoch-2 73.05 70.65 87.8 71.43 65.97 82.08 60.35
57 | 🔶 shadowml/Marcoro14-7B-ties 73.01 69.8 87.13 65.11 63.54 81.61 70.89
58 | 🔶 argilla/notux-8x7b-v1 72.97 70.65 87.72 71.39 66.21 80.74 61.11
59 | 🔶 AA051611/whattest 72.96 66.81 84.43 76.59 58.04 82.48 69.45
60 | 🟦 bardsai/jaskier-7b-dpo 72.91 70.82 87.02 64.67 64.41 80.19 70.36
61 | 🔶 VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct 72.89 70.48 87.75 71.37 65.71 81.22 60.8
62 | 🔶 samir-fama/FernandoGPT-v1 72.87 69.45 86.94 65.19 61.18 81.14 73.31
63 | 🔶 PSanni/MPOMixtral-8x7B-Instruct-v0.1 72.8 70.99 87.95 70.26 66.52 82.56 58.53
64 | 🔶 cookinai/OpenCM-14 72.75 69.28 86.89 65.01 61.07 81.29 72.93
65 | 🔶 VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct 72.73 70.56 87.74 71.08 65.72 81.45 59.82
66 | 🔶 mistralai/Mixtral-8x7B-Instruct-v0.1 72.7 70.14 87.55 71.4 64.98 81.06 61.11
67 | 🔶 senseable/garten2-7b 72.65 69.37 87.54 65.44 59.5 84.69 69.37
68 | ⭕ mistralai/Mixtral-8x7B-Instruct-v0.1 72.62 70.22 87.63 71.16 64.58 81.37 60.73
69 | 🔶 AIDC-ai-business/Marcoroni-7B-v3 72.53 69.45 86.78 65 60.4 81.45 72.1
70 | 🟦 bardsai/jaskier-7b-dpo-v2 72.53 69.28 86.8 64.92 61.64 80.74 71.8
71 | 🔶 Toten5/Marcoroni-v3-neural-chat-v3-3-Slerp 72.51 68.77 86.55 64.51 62.7 80.74 71.8
72 | 🔶 jondurbin/bagel-dpo-8x7b-v0.2 72.49 72.1 86.41 70.27 72.83 83.27 50.04
73 | 🔶 Brillibits/Instruct_Mixtral-8x7B-v0.1_Dolly15K 72.44 69.28 87.59 70.96 64.83 82.56 59.44
74 | 🔶 SanjiWatsuki/Kunoichi-DPO-v2-7B 72.4 69.37 87.42 64.83 66 80.74 66.03
75 | 🔶 mindy-labs/mindy-7b 72.34 69.11 86.57 64.69 60.89 81.06 71.72
76 | 🔶 janhq/supermario-v2 72.34 68.52 86.51 64.88 60.58 81.37 72.18
77 | 🔶 OpenBuddy/openbuddy-deepseek-67b-v15.2 72.33 68.6 86.37 71.5 56.2 84.45 66.87
78 | 🔶 shadowml/Beyonder-4x7B-v2 72.33 68.77 86.8 65.1 60.68 80.9 71.72
79 | 🔶 janhq/supermario-slerp 72.32 68.94 86.58 64.93 60.11 81.29 72.1
80 | ⭕ mncai/yi-34B-v3 72.26 67.06 85.11 75.8 57.54 83.5 64.52
81 | 🔶 Sao10K/Fimbulvetr-10.7B-v1 72.25 68.94 87.27 66.59 60.54 83.5 66.64
82 | 🔶 SanjiWatsuki/Kunoichi-DPO-7B 72.24 69.62 87.14 64.79 67.31 80.58 63.99
83 | 🟦 rwitz2/grindin 72.18 69.88 87.02 64.98 59.34 80.9 70.96
84 | 🔶 SanjiWatsuki/Kunoichi-7B 72.13 68.69 87.1 64.9 64.04 81.06 67.02
85 | ⭕ mncai/yi-34B-v2 72.12 66.13 85 75.64 57.34 83.66 64.97
86 | 🔶 CausalLM/72B-preview 72.12 65.19 83.23 77.14 52.58 82.48 72.1
87 | 🔶 mindy-labs/mindy-7b-v2 72.11 68.69 86.59 65.18 60.16 81.06 70.96
88 | 🔶 CausalLM/72B-preview 72.06 64.85 83.28 77.21 52.51 82.48 72.02
89 | 🔶 rwitz/dec10 72.05 69.11 86.46 64.98 60.42 80.74 70.58
90 | 🔶 rwitz/dec10 72.01 69.2 86.48 64.91 60.52 80.43 70.51
91 | 🔶 cookinai/Valkyrie-V1 71.92 67.24 86.27 64.82 60.4 81.45 71.34
92 | 🔶 AA051611/A0110 71.89 66.38 84.73 74.48 58.6 82.32 64.82
93 | ⭕ DopeorNope/COKAL-v1-70B 71.87 87.46 83.29 68.13 72.79 80.27 39.27
94 | 🟦 bn22/Nous-Hermes-2-SOLAR-10.7B-MISALIGNED 71.83 68.26 86.11 66.26 57.79 83.43 69.14
95 | 🔶 AA051611/A0109 71.83 66.55 84.7 74.44 58.75 82.16 64.37
96 | ⭕ deepseek-ai/deepseek-llm-67b-chat 71.79 67.75 86.82 72.42 55.85 84.21 63.68
97 | 🔶 OpenBuddy/openbuddy-deepseek-67b-v15.1 71.76 67.66 86.49 70.3 54.42 84.77 66.94
98 | 🔶 migtissera/Tess-M-Creative-v1.0 71.73 66.81 85.14 75.54 57.68 83.11 62.09
99 | 🟦 VitalContribution/Evangelion-7B 71.71 68.94 86.45 63.97 64.01 79.95 66.94
100 | ⭕ bhenrym14/platypus-yi-34b 71.69 68.43 85.21 78.13 54.48 84.06 59.82
101 | 🟦 RatanRohith/NeuralPizza-7B-V0.1 71.53 70.48 87.3 64.42 67.22 80.35 59.44
102 |
--------------------------------------------------------------------------------
/benchbench/data/imagenet/leaderboard_raw.tsv:
--------------------------------------------------------------------------------
1 | Weight Acc@1 Acc@5 Params GFLOPS Recipe
2 | AlexNet_Weights.IMAGENET1K_V1 56.522 79.066 61.1M 0.71 link
3 | ConvNeXt_Base_Weights.IMAGENET1K_V1 84.062 96.87 88.6M 15.36 link
4 | ConvNeXt_Large_Weights.IMAGENET1K_V1 84.414 96.976 197.8M 34.36 link
5 | ConvNeXt_Small_Weights.IMAGENET1K_V1 83.616 96.65 50.2M 8.68 link
6 | ConvNeXt_Tiny_Weights.IMAGENET1K_V1 82.52 96.146 28.6M 4.46 link
7 | DenseNet121_Weights.IMAGENET1K_V1 74.434 91.972 8.0M 2.83 link
8 | DenseNet161_Weights.IMAGENET1K_V1 77.138 93.56 28.7M 7.73 link
9 | DenseNet169_Weights.IMAGENET1K_V1 75.6 92.806 14.1M 3.36 link
10 | DenseNet201_Weights.IMAGENET1K_V1 76.896 93.37 20.0M 4.29 link
11 | EfficientNet_B0_Weights.IMAGENET1K_V1 77.692 93.532 5.3M 0.39 link
12 | EfficientNet_B1_Weights.IMAGENET1K_V1 78.642 94.186 7.8M 0.69 link
13 | EfficientNet_B1_Weights.IMAGENET1K_V2 79.838 94.934 7.8M 0.69 link
14 | EfficientNet_B2_Weights.IMAGENET1K_V1 80.608 95.31 9.1M 1.09 link
15 | EfficientNet_B3_Weights.IMAGENET1K_V1 82.008 96.054 12.2M 1.83 link
16 | EfficientNet_B4_Weights.IMAGENET1K_V1 83.384 96.594 19.3M 4.39 link
17 | EfficientNet_B5_Weights.IMAGENET1K_V1 83.444 96.628 30.4M 10.27 link
18 | EfficientNet_B6_Weights.IMAGENET1K_V1 84.008 96.916 43.0M 19.07 link
19 | EfficientNet_B7_Weights.IMAGENET1K_V1 84.122 96.908 66.3M 37.75 link
20 | EfficientNet_V2_L_Weights.IMAGENET1K_V1 85.808 97.788 118.5M 56.08 link
21 | EfficientNet_V2_M_Weights.IMAGENET1K_V1 85.112 97.156 54.1M 24.58 link
22 | EfficientNet_V2_S_Weights.IMAGENET1K_V1 84.228 96.878 21.5M 8.37 link
23 | GoogLeNet_Weights.IMAGENET1K_V1 69.778 89.53 6.6M 1.5 link
24 | Inception_V3_Weights.IMAGENET1K_V1 77.294 93.45 27.2M 5.71 link
25 | MNASNet0_5_Weights.IMAGENET1K_V1 67.734 87.49 2.2M 0.1 link
26 | MNASNet0_75_Weights.IMAGENET1K_V1 71.18 90.496 3.2M 0.21 link
27 | MNASNet1_0_Weights.IMAGENET1K_V1 73.456 91.51 4.4M 0.31 link
28 | MNASNet1_3_Weights.IMAGENET1K_V1 76.506 93.522 6.3M 0.53 link
29 | MaxVit_T_Weights.IMAGENET1K_V1 83.7 96.722 30.9M 5.56 link
30 | MobileNet_V2_Weights.IMAGENET1K_V1 71.878 90.286 3.5M 0.3 link
31 | MobileNet_V2_Weights.IMAGENET1K_V2 72.154 90.822 3.5M 0.3 link
32 | MobileNet_V3_Large_Weights.IMAGENET1K_V1 74.042 91.34 5.5M 0.22 link
33 | MobileNet_V3_Large_Weights.IMAGENET1K_V2 75.274 92.566 5.5M 0.22 link
34 | MobileNet_V3_Small_Weights.IMAGENET1K_V1 67.668 87.402 2.5M 0.06 link
35 | RegNet_X_16GF_Weights.IMAGENET1K_V1 80.058 94.944 54.3M 15.94 link
36 | RegNet_X_16GF_Weights.IMAGENET1K_V2 82.716 96.196 54.3M 15.94 link
37 | RegNet_X_1_6GF_Weights.IMAGENET1K_V1 77.04 93.44 9.2M 1.6 link
38 | RegNet_X_1_6GF_Weights.IMAGENET1K_V2 79.668 94.922 9.2M 1.6 link
39 | RegNet_X_32GF_Weights.IMAGENET1K_V1 80.622 95.248 107.8M 31.74 link
40 | RegNet_X_32GF_Weights.IMAGENET1K_V2 83.014 96.288 107.8M 31.74 link
41 | RegNet_X_3_2GF_Weights.IMAGENET1K_V1 78.364 93.992 15.3M 3.18 link
42 | RegNet_X_3_2GF_Weights.IMAGENET1K_V2 81.196 95.43 15.3M 3.18 link
43 | RegNet_X_400MF_Weights.IMAGENET1K_V1 72.834 90.95 5.5M 0.41 link
44 | RegNet_X_400MF_Weights.IMAGENET1K_V2 74.864 92.322 5.5M 0.41 link
45 | RegNet_X_800MF_Weights.IMAGENET1K_V1 75.212 92.348 7.3M 0.8 link
46 | RegNet_X_800MF_Weights.IMAGENET1K_V2 77.522 93.826 7.3M 0.8 link
47 | RegNet_X_8GF_Weights.IMAGENET1K_V1 79.344 94.686 39.6M 8 link
48 | RegNet_X_8GF_Weights.IMAGENET1K_V2 81.682 95.678 39.6M 8 link
49 | RegNet_Y_128GF_Weights.IMAGENET1K_SWAG_E2E_V1 88.228 98.682 644.8M 374.57 link
50 | RegNet_Y_128GF_Weights.IMAGENET1K_SWAG_LINEAR_V1 86.068 97.844 644.8M 127.52 link
51 | RegNet_Y_16GF_Weights.IMAGENET1K_V1 80.424 95.24 83.6M 15.91 link
52 | RegNet_Y_16GF_Weights.IMAGENET1K_V2 82.886 96.328 83.6M 15.91 link
53 | RegNet_Y_16GF_Weights.IMAGENET1K_SWAG_E2E_V1 86.012 98.054 83.6M 46.73 link
54 | RegNet_Y_16GF_Weights.IMAGENET1K_SWAG_LINEAR_V1 83.976 97.244 83.6M 15.91 link
55 | RegNet_Y_1_6GF_Weights.IMAGENET1K_V1 77.95 93.966 11.2M 1.61 link
56 | RegNet_Y_1_6GF_Weights.IMAGENET1K_V2 80.876 95.444 11.2M 1.61 link
57 | RegNet_Y_32GF_Weights.IMAGENET1K_V1 80.878 95.34 145.0M 32.28 link
58 | RegNet_Y_32GF_Weights.IMAGENET1K_V2 83.368 96.498 145.0M 32.28 link
59 | RegNet_Y_32GF_Weights.IMAGENET1K_SWAG_E2E_V1 86.838 98.362 145.0M 94.83 link
60 | RegNet_Y_32GF_Weights.IMAGENET1K_SWAG_LINEAR_V1 84.622 97.48 145.0M 32.28 link
61 | RegNet_Y_3_2GF_Weights.IMAGENET1K_V1 78.948 94.576 19.4M 3.18 link
62 | RegNet_Y_3_2GF_Weights.IMAGENET1K_V2 81.982 95.972 19.4M 3.18 link
63 | RegNet_Y_400MF_Weights.IMAGENET1K_V1 74.046 91.716 4.3M 0.4 link
64 | RegNet_Y_400MF_Weights.IMAGENET1K_V2 75.804 92.742 4.3M 0.4 link
65 | RegNet_Y_800MF_Weights.IMAGENET1K_V1 76.42 93.136 6.4M 0.83 link
66 | RegNet_Y_800MF_Weights.IMAGENET1K_V2 78.828 94.502 6.4M 0.83 link
67 | RegNet_Y_8GF_Weights.IMAGENET1K_V1 80.032 95.048 39.4M 8.47 link
68 | RegNet_Y_8GF_Weights.IMAGENET1K_V2 82.828 96.33 39.4M 8.47 link
69 | ResNeXt101_32X8D_Weights.IMAGENET1K_V1 79.312 94.526 88.8M 16.41 link
70 | ResNeXt101_32X8D_Weights.IMAGENET1K_V2 82.834 96.228 88.8M 16.41 link
71 | ResNeXt101_64X4D_Weights.IMAGENET1K_V1 83.246 96.454 83.5M 15.46 link
72 | ResNeXt50_32X4D_Weights.IMAGENET1K_V1 77.618 93.698 25.0M 4.23 link
73 | ResNeXt50_32X4D_Weights.IMAGENET1K_V2 81.198 95.34 25.0M 4.23 link
74 | ResNet101_Weights.IMAGENET1K_V1 77.374 93.546 44.5M 7.8 link
75 | ResNet101_Weights.IMAGENET1K_V2 81.886 95.78 44.5M 7.8 link
76 | ResNet152_Weights.IMAGENET1K_V1 78.312 94.046 60.2M 11.51 link
77 | ResNet152_Weights.IMAGENET1K_V2 82.284 96.002 60.2M 11.51 link
78 | ResNet18_Weights.IMAGENET1K_V1 69.758 89.078 11.7M 1.81 link
79 | ResNet34_Weights.IMAGENET1K_V1 73.314 91.42 21.8M 3.66 link
80 | ResNet50_Weights.IMAGENET1K_V1 76.13 92.862 25.6M 4.09 link
81 | ResNet50_Weights.IMAGENET1K_V2 80.858 95.434 25.6M 4.09 link
82 | ShuffleNet_V2_X0_5_Weights.IMAGENET1K_V1 60.552 81.746 1.4M 0.04 link
83 | ShuffleNet_V2_X1_0_Weights.IMAGENET1K_V1 69.362 88.316 2.3M 0.14 link
84 | ShuffleNet_V2_X1_5_Weights.IMAGENET1K_V1 72.996 91.086 3.5M 0.3 link
85 | ShuffleNet_V2_X2_0_Weights.IMAGENET1K_V1 76.23 93.006 7.4M 0.58 link
86 | SqueezeNet1_0_Weights.IMAGENET1K_V1 58.092 80.42 1.2M 0.82 link
87 | SqueezeNet1_1_Weights.IMAGENET1K_V1 58.178 80.624 1.2M 0.35 link
88 | Swin_B_Weights.IMAGENET1K_V1 83.582 96.64 87.8M 15.43 link
89 | Swin_S_Weights.IMAGENET1K_V1 83.196 96.36 49.6M 8.74 link
90 | Swin_T_Weights.IMAGENET1K_V1 81.474 95.776 28.3M 4.49 link
91 | Swin_V2_B_Weights.IMAGENET1K_V1 84.112 96.864 87.9M 20.32 link
92 | Swin_V2_S_Weights.IMAGENET1K_V1 83.712 96.816 49.7M 11.55 link
93 | Swin_V2_T_Weights.IMAGENET1K_V1 82.072 96.132 28.4M 5.94 link
94 | VGG11_BN_Weights.IMAGENET1K_V1 70.37 89.81 132.9M 7.61 link
95 | VGG11_Weights.IMAGENET1K_V1 69.02 88.628 132.9M 7.61 link
96 | VGG13_BN_Weights.IMAGENET1K_V1 71.586 90.374 133.1M 11.31 link
97 | VGG13_Weights.IMAGENET1K_V1 69.928 89.246 133.0M 11.31 link
98 | VGG16_BN_Weights.IMAGENET1K_V1 73.36 91.516 138.4M 15.47 link
99 | VGG16_Weights.IMAGENET1K_V1 71.592 90.382 138.4M 15.47 link
100 | VGG16_Weights.IMAGENET1K_FEATURES nan nan 138.4M 15.47 link
101 | VGG19_BN_Weights.IMAGENET1K_V1 74.218 91.842 143.7M 19.63 link
102 | VGG19_Weights.IMAGENET1K_V1 72.376 90.876 143.7M 19.63 link
103 | ViT_B_16_Weights.IMAGENET1K_V1 81.072 95.318 86.6M 17.56 link
104 | ViT_B_16_Weights.IMAGENET1K_SWAG_E2E_V1 85.304 97.65 86.9M 55.48 link
105 | ViT_B_16_Weights.IMAGENET1K_SWAG_LINEAR_V1 81.886 96.18 86.6M 17.56 link
106 | ViT_B_32_Weights.IMAGENET1K_V1 75.912 92.466 88.2M 4.41 link
107 | ViT_H_14_Weights.IMAGENET1K_SWAG_E2E_V1 88.552 98.694 633.5M 1016.72 link
108 | ViT_H_14_Weights.IMAGENET1K_SWAG_LINEAR_V1 85.708 97.73 632.0M 167.29 link
109 | ViT_L_16_Weights.IMAGENET1K_V1 79.662 94.638 304.3M 61.55 link
110 | ViT_L_16_Weights.IMAGENET1K_SWAG_E2E_V1 88.064 98.512 305.2M 361.99 link
111 | ViT_L_16_Weights.IMAGENET1K_SWAG_LINEAR_V1 85.146 97.422 304.3M 61.55 link
112 | ViT_L_32_Weights.IMAGENET1K_V1 76.972 93.07 306.5M 15.38 link
113 | Wide_ResNet101_2_Weights.IMAGENET1K_V1 78.848 94.284 126.9M 22.75 link
114 | Wide_ResNet101_2_Weights.IMAGENET1K_V2 82.51 96.02 126.9M 22.75 link
115 | Wide_ResNet50_2_Weights.IMAGENET1K_V1 78.468 94.086 68.9M 11.4 link
116 |
--------------------------------------------------------------------------------
/benchbench/data/helm_lite/leaderboard.tsv:
--------------------------------------------------------------------------------
1 | Model Mean win rate NarrativeQA - F1 NaturalQuestions (open) - F1 NaturalQuestions (closed) - F1 OpenbookQA - EM MMLU - EM MATH - Equivalent (CoT) GSM8K - EM LegalBench - EM MedQA - EM WMT 2014 - BLEU-4
2 | GPT-4o (2024-05-13) 0.938 0.804 0.803 0.501 0.966 0.748 0.829 0.905 0.733 0.857 0.231
3 | GPT-4o (2024-08-06) 0.928 0.795 0.793 0.496 0.968 0.738 0.853 0.909 0.721 0.863 0.225
4 | DeepSeek v3 0.908 0.796 0.765 0.467 0.954 0.803 0.912 0.94 0.718 0.809 0.209
5 | Claude 3.5 Sonnet (20240620) 0.885 0.746 0.749 0.502 0.972 0.799 0.813 0.949 0.707 0.825 0.229
6 | Amazon Nova Pro 0.885 0.791 0.829 0.405 0.96 0.758 0.821 0.87 0.736 0.811 0.229
7 | GPT-4 (0613) 0.867 0.768 0.79 0.457 0.96 0.735 0.802 0.932 0.713 0.815 0.211
8 | GPT-4 Turbo (2024-04-09) 0.864 0.761 0.795 0.482 0.97 0.711 0.833 0.824 0.727 0.783 0.218
9 | Llama 3.1 Instruct Turbo (405B) 0.854 0.749 0.756 0.456 0.94 0.759 0.827 0.949 0.707 0.805 0.238
10 | Claude 3.5 Sonnet (20241022) 0.846 0.77 0.665 0.467 0.966 0.809 0.904 0.956 0.647 0.859 0.226
11 | Gemini 1.5 Pro (002) 0.842 0.756 0.726 0.455 0.952 0.795 0.92 0.817 0.747 0.771 0.231
12 | Llama 3.2 Vision Instruct Turbo (90B) 0.819 0.777 0.739 0.457 0.942 0.703 0.791 0.936 0.68 0.769 0.224
13 | Gemini 2.0 Flash (Experimental) 0.813 0.783 0.722 0.443 0.946 0.717 0.901 0.946 0.674 0.73 0.212
14 | Llama 3.3 Instruct Turbo (70B) 0.812 0.791 0.737 0.431 0.928 0.7 0.808 0.942 0.725 0.761 0.219
15 | Llama 3.1 Instruct Turbo (70B) 0.808 0.772 0.738 0.452 0.938 0.709 0.783 0.938 0.687 0.769 0.223
16 | Palmyra-X-004 0.808 0.773 0.754 0.457 0.926 0.739 0.767 0.905 0.73 0.775 0.203
17 | Llama 3 (70B) 0.793 0.798 0.743 0.475 0.934 0.695 0.663 0.805 0.733 0.777 0.225
18 | Qwen2 Instruct (72B) 0.77 0.727 0.776 0.39 0.954 0.769 0.79 0.92 0.712 0.746 0.207
19 | Qwen2.5 Instruct Turbo (72B) 0.745 0.745 0.676 0.359 0.962 0.77 0.884 0.9 0.74 0.753 0.207
20 | Mistral Large 2 (2407) 0.744 0.779 0.734 0.453 0.932 0.725 0.677 0.912 0.646 0.775 0.192
21 | Gemini 1.5 Pro (001) 0.739 0.783 0.748 0.378 0.902 0.772 0.825 0.836 0.757 0.692 0.189
22 | Amazon Nova Lite 0.708 0.768 0.815 0.352 0.928 0.693 0.779 0.829 0.659 0.696 0.204
23 | Mixtral (8x22B) 0.705 0.779 0.726 0.478 0.882 0.701 0.656 0.8 0.708 0.704 0.209
24 | GPT-4o mini (2024-07-18) 0.701 0.768 0.746 0.386 0.92 0.668 0.802 0.843 0.653 0.748 0.206
25 | GPT-4 Turbo (1106 preview) 0.698 0.727 0.763 0.435 0.95 0.699 0.857 0.668 0.626 0.817 0.205
26 | Claude 3 Opus (20240229) 0.683 0.351 0.264 0.441 0.956 0.768 0.76 0.924 0.662 0.775 0.24
27 | Palmyra X V3 (72B) 0.679 0.706 0.685 0.407 0.938 0.702 0.723 0.831 0.709 0.684 0.262
28 | Gemma 2 Instruct (27B) 0.675 0.79 0.731 0.353 0.918 0.664 0.746 0.812 0.7 0.684 0.214
29 | Gemini 1.5 Flash (001) 0.667 0.783 0.723 0.332 0.928 0.703 0.753 0.785 0.661 0.68 0.225
30 | PaLM-2 (Unicorn) 0.644 0.583 0.674 0.435 0.938 0.702 0.674 0.831 0.677 0.684 0.26
31 | Jamba 1.5 Large 0.637 0.664 0.718 0.394 0.948 0.683 0.692 0.846 0.675 0.698 0.203
32 | Qwen1.5 (72B) 0.608 0.601 0.758 0.417 0.93 0.647 0.683 0.799 0.694 0.67 0.201
33 | Solar Pro 0.602 0.753 0.792 0.297 0.922 0.679 0.567 0.871 0.67 0.698 0.169
34 | Palmyra X V2 (33B) 0.589 0.752 0.752 0.428 0.878 0.621 0.58 0.735 0.644 0.598 0.239
35 | Gemini 1.5 Flash (002) 0.573 0.746 0.718 0.323 0.914 0.679 0.908 0.328 0.67 0.656 0.212
36 | Yi (34B) 0.57 0.782 0.775 0.443 0.92 0.65 0.375 0.648 0.618 0.656 0.172
37 | Gemma 2 Instruct (9B) 0.562 0.768 0.738 0.328 0.91 0.645 0.724 0.762 0.639 0.63 0.201
38 | Qwen1.5 Chat (110B) 0.55 0.721 0.739 0.35 0.922 0.704 0.568 0.815 0.624 0.64 0.192
39 | Qwen1.5 (32B) 0.546 0.589 0.777 0.353 0.932 0.628 0.733 0.773 0.636 0.656 0.193
40 | Claude 3.5 Haiku (20241022) 0.531 0.763 0.639 0.344 0.854 0.671 0.872 0.815 0.631 0.722 0.135
41 | PaLM-2 (Bison) 0.526 0.718 0.813 0.39 0.878 0.608 0.421 0.61 0.645 0.547 0.241
42 | Amazon Nova Micro 0.524 0.744 0.779 0.285 0.888 0.64 0.76 0.794 0.615 0.608 0.192
43 | Claude v1.3 0.518 0.723 0.699 0.409 0.908 0.631 0.54 0.784 0.629 0.618 0.219
44 | Mixtral (8x7B 32K seqlen) 0.51 0.767 0.699 0.427 0.868 0.649 0.494 0.622 0.63 0.652 0.19
45 | Phi-3 (14B) 0.509 0.724 0.729 0.278 0.916 0.675 0.611 0.878 0.593 0.696 0.17
46 | Claude 2.0 0.489 0.718 0.67 0.428 0.862 0.639 0.603 0.583 0.643 0.652 0.219
47 | DeepSeek LLM Chat (67B) 0.488 0.581 0.733 0.412 0.88 0.641 0.615 0.795 0.637 0.628 0.186
48 | Qwen2.5 Instruct Turbo (7B) 0.488 0.742 0.725 0.205 0.862 0.658 0.835 0.83 0.632 0.6 0.155
49 | Llama 2 (70B) 0.482 0.763 0.674 0.46 0.838 0.58 0.323 0.567 0.673 0.618 0.196
50 | Phi-3 (7B) 0.473 0.754 0.675 0.324 0.912 0.659 0.703 - 0.584 0.672 0.154
51 | Yi Large (Preview) 0.471 0.373 0.586 0.428 0.946 0.712 0.712 0.69 0.519 0.66 0.176
52 | Command R Plus 0.441 0.735 0.711 0.343 0.828 0.59 0.403 0.738 0.672 0.567 0.203
53 | GPT-3.5 (text-davinci-003) 0.439 0.731 0.77 0.413 0.828 0.555 0.449 0.615 0.622 0.531 0.191
54 | Claude 2.1 0.437 0.677 0.611 0.375 0.872 0.643 0.632 0.604 0.643 0.644 0.204
55 | Qwen1.5 (14B) 0.425 0.711 0.772 0.3 0.862 0.626 0.686 0.693 0.593 0.515 0.178
56 | Gemini 1.0 Pro (002) 0.422 0.751 0.714 0.391 0.788 0.534 0.665 0.816 0.475 0.483 0.194
57 | Jamba 1.5 Mini 0.414 0.746 0.71 0.388 0.89 0.582 0.318 0.691 0.503 0.632 0.179
58 | Claude Instant 1.2 0.399 0.616 0.731 0.343 0.844 0.631 0.499 0.721 0.586 0.559 0.194
59 | Llama 3 (8B) 0.387 0.754 0.681 0.378 0.766 0.602 0.391 0.499 0.637 0.581 0.183
60 | Claude 3 Sonnet (20240229) 0.377 0.111 0.072 0.028 0.918 0.652 0.084 0.907 0.49 0.684 0.218
61 | GPT-3.5 Turbo (0613) 0.358 0.655 0.678 0.335 0.838 0.614 0.667 0.501 0.528 0.622 0.187
62 | LLaMA (65B) 0.345 0.755 0.672 0.433 0.754 0.584 0.257 0.489 0.48 0.507 0.189
63 | Arctic Instruct 0.338 0.654 0.586 0.39 0.828 0.575 0.519 0.768 0.588 0.581 0.172
64 | Gemma (7B) 0.336 0.752 0.665 0.336 0.808 0.571 0.5 0.559 0.581 0.513 0.187
65 | GPT-3.5 (text-davinci-002) 0.336 0.719 0.71 0.394 0.796 0.568 0.428 0.479 0.58 0.525 0.174
66 | Mistral NeMo (2402) 0.333 0.731 0.65 0.265 0.822 0.604 0.668 0.782 0.415 0.59 0.177
67 | Mistral Large (2402) 0.328 0.454 0.485 0.311 0.894 0.638 0.75 0.694 0.479 0.499 0.182
68 | Command 0.327 0.749 0.777 0.391 0.774 0.525 0.236 0.452 0.578 0.445 0.088
69 | Llama 3.2 Vision Instruct Turbo (11B) 0.325 0.756 0.671 0.234 0.724 0.511 0.739 0.823 0.435 0.27 0.179
70 | Llama 3.1 Instruct Turbo (8B) 0.303 0.756 0.677 0.209 0.74 0.5 0.703 0.798 0.342 0.245 0.181
71 | Command R 0.299 0.742 0.72 0.352 0.782 0.567 0.266 0.551 0.507 0.555 0.149
72 | Mistral v0.1 (7B) 0.292 0.716 0.687 0.367 0.776 0.584 0.297 0.377 0.58 0.525 0.16
73 | DBRX Instruct 0.289 0.488 0.55 0.284 0.91 0.643 0.358 0.671 0.426 0.694 0.131
74 | Mistral Small (2402) 0.288 0.519 0.587 0.304 0.862 0.593 0.621 0.734 0.389 0.616 0.169
75 | Jamba Instruct 0.287 0.658 0.636 0.384 0.796 0.582 0.38 0.67 0.54 0.519 0.164
76 | Qwen1.5 (7B) 0.275 0.448 0.749 0.27 0.806 0.569 0.561 0.6 0.523 0.479 0.153
77 | Mistral Medium (2312) 0.268 0.449 0.468 0.29 0.83 0.618 0.565 0.706 0.452 0.61 0.169
78 | Claude 3 Haiku (20240307) 0.263 0.244 0.252 0.144 0.838 0.662 0.131 0.699 0.46 0.702 0.148
79 | Yi (6B) 0.253 0.702 0.748 0.31 0.8 0.53 0.126 0.375 0.519 0.497 0.117
80 | Llama 2 (13B) 0.233 0.741 0.64 0.371 0.634 0.505 0.102 0.266 0.591 0.392 0.167
81 | Falcon (40B) 0.217 0.671 0.676 0.392 0.662 0.507 0.128 0.267 0.442 0.419 0.162
82 | Jurassic-2 Jumbo (178B) 0.215 0.728 0.65 0.385 0.688 0.483 0.103 0.239 0.533 0.431 0.114
83 | Mistral Instruct v0.3 (7B) 0.196 0.716 0.68 0.253 0.79 0.51 0.289 0.538 0.331 0.517 0.142
84 | Jurassic-2 Grande (17B) 0.172 0.744 0.627 0.35 0.614 0.471 0.064 0.159 0.468 0.39 0.102
85 | Phi-2 0.169 0.703 0.68 0.155 0.798 0.518 0.255 0.581 0.334 0.41 0.038
86 | Llama 2 (7B) 0.152 0.686 0.612 0.333 0.544 0.425 0.097 0.154 0.502 0.392 0.144
87 | Luminous Supreme (70B) 0.145 0.743 0.656 0.299 0.284 0.316 0.078 0.137 0.452 0.276 0.102
88 | Command Light 0.105 0.629 0.686 0.195 0.398 0.386 0.098 0.149 0.397 0.312 0.023
89 | Luminous Extended (30B) 0.078 0.684 0.611 0.253 0.272 0.248 0.04 0.075 0.421 0.276 0.083
90 | Falcon (7B) 0.064 0.621 0.58 0.285 0.26 0.288 0.044 0.055 0.346 0.254 0.094
91 | OLMo (7B) 0.052 0.597 0.603 0.259 0.222 0.305 0.029 0.044 0.341 0.229 0.097
92 | Luminous Base (13B) 0.041 0.633 0.577 0.197 0.286 0.243 0.026 0.028 0.332 0.26 0.066
93 |
--------------------------------------------------------------------------------
/benchbench/data/glue/leaderboard.tsv:
--------------------------------------------------------------------------------
1 | Rank Name Model URL Score CoLA SST-2 MRPC STS-B QQP MNLI-m MNLI-mm QNLI RTE WNLI AX
2 | 1 Microsoft Alexander v-team Turing ULR v6 91.3 73.3 97.5 94.2/92.3 93.5/93.1 76.4/90.9 92.5 92.1 96.7 93.6 97.9 55.4
3 | 2 JDExplore d-team Vega v1 91.3 73.8 97.9 94.5/92.6 93.5/93.1 76.7/91.1 92.1 91.9 96.7 92.4 97.9 51.4
4 | 3 Microsoft Alexander v-team Turing NLR v5 91.2 72.6 97.6 93.8/91.7 93.7/93.3 76.4/91.1 92.6 92.4 97.9 94.1 95.9 57.0
5 | 4 DIRL Team DeBERTa + CLEVER 91.1 74.7 97.6 93.3/91.1 93.4/93.1 76.5/91.0 92.1 91.8 96.7 93.2 96.6 53.3
6 | 5 ERNIE Team - Baidu ERNIE 91.1 75.5 97.8 93.9/91.8 93.0/92.6 75.2/90.9 92.3 91.7 97.3 92.6 95.9 51.7
7 | 6 AliceMind & DIRL StructBERT + CLEVER 91.0 75.3 97.7 93.9/91.9 93.5/93.1 75.6/90.8 91.7 91.5 97.4 92.5 95.2 49.1
8 | 7 DeBERTa Team - Microsoft DeBERTa / TuringNLRv4 90.8 71.5 97.5 94.0/92.0 92.9/92.6 76.2/90.8 91.9 91.6 99.2 93.2 94.5 53.2
9 | 8 HFL iFLYTEK MacALBERT + DKM 90.7 74.8 97.0 94.5/92.6 92.8/92.6 74.7/90.6 91.3 91.1 97.8 92.0 94.5 52.6
10 | 9 PING-AN Omni-Sinitic ALBERT + DAAF + NAS 90.6 73.5 97.2 94.0/92.0 93.0/92.4 76.1/91.0 91.6 91.3 97.5 91.7 94.5 51.2
11 | 10 T5 Team - Google T5 90.3 71.6 97.5 92.8/90.4 93.1/92.8 75.1/90.6 92.2 91.9 96.9 92.8 94.5 53.1
12 | 11 Microsoft D365 AI & MSR AI & GATECH MT-DNN-SMART 89.9 69.5 97.5 93.7/91.6 92.9/92.5 73.9/90.2 91.0 90.8 99.2 89.7 94.5 50.2
13 | 12 Huawei Noah's Ark Lab NEZHA-Large 89.8 71.7 97.3 93.3/91.0 92.4/91.9 75.2/90.7 91.5 91.3 96.2 90.3 94.5 47.9
14 | 13 LG AI Research ANNA 89.8 68.7 97.0 92.7/90.1 93.0/92.8 75.3/90.5 91.8 91.6 96.0 91.8 95.9 51.8
15 | 14 Zihang Dai Funnel-Transformer (Ensemble B10-10-10H1024) 89.7 70.5 97.5 93.4/91.2 92.6/92.3 75.4/90.7 91.4 91.1 95.8 90.0 94.5 51.6
16 | 15 ELECTRA Team ELECTRA-Large + Standard Tricks 89.4 71.7 97.1 93.1/90.7 92.9/92.5 75.6/90.8 91.3 90.8 95.8 89.8 91.8 50.7
17 | 16 David Kim 2digit LANet 89.3 71.8 97.3 92.4/89.6 93.0/92.7 75.5/90.5 91.8 91.6 96.4 91.1 88.4 54.6
18 | 17 倪仕文 DropAttack-RoBERTa-large 88.8 70.3 96.7 92.6/90.1 92.1/91.8 75.1/90.5 91.1 90.9 95.3 89.9 89.7 48.2
19 | 18 Microsoft D365 AI & UMD FreeLB-RoBERTa (ensemble) 88.4 68.0 96.8 93.1/90.8 92.3/92.1 74.8/90.3 91.1 90.7 95.6 88.7 89.0 50.1
20 | 19 Junjie Yang HIRE-RoBERTa 88.3 68.6 97.1 93.0/90.7 92.4/92.0 74.3/90.2 90.7 90.4 95.5 87.9 89.0 49.3
21 | 20 Shiwen Ni ELECTRA-large-M (bert4keras) 88.3 69.3 95.8 92.2/89.6 91.2/91.1 75.1/90.5 91.1 90.9 93.8 87.9 91.8 48.2
22 | 21 Facebook AI RoBERTa 88.1 67.8 96.7 92.3/89.8 92.2/91.9 74.3/90.2 90.8 90.2 95.4 88.2 89.0 48.7
23 | 22 Microsoft D365 AI & MSR AI MT-DNN-ensemble 87.6 68.4 96.5 92.7/90.3 91.1/90.7 73.7/89.9 87.9 87.4 96.0 86.3 89.0 42.8
24 | 23 GLUE Human Baselines GLUE Human Baselines 87.1 66.4 97.8 86.3/80.8 92.7/92.6 59.5/80.4 92.0 92.8 91.2 93.6 95.9 -
25 | 24 kk xx ELECTRA-Large-NewSCL(single) 85.6 73.3 97.2 92.7/90.2 92.0/91.7 75.3/90.6 90.8 90.3 95.6 86.9 60.3 50.0
26 | 25 Adrian de Wynter Bort (Alexa AI) 83.6 63.9 96.2 94.1/92.3 89.2/88.3 66.0/85.9 88.1 87.8 92.3 82.7 71.2 51.9
27 | 26 Lab LV ConvBERT base 83.2 67.8 95.7 91.4/88.3 90.4/89.7 73.0/90.0 88.3 87.4 93.2 77.9 65.1 42.9
28 | 27 Stanford Hazy Research Snorkel MeTaL 83.2 63.8 96.2 91.5/88.5 90.1/89.7 73.1/89.9 87.6 87.2 93.9 80.9 65.1 39.9
29 | 28 XLM Systems XLM (English only) 83.1 62.9 95.6 90.7/87.1 88.8/88.2 73.2/89.8 89.1 88.5 94.0 76.0 71.9 44.7
30 | 29 WATCH ME ConvBERT-base-paddle-v1.1 83.1 66.3 95.4 91.6/88.6 90.0/89.2 73.9/90.0 88.2 87.7 93.3 78.2 65.1 9.2
31 | 30 Zhuosheng Zhang SemBERT 82.9 62.3 94.6 91.2/88.3 87.8/86.7 72.8/89.8 87.6 86.3 94.6 84.5 65.1 42.4
32 | 31 Jun Yu mpnet-base-paddle 82.9 60.5 95.9 91.6/88.9 90.8/90.3 72.5/89.7 87.6 86.6 93.3 82.4 65.1 9.2
33 | 32 Danqi Chen SpanBERT (single-task training) 82.8 64.3 94.8 90.9/87.9 89.9/89.1 71.9/89.5 88.1 87.7 94.3 79.0 65.1 45.1
34 | 33 GAL team distilRoBERTa+GAL (6-layer transformer single model) 82.6 60.0 95.3 91.9/89.2 90.0/89.6 73.3/90.0 87.4 86.5 92.7 81.8 65.1 0.0
35 | 34 Kevin Clark BERT + BAM 82.3 61.5 95.2 91.3/88.3 88.6/87.9 72.5/89.7 86.6 85.8 93.1 80.4 65.1 40.7
36 | 35 Nitish Shirish Keskar Span-Extractive BERT on STILTs 82.3 63.2 94.5 90.6/87.6 89.4/89.2 72.2/89.4 86.5 85.8 92.5 79.8 65.1 28.3
37 | 36 LV NUS LV-BERT-base 82.1 64.0 94.7 90.9/87.9 89.4/88.8 72.3/89.5 86.6 86.1 92.6 77.0 65.1 39.5
38 | 37 Jason Phang BERT on STILTs 82.0 62.1 94.3 90.2/86.6 88.7/88.3 71.9/89.4 86.4 85.6 92.7 80.1 65.1 28.3
39 | 38 gao jie 1 82.0 66.8 96.5 90.9/87.2 91.4/90.8 72.9/89.6 90.2 56.4 94.7 82.8 62.3 9.2
40 | 39 Gino Tesei RobustRoBERTa 81.9 63.6 96.8 91.6/88.6 90.3/89.6 73.2/89.7 90.0 89.4 95.1 50.3 80.1 50.5
41 | 40 Karen Hambardzumyan WARP with RoBERTa 81.6 53.9 96.3 88.2/83.9 89.5/88.8 68.6/87.7 88.0 88.2 93.5 84.3 65.1 41.2
42 | 41 Junxiong Wang Bigs-128-1000k 81.5 64.4 94.9 88.7/84.2 87.8/87.5 71.2/89.2 86.1 85.0 91.6 77.6 65.1 36.2
43 | 42 Huawei Noah's Ark Lab MTL CombinedKD-TinyRoBERTa (6 layer 82M parameters, MATE-KD + AnnealingKD) 81.5 58.6 95.1 91.2/88.1 88.5/88.4 73.0/89.7 86.2 85.6 92.4 76.6 65.1 20.2
44 | 43 Richard Bai segaBERT-large 81.4 62.6 94.8 89.7/86.1 88.6/87.7 72.5/89.4 87.9 87.7 94.0 71.6 65.1 0.0
45 | 44 廖亿 u-PMLM-R (Huawei Noah's Ark Lab) 81.3 56.9 94.2 90.7/87.7 89.7/89.1 72.2/89.4 86.1 85.4 92.1 78.5 65.1 40.0
46 | 45 Xinsong Zhang AMBERT-BASE 81.0 60.0 95.2 90.6/87.1 86.3/88.2 72.2/89.5 87.2 86.5 92.6 72.6 65.1 39.4
47 | 46 Mikita Sazanovich Routed BERTs 80.7 56.1 93.6 88.6/84.7 88.0/87.6 71.0/88.8 85.2 84.5 92.6 80.0 65.1 9.2
48 | 47 USCD-AI4Health Team CERT 80.7 58.9 94.6 89.8/85.9 87.9/86.8 72.5/90.3 87.2 86.4 93.0 71.2 65.1 39.6
49 | 48 Jacob Devlin BERT: 24-layers, 16-heads, 1024-hidden 80.5 60.5 94.9 89.3/85.4 87.6/86.5 72.1/89.3 86.7 85.9 92.7 70.1 65.1 39.6
50 | 49 Chen Qian KerasNLP XLM-R 80.4 56.3 96.1 89.8/86.3 88.4/87.7 72.3/89.0 87.7 87.1 92.8 69.2 65.1 40.6
51 | 50 Chen Qian KerasNLP RoBERTa 80.4 56.3 96.1 89.8/86.3 88.4/87.7 72.3/89.0 87.7 87.1 92.8 69.2 65.1 40.6
52 | 51 Jinliang LU MULTIPLE_ADAPTER_T5_BASE 80.3 54.1 93.8 90.1/86.8 87.9/87.6 71.8/88.9 86.1 85.7 93.5 76.8 62.3 9.2
53 | 52 Yoshitomo Matsubara HF bert-large-uncased (default fine-tuning) 80.2 61.5 94.6 89.2/85.2 86.4/85.0 72.2/89.3 86.4 85.7 92.4 68.9 65.1 36.9
54 | 53 Neil Houlsby BERT + Single-task Adapters 80.2 59.2 94.3 88.7/84.3 87.3/86.1 71.5/89.4 85.4 85.0 92.4 71.6 65.1 9.2
55 | 54 KI BERT KI-BERT 80.0 55.6 94.5 88.2/83.9 86.3/85.1 71.5/88.9 85.2 83.7 91.2 69.3 73.3 35.6
56 | 55 Xiangyang Liu elasticbert-large-12L 79.9 57.0 92.9 89.4/86.0 89.7/88.6 72.7/89.6 85.4 84.9 92.3 71.8 62.3 9.2
57 | 56 刘向阳 roberta-large-12L 79.8 59.4 94.6 89.1/85.8 89.8/89.1 71.5/89.4 86.4 85.2 91.6 67.3 62.3 9.2
58 | 57 Zhuohan Li Macaron Net-base 79.7 57.6 94.0 88.4/84.4 87.5/86.3 70.8/89.0 85.4 84.5 91.6 70.5 65.1 38.7
59 | 58 shi To GAT-bert-base 79.6 56.8 94.0 89.4/85.3 87.9/86.8 72.4/89.4 85.7 84.5 91.8 70.5 62.3 9.2
60 | 59 teerapong saelim WT-VAT-BERT (Base) 79.5 56.0 94.4 89.2/85.5 87.3/86.2 72.9/89.8 85.5 84.8 91.4 70.4 62.3 9.2
61 | 60 Anshuman Singh Bert-n-Pals 79.1 52.2 93.4 89.5/85.6 86.6/85.9 71.4/89.0 84.1 83.5 90.6 75.4 62.3 33.8
62 | 61 ANSHUMAN SINGH (RA1811003010460) DeepPavlov Multitask PalBert 78.8 48.1 93.4 88.9/85.6 87.0/86.7 71.4/89.0 83.9 83.4 90.8 76.7 62.3 33.8
63 | 62 xiaok Liu BERT-EMD(6-layer; Single model; No DA) 78.7 47.5 93.3 89.8/86.4 87.6/86.8 72.0/89.3 84.7 83.5 90.7 71.7 65.1 9.2
64 | 63 蘇大鈞 SesameBERT-Base 78.6 52.7 94.2 88.9/84.8 86.5/85.5 70.8/88.8 83.7 83.6 91.0 67.6 65.1 35.8
65 | 64 xinge ma ReptileDistil 78.5 47.9 92.8 89.2/85.4 87.1/85.9 71.0/89.0 83.6 82.9 90.4 73.5 65.1 33.2
66 | 65 MobileBERT Team MobileBERT 78.5 51.1 92.6 88.8/84.5 86.2/84.8 70.5/88.3 84.3 83.4 91.6 70.4 65.1 34.3
67 | 66 Linyuan Gong StackingBERT-Base 78.4 56.2 93.9 88.2/83.9 84.2/82.5 70.4/88.7 84.4 84.2 90.1 67.0 65.1 36.6
68 | 67 TinyBERT Team TinyBERT (6-layer; Single model) 78.1 51.1 93.1 87.3/82.6 85.0/83.7 71.6/89.1 84.6 83.2 90.4 70.0 65.1 9.2
69 | 68 SqueezeBERT Team SqueezeBERT (4.3x faster than BERT-base on smartphone) 78.1 46.5 91.4 89.5/86.0 87.0/86.3 71.5/89.0 82.0 81.1 90.1 73.2 65.1 35.3
70 | 69 Anshuman Singh CAMTL 77.9 53.0 92.6 88.3/84.4 86.6/85.9 70.0/88.5 82.3 82.0 90.5 72.8 58.2 33.8
71 | 70 傅薛林 KRISFU 77.8 52.4 92.5 89.0/84.8 83.7/82.2 70.4/88.6 84.3 83.4 90.9 65.9 65.1 36.1
72 | 71 王上 s0 77.8 46.8 92.9 88.9/84.8 87.2/86.5 71.9/89.1 84.5 83.4 90.8 70.9 60.3 35.3
73 | 72 Stark Tony Pocket GLUE 77.6 49.3 92.4 89.0/84.6 84.9/84.0 70.1/88.7 84.0 82.8 90.1 67.2 65.1 36.1
74 | 73 Pavan Kalyan Reddy Neerudu Pavan Neerudu - BERT 77.6 56.1 93.5 87.6/83.2 85.3/83.8 70.6/88.8 84.0 83.4 90.8 64.0 60.3 34.6
75 | 74 NLC MSR Asia BERT-of-Theseus (6-layer; single model) 77.1 47.8 92.2 87.6/83.2 85.6/84.1 71.6/89.3 82.4 82.1 89.6 66.2 65.1 9.2
76 | 75 Hanxiong Huang Hanxiong Huang 75.9 49.3 93.3 87.1/81.9 83.3/81.7 71.5/89.1 84.8 83.8 91.0 64.1 53.4 9.2
77 | 76 YeonTaek Oh EL-BERT(6-Layer, Single model) 75.6 47.7 91.0 87.8/83.0 81.2/80.2 69.9/88.1 81.8 81.0 90.2 59.9 65.1 31.8
78 | 77 EVS Team Anonymous 74.7 52.6 93.4 87.6/83.2 61.2/59.1 71.8/89.3 83.7 83.2 89.9 65.0 62.3 35.6
79 | 78 Chen Money KerasNLP 12/05/2022 Trial 2 74.6 52.2 93.5 87.8/82.6 84.5/83.1 71.3/89.3 82.3 81.6 89.3 61.7 43.8 32.9
80 | 79 Sinx ZHIYUAN 74.1 57.0 95.2 91.4/88.4 91.1/90.8 24.2/23.7 87.7 87.3 92.5 81.7 47.9 0.3
81 | 80 Tirana Noor Fatyanosa distilbert-base-uncased 73.6 45.8 92.3 87.6/83.1 71.0/71.0 69.6/88.2 81.6 81.3 88.8 54.1 65.1 31.8
82 | 81 Haiqin YANG RefBERT 73.1 47.9 92.9 86.9/81.9 75.0/76.3 61.6/84.4 80.9 80.3 87.3 61.7 54.8 -10.3
83 | 82 Haiqin Yang RefBERT 73.1 47.9 92.9 86.9/81.9 75.0/76.3 61.4/84.2 80.9 80.3 87.3 61.7 54.8 -10.3
84 | 83 Haiqin Yang RefBERT 71.8 36.3 92.9 86.9/81.9 75.0/76.3 61.6/83.8 80.9 80.3 87.3 61.7 54.8 -10.3
85 | 84 Haiqin Yang RefBERT 71.8 36.3 92.9 86.9/81.9 75.0/76.3 61.3/83.6 80.9 80.3 87.3 61.7 54.8 -10.3
86 | 85 公能公能 1111 71.4 35.8 90.1 83.2/75.7 81.0/79.3 68.5/87.5 77.5 77.1 86.7 58.0 56.8 9.2
87 | 86 Jack Hessel Bag-of-words only BoW-BERT (Base) 70.0 14.3 86.7 82.9/75.2 81.8/80.3 68.3/87.5 79.8 79.7 86.2 60.4 65.1 31.0
88 | 87 GLUE Baselines BiLSTM+ELMo+Attn 70.0 33.6 90.4 84.4/78.0 74.2/72.3 63.1/84.3 74.1 74.5 79.8 58.9 65.1 21.7
89 |
--------------------------------------------------------------------------------
/benchbench/data/heim/alignment_auto.tsv:
--------------------------------------------------------------------------------
1 | Model/adapter Mean win rate ↑ [ sort ] MS-COCO (base) - Expected CLIP score ↑ [ sort ] MS-COCO (base) - Max CLIP score ↑ [ sort ] Caltech-UCSD Birds-200-2011 - Expected CLIP score ↑ [ sort ] Caltech-UCSD Birds-200-2011 - Max CLIP score ↑ [ sort ] DrawBench (image quality categories) - Expected CLIP score ↑ [ sort ] DrawBench (image quality categories) - Max CLIP score ↑ [ sort ] PartiPrompts (image quality categories) - Expected CLIP score ↑ [ sort ] PartiPrompts (image quality categories) - Max CLIP score ↑ [ sort ] dailydall.e - Expected CLIP score ↑ [ sort ] dailydall.e - Max CLIP score ↑ [ sort ] Landing Page - Expected CLIP score ↑ [ sort ] Landing Page - Max CLIP score ↑ [ sort ] Logos - Expected CLIP score ↑ [ sort ] Logos - Max CLIP score ↑ [ sort ] Magazine Cover Photos - Expected CLIP score ↑ [ sort ] Magazine Cover Photos - Max CLIP score ↑ [ sort ] Common Syntactic Processes - Expected CLIP score ↑ [ sort ] Common Syntactic Processes - Max CLIP score ↑ [ sort ] DrawBench (reasoning categories) - Expected CLIP score ↑ [ sort ] DrawBench (reasoning categories) - Max CLIP score ↑ [ sort ] PartiPrompts (reasoning categories) - Expected CLIP score ↑ [ sort ] PartiPrompts (reasoning categories) - Max CLIP score ↑ [ sort ] Relational Understanding - Expected CLIP score ↑ [ sort ] Relational Understanding - Max CLIP score ↑ [ sort ] Detection (PaintSkills) - Expected CLIP score ↑ [ sort ] Detection (PaintSkills) - Max CLIP score ↑ [ sort ] Winoground - Expected CLIP score ↑ [ sort ] Winoground - Max CLIP score ↑ [ sort ] PartiPrompts (knowledge categories) - Expected CLIP score ↑ [ sort ] PartiPrompts (knowledge categories) - Max CLIP score ↑ [ sort ] DrawBench (knowledge categories) - Expected CLIP score ↑ [ sort ] DrawBench (knowledge categories) - Max CLIP score ↑ [ sort ] TIME's most significant historical figures - Expected CLIP score ↑ [ sort ] TIME's most significant historical figures - Max CLIP score ↑ [ sort ] Demographic Stereotypes - Expected CLIP score ↑ [ sort ] Demographic Stereotypes - Max CLIP score ↑ [ sort ] Mental Disorders - Expected CLIP score ↑ [ sort ] Mental Disorders - Max CLIP score ↑ [ sort ] Inappropriate Image Prompts (I2P) - Expected CLIP score ↑ [ sort ] Inappropriate Image Prompts (I2P) - Max CLIP score ↑ [ sort ]
2 | Dreamlike Diffusion v1.0 (1B) 0.958 27.06 28.593 27.071 27.071 29.446 31.775 28.071 29.793 30.507 32.424 27.751 29.513 26.027 29.048 28.419 30.538 26.146 27.745 27.875 29.624 27.998 30.088 26.46 28.525 25.603 27.195 25.381 27.033 29.205 31.118 30.656 32.961 26.17 27.584 23.22 24.839 22.425 24.541 28.473 31.283
3 | Vintedois (22h) Diffusion model v0.1 (1B) 0.806 26.402 28.169 26.209 26.209 27.42 29.887 27.471 29.319 29.901 31.69 27.1 29.07 23.608 26.569 26.279 28.531 25.379 27.467 27.171 28.953 27.299 29.323 26.099 28.261 25.403 26.972 24.409 26.255 28.728 30.602 29.354 32.072 26.653 28.169 22.62 24.868 22.258 24.71 27.149 30.318
4 | Dreamlike Photoreal v2.0 (1B) 0.779 26.104 27.733 26.597 26.597 28.186 31.06 27.392 29.238 30.289 32.345 26.549 28.668 24.582 27.419 27.462 29.855 24.975 26.689 26.843 28.887 27.163 29.322 26.184 28.312 25.123 26.785 24.136 25.983 28.727 30.62 29.421 32.221 25.907 27.373 22.358 24.193 21.809 24.028 28.009 31.036
5 | Stable Diffusion v2 base (1B) 0.777 26.255 28.052 26.089 26.089 28.923 31.806 27.421 29.388 29.246 31.576 26.535 29.116 24.19 27.284 28.292 31.275 24.731 26.907 27.107 29.492 27.281 29.653 25.839 28.282 25.194 26.976 24.643 27.083 28.105 30.331 29.243 32.155 25.443 27.379 21.82 24.385 21.53 23.717 26.509 30.134
6 | Stable Diffusion v1.5 (1B) 0.767 26.376 28.147 26.699 26.699 27.843 30.343 27.165 29.34 29.81 32.473 26.553 28.714 23.975 26.931 27.09 29.75 24.978 27.21 26.899 29.078 27.103 29.477 25.272 27.929 24.999 26.977 24.372 26.542 28.248 30.414 28.55 31.554 26.033 27.922 22.134 24.162 22.352 24.366 26.867 30.201
7 | DeepFloyd IF X-Large (4.3B) 0.758 25.791 27.653 26.126 26.126 29.691 32.408 27.328 29.176 29.021 31.105 27.448 29.421 25.388 28.423 29.366 32.274 24.795 26.862 26.936 29.1 27.852 29.997 26 28.446 25.404 27.136 23.926 25.873 28.175 30.135 30.038 32.743 24.892 26.678 21.659 23.637 21.497 23.484 25.486 28.971
8 | Stable Diffusion v1.4 (1B) 0.749 26.425 28.28 26.433 26.433 27.713 30.556 27.228 29.406 29.542 32.076 26.881 29.496 23.582 26.615 26.944 29.566 24.719 26.789 27.01 29.074 26.792 29.278 25.399 28.07 25.069 26.812 24.442 26.347 28.135 30.448 28.325 31.246 26.303 28.057 21.926 24.358 22.637 24.595 26.608 29.841
9 | Safe Stable Diffusion weak (1B) 0.742 26.196 27.885 26.68 26.68 27.551 30.098 27.078 29.233 29.577 32.034 26.642 29.103 24.008 26.615 27.201 29.71 24.917 27.028 27.033 29.242 27.208 29.69 25.188 27.827 24.964 26.882 24.337 26.332 28.325 30.628 27.982 30.974 26.24 27.882 22.022 24.128 22.259 24.347 26.63 29.975
10 | DALL-E 2 (3.5B) 0.696 27.102 28.714 25.323 25.323 28.56 30.857 27.841 29.623 29.89 31.886 26.35 28.28 25.798 28.257 22.522 25.246 25.301 26.9 27.909 29.722 28.696 30.6 26.839 28.994 26.79 28.382 24.96 26.679 28.628 30.597 30.613 32.92 22.177 24.177 21.529 23.555 15.062 16.36 20.186 22.735
11 | DALL-E mega (2.6B) 0.695 27.193 29.205 26.752 26.752 26.866 29.347 27.925 30.074 28.761 31.018 21.37 23.327 24.489 26.732 19.366 21.312 25.246 27.135 26.734 29.346 27.289 29.251 25.698 28.074 26.744 28.508 23.913 25.853 28.1 30.427 27.971 30.817 26.743 28.378 22.849 25.288 21.634 24.123 24.502 27.814
12 | DeepFloyd IF Large (0.9B) 0.628 25.504 27.046 25.881 25.881 28.705 32.045 27.06 28.906 28.786 30.828 26.825 28.923 24.417 27.176 28.412 31.832 24.757 26.614 26.461 28.569 27.423 29.715 25.657 27.965 25.421 27.154 23.733 25.577 28.07 29.954 29.009 31.848 23.527 25.425 21.5 23.537 21.097 23.593 25.025 28.5
13 | Stable Diffusion v2.1 base (1B) 0.609 25.861 27.507 26.065 26.065 28.135 30.718 27.205 29.136 29.028 31.617 25.658 27.805 22.697 25.8 25.989 28.949 24.566 26.543 26.575 28.608 26.311 28.921 25.754 27.992 24.851 26.826 23.753 25.658 27.773 29.757 28.658 31.35 25.898 27.535 21.778 24.588 21.329 23.619 26.266 29.768
14 | DeepFloyd IF Medium (0.4B) 0.56 25.517 27.116 25.692 25.692 28.541 31.596 26.739 28.63 28.531 30.875 26.338 28.054 24.225 27.369 27.657 30.839 24.709 26.782 26.251 28.357 27.21 29.424 25.315 27.951 25.52 27.223 23.63 25.592 27.525 29.557 27.928 30.694 21.873 24.283 21.387 23.902 21.487 23.921 24.562 28.175
15 | Openjourney v2 (1B) 0.506 26.807 28.61 25.661 25.661 26.317 29.183 26.448 28.682 28.956 31.465 26.097 28.122 24.803 27.362 24.812 27.158 23.831 26.053 26.398 28.555 25.795 28.251 24.316 27.084 24.932 26.726 22.811 25.188 27.209 29.476 27.328 30.312 24.25 26.373 20.996 23.57 21.123 24.108 25.056 28.469
16 | Safe Stable Diffusion medium (1B) 0.493 25.671 27.676 26.003 26.003 26.563 29.467 26.536 28.739 28.687 31.451 26.207 28.47 23.355 26.011 25.746 28.411 24.023 26.136 26.271 28.558 26.528 29.02 24.358 26.831 24.4 26.331 23.498 25.557 27.725 30.182 27.619 30.716 25.521 27.165 20.949 23.289 19.615 21.402 25.803 29.444
17 | GigaGAN (1B) 0.4 25.722 27.645 26.569 26.569 25.668 27.828 26.589 28.678 28.154 30.582 25.199 27.185 20.775 23.154 20.637 23.247 24.301 26.324 26.328 28.668 26.145 28.237 24.996 27.004 24.391 26.205 23.28 25.362 27.245 29.449 27.433 30.121 23.746 26.019 20.94 23.343 19.091 22.136 24.886 28.073
18 | Promptist + Stable Diffusion v1.4 (1B) 0.369 25.245 27.209 25.207 25.207 24.786 27.488 26.213 28.384 28.776 31.238 24.525 26.693 22.608 25.516 26.105 28.509 23.599 25.233 25.449 27.704 25.462 27.918 23.745 26.363 23.872 25.834 21.741 23.811 27.209 29.446 28.214 30.657 24.663 26.595 20.877 23.337 20.785 23.185 25.051 28.182
19 | Safe Stable Diffusion strong (1B) 0.344 24.787 26.974 25.769 25.769 25.704 28.244 25.758 28.148 27.727 30.772 25.476 27.77 22.65 25.527 24.476 27.406 23.074 25.367 25.522 27.833 25.722 28.394 23.246 25.788 23.898 25.93 22.48 24.761 26.754 29.467 26.842 29.558 24.763 26.864 20.029 22.566 17.414 19.736 23.836 28.104
20 | Redshift Diffusion (1B) 0.244 24.837 26.695 25.407 25.407 25.15 27.975 25.494 27.792 27.753 30.215 24.306 26.406 20.97 23.405 22.523 25.576 22.733 25.258 25.032 27.432 23.822 27.015 23.125 25.863 23.58 25.667 21.284 23.548 26.755 29.104 26.408 28.721 23.341 25.651 20.01 22.322 19.156 20.985 23.825 27.389
21 | DALL-E mini (0.4B) 0.226 25.012 27.029 25.648 25.648 22.838 25.168 25.366 27.615 25.483 27.974 19.796 21.853 22.677 24.914 17.046 19.148 21.89 24.111 23.495 26.067 25.2 27.45 23.145 25.743 24.982 26.838 21.337 23.457 25.301 27.538 24.023 26.368 22.972 24.798 21.042 23.494 21.046 22.96 21.367 24.746
22 | Safe Stable Diffusion max (1B) 0.223 23.859 26.086 25.562 25.562 24.708 27.835 24.852 27.384 26.671 29.919 24.982 27.168 20.657 24.167 22.995 25.918 22.188 24.81 24.662 27.251 24.651 27.675 22.673 25.617 23.494 25.718 21.663 24.438 25.726 28.466 25.257 28.386 24.043 26.374 19.46 22.258 16.703 18.993 21.666 26.659
23 | Openjourney v1 (1B) 0.198 24.894 27.025 24.611 24.611 21.407 24.437 24.868 27.346 27.697 30.206 25.33 27.495 19.74 22.446 22.989 25.714 22.254 24.603 24.223 26.86 23.298 25.935 20.85 23.903 23.859 25.846 20.444 23.165 26.339 28.686 26.85 29.452 24.038 25.885 19.712 22.125 19.291 22.07 23.823 27.315
24 | Lexica Search with Stable Diffusion v1.5 (1B) 0.18 21.961 24.592 22.964 22.964 22.862 25.672 22.769 25.521 23.685 26.341 22.429 25.009 21.818 24.365 21.709 23.701 21.728 24.083 22.618 25.314 22.602 25.234 22.138 24.929 22.894 25.118 21.659 23.818 23.01 25.779 23.298 26.011 22.926 25.289 21.601 24.112 21.228 23.99 24.632 28.751
25 | MultiFusion (13B) 0.163 24.236 26.55 24.601 24.601 23.061 26.272 24.036 26.566 25.655 29.027 22.637 24.586 19.342 22.464 19.955 22.778 22.315 24.757 24.156 26.968 24.073 27.183 22.967 26.243 22.849 24.997 21.331 23.92 24.127 26.652 23.666 27.558 17.701 20.452 19.768 22.997 19.846 23.121 19.239 23.436
26 | CogView2 (6B) 0.085 23.082 25.896 23.656 23.656 22.005 25.267 23.157 26.143 23.247 26.783 18.952 21.609 19.135 21.655 16.365 19.004 21.808 24.546 22.251 25.46 22.854 26.096 22.743 26.103 22.804 25.286 20.928 23.91 21.91 25.171 19.703 23.339 13.897 16.125 20.031 24.113 18.803 21.807 16.534 20.931
27 | minDALL-E (1.3B) 0.045 21.596 25.119 23.908 23.908 22.67 25.628 22.25 25.615 21.644 25.391 17.916 20.329 20.343 23.082 16.728 19.315 19.798 22.602 21.283 24.54 22.225 25.359 20.573 23.678 21.334 24.398 18.825 21.861 22.089 25.715 20.741 24.123 14.138 16.652 19.439 23.03 19.168 21.523 16.836 21.259
28 |
--------------------------------------------------------------------------------