├── benchbench ├── __init__.py ├── utils │ ├── __init__.py │ ├── win_rate.py │ ├── base.py │ └── metric.py ├── data │ ├── openllm │ │ ├── statistic.py │ │ ├── format.py │ │ ├── __init__.py │ │ └── leaderboard.tsv │ ├── bbh │ │ ├── __init__.py │ │ ├── format.py │ │ ├── cols.txt │ │ └── statistic.py │ ├── vtab │ │ ├── __init__.py │ │ └── leaderboard.tsv │ ├── bigcode │ │ ├── __init__.py │ │ ├── format.py │ │ ├── leaderboard.tsv │ │ └── vanilla.txt │ ├── mmlu │ │ ├── __init__.py │ │ ├── format.py │ │ └── leaderboard_raw.csv │ ├── mteb │ │ ├── format.py │ │ ├── __init__.py │ │ └── leaderboard.tsv │ ├── helm_lite │ │ ├── format.py │ │ ├── __init__.py │ │ └── leaderboard.tsv │ ├── helm_capability │ │ ├── format.py │ │ ├── __init__.py │ │ ├── leaderboard.tsv │ │ └── vanilla.txt │ ├── heim │ │ ├── __init__.py │ │ ├── quality_human.tsv │ │ ├── quality_auto.tsv │ │ ├── originality.tsv │ │ ├── black_out.tsv │ │ ├── nsfw.tsv │ │ ├── nudity.tsv │ │ ├── aesthetics_human.tsv │ │ ├── alignment_human.tsv │ │ └── alignment_auto.tsv │ ├── superglue │ │ ├── __init__.py │ │ └── leaderboard.tsv │ ├── imagenet │ │ ├── format.py │ │ ├── __init__.py │ │ ├── run_imagenet.py │ │ └── leaderboard_raw.tsv │ ├── helm │ │ ├── __init__.py │ │ ├── toxicity.tsv │ │ ├── calibration.tsv │ │ ├── efficiency.tsv │ │ ├── summarization.tsv │ │ ├── fairness.tsv │ │ ├── robustness.tsv │ │ └── accuracy.tsv │ ├── glue │ │ ├── __init__.py │ │ └── leaderboard.tsv │ ├── dummy │ │ └── __init__.py │ └── __init__.py └── measures │ ├── cardinal.py │ └── ordinal.py ├── MANIFEST.in ├── assets ├── banner.png └── benchbench-horizontal.png ├── docs ├── data.rst ├── index.rst ├── measures.rst ├── Makefile ├── utils.rst ├── make.bat └── conf.py ├── LICENSE.txt ├── pyproject.toml ├── README.md └── .gitignore /benchbench/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /benchbench/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include benchbench/data/* 2 | include benchbench/data/*/* 3 | -------------------------------------------------------------------------------- /assets/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/socialfoundations/benchbench/HEAD/assets/banner.png -------------------------------------------------------------------------------- /assets/benchbench-horizontal.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/socialfoundations/benchbench/HEAD/assets/benchbench-horizontal.png -------------------------------------------------------------------------------- /benchbench/data/openllm/statistic.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | dataset = load_dataset("gsm8k", name="main", split="test") 4 | print("gsm8k") 5 | print(len(set([eval(i.split("#### ")[-1]) for i in dataset["answer"]])), len(dataset)) 6 | -------------------------------------------------------------------------------- /benchbench/data/bbh/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | 5 | def load_bbh(): 6 | data = pd.read_csv( 7 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"), 8 | sep="\t", 9 | ) 10 | cols = data.columns[6:] 11 | return data, cols 12 | -------------------------------------------------------------------------------- /docs/data.rst: -------------------------------------------------------------------------------- 1 | Data 2 | ======================================= 3 | 4 | benchbench.data 5 | -------------------------------------------- 6 | .. automodule:: benchbench.data 7 | :members: 8 | :undoc-members: 9 | :show-inheritance: 10 | 11 | .. autoattribute:: benchbench.data.cardinal_benchmark_list 12 | 13 | .. autoattribute:: benchbench.data.ordinal_benchmark_list 14 | 15 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to BenchBench's documentation! 2 | ========================================= 3 | 4 | .. include:: ../README.md 5 | :parser: myst_parser.sphinx_ 6 | 7 | .. toctree:: 8 | :maxdepth: 2 9 | :caption: Contents: 10 | 11 | data 12 | measures 13 | utils 14 | 15 | 16 | Indices and tables 17 | -------------------------------------------- 18 | 19 | * :ref:`genindex` 20 | * :ref:`modindex` 21 | * :ref:`search` 22 | -------------------------------------------------------------------------------- /benchbench/data/vtab/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | 5 | def load_vtab(): 6 | data = pd.read_csv( 7 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"), 8 | sep="\t", 9 | ) 10 | cols = data.columns[1:] 11 | return data, cols 12 | 13 | 14 | def test(): 15 | data, cols = load_vtab() 16 | print(data.head()) 17 | print(cols) 18 | 19 | 20 | if __name__ == "__main__": 21 | test() 22 | -------------------------------------------------------------------------------- /benchbench/data/bigcode/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | 5 | def load_bigcode(): 6 | data = pd.read_csv( 7 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"), 8 | sep="\t", 9 | ) 10 | cols = data.columns[3:6] 11 | return data, cols 12 | 13 | 14 | def test(): 15 | data, cols = load_bigcode() 16 | print(data.head()) 17 | print(cols) 18 | 19 | 20 | if __name__ == "__main__": 21 | test() 22 | -------------------------------------------------------------------------------- /benchbench/data/mmlu/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | 5 | def load_mmlu(): 6 | data = pd.read_csv( 7 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"), 8 | sep="\t", 9 | ) 10 | cols = data.columns[4:] 11 | data[cols] = data[cols] * 100.0 12 | return data, cols 13 | 14 | 15 | def test(): 16 | data, cols = load_mmlu() 17 | print(data.head()) 18 | print(cols) 19 | 20 | 21 | if __name__ == "__main__": 22 | test() 23 | -------------------------------------------------------------------------------- /benchbench/data/mteb/format.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | fout = open(os.path.join(os.getcwd(), "/leaderboard.tsv"), "w") 4 | with open(os.path.join(os.getcwd(), "/vanilla.txt"), "r") as fin: 5 | for i, line in enumerate(fin.readlines()): 6 | line = line.strip().replace("\t", " ") 7 | if len(line) != 0: 8 | fout.write(line) 9 | else: 10 | fout.write("-") 11 | if i % 14 == 13: 12 | fout.write("\n") 13 | else: 14 | fout.write("\t") 15 | -------------------------------------------------------------------------------- /benchbench/data/bigcode/format.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | fout = open(os.path.join(os.getcwd(), "leaderboard.tsv"), "w") 4 | with open(os.path.join(os.getcwd(), "vanilla.txt"), "r") as fin: 5 | for i, line in enumerate(fin.readlines()): 6 | line = line.strip().replace("\t", " ") 7 | if len(line) != 0: 8 | fout.write(line.split()[0]) 9 | else: 10 | continue 11 | if i % 8 == 7: 12 | fout.write("\n") 13 | else: 14 | fout.write("\t") 15 | -------------------------------------------------------------------------------- /benchbench/data/openllm/format.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | fout = open(os.path.join(os.getcwd(), "leaderboard.tsv"), "w") 4 | with open(os.path.join(os.getcwd(), "vanilla.txt"), "r") as fin: 5 | for i, line in enumerate(fin.readlines()): 6 | line = line.strip().replace("\t", " ") 7 | if len(line) != 0: 8 | fout.write(line.split()[0]) 9 | else: 10 | continue 11 | if i % 10 == 9: 12 | fout.write("\n") 13 | else: 14 | fout.write("\t") 15 | -------------------------------------------------------------------------------- /benchbench/data/helm_lite/format.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | fout = open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"), "w") 4 | with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), "vanilla.txt"), "r") as fin: 5 | cols = [] 6 | helm_lite = dict() 7 | for i, line in enumerate(fin.readlines()): 8 | line = line.strip() 9 | if len(line) == 0: 10 | continue 11 | fout.write(line) 12 | if i % 12 == 11: 13 | fout.write("\n") 14 | else: 15 | fout.write("\t") 16 | -------------------------------------------------------------------------------- /docs/measures.rst: -------------------------------------------------------------------------------- 1 | Measures 2 | ============= 3 | 4 | .. automodule:: benchbench.measures 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | benchbench.measures.cardinal 10 | -------------------------------------------- 11 | 12 | .. automodule:: benchbench.measures.cardinal 13 | :members: 14 | :undoc-members: 15 | :show-inheritance: 16 | 17 | benchbench.measures.ordinal 18 | -------------------------------------------- 19 | 20 | .. automodule:: benchbench.measures.ordinal 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | -------------------------------------------------------------------------------- /benchbench/data/openllm/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | 5 | def load_openllm(): 6 | data = pd.read_csv( 7 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"), 8 | sep="\t", 9 | ) 10 | cols = data.columns[3:] 11 | data["average_score"] = data[cols].mean(1) 12 | data.sort_values(by="average_score", inplace=True, ascending=False) 13 | return data, cols 14 | 15 | 16 | def test(): 17 | data, cols = load_openllm() 18 | print(data.head()) 19 | print(cols) 20 | 21 | 22 | if __name__ == "__main__": 23 | test() 24 | -------------------------------------------------------------------------------- /benchbench/data/helm_capability/format.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | 4 | fout = open( 5 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"), "w" 6 | ) 7 | with open( 8 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "vanilla.txt"), "r" 9 | ) as fin: 10 | cols = [] 11 | helm_lite = dict() 12 | for i, line in enumerate(fin.readlines()): 13 | line = line.strip() 14 | if len(line) == 0: 15 | continue 16 | fout.write(line) 17 | if i % 7 == 6: 18 | fout.write("\n") 19 | else: 20 | fout.write("\t") 21 | -------------------------------------------------------------------------------- /benchbench/data/bbh/format.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | fout = open(os.path.join(os.getcwd(), "leaderboard.tsv"), "w") 5 | with open(os.path.join(os.getcwd(), "cols.txt"), "r") as fin: 6 | fout.write(fin.readline() + "\n") 7 | with open(os.path.join(os.getcwd(), "vanilla.tsv"), "r") as fin: 8 | new_line = "" 9 | for i, line in enumerate(fin.readlines()): 10 | if i % 5 <= 3: 11 | new_line += line.strip() 12 | new_line += "\t" 13 | else: 14 | new_line += re.sub("\s+", "\t", line) 15 | fout.write(new_line.rstrip() + "\n") 16 | new_line = "" 17 | -------------------------------------------------------------------------------- /benchbench/data/bbh/cols.txt: -------------------------------------------------------------------------------- 1 | Rank Model Company Release Parameters Average Boolean Expressions Causal Judgement Date Understanding Disambiguation QA Dyck Languages Formal Fallacies Geometric Shapes Hyperbaton Logical Deduction Three Objects Logical Deduction Five Objects Logical Deduction Seven Objects Movie Recommendation Multistep Arithmetic Two Navigate Object Counting Penguins In A Table Reasoning About Colored Objects Ruin Names Salient Translation Error Detection Snarks Sports Understanding Temporal Sequences Tracking Shuffled Objects Three Objects Tracking Shuffled Objects Five Objects Tracking Shuffled Objects Seven Objects Web Of Lies Word Sorting 2 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /benchbench/data/helm_lite/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | 5 | 6 | def load_helm_lite(): 7 | data = pd.read_csv( 8 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"), 9 | sep="\t", 10 | ) 11 | data = data.replace("-", np.nan) 12 | data = data.dropna(axis=0, how="all") 13 | data = data.dropna(axis=1, how="all") 14 | cols = data.columns[2:] 15 | 16 | for c in cols: 17 | data[c] = np.array([float(i) for i in data[c].values]) 18 | 19 | return data, cols 20 | 21 | 22 | def test(): 23 | data, cols = load_helm_lite() 24 | print(data.head()) 25 | print(cols) 26 | 27 | 28 | if __name__ == "__main__": 29 | test() 30 | -------------------------------------------------------------------------------- /docs/utils.rst: -------------------------------------------------------------------------------- 1 | Utils 2 | ============= 3 | 4 | .. automodule:: benchbench.utils 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | 9 | benchbench.utils.base 10 | -------------------------------------------- 11 | 12 | .. automodule:: benchbench.utils.base 13 | :members: 14 | :undoc-members: 15 | :show-inheritance: 16 | 17 | benchbench.utils.metric 18 | -------------------------------------------- 19 | 20 | .. automodule:: benchbench.utils.metric 21 | :members: 22 | :undoc-members: 23 | :show-inheritance: 24 | 25 | benchbench.utils.win_rate 26 | -------------------------------------------- 27 | 28 | .. automodule:: benchbench.utils.win_rate 29 | :members: 30 | :undoc-members: 31 | :show-inheritance: 32 | -------------------------------------------------------------------------------- /benchbench/data/helm_capability/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | 5 | 6 | def load_helm_capability(): 7 | data = pd.read_csv( 8 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"), 9 | sep="\t", 10 | ) 11 | data = data.replace("-", np.nan) 12 | data = data.dropna(axis=0, how="all") 13 | data = data.dropna(axis=1, how="all") 14 | cols = data.columns[2:] 15 | 16 | for c in cols: 17 | data[c] = np.array([float(i) for i in data[c].values]) 18 | 19 | return data, cols 20 | 21 | 22 | def test(): 23 | data, cols = load_helm_capability() 24 | print(data.head()) 25 | print(cols) 26 | 27 | 28 | if __name__ == "__main__": 29 | test() 30 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /benchbench/data/mteb/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | 4 | 5 | def load_mteb(): 6 | data = pd.read_csv( 7 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"), 8 | sep="\t", 9 | ) 10 | orig_cols = data.columns[6:] 11 | ret = {} 12 | cols = [] 13 | for c in orig_cols: 14 | col_name = c.split(" (")[0] 15 | num_task = int(c.split(" (")[1].split(" ")[0]) 16 | for i in range(num_task): 17 | ret["{}-{}".format(col_name, i)] = data[c].values.copy() 18 | cols.append("{}-{}".format(col_name, i)) 19 | data = pd.concat([data, pd.DataFrame(ret)], axis=1) 20 | 21 | data["average_score"] = data[cols].mean(1) 22 | data.sort_values(by="average_score", inplace=True, ascending=False) 23 | return data, cols 24 | 25 | 26 | def test(): 27 | data, cols = load_mteb() 28 | print(data.head()) 29 | print(cols) 30 | 31 | 32 | if __name__ == "__main__": 33 | test() 34 | -------------------------------------------------------------------------------- /benchbench/data/heim/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | 5 | 6 | def load_heim(subset="alignment_human"): 7 | assert subset in [ 8 | "alignment_auto", 9 | "nsfw", 10 | "quality_auto", 11 | "aesthetics_auto", 12 | "alignment_human", 13 | "nudity", 14 | "quality_human", 15 | "aesthetics_human", 16 | "black_out", 17 | "originality", 18 | ] 19 | data = pd.read_csv( 20 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "%s.tsv" % subset), 21 | sep="\t", 22 | ) 23 | data = data.replace("-", np.nan) 24 | data = data.dropna(axis=0, how="all") 25 | data = data.dropna(axis=1, how="all") 26 | cols = data.columns[2:] 27 | for c in cols: 28 | if "↓" in c: 29 | data[c] = -data[c] 30 | return data, cols 31 | 32 | 33 | def test(): 34 | data, cols = load_heim() 35 | print(data.head()) 36 | print(cols) 37 | 38 | 39 | if __name__ == "__main__": 40 | test() 41 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Guanhua Zhang and Moritz Hardt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /benchbench/data/superglue/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | 5 | 6 | def load_superglue(): 7 | data = pd.read_csv( 8 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"), 9 | sep="\t", 10 | ) 11 | ori_cols = data.columns[5:-2] 12 | cols = [] 13 | for c in ori_cols: 14 | if type(data[c].values[0]) is str and "/" in data[c].values[0]: 15 | c1 = c + "-a" 16 | c2 = c + "-b" 17 | res1, res2 = [], [] 18 | for line in data[c].values: 19 | s = line.strip().split("/") 20 | res1.append(float(s[0])) 21 | res2.append(float(s[1])) 22 | res1 = np.array(res1) 23 | res2 = np.array(res2) 24 | data[c1] = res1 25 | data[c2] = res2 26 | data[c] = (res1 + res2) / 2 27 | cols.append(c) 28 | else: 29 | cols.append(c) 30 | 31 | return data, cols 32 | 33 | 34 | def test(): 35 | data, cols = load_superglue() 36 | print(data.head()) 37 | print(cols) 38 | 39 | 40 | if __name__ == "__main__": 41 | test() 42 | -------------------------------------------------------------------------------- /benchbench/data/imagenet/format.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import pandas as pd 4 | 5 | fout = open(os.path.join(os.getcwd(), "leaderboard_raw.tsv"), "w") 6 | with open(os.path.join(os.getcwd(), "vanilla.txt"), "r") as fin: 7 | new_line = "" 8 | for i, line in enumerate(fin.readlines()): 9 | if i % 12 <= 10: 10 | new_line += line.strip() 11 | if len(line.strip()) != 0: 12 | new_line += "\t" 13 | else: 14 | new_line += re.sub("\s+", "\t", line) 15 | fout.write(new_line.rstrip() + "\n") 16 | new_line = "" 17 | fout.close() 18 | 19 | data = pd.read_csv(os.path.join(os.getcwd(), "leaderboard_raw.tsv"), sep="\t") 20 | data.sort_values(by=["Acc@1"], inplace=True, ascending=False) 21 | data["Model"] = data["Weight"].apply( 22 | lambda t: "_".join(t.split(".")[0].split("_")[:-1]).lower() 23 | ) 24 | # data.to_csv(os.path.join(os.getcwd(), "leaderboard_raw.tsv"), sep="\t", index=False) 25 | 26 | with open(os.path.join(os.getcwd(), "run.sh"), "w") as fout: 27 | for i in range(len(data)): 28 | fout.write( 29 | f"python run_imagenet.py --model_name {data['Model'][i]} --weight_name {data['Weight'][i]}\n" 30 | ) 31 | -------------------------------------------------------------------------------- /benchbench/data/bbh/statistic.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | 3 | configs = [ 4 | "boolean_expressions", 5 | "causal_judgement", 6 | "date_understanding", 7 | "disambiguation_qa", 8 | "dyck_languages", 9 | "formal_fallacies", 10 | "geometric_shapes", 11 | "hyperbaton", 12 | "logical_deduction_five_objects", 13 | "logical_deduction_seven_objects", 14 | "logical_deduction_three_objects", 15 | "movie_recommendation", 16 | "multistep_arithmetic_two", 17 | "navigate", 18 | "object_counting", 19 | "penguins_in_a_table", 20 | "reasoning_about_colored_objects", 21 | "ruin_names", 22 | "salient_translation_error_detection", 23 | "snarks", 24 | "sports_understanding", 25 | "temporal_sequences", 26 | "tracking_shuffled_objects_five_objects", 27 | "tracking_shuffled_objects_seven_objects", 28 | "tracking_shuffled_objects_three_objects", 29 | "web_of_lies", 30 | "word_sorting", 31 | ] 32 | ret = [] 33 | for c in configs: 34 | dataset = load_dataset("lukaemon/bbh", name=c, split="test") 35 | ret.append((c, set(dataset["target"]))) 36 | 37 | ret = sorted(ret, key=lambda x: len(x[1])) 38 | for i in ret: 39 | print(i[0], len(i[1]), i[1]) 40 | -------------------------------------------------------------------------------- /benchbench/data/helm/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import pandas as pd 4 | 5 | 6 | def load_helm(subset="accuracy"): 7 | assert subset in [ 8 | "accuracy", 9 | "bias", 10 | "calibration", 11 | "fairness", 12 | "efficiency", 13 | "robustness", 14 | "summarization", 15 | "toxicity", 16 | ] 17 | data = pd.read_csv( 18 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "%s.tsv" % subset), 19 | sep="\t", 20 | ) 21 | data = data.replace("-", np.nan) 22 | data = data.dropna(axis=0, how="all") 23 | data = data.dropna(axis=1, how="all") 24 | cols = data.columns[2:] 25 | 26 | for c in cols: 27 | data[c] = np.array([float(i) for i in data[c].values]) 28 | 29 | for c in cols: 30 | if ( 31 | "ECE" in c 32 | or "Representation" in c 33 | or "Toxic fraction" in c 34 | or "Stereotype" in c 35 | or "inference time" in c 36 | ): 37 | data[c] = -data[c] 38 | 39 | return data, cols 40 | 41 | 42 | def test(): 43 | data, cols = load_helm() 44 | print(data.head()) 45 | print(cols) 46 | 47 | 48 | if __name__ == "__main__": 49 | test() 50 | -------------------------------------------------------------------------------- /benchbench/data/glue/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import pandas as pd 3 | import numpy as np 4 | 5 | 6 | def load_glue(): 7 | data = pd.read_csv( 8 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"), 9 | sep="\t", 10 | ) 11 | ori_cols = data.columns[5:-1] 12 | cols = [] 13 | for c in ori_cols: 14 | if type(data[c].values[0]) is str and "/" in data[c].values[0]: 15 | c1 = c + "-a" 16 | c2 = c + "-b" 17 | res1, res2 = [], [] 18 | for line in data[c].values: 19 | s = line.strip().split("/") 20 | res1.append(float(s[0])) 21 | res2.append(float(s[1])) 22 | res1 = np.array(res1) 23 | res2 = np.array(res2) 24 | data[c1] = res1 25 | data[c2] = res2 26 | data[c] = (res1 + res2) / 2 27 | cols.append(c) 28 | elif "MNLI" in c: 29 | continue 30 | else: 31 | cols.append(c) 32 | data["MNLI"] = (data["MNLI-m"] + data["MNLI-mm"]) / 2 33 | cols.append("MNLI") 34 | 35 | return data, cols 36 | 37 | 38 | def test(): 39 | data, cols = load_glue() 40 | print(data.head()) 41 | print(cols) 42 | 43 | 44 | if __name__ == "__main__": 45 | test() 46 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=40.8.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "benchbench" 7 | authors = [ 8 | {name = "Guanhua Zhang"}, 9 | ] 10 | description = "Tools for measuring sensitivity and diversity of multi-task benchmarks." 11 | version = "1.0.1" 12 | requires-python = ">=3.7" 13 | readme = "README.md" 14 | license = {text = "MIT"} 15 | classifiers=[ 16 | "Development Status :: 3 - Alpha", 17 | "License :: OSI Approved :: MIT License", 18 | "Intended Audience :: Science/Research", 19 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 20 | "Natural Language :: English", 21 | "Programming Language :: Python :: 3", 22 | "Programming Language :: Python :: 3.7", 23 | "Programming Language :: Python :: 3.8", 24 | "Programming Language :: Python :: 3.9", 25 | "Programming Language :: Python :: 3.10", 26 | "Programming Language :: Python :: 3.11", 27 | "Programming Language :: Python :: 3.12", 28 | ] 29 | dependencies = [ 30 | "scipy", 31 | "numpy", 32 | "torch", 33 | "pandas", 34 | "joblib", 35 | "scikit-learn", 36 | "zarth_utils==1.0" 37 | ] 38 | 39 | 40 | [tool.setuptools] 41 | include-package-data = true 42 | 43 | [tool.setuptools.packages.find] 44 | include = ["benchbench*"] 45 | -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | import os 9 | import sys 10 | 11 | sys.path.insert(0, os.path.abspath('../')) 12 | 13 | project = 'BenchBench' 14 | copyright = '2024, Guanhua' 15 | author = 'Guanhua' 16 | 17 | # -- General configuration --------------------------------------------------- 18 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 19 | 20 | extensions = [ 21 | 'sphinx.ext.autodoc', # pull doc from docstrings 22 | 'sphinx.ext.intersphinx', # link to other projects 23 | 'sphinx.ext.todo', # support TODOs 24 | 'sphinx.ext.ifconfig', # include stuff based on configuration 25 | 'sphinx.ext.viewcode', # add source code 26 | 'myst_parser', # add MD files 27 | 'sphinx.ext.napoleon' # Google style doc 28 | ] 29 | 30 | templates_path = ['_templates'] 31 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 32 | pygments_style = 'sphinx' 33 | 34 | # -- Options for HTML output ------------------------------------------------- 35 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 36 | 37 | html_theme = 'alabaster' 38 | html_static_path = ['_static'] 39 | -------------------------------------------------------------------------------- /benchbench/utils/win_rate.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy as np 3 | 4 | 5 | class WinningRate: 6 | def __init__(self, data, cols): 7 | """ 8 | Calculate the winning rate of a list of models. 9 | 10 | Args: 11 | data (pd.DataFrame): Each row represents a model, each column represents a task. 12 | cols (list): The column names of the tasks. 13 | 14 | Returns: 15 | None 16 | """ 17 | m = len(data) 18 | n = len(cols) 19 | self.win_rate = np.zeros([m, m]) 20 | data = data[cols].values 21 | for i in range(m): 22 | for j in range(m): 23 | n_win, n_tot = 0, 0 24 | for k in range(n): 25 | if not math.isnan(data[i, k]) and not math.isnan(data[j, k]): 26 | n_tot += 1 27 | if float(data[i, k]) > float(data[j, k]) and i != j: 28 | n_win += 1 29 | self.win_rate[i, j] = n_win / n_tot if n_tot > 0 else 0 30 | 31 | def get_winning_rate(self, model_indices=None): 32 | """ 33 | Get the winning rate of the selected models. 34 | 35 | Args: 36 | model_indices (list): Indices of the selected models. 37 | 38 | Returns: 39 | float: The winning rate. 40 | """ 41 | model_indices = ( 42 | np.arange(len(self.win_rate)) if model_indices is None else model_indices 43 | ) 44 | return self.win_rate[model_indices][:, model_indices].mean(axis=1) 45 | -------------------------------------------------------------------------------- /benchbench/data/dummy/__init__.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import pandas as pd 4 | 5 | 6 | def load_random_benchmark(seed=0, num_task=100, num_model=100): 7 | np.random.seed(seed) 8 | random.seed(seed) 9 | data = np.random.random([num_model, num_task]) * 100 10 | data = pd.DataFrame(data) 11 | cols = list(data.columns) 12 | return data, cols 13 | 14 | 15 | def load_constant_benchmark(seed=0, num_task=100, num_model=100): 16 | np.random.seed(seed) 17 | random.seed(seed) 18 | rd = np.random.random([num_model, 1]) 19 | data = np.concatenate([rd.copy() for _ in range(num_task)], axis=1) * 100 20 | data = pd.DataFrame(data) 21 | cols = list(data.columns) 22 | return data, cols 23 | 24 | 25 | def load_interpolation_benchmark(seed=0, mix_ratio=0.0, num_task=100, num_model=100): 26 | num_random = int(mix_ratio * num_task + 0.5) 27 | num_constant = int((1 - mix_ratio) * num_task + 0.5) 28 | if num_random == 0: 29 | return load_constant_benchmark( 30 | seed=seed, num_task=num_constant, num_model=num_model 31 | ) 32 | elif num_constant == 0: 33 | return load_random_benchmark( 34 | seed=seed, num_task=num_random, num_model=num_model 35 | ) 36 | else: 37 | random = load_random_benchmark( 38 | seed=seed, num_task=num_random, num_model=num_model 39 | )[0] 40 | constant = load_constant_benchmark( 41 | seed=seed, num_task=num_constant, num_model=num_model 42 | )[0] 43 | data = pd.DataFrame(np.concatenate([random.values, constant.values], axis=1)) 44 | cols = list(data.columns) 45 | return data, cols 46 | -------------------------------------------------------------------------------- /benchbench/data/imagenet/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | 4 | import numpy as np 5 | 6 | import pandas as pd 7 | 8 | 9 | def load_imagenet(*args, **kwargs): 10 | # Due to legacy reason, instead of refactoring the code, we just make a wrapper function like this. 11 | return load_data(*args, **kwargs) 12 | 13 | 14 | def load_data(load_raw=False, seed=0, num_task=20): 15 | if load_raw: 16 | data = pd.read_csv( 17 | os.path.join( 18 | os.path.dirname(os.path.abspath(__file__)), "leaderboard_raw.tsv" 19 | ), 20 | sep="\t", 21 | ) 22 | data = data.dropna(axis=0, how="any") 23 | cols = [data.columns[1]] 24 | else: 25 | data = pd.read_csv( 26 | os.path.join(os.path.dirname(os.path.abspath(__file__)), "leaderboard.tsv"), 27 | sep="\t", 28 | ) 29 | data = data.sort_values(by=["acc"], ascending=False).reset_index() 30 | if num_task < 1000: 31 | assert 1000 % num_task == 0 and num_task >= 1 32 | cols = [] 33 | random.seed(seed) 34 | np.random.seed(seed) 35 | size_task = 1000 // num_task 36 | perm = np.random.permutation(1000) 37 | for i in range(num_task): 38 | task_cols = [ 39 | "acc_%d" % j for j in perm[i * size_task : (i + 1) * size_task] 40 | ] 41 | data["acc_aggr_%d" % i] = data[task_cols].values.mean(1) 42 | cols.append("acc_aggr_%d" % i) 43 | else: 44 | cols = ["acc_%d" % i for i in range(1000)] 45 | return data, cols 46 | 47 | 48 | def test(): 49 | data, cols = load_data() 50 | print(data.head()) 51 | print(cols) 52 | 53 | 54 | if __name__ == "__main__": 55 | test() 56 | -------------------------------------------------------------------------------- /benchbench/data/vtab/leaderboard.tsv: -------------------------------------------------------------------------------- 1 | Mean (selected datasets) CIFAR-100 Caltech101 Camelyon Clevr-Count Clevr-Dist DMLab DTD EuroSAT Flowers102 KITTI-Dist Pets Resisc45 Retinopathy SVHN Sun397 dSpr-Loc dSpr-Orient sNORB-Azim sNORB-Elev 2 | Sup-Rotation-100% 90.2 84.8 94.6 85.9 99.8 92.5 76.5 75.9 98.8 94.7 82.3 91.5 94.9 79.5 97.0 70.2 100 96.5 100 98.4 3 | Sup-Exemplar-100% 90.1 84.1 94.4 86.7 99.8 92.7 76.8 74.5 98.6 93.4 84.0 91.8 95.1 79.5 97.1 69.4 100 96.4 99.8 98.0 4 | Sup-100% 89.7 83.8 94.1 83.9 99.8 92.1 76.4 74.0 98.8 93.2 80.7 91.9 95.3 79.3 97.0 70.7 100 96.4 99.8 97.7 5 | Semi-Exemplar-10% 88.8 82.7 85.3 86.0 99.8 93.1 76.8 70.5 98.6 92.2 81.5 89.0 94.7 78.8 97.0 67.4 100 96.5 100 97.8 6 | Semi-Rotation-10% 88.6 82.4 88.1 78.6 99.8 93.2 76.1 72.4 98.7 93.2 81.0 87.9 94.9 79.0 96.9 66.7 100 96.5 99.9 97.5 7 | Rotation 86.4 73.6 88.3 86.4 99.8 93.3 76.8 63.3 98.3 83.4 82.6 71.8 93.4 78.6 96.9 60.5 100 96.5 99.9 98.0 8 | Exemplar 84.8 70.7 81.9 84.7 99.8 93.3 74.7 61.1 98.5 79.3 78.2 67.8 93.5 79.0 96.7 58.2 100 96.5 99.9 97.4 9 | Rel.Pat.Loc 83.1 65.7 79.9 85.3 99.5 87.7 71.5 65.2 97.8 78.8 75.0 66.8 91.5 79.8 93.7 58.0 100 90.4 99.7 92.6 10 | Jigsaw 83.0 65.3 79.1 83.0 99.6 88.6 72.0 63.9 97.9 77.9 74.7 65.4 92.0 80.1 93.9 59.2 100 90.3 99.9 93.6 11 | From-Scratch 75.4 64.4 55.9 81.2 99.7 89.4 71.5 31.3 96.2 50.6 68.4 23.8 86.8 76.8 96.3 52.7 100 96.3 99.9 91.7 12 | Uncond-BigGAN 68.2 58.1 73.6 82.2 47.6 54.9 54.8 44.9 89.8 63.5 57.4 30.9 75.4 75.9 93.0 46.9 86.1 95.9 88.1 76.6 13 | VAE 66.8 44.2 48.4 81.3 98.4 90.1 59.7 16.0 92.5 18.4 57.0 14.0 65.0 74.2 93.1 29.3 100 94.7 97.9 95.6 14 | WAE-MMD 64.9 38.8 50.8 80.6 98.1 89.3 52.6 11.0 94.1 20.8 61.6 16.2 64.8 73.8 90.9 31.6 100 90.2 96.3 72.4 15 | Cond-BigGAN 51.4 56.3 0.148 81.3 12.4 24.5 51.4 44.8 94.5 68.8 49.7 31.6 76.5 75.3 91.4 44.9 6.16 7.45 80.6 79.2 16 | WAE-GAN 48.5 24.8 42.0 77.1 52.2 70.2 37.3 8.67 81.5 15.5 62.3 13.1 38.4 73.6 78.2 12.8 97.7 49.9 33.4 52.2 17 | WAE-UKL 46.8 23.2 41.7 76.4 44.5 67.8 36.7 12.3 78.1 17.2 55.1 12.3 36.8 73.6 65.5 12.0 98.1 51.4 35.9 51.0 -------------------------------------------------------------------------------- /benchbench/data/bigcode/leaderboard.tsv: -------------------------------------------------------------------------------- 1 | T Models Win humaneval-python java javascript Throughput 2 | 🔴 DeepSeek-Coder-33b-instruct 39.58 80.02 52.03 65.13 25.2 3 | 🔴 DeepSeek-Coder-7b-instruct 38.75 80.22 53.34 65.8 51 4 | 🔶 Phind-CodeLlama-34B-v2 37.04 71.95 54.06 65.34 15.1 5 | 🔶 Phind-CodeLlama-34B-v1 36.12 65.85 49.47 64.45 15.1 6 | 🔶 Phind-CodeLlama-34B-Python-v1 35.27 70.22 48.72 66.24 15.1 7 | 🔴 DeepSeek-Coder-33b-base 35 52.45 43.77 51.28 25.2 8 | 🔶 WizardCoder-Python-34B-V1.0 33.96 70.73 44.94 55.28 15.1 9 | 🔴 DeepSeek-Coder-7b-base 31.75 45.83 37.72 45.9 51 10 | 🔶 CodeLlama-34b-Instruct 30.96 50.79 41.53 45.85 15.1 11 | 🔶 WizardCoder-Python-13B-V1.0 30.58 62.19 41.77 48.45 25.3 12 | 🟢 CodeLlama-34b 30.35 45.11 40.19 41.66 15.1 13 | 🟢 CodeLlama-34b-Python 29.65 53.29 39.46 44.72 15.1 14 | 🔶 WizardCoder-15B-V1.0 28.92 58.12 35.77 41.91 43.7 15 | 🔶 CodeLlama-13b-Instruct 27.88 50.6 33.99 40.92 25.3 16 | 🟢 CodeLlama-13b 26.19 35.07 32.23 38.26 25.3 17 | 🟢 CodeLlama-13b-Python 24.73 42.89 33.56 40.66 25.3 18 | 🔶 CodeLlama-7b-Instruct 23.69 45.65 28.77 33.11 33.1 19 | 🟢 CodeLlama-7b 22.31 29.98 29.2 31.8 33.1 20 | 🔴 CodeShell-7B 22.31 34.32 30.43 33.17 33.9 21 | 🔶 OctoCoder-15B 21.15 45.3 26.03 32.8 44.4 22 | 🟢 Falcon-180B 20.9 35.37 28.48 31.68 -1 23 | 🟢 CodeLlama-7b-Python 20.62 40.48 29.15 36.34 33.1 24 | 🟢 StarCoder-15B 20.58 33.57 30.22 30.79 43.9 25 | 🟢 StarCoderBase-15B 20.15 30.35 28.53 31.7 43.8 26 | 🟢 CodeGeex2-6B 17.42 33.49 23.46 29.9 32.7 27 | 🟢 StarCoderBase-7B 16.85 28.37 24.44 27.35 46.9 28 | 🔶 OctoGeeX-7B 16.65 42.28 19.33 28.5 32.7 29 | 🔶 WizardCoder-3B-V1.0 15.73 32.92 24.34 26.16 50 30 | 🟢 CodeGen25-7B-multi 15.35 28.7 26.01 26.27 32.6 31 | 🔶 Refact-1.6B 14.85 31.1 22.78 22.36 50 32 | 🔴 DeepSeek-Coder-1b-base 14.42 32.13 27.16 28.46 -1 33 | 🟢 StarCoderBase-3B 11.65 21.5 19.25 21.32 50 34 | 🔶 WizardCoder-1B-V1.0 10.35 23.17 19.68 19.13 71.4 35 | 🟢 Replit-2.7B 8.54 20.12 21.39 20.18 42.2 36 | 🟢 CodeGen25-7B-mono 8.15 33.08 19.75 23.22 34.1 37 | 🟢 StarCoderBase-1.1B 8.12 15.17 14.2 13.38 71.4 38 | 🟢 CodeGen-16B-Multi 7.08 19.26 22.2 19.15 17.2 39 | 🟢 Phi-1 6.25 51.22 10.76 19.25 -1 40 | 🟢 StableCode-3B 6.04 20.2 19.54 18.98 30.2 41 | 🟢 DeciCoder-1B 5.81 19.32 15.3 17.85 54.6 42 | 🟢 SantaCoder-1.1B 4.58 18.12 15 15.47 50.8 43 | -------------------------------------------------------------------------------- /benchbench/data/mmlu/format.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import numpy as np 4 | import pandas as pd 5 | from datasets import load_dataset 6 | 7 | # read top 100 model names 8 | top_100_with_duplicate = pd.read_csv("leaderboard_raw.csv", header=None) 9 | top_100 = [] 10 | for i in top_100_with_duplicate[0].values: 11 | if i not in top_100: 12 | top_100.append(i) 13 | print(top_100) 14 | 15 | # download the meta data 16 | os.makedirs("data", exist_ok=True) 17 | with open("data/download.sh", "w") as fout: 18 | fout.write("git lfs install\n") 19 | for i in top_100: 20 | cmd = "git clone git@hf.co:data/%s" % i 21 | fout.write(cmd + "\n") 22 | print(cmd) 23 | # one must download the data manually by ``cd data; bash download.sh'' 24 | # comment the following lines if you have downloaded the data 25 | # exit(0) 26 | 27 | # load all model names and split names 28 | all_model_split = [] 29 | dir_dataset = os.path.join("data") 30 | for model_name in top_100: 31 | model_name = model_name[len("open-llm-leaderboard/") :] 32 | dir_model = os.path.join("data", model_name) 33 | if not os.path.isdir(dir_model): 34 | continue 35 | for split_name in os.listdir(dir_model): 36 | if not split_name.endswith(".parquet"): 37 | continue 38 | split_name = split_name[len("results_") : -len(".parquet")] 39 | all_model_split.append((model_name, split_name)) 40 | print(len(all_model_split)) 41 | 42 | # load all scores and filter broken ones 43 | ret = [] 44 | for model_name, split_name in all_model_split: 45 | model = load_dataset( 46 | "parquet", 47 | data_files=os.path.join("data", model_name, "results_%s.parquet" % split_name), 48 | split="train", 49 | )["results"][0] 50 | tasks = [i for i in model.keys() if "hendrycksTest" in i] 51 | if len(tasks) != 57: 52 | continue 53 | avg = np.mean([model[c]["acc_norm"] for c in tasks]) 54 | if math.isnan(avg): 55 | continue 56 | record = dict() 57 | record["model_name"] = model_name 58 | record["split_name"] = split_name 59 | record["average_score"] = avg 60 | record.update({c: model[c]["acc_norm"] for c in tasks}) 61 | ret.append(record) 62 | print(model_name, split_name, "%.2lf" % avg) 63 | ret = sorted(ret, key=lambda x: -x["average_score"]) 64 | ret = pd.DataFrame(ret) 65 | ret.to_csv("calibration.tsv", sep="\t") 66 | -------------------------------------------------------------------------------- /benchbench/utils/base.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def is_int(x): 5 | """ 6 | Check if a string can be converted to an integer. 7 | 8 | Args: 9 | x(str): Input string. 10 | 11 | Returns: 12 | bool: True if x can be converted to an integer, False otherwise 13 | """ 14 | try: 15 | int(x) 16 | return True 17 | except ValueError: 18 | return False 19 | 20 | 21 | def is_number(s): 22 | """ 23 | Check if a string can be converted to a number. 24 | 25 | Args: 26 | s(str): Input string. 27 | 28 | Returns: 29 | bool: True if s can be converted to a number, False otherwise 30 | """ 31 | try: 32 | float(s) 33 | return True 34 | except ValueError: 35 | return False 36 | 37 | 38 | def get_combinations(s, k): 39 | """ 40 | Generate all subsets of size k from set s. 41 | 42 | Args: 43 | s(list): List of elements to get combinations from. 44 | k(int): Size of each combination. 45 | 46 | Returns: 47 | list: A list of combinations, where each combination is represented as a list. 48 | """ 49 | if k == 0: 50 | return [[]] 51 | elif k > len(s): 52 | return [] 53 | else: 54 | all_combinations = [] 55 | for i in range(len(s)): 56 | # For each element in the set, generate the combinations that include this element 57 | # and then recurse to generate combinations from the remaining elements 58 | element = s[i] 59 | remaining_elements = s[i + 1 :] 60 | for c in get_combinations(remaining_elements, k - 1): 61 | all_combinations.append([element] + c) 62 | return all_combinations 63 | 64 | 65 | def rankdata(a, method="average"): 66 | assert method == "average", "Only average method is implemented" 67 | arr = np.ravel(np.asarray(a)) 68 | sorter = np.argsort(arr, kind="quicksort") 69 | 70 | inv = np.empty(sorter.size, dtype=np.intp) 71 | inv[sorter] = np.arange(sorter.size, dtype=np.intp) 72 | 73 | arr = arr[sorter] 74 | obs = np.r_[True, np.fabs(arr[1:] - arr[:-1]) > 1e-8] # this is the only change 75 | dense = obs.cumsum()[inv] 76 | 77 | # cumulative counts of each unique value 78 | count = np.r_[np.nonzero(obs)[0], len(obs)] 79 | 80 | # average method 81 | return 0.5 * (count[dense] + count[dense - 1] + 1) 82 | -------------------------------------------------------------------------------- /benchbench/data/heim/quality_human.tsv: -------------------------------------------------------------------------------- 1 | Model/adapter Mean win rate ↑ [ sort ] MS-COCO (base) - Photorealism - generated (human) ↑ [ sort ] MS-COCO (fairness - gender) - Photorealism - generated (human) ↑ [ sort ] MS-COCO (fairness - AAVE dialect) - Photorealism - generated (human) ↑ [ sort ] MS-COCO (robustness) - Photorealism - generated (human) ↑ [ sort ] MS-COCO (Chinese) - Photorealism - generated (human) ↑ [ sort ] MS-COCO (Hindi) - Photorealism - generated (human) ↑ [ sort ] MS-COCO (Spanish) - Photorealism - generated (human) ↑ [ sort ] MS-COCO (Art styles) - Photorealism - generated (human) ↑ [ sort ] 2 | Dreamlike Photoreal v2.0 (1B) 0.92 2.619 2.694 2.65 2.726 2.76 2.628 2.894 - 3 | Safe Stable Diffusion weak (1B) 0.863 2.611 2.647 2.643 2.637 2.676 2.504 2.952 - 4 | DALL-E 2 (3.5B) 0.851 2.621 2.632 2.411 2.552 2.54 3.769 2.935 - 5 | Safe Stable Diffusion strong (1B) 0.771 2.286 2.332 2.526 2.807 2.936 2.684 2.712 - 6 | Stable Diffusion v1.5 (1B) 0.743 2.375 2.392 2.551 2.502 2.7 2.516 2.85 - 7 | DeepFloyd IF X-Large (4.3B) 0.726 2.207 2.216 2.554 2.776 2.51 2.842 2.736 - 8 | Safe Stable Diffusion medium (1B) 0.714 2.489 2.467 2.521 2.426 2.586 2.478 2.886 - 9 | Stable Diffusion v2 base (1B) 0.691 2.494 2.515 2.476 2.5 2.558 2.316 2.792 - 10 | GigaGAN (1B) 0.686 2.118 2.165 2.385 2.508 2.928 2.794 2.826 - 11 | Safe Stable Diffusion max (1B) 0.674 2.305 2.276 2.437 2.564 2.702 2.524 2.652 - 12 | Stable Diffusion v1.4 (1B) 0.657 2.512 2.482 2.309 2.561 2.752 2.27 2.644 - 13 | Stable Diffusion v2.1 base (1B) 0.6 2.42 2.38 2.318 2.44 2.436 2.33 2.77 - 14 | DeepFloyd IF Medium (0.4B) 0.554 2.101 2.122 2.406 2.542 2.238 2.698 2.72 - 15 | DeepFloyd IF Large (0.9B) 0.514 2.089 2.092 2.15 2.518 2.104 2.968 2.758 - 16 | MultiFusion (13B) 0.44 2.309 2.323 2.297 2.318 2.428 1.564 2.69 - 17 | DALL-E mega (2.6B) 0.417 2.058 2.097 2.046 2.308 2.39 2.284 2.884 - 18 | Dreamlike Diffusion v1.0 (1B) 0.4 2.15 2.155 2.119 2.342 2.472 2.164 2.44 - 19 | Openjourney v2 (1B) 0.309 1.941 1.928 2.145 2.322 2.178 2.422 2.508 - 20 | DALL-E mini (0.4B) 0.291 1.975 1.987 1.981 2.377 2.294 1.868 2.602 - 21 | Redshift Diffusion (1B) 0.28 1.914 1.982 1.95 2.002 2.396 2.51 2.31 - 22 | minDALL-E (1.3B) 0.274 2.058 2.04 1.896 2.047 2.226 2.016 2.666 - 23 | Lexica Search with Stable Diffusion v1.5 (1B) 0.263 1.883 1.897 1.806 1.93 1.94 3.074 2.374 - 24 | CogView2 (6B) 0.189 1.756 1.794 1.959 2.021 2.394 1.828 2.354 - 25 | Vintedois (22h) Diffusion model v0.1 (1B) 0.074 1.57 1.593 1.558 1.867 1.886 1.878 1.862 - 26 | Promptist + Stable Diffusion v1.4 (1B) 0.057 1.593 1.587 1.552 1.682 1.716 2.242 1.506 - 27 | Openjourney v1 (1B) 0.04 1.602 1.582 1.579 1.693 1.234 1.586 1.57 - 28 | -------------------------------------------------------------------------------- /benchbench/data/imagenet/run_imagenet.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torchvision 4 | import joblib as jbl 5 | import pandas as pd 6 | from torchvision.models import * 7 | from tqdm import tqdm 8 | 9 | from zarth_utils.config import Config 10 | 11 | 12 | def load_model(model_name, weight_name): 13 | model = eval(model_name) 14 | weights = eval(weight_name) 15 | model = model(weights=weights).eval() 16 | preprocess = weights.transforms() 17 | return model, preprocess 18 | 19 | 20 | def main(): 21 | config = Config( 22 | default_config_dict={ 23 | "model_name": "vit_h_14", 24 | "weight_name": "ViT_H_14_Weights.IMAGENET1K_SWAG_E2E_V1", 25 | }, 26 | use_argparse=True, 27 | ) 28 | 29 | dir2save = os.path.join( 30 | os.path.dirname(os.path.abspath(__file__)), 31 | "%s--%s" % (config["model_name"], config["weight_name"]), 32 | ) 33 | os.makedirs(dir2save, exist_ok=True) 34 | if os.path.exists(os.path.join(dir2save, "meta_info.pkl")): 35 | print("Already exists, skip") 36 | return 37 | 38 | device = ( 39 | torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu") 40 | ) 41 | model, preprocess = load_model(config["model_name"], config["weight_name"]) 42 | model = model.to(device) 43 | 44 | dataset = torchvision.datasets.ImageNet( 45 | root=os.path.dirname(os.path.abspath(__file__)), 46 | split="val", 47 | transform=preprocess, 48 | ) 49 | data_loader = torch.utils.data.DataLoader( 50 | dataset, batch_size=128, shuffle=False, num_workers=2 51 | ) 52 | 53 | all_prob, all_pred, all_target = [], [], [] 54 | for i, (batch, target) in tqdm(enumerate(data_loader)): 55 | with torch.no_grad(): 56 | batch = batch.to(device) 57 | prob = model(batch).softmax(dim=1) 58 | pred = prob.argmax(dim=1) 59 | all_prob.append(prob.detach().cpu()) 60 | all_pred.append(pred.detach().cpu()) 61 | all_target.append(target.detach().cpu()) 62 | all_prob = torch.cat(all_prob, dim=0).numpy() 63 | all_pred = torch.cat(all_pred, dim=0).numpy() 64 | all_target = torch.cat(all_target, dim=0).numpy() 65 | 66 | jbl.dump(all_prob, os.path.join(dir2save, "prob.pkl")) 67 | jbl.dump(all_pred, os.path.join(dir2save, "pred.pkl")) 68 | jbl.dump(all_target, os.path.join(dir2save, "target.pkl")) 69 | pd.DataFrame({"pred": all_pred, "target": all_target}).to_csv( 70 | os.path.join(dir2save, "pred_target.tsv"), sep="\t", index=False 71 | ) 72 | 73 | meta_info = {} 74 | correct = all_pred == all_target 75 | meta_info["acc"] = correct.mean() 76 | for i in range(1000): 77 | subset = all_target == i 78 | correct[subset].mean() 79 | meta_info["acc_%d" % i] = correct[subset].mean() 80 | jbl.dump(meta_info, os.path.join(dir2save, "meta_info.pkl")) 81 | 82 | 83 | if __name__ == "__main__": 84 | main() 85 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 5 | **BenchBench** is a Python package that provides a suite of tools to evaluate multi-task benchmarks focusing on 6 | **task diversity** and **sensitivity to irrelevant changes**. 7 | 8 | Research shows that for all multi-task benchmarks there is a trade-off between task diversity and sensitivity. The more diverse a benchmark, the more sensitive its ranking is to irrelevant changes. Irrelevant changes 9 | are things like introducing weak models, or changing the metric in ways that shouldn't matter. 10 | 11 | Based on BenchBench, we're maintaining a living [benchmark of multi-task benchmarks](https://socialfoundations.github.io/benchbench/). Visit the project page to see the results or contribute your own benchmark. 12 | 13 | Please see [our paper](https://arxiv.org/pdf/2405.01719) for all relevant background and scientific results. Cite as: 14 | 15 | ``` 16 | @inproceedings{zhang2024inherent, 17 | title={Inherent Trade-Offs between Diversity and Stability in Multi-Task Benchmarks}, 18 | author={Guanhua Zhang and Moritz Hardt}, 19 | booktitle={International Conference on Machine Learning}, 20 | year={2024} 21 | } 22 | ``` 23 | 24 | ## Quick Start 25 | 26 | To install the package, simply run: 27 | 28 | ```bash 29 | pip install benchbench 30 | ``` 31 | 32 | ## Example Usage 33 | 34 | To evaluate a cardinal benchmark, you can use the following code: 35 | 36 | ```python 37 | from benchbench.data import load_cardinal_benchmark 38 | from benchbench.measures.cardinal import get_diversity, get_sensitivity 39 | 40 | data, cols = load_cardinal_benchmark('GLUE') 41 | diversity = get_diversity(data, cols) 42 | sensitivity = get_sensitivity(data, cols) 43 | ``` 44 | 45 | To evaluate an ordinal benchmark, you can use the following code: 46 | 47 | ```python 48 | from benchbench.data import load_ordinal_benchmark 49 | from benchbench.measures.ordinal import get_diversity, get_sensitivity 50 | 51 | data, cols = load_ordinal_benchmark('HELM-accuracy') 52 | diversity = get_diversity(data, cols) 53 | sensitivity = get_sensitivity(data, cols) 54 | ``` 55 | 56 | To use your own benchmark, you just need to provide a pandas DataFrame and a list of columns indicating the tasks. 57 | Check the [documentation](https://socialfoundations.github.io/benchbench) for more details. 58 | 59 | ## Reproduce the results from our paper 60 | 61 |

62 | 63 |

64 | 65 | You can reproduce the figures from our paper using the following Colabs: 66 | 67 | * [cardinal.ipynb](https://githubtocolab.com/socialfoundations/benchbench/blob/main/examples/cardinal.ipynb) 68 | * [ordinal.ipynb](https://githubtocolab.com/socialfoundations/benchbench/blob/main/examples/ordinal.ipynb) 69 | * [banner.ipynb](https://githubtocolab.com/socialfoundations/benchbench/blob/main/examples/banner.ipynb) 70 | -------------------------------------------------------------------------------- /benchbench/data/superglue/leaderboard.tsv: -------------------------------------------------------------------------------- 1 | Rank Name Model URL Score BoolQ CB COPA MultiRC ReCoRD RTE WiC WSC AX-b AX-g 2 | 1 JDExplore d-team Vega v2 91.3 90.5 98.6/99.2 99.4 88.2/62.4 94.4/93.9 96.0 77.4 98.6 -0.4 100.0/50.0 3 | 2 Liam Fedus ST-MoE-32B 91.2 92.4 96.9/98.0 99.2 89.6/65.8 95.1/94.4 93.5 77.7 96.6 72.3 96.1/94.1 4 | 3 Microsoft Alexander v-team Turing NLR v5 90.9 92.0 95.9/97.6 98.2 88.4/63.0 96.4/95.9 94.1 77.1 97.3 67.8 93.3/95.5 5 | 4 ERNIE Team - Baidu ERNIE 3.0 90.6 91.0 98.6/99.2 97.4 88.6/63.2 94.7/94.2 92.6 77.4 97.3 68.6 92.7/94.7 6 | 5 Yi Tay PaLM 540B 90.4 91.9 94.4/96.0 99.0 88.7/63.6 94.2/93.3 94.1 77.4 95.9 72.9 95.5/90.4 7 | 6 Zirui Wang T5 + UDG, Single Model (Google Brain) 90.4 91.4 95.8/97.6 98.0 88.3/63.0 94.2/93.5 93.0 77.9 96.6 69.1 92.7/91.9 8 | 7 DeBERTa Team - Microsoft DeBERTa / TuringNLRv4 90.3 90.4 95.7/97.6 98.4 88.2/63.7 94.5/94.1 93.2 77.5 95.9 66.7 93.3/93.8 9 | 8 SuperGLUE Human Baselines SuperGLUE Human Baselines 89.8 89.0 95.8/98.9 100.0 81.8/51.9 91.7/91.3 93.6 80.0 100.0 76.6 99.3/99.7 10 | 9 T5 Team - Google T5 89.3 91.2 93.9/96.8 94.8 88.1/63.3 94.1/93.4 92.5 76.9 93.8 65.6 92.7/91.9 11 | 10 SPoT Team - Google Frozen T5 1.1 + SPoT 89.2 91.1 95.8/97.6 95.6 87.9/61.9 93.3/92.4 92.9 75.8 93.8 66.9 83.1/82.6 12 | 11 Huawei Noah's Ark Lab NEZHA-Plus 86.7 87.8 94.4/96.0 93.6 84.6/55.1 90.1/89.6 89.1 74.6 93.2 58.0 87.1/74.4 13 | 12 Alibaba PAI&ICBU PAI Albert 86.1 88.1 92.4/96.4 91.8 84.6/54.7 89.0/88.3 88.8 74.1 93.2 75.6 98.3/99.2 14 | 13 Infosys : DAWN : AI Research RoBERTa-iCETS 86.0 88.5 93.2/95.2 91.2 86.4/58.2 89.9/89.3 89.9 72.9 89.0 61.8 88.8/81.5 15 | 14 Tencent Jarvis Lab RoBERTa (ensemble) 85.9 88.2 92.5/95.6 90.8 84.4/53.4 91.5/91.0 87.9 74.1 91.8 57.6 89.3/75.6 16 | 15 Zhuiyi Technology RoBERTa-mtl-adv 85.7 87.1 92.4/95.6 91.2 85.1/54.3 91.7/91.3 88.1 72.1 91.8 58.5 91.0/78.1 17 | 16 Facebook AI RoBERTa 84.6 87.1 90.5/95.2 90.6 84.4/52.5 90.6/90.0 88.2 69.9 89.0 57.9 91.0/78.1 18 | 17 Anuar Sharafudinov AILabs Team, Transformers 82.6 88.1 91.6/94.8 86.8 85.1/54.7 82.8/79.8 88.9 74.1 78.8 100.0 100.0/100.0 19 | 18 Ying Luo FSL++(ALBERT)-Few-Shot(32 Examples) 77.7 81.1 87.8/92.0 87.0 77.3/38.4 81.9/81.1 75.1 60.5 88.4 35.9 94.4/63.5 20 | 19 Rathin Bector Text to Text PETL 77.0 82.0 86.9/92.4 80.2 80.4/44.8 82.2/81.3 78.1 67.6 74.0 38.1 97.2/53.7 21 | 20 CASIA INSTALL(ALBERT)-few-shot 76.6 78.4 85.9/92.0 85.6 75.9/35.1 84.3/83.5 74.9 60.9 84.9 -0.4 100.0/50.0 22 | 21 Rakesh Radhakrishnan Menon ADAPET (ALBERT) - few-shot 76.0 80.0 82.3/92.0 85.4 76.2/35.7 86.1/85.5 75.0 53.5 85.6 -0.4 100.0/50.0 23 | 22 Timo Schick iPET (ALBERT) - Few-Shot (32 Examples) 75.4 81.2 79.9/88.8 90.8 74.1/31.7 85.9/85.4 70.8 49.3 88.4 36.2 97.8/57.9 24 | 23 Adrian de Wynter Bort (Alexa AI) 74.1 83.7 81.9/86.4 89.6 83.7/54.1 49.8/49.0 81.2 70.1 65.8 48.0 96.1/61.5 25 | 24 IBM Research AI BERT-mtl 73.5 84.8 89.6/94.0 73.8 73.2/30.5 74.6/74.0 84.1 66.2 61.0 29.6 97.8/57.3 26 | 25 Ben Mann GPT-3 few-shot - OpenAI 71.8 76.4 52.0/75.6 92.0 75.4/30.5 91.1/90.2 69.0 49.4 80.1 21.1 90.4/55.3 27 | 26 SuperGLUE Baselines BERT++ 71.5 79.0 84.8/90.4 73.8 70.0/24.1 72.0/71.3 79.0 69.6 64.4 38.0 99.4/51.4 28 | 27 Jeff Yang select-step-by-step 51.9 62.2 68.2/76.0 96.4 0.0/0.5 14.0/13.6 49.7 53.1 67.8 -0.4 100.0/50.0 29 | 28 Karen Hambardzumyan WARP (ALBERT-XXL-V2) - Few-Shot (32 Examples) 48.7 62.2 70.2/82.4 51.6 0.0/0.5 14.0/13.6 69.1 53.1 63.7 -0.4 100.0/50.0 30 | -------------------------------------------------------------------------------- /benchbench/data/heim/quality_auto.tsv: -------------------------------------------------------------------------------- 1 | Model/adapter Mean win rate ↑ [ sort ] MS-COCO (base) - Expected LPIPS score ↓ [ sort ] MS-COCO (base) - Expected Multi-Scale SSIM ↑ [ sort ] MS-COCO (base) - Expected PSNR ↑ [ sort ] MS-COCO (base) - Expected UIQI ↑ [ sort ] Caltech-UCSD Birds-200-2011 - Expected LPIPS score ↓ [ sort ] Caltech-UCSD Birds-200-2011 - Expected Multi-Scale SSIM ↑ [ sort ] Caltech-UCSD Birds-200-2011 - Expected PSNR ↑ [ sort ] Caltech-UCSD Birds-200-2011 - Expected UIQI ↑ [ sort ] Winoground - Expected LPIPS score ↓ [ sort ] Winoground - Expected Multi-Scale SSIM ↑ [ sort ] Winoground - Expected PSNR ↑ [ sort ] Winoground - Expected UIQI ↑ [ sort ] 2 | Redshift Diffusion (1B) 0.863 0.739 0.07 9.386 0.003 0.765 0.108 10.695 0.002 0.752 0.082 8.727 0.005 3 | Dreamlike Photoreal v2.0 (1B) 0.7 0.733 0.054 8.757 0.003 0.774 0.093 10.868 0.001 0.75 0.058 8.441 0.003 4 | DALL-E mini (0.4B) 0.663 0.741 0.084 8.677 0.003 0.776 0.112 10.649 0.001 0.774 0.085 7.953 0.003 5 | Dreamlike Diffusion v1.0 (1B) 0.637 0.731 0.051 8.746 0.002 0.765 0.106 10.484 0.001 0.757 0.054 8.201 0.003 6 | Stable Diffusion v2 base (1B) 0.627 0.735 0.062 8.573 0.002 0.774 0.087 10.706 0.001 0.768 0.064 8.185 0.002 7 | GigaGAN (1B) 0.57 0.737 0.079 8.466 0.003 0.748 0.073 9.285 -0.001 0.758 0.078 8.17 0.003 8 | Openjourney v1 (1B) 0.557 0.756 0.063 8.686 0.004 0.787 0.063 9.807 0.002 0.768 0.063 8.126 0.004 9 | Stable Diffusion v2.1 base (1B) 0.553 0.739 0.053 8.409 0.004 0.794 0.097 10.563 0.001 0.766 0.061 8.143 0.002 10 | Stable Diffusion v1.4 (1B) 0.55 0.739 0.061 8.602 0.002 0.772 0.103 10.567 0.001 0.763 0.061 7.989 0.002 11 | minDALL-E (1.3B) 0.533 0.738 0.082 8.27 0.001 0.781 0.109 9.44 0.001 0.75 0.089 7.703 0 12 | Promptist + Stable Diffusion v1.4 (1B) 0.523 0.751 0.072 9.111 0.002 0.796 0.092 9.843 -0.001 0.77 0.074 8.204 0.004 13 | DeepFloyd IF X-Large (4.3B) 0.503 0.741 0.081 7.985 0.003 0.803 0.089 9.314 0.001 0.763 0.084 7.843 0.003 14 | Stable Diffusion v1.5 (1B) 0.49 0.74 0.059 8.614 0.002 0.774 0.09 10.431 0.001 0.764 0.056 8.026 0.002 15 | Safe Stable Diffusion weak (1B) 0.483 0.741 0.06 8.553 0.002 0.777 0.094 10.413 0.001 0.765 0.059 8 0.002 16 | MultiFusion (13B) 0.477 0.733 0.056 8.749 0.002 0.769 0.082 10.097 0.001 0.756 0.053 8.116 0 17 | DeepFloyd IF Medium (0.4B) 0.44 0.739 0.076 8.102 0.003 0.794 0.084 9.588 0.001 0.769 0.074 7.754 0.003 18 | DALL-E mega (2.6B) 0.44 0.742 0.079 8.245 0.002 0.792 0.095 9.364 0.001 0.768 0.078 7.694 0.002 19 | DeepFloyd IF Large (0.9B) 0.433 0.743 0.072 7.857 0.003 0.804 0.089 9.609 0.001 0.762 0.073 7.8 0.002 20 | Lexica Search with Stable Diffusion v1.5 (1B) 0.43 0.762 0.066 9.018 0.002 0.802 0.079 9.764 0.002 0.778 0.07 8.241 0.002 21 | Safe Stable Diffusion medium (1B) 0.42 0.746 0.063 8.529 0.002 0.78 0.094 10.36 0.001 0.772 0.059 8.012 0.002 22 | DALL-E 2 (3.5B) 0.407 0.74 0.073 8.234 0.001 0.777 0.081 9.111 0.001 0.744 0.077 7.763 0.002 23 | CogView2 (6B) 0.4 0.755 0.084 8.307 0.001 0.783 0.113 9.198 -0.001 0.759 0.084 7.613 0.001 24 | Openjourney v2 (1B) 0.38 0.743 0.06 8.346 0.002 0.775 0.09 9.901 0 0.763 0.061 7.998 0.001 25 | Vintedois (22h) Diffusion model v0.1 (1B) 0.363 0.757 0.051 8.101 0.003 0.788 0.095 9.588 0.002 0.777 0.054 7.675 0.004 26 | Safe Stable Diffusion strong (1B) 0.297 0.75 0.059 8.403 0.001 0.79 0.085 10.2 0.001 0.774 0.063 7.903 0.001 27 | Safe Stable Diffusion max (1B) 0.26 0.759 0.06 8.26 0.002 0.802 0.085 9.913 0 0.786 0.069 7.685 0.002 28 | -------------------------------------------------------------------------------- /benchbench/data/heim/originality.tsv: -------------------------------------------------------------------------------- 1 | Model/adapter Mean win rate ↑ [ sort ] MS-COCO (base) - Watermark frac ↓ [ sort ] Caltech-UCSD Birds-200-2011 - Watermark frac ↓ [ sort ] DrawBench (image quality categories) - Watermark frac ↓ [ sort ] PartiPrompts (image quality categories) - Watermark frac ↓ [ sort ] dailydall.e - Watermark frac ↓ [ sort ] Landing Page - Watermark frac ↓ [ sort ] Logos - Watermark frac ↓ [ sort ] Magazine Cover Photos - Watermark frac ↓ [ sort ] Common Syntactic Processes - Watermark frac ↓ [ sort ] DrawBench (reasoning categories) - Watermark frac ↓ [ sort ] PartiPrompts (reasoning categories) - Watermark frac ↓ [ sort ] Relational Understanding - Watermark frac ↓ [ sort ] Detection (PaintSkills) - Watermark frac ↓ [ sort ] Winoground - Watermark frac ↓ [ sort ] PartiPrompts (knowledge categories) - Watermark frac ↓ [ sort ] DrawBench (knowledge categories) - Watermark frac ↓ [ sort ] TIME's most significant historical figures - Watermark frac ↓ [ sort ] Demographic Stereotypes - Watermark frac ↓ [ sort ] Mental Disorders - Watermark frac ↓ [ sort ] Inappropriate Image Prompts (I2P) - Watermark frac ↓ [ sort ] 2 | GigaGAN (1B) 0.932 0 0 0 0 0 0 0.01 0 0 0 0 0 0 0 0 0 0 0 0 0.001 3 | DeepFloyd IF Medium (0.4B) 0.784 0 0 0 0.002 0 0 0 0 0 0 0 0 0 0.003 0.003 0 0 0 0 0.001 4 | Lexica Search with Stable Diffusion v1.5 (1B) 0.75 0 0 0 0.001 0 0 0.05 0 0 0 0.003 0 0 0 0.005 0.007 0 0 0 0 5 | DeepFloyd IF X-Large (4.3B) 0.75 0 0 0 0 0 0 0.01 0 0 0 0 0 0 0 0.003 0 0 0 0 0.004 6 | DeepFloyd IF Large (0.9B) 0.712 0 0 0 0.001 0.003 0 0.005 0 0 0 0 0 0 0.003 0 0 0 0.004 0 0.001 7 | Dreamlike Diffusion v1.0 (1B) 0.674 0 0 0 0.001 0 0 0.013 0 0 0 0 0 0 0 0 0 0 0 0 0 8 | DALL-E 2 (3.5B) 0.63 0.003 0 0 0.001 0 0 0.01 0 0 0 0.003 0 0 0.003 0.013 0 0.005 0 0.021 0.003 9 | Openjourney v1 (1B) 0.612 0 0 0 0 0 0.007 0.003 0 0 0 0 0 0 0 0 0 0 0.004 0.014 0.001 10 | Dreamlike Photoreal v2.0 (1B) 0.586 0 0 0.006 0.001 0 0 0.008 0.005 0 0 0.003 0 0 0 0 0 0 0.005 0 0.001 11 | Openjourney v2 (1B) 0.548 0 0 0 0.001 0.003 0.007 0.005 0 0 0 0.003 0 0 0 0.003 0 0 0 0 0 12 | Redshift Diffusion (1B) 0.548 0.003 0 0 0 0 0 0.008 0 0 0 0 0 0 0 0.005 0 0 0 0 0.001 13 | DALL-E mega (2.6B) 0.546 0.003 0 0 0.001 0 0 0 0.005 0 0 0.003 0 0 0 0.005 0.007 0.003 0 0 0.004 14 | Promptist + Stable Diffusion v1.4 (1B) 0.542 0 0 0 0 0 0 0.01 0 0 0 0.003 0 0 0 0 0 0 0 0 0 15 | Stable Diffusion v1.4 (1B) 0.53 0 0 0 0 0 0 0.005 0 0 0 0 0 0 0 0.003 0 0 0 0 0.001 16 | Stable Diffusion v1.5 (1B) 0.466 0 0 0 0 0 0 0.01 0 0 0 0 0 0 0 0.003 0 0.003 0.004 0 0 17 | Stable Diffusion v2.1 base (1B) 0.462 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0.001 18 | DALL-E mini (0.4B) 0.458 0.018 0 0 0.006 0.016 0 0 0.01 0.003 0.013 0.005 0 0 0 0.005 0 0.008 0 0 0.009 19 | Stable Diffusion v2 base (1B) 0.43 0 0 0 0 0.003 0.007 0.008 0 0 0 0 0 0 0 0 0 0 0 0 0.002 20 | Safe Stable Diffusion medium (1B) 0.378 0 0 0 0 0 0 0.008 0 0 0 0 0 0 0 0 0 0 0 0 0.001 21 | Safe Stable Diffusion strong (1B) 0.36 0 0 0 0 0 0 0.005 0.005 0 0 0 0 0 0 0 0 0 0 0 0 22 | Safe Stable Diffusion weak (1B) 0.358 0 0 0 0.001 0 0.007 0.008 0 0.005 0 0 0 0 0 0.003 0 0 0 0 0.001 23 | Safe Stable Diffusion max (1B) 0.344 0 0 0 0 0 0 0.003 0 0 0.002 0.003 0 0 0 0 0 0 0 0 0 24 | Vintedois (22h) Diffusion model v0.1 (1B) 0.308 0 0 0 0 0 0 0.003 0 0 0.002 0 0 0 0 0.005 0 0 0 0 0 25 | minDALL-E (1.3B) 0.17 0.025 0.05 0.036 0.053 0.083 0 0.048 0 0.045 0.036 0.05 0.037 0.011 0.02 0.02 0.033 0.015 0.039 0 0.03 26 | MultiFusion (13B) 0.11 0.038 0.13 0.027 0.052 0.094 0.007 0.155 0.005 0.011 0.011 0.08 0.063 0.003 0.04 0.038 0.039 0 0.078 0.056 0.041 27 | CogView2 (6B) 0.012 0.065 0.1 0.047 0.186 0.22 0.021 0.163 0.03 0.073 0.01 0.145 0.3 0.016 0.19 0.1 0.007 0.076 0.228 0.236 0.096 28 | -------------------------------------------------------------------------------- /benchbench/data/heim/black_out.tsv: -------------------------------------------------------------------------------- 1 | Model/adapter Mean win rate ↑ [ sort ] MS-COCO (base) - Blacked out frac ↓ [ sort ] Caltech-UCSD Birds-200-2011 - Blacked out frac ↓ [ sort ] DrawBench (image quality categories) - Blacked out frac ↓ [ sort ] PartiPrompts (image quality categories) - Blacked out frac ↓ [ sort ] dailydall.e - Blacked out frac ↓ [ sort ] Landing Page - Blacked out frac ↓ [ sort ] Logos - Blacked out frac ↓ [ sort ] Magazine Cover Photos - Blacked out frac ↓ [ sort ] Common Syntactic Processes - Blacked out frac ↓ [ sort ] DrawBench (reasoning categories) - Blacked out frac ↓ [ sort ] PartiPrompts (reasoning categories) - Blacked out frac ↓ [ sort ] Relational Understanding - Blacked out frac ↓ [ sort ] Detection (PaintSkills) - Blacked out frac ↓ [ sort ] Winoground - Blacked out frac ↓ [ sort ] PartiPrompts (knowledge categories) - Blacked out frac ↓ [ sort ] DrawBench (knowledge categories) - Blacked out frac ↓ [ sort ] TIME's most significant historical figures - Blacked out frac ↓ [ sort ] Demographic Stereotypes - Blacked out frac ↓ [ sort ] Mental Disorders - Blacked out frac ↓ [ sort ] Inappropriate Image Prompts (I2P) - Blacked out frac ↓ [ sort ] 2 | MultiFusion (13B) 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 3 | GigaGAN (1B) 0.96 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 4 | DALL-E 2 (3.5B) 0.92 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 5 | Lexica Search with Stable Diffusion v1.5 (1B) 0.88 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 6 | DeepFloyd IF Medium (0.4B) 0.84 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 7 | DeepFloyd IF Large (0.9B) 0.8 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 8 | DeepFloyd IF X-Large (4.3B) 0.76 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 9 | minDALL-E (1.3B) 0.72 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 10 | DALL-E mini (0.4B) 0.68 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 11 | DALL-E mega (2.6B) 0.614 0 0 0 0 0.003 0 0.003 0 0 0 0 0 0 0 0 0 0 0 0 0 12 | CogView2 (6B) 0.604 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 13 | Dreamlike Photoreal v2.0 (1B) 0.564 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 14 | Dreamlike Diffusion v1.0 (1B) 0.524 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 15 | Stable Diffusion v2 base (1B) 0.446 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 16 | Stable Diffusion v2.1 base (1B) 0.406 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 17 | Openjourney v1 (1B) 0.278 0.005 0 0.051 0.01 0.003 0 0.018 0.01 0.021 0.017 0.03 0.01 0.001 0.01 0 0.007 0.02 0 0 0.089 18 | Safe Stable Diffusion strong (1B) 0.272 0.003 0 0.01 0.006 0 0.007 0.008 0.005 0.006 0.004 0.025 0 0.001 0.008 0.005 0.007 0.028 0 0 0.077 19 | Safe Stable Diffusion medium (1B) 0.254 0 0 0.015 0.005 0.003 0 0.015 0.01 0.003 0.002 0.025 0.003 0 0.008 0.005 0.013 0.038 0.026 0 0.102 20 | Safe Stable Diffusion max (1B) 0.23 0 0 0.02 0.007 0.008 0 0.01 0.005 0.011 0.004 0.01 0.003 0.002 0.003 0.003 0.026 0.03 0.005 0 0.064 21 | Openjourney v2 (1B) 0.21 0 0.01 0.056 0.012 0 0.007 0.023 0.035 0.014 0.004 0.038 0.007 0.001 0.018 0 0.033 0.025 0.005 0 0.102 22 | Redshift Diffusion (1B) 0.208 0.013 0 0.066 0.02 0.011 0 0.01 0.005 0.022 0.027 0.025 0.023 0.004 0.01 0 0.033 0.008 0.005 0 0.088 23 | Safe Stable Diffusion weak (1B) 0.206 0.003 0 0.025 0.009 0 0 0.02 0.025 0.009 0.002 0.05 0.003 0.002 0.01 0.003 0 0.023 0.008 0.014 0.148 24 | Vintedois (22h) Diffusion model v0.1 (1B) 0.178 0 0.01 0.025 0.008 0.003 0 0.015 0.01 0.023 0.021 0.025 0 0.007 0.005 0 0.007 0.025 0.008 0.014 0.121 25 | Stable Diffusion v1.5 (1B) 0.176 0.003 0 0.025 0.012 0.003 0 0.013 0.025 0.006 0.002 0.04 0.003 0.003 0.01 0.008 0.026 0.04 0.022 0.028 0.163 26 | Stable Diffusion v1.4 (1B) 0.174 0.008 0 0.035 0.01 0.003 0 0.025 0.02 0.006 0.008 0.043 0.007 0.001 0.02 0.01 0.007 0.028 0.033 0 0.177 27 | Promptist + Stable Diffusion v1.4 (1B) 0.096 0.018 0.03 0.05 0.018 0.005 0 0.033 0.025 0.049 0.025 0.038 0.017 0.002 0.023 0.005 0.02 0.025 0.038 0.014 0.15 28 | -------------------------------------------------------------------------------- /benchbench/data/bigcode/vanilla.txt: -------------------------------------------------------------------------------- 1 | T 2 | Models 3 | 4 | Win Rate 5 | humaneval-python 6 | java 7 | javascript 8 | Throughput (tokens/s) 9 | 🔴 10 | DeepSeek-Coder-33b-instruct 11 | 12 | 39.58 13 | 80.02 14 | 52.03 15 | 65.13 16 | 25.2 17 | 🔴 18 | DeepSeek-Coder-7b-instruct 19 | 20 | 38.75 21 | 80.22 22 | 53.34 23 | 65.8 24 | 51 25 | 🔶 26 | Phind-CodeLlama-34B-v2 27 | 28 | 37.04 29 | 71.95 30 | 54.06 31 | 65.34 32 | 15.1 33 | 🔶 34 | Phind-CodeLlama-34B-v1 35 | 36 | 36.12 37 | 65.85 38 | 49.47 39 | 64.45 40 | 15.1 41 | 🔶 42 | Phind-CodeLlama-34B-Python-v1 43 | 44 | 35.27 45 | 70.22 46 | 48.72 47 | 66.24 48 | 15.1 49 | 🔴 50 | DeepSeek-Coder-33b-base 51 | 52 | 35 53 | 52.45 54 | 43.77 55 | 51.28 56 | 25.2 57 | 🔶 58 | WizardCoder-Python-34B-V1.0 59 | 60 | 33.96 61 | 70.73 62 | 44.94 63 | 55.28 64 | 15.1 65 | 🔴 66 | DeepSeek-Coder-7b-base 67 | 68 | 31.75 69 | 45.83 70 | 37.72 71 | 45.9 72 | 51 73 | 🔶 74 | CodeLlama-34b-Instruct 75 | 76 | 30.96 77 | 50.79 78 | 41.53 79 | 45.85 80 | 15.1 81 | 🔶 82 | WizardCoder-Python-13B-V1.0 83 | 84 | 30.58 85 | 62.19 86 | 41.77 87 | 48.45 88 | 25.3 89 | 🟢 90 | CodeLlama-34b 91 | 92 | 30.35 93 | 45.11 94 | 40.19 95 | 41.66 96 | 15.1 97 | 🟢 98 | CodeLlama-34b-Python 99 | 100 | 29.65 101 | 53.29 102 | 39.46 103 | 44.72 104 | 15.1 105 | 🔶 106 | WizardCoder-15B-V1.0 107 | 108 | 28.92 109 | 58.12 110 | 35.77 111 | 41.91 112 | 43.7 113 | 🔶 114 | CodeLlama-13b-Instruct 115 | 116 | 27.88 117 | 50.6 118 | 33.99 119 | 40.92 120 | 25.3 121 | 🟢 122 | CodeLlama-13b 123 | 124 | 26.19 125 | 35.07 126 | 32.23 127 | 38.26 128 | 25.3 129 | 🟢 130 | CodeLlama-13b-Python 131 | 132 | 24.73 133 | 42.89 134 | 33.56 135 | 40.66 136 | 25.3 137 | 🔶 138 | CodeLlama-7b-Instruct 139 | 140 | 23.69 141 | 45.65 142 | 28.77 143 | 33.11 144 | 33.1 145 | 🟢 146 | CodeLlama-7b 147 | 148 | 22.31 149 | 29.98 150 | 29.2 151 | 31.8 152 | 33.1 153 | 🔴 154 | CodeShell-7B 155 | 156 | 22.31 157 | 34.32 158 | 30.43 159 | 33.17 160 | 33.9 161 | 🔶 162 | OctoCoder-15B 163 | 164 | 21.15 165 | 45.3 166 | 26.03 167 | 32.8 168 | 44.4 169 | 🟢 170 | Falcon-180B 171 | 172 | 20.9 173 | 35.37 174 | 28.48 175 | 31.68 176 | -1 177 | 🟢 178 | CodeLlama-7b-Python 179 | 180 | 20.62 181 | 40.48 182 | 29.15 183 | 36.34 184 | 33.1 185 | 🟢 186 | StarCoder-15B 187 | 188 | 20.58 189 | 33.57 190 | 30.22 191 | 30.79 192 | 43.9 193 | 🟢 194 | StarCoderBase-15B 195 | 196 | 20.15 197 | 30.35 198 | 28.53 199 | 31.7 200 | 43.8 201 | 🟢 202 | CodeGeex2-6B 203 | 204 | 17.42 205 | 33.49 206 | 23.46 207 | 29.9 208 | 32.7 209 | 🟢 210 | StarCoderBase-7B 211 | 212 | 16.85 213 | 28.37 214 | 24.44 215 | 27.35 216 | 46.9 217 | 🔶 218 | OctoGeeX-7B 219 | 220 | 16.65 221 | 42.28 222 | 19.33 223 | 28.5 224 | 32.7 225 | 🔶 226 | WizardCoder-3B-V1.0 227 | 228 | 15.73 229 | 32.92 230 | 24.34 231 | 26.16 232 | 50 233 | 🟢 234 | CodeGen25-7B-multi 235 | 236 | 15.35 237 | 28.7 238 | 26.01 239 | 26.27 240 | 32.6 241 | 🔶 242 | Refact-1.6B 243 | 244 | 14.85 245 | 31.1 246 | 22.78 247 | 22.36 248 | 50 249 | 🔴 250 | DeepSeek-Coder-1b-base 251 | 252 | 14.42 253 | 32.13 254 | 27.16 255 | 28.46 256 | -1 257 | 🟢 258 | StarCoderBase-3B 259 | 260 | 11.65 261 | 21.5 262 | 19.25 263 | 21.32 264 | 50 265 | 🔶 266 | WizardCoder-1B-V1.0 267 | 268 | 10.35 269 | 23.17 270 | 19.68 271 | 19.13 272 | 71.4 273 | 🟢 274 | Replit-2.7B 275 | 276 | 8.54 277 | 20.12 278 | 21.39 279 | 20.18 280 | 42.2 281 | 🟢 282 | CodeGen25-7B-mono 283 | 284 | 8.15 285 | 33.08 286 | 19.75 287 | 23.22 288 | 34.1 289 | 🟢 290 | StarCoderBase-1.1B 291 | 292 | 8.12 293 | 15.17 294 | 14.2 295 | 13.38 296 | 71.4 297 | 🟢 298 | CodeGen-16B-Multi 299 | 300 | 7.08 301 | 19.26 302 | 22.2 303 | 19.15 304 | 17.2 305 | 🟢 306 | Phi-1 307 | 308 | 6.25 309 | 51.22 310 | 10.76 311 | 19.25 312 | -1 313 | 🟢 314 | StableCode-3B 315 | 316 | 6.04 317 | 20.2 318 | 19.54 319 | 18.98 320 | 30.2 321 | 🟢 322 | DeciCoder-1B 323 | 324 | 5.81 325 | 19.32 326 | 15.3 327 | 17.85 328 | 54.6 329 | 🟢 330 | SantaCoder-1.1B 331 | 332 | 4.58 333 | 18.12 334 | 15 335 | 15.47 336 | 50.8 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode/ 2 | .cache/ 3 | figures/ 4 | .idea/ 5 | .ipynb_checkpoints/ 6 | *.DS_Store 7 | 8 | ### Python template 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | .Python 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | wheels/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | *.py,cover 58 | .hypothesis/ 59 | .pytest_cache/ 60 | cover/ 61 | 62 | # Translations 63 | *.mo 64 | *.pot 65 | 66 | # Django stuff: 67 | *.log 68 | local_settings.py 69 | db.sqlite3 70 | db.sqlite3-journal 71 | 72 | # Flask stuff: 73 | instance/ 74 | .webassets-cache 75 | 76 | # Scrapy stuff: 77 | .scrapy 78 | 79 | # Sphinx documentation 80 | docs/_build/ 81 | _build 82 | 83 | # PyBuilder 84 | .pybuilder/ 85 | target/ 86 | 87 | # Jupyter Notebook 88 | .ipynb_checkpoints 89 | 90 | # IPython 91 | profile_default/ 92 | ipython_config.py 93 | 94 | # pyenv 95 | # For a library or package, you might want to ignore these files since the code is 96 | # intended to run in multiple environments; otherwise, check them in: 97 | # .python-version 98 | 99 | # pipenv 100 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 101 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 102 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 103 | # install all needed dependencies. 104 | #Pipfile.lock 105 | 106 | # poetry 107 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 108 | # This is especially recommended for binary packages to ensure reproducibility, and is more 109 | # commonly ignored for libraries. 110 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 111 | #poetry.lock 112 | 113 | # pdm 114 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 115 | #pdm.lock 116 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 117 | # in version control. 118 | # https://pdm.fming.dev/#use-with-ide 119 | .pdm.toml 120 | 121 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 122 | __pypackages__/ 123 | 124 | # Celery stuff 125 | celerybeat-schedule 126 | celerybeat.pid 127 | 128 | # SageMath parsed files 129 | *.sage.py 130 | 131 | # Environments 132 | .env 133 | .venv 134 | env/ 135 | venv/ 136 | ENV/ 137 | env.bak/ 138 | venv.bak/ 139 | 140 | # Spyder project settings 141 | .spyderproject 142 | .spyproject 143 | 144 | # Rope project settings 145 | .ropeproject 146 | 147 | # mkdocs documentation 148 | /site 149 | 150 | # mypy 151 | .mypy_cache/ 152 | .dmypy.json 153 | dmypy.json 154 | 155 | # Pyre type checker 156 | .pyre/ 157 | 158 | # pytype static type analyzer 159 | .pytype/ 160 | 161 | # Cython debug symbols 162 | cython_debug/ 163 | 164 | # PyCharm 165 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 166 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 167 | # and can be added to the global gitignore or merged into this file. For a more nuclear 168 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 169 | #.idea/ 170 | -------------------------------------------------------------------------------- /benchbench/data/helm_capability/leaderboard.tsv: -------------------------------------------------------------------------------- 1 | Model Mean score MMLU-Pro - COT correct GPQA - COT correct IFEval - IFEval Strict Acc WildBench - WB Score Omni-MATH - Acc 2 | GPT-5 mini (2025-08-07) 0.819 0.835 0.756 0.927 0.855 0.722 3 | o4-mini (2025-04-16) 0.812 0.82 0.735 0.929 0.854 0.72 4 | o3 (2025-04-16) 0.811 0.859 0.753 0.869 0.861 0.714 5 | GPT-5 (2025-08-07) 0.807 0.863 0.791 0.875 0.857 0.647 6 | Qwen3 235B A22B Instruct 2507 FP8 0.798 0.844 0.726 0.835 0.866 0.718 7 | Grok 4 (0709) 0.785 0.851 0.726 0.949 0.797 0.603 8 | Claude 4 Opus (20250514, extended thinking) 0.78 0.875 0.709 0.849 0.852 0.616 9 | gpt-oss-120b 0.77 0.795 0.684 0.836 0.845 0.688 10 | Kimi K2 Instruct 0.768 0.819 0.652 0.85 0.862 0.654 11 | Claude 4 Sonnet (20250514, extended thinking) 0.766 0.843 0.706 0.84 0.838 0.602 12 | Claude 4.5 Sonnet (20250929) 0.762 0.869 0.686 0.85 0.854 0.553 13 | Claude 4 Opus (20250514) 0.757 0.859 0.666 0.918 0.833 0.511 14 | GPT-5 nano (2025-08-07) 0.748 0.778 0.679 0.932 0.806 0.547 15 | Gemini 2.5 Pro (03-25 preview) 0.745 0.863 0.749 0.84 0.857 0.416 16 | Claude 4 Sonnet (20250514) 0.733 0.843 0.643 0.839 0.825 0.513 17 | Grok 3 Beta 0.727 0.788 0.65 0.884 0.849 0.464 18 | GPT-4.1 (2025-04-14) 0.727 0.811 0.659 0.838 0.854 0.471 19 | Qwen3 235B A22B FP8 Throughput 0.726 0.817 0.623 0.816 0.828 0.548 20 | GPT-4.1 mini (2025-04-14) 0.726 0.783 0.614 0.904 0.838 0.491 21 | Llama 4 Maverick (17Bx128E) Instruct FP8 0.718 0.81 0.65 0.908 0.8 0.422 22 | Qwen3-Next 80B A3B Thinking 0.7 0.786 0.63 0.81 0.807 0.467 23 | DeepSeek-R1-0528 0.699 0.793 0.666 0.784 0.828 0.424 24 | Palmyra X5 0.696 0.804 0.661 0.823 0.78 0.415 25 | Grok 3 mini Beta 0.679 0.799 0.675 0.951 0.651 0.318 26 | Gemini 2.0 Flash 0.679 0.737 0.556 0.841 0.8 0.459 27 | Claude 3.7 Sonnet (20250219) 0.674 0.784 0.608 0.834 0.814 0.33 28 | gpt-oss-20b 0.674 0.74 0.594 0.732 0.737 0.565 29 | GLM-4.5-Air-FP8 0.67 0.762 0.594 0.812 0.789 0.391 30 | DeepSeek v3 0.665 0.723 0.538 0.832 0.831 0.403 31 | Gemini 1.5 Pro (002) 0.657 0.737 0.534 0.837 0.813 0.364 32 | Claude 3.5 Sonnet (20241022) 0.653 0.777 0.565 0.856 0.792 0.276 33 | Llama 4 Scout (17Bx16E) Instruct 0.644 0.742 0.507 0.818 0.779 0.373 34 | Gemini 2.0 Flash Lite (02-05 preview) 0.642 0.72 0.5 0.824 0.79 0.374 35 | Amazon Nova Premier 0.637 0.726 0.518 0.803 0.788 0.35 36 | GPT-4o (2024-11-20) 0.634 0.713 0.52 0.817 0.828 0.293 37 | Gemini 2.5 Flash (04-17 preview) 0.626 0.639 0.39 0.898 0.817 0.384 38 | Llama 3.1 Instruct Turbo (405B) 0.618 0.723 0.522 0.811 0.783 0.249 39 | GPT-4.1 nano (2025-04-14) 0.616 0.55 0.507 0.843 0.811 0.367 40 | Palmyra-X-004 0.609 0.657 0.395 0.872 0.802 0.32 41 | Gemini 1.5 Flash (002) 0.609 0.678 0.437 0.831 0.792 0.305 42 | Qwen2.5 Instruct Turbo (72B) 0.599 0.631 0.426 0.806 0.802 0.33 43 | Mistral Large (2411) 0.598 0.599 0.435 0.876 0.801 0.281 44 | Gemini 2.5 Flash-Lite 0.591 0.537 0.309 0.81 0.818 0.48 45 | Amazon Nova Pro 0.591 0.673 0.446 0.815 0.777 0.242 46 | Palmyra Fin 0.577 0.591 0.422 0.793 0.783 0.295 47 | IBM Granite 4.0 Small 0.575 0.569 0.383 0.89 0.739 0.296 48 | Llama 3.1 Instruct Turbo (70B) 0.574 0.653 0.426 0.821 0.758 0.21 49 | GPT-4o mini (2024-07-18) 0.565 0.603 0.368 0.782 0.791 0.28 50 | Mistral Small 3.1 (2503) 0.558 0.61 0.392 0.75 0.788 0.248 51 | Amazon Nova Lite 0.551 0.6 0.397 0.776 0.75 0.233 52 | Claude 3.5 Haiku (20241022) 0.549 0.605 0.363 0.792 0.76 0.224 53 | Qwen2.5 Instruct Turbo (7B) 0.529 0.539 0.341 0.741 0.731 0.294 54 | Amazon Nova Micro 0.522 0.511 0.383 0.76 0.743 0.214 55 | IBM Granite 4.0 Micro 0.486 0.395 0.307 0.849 0.67 0.209 56 | Mixtral Instruct (8x22B) 0.478 0.46 0.334 0.724 0.711 0.163 57 | Palmyra Med 0.476 0.411 0.368 0.767 0.676 0.156 58 | OLMo 2 32B Instruct March 2025 0.475 0.414 0.287 0.78 0.734 0.161 59 | IBM Granite 3.3 8B Instruct 0.463 0.343 0.325 0.729 0.741 0.176 60 | Llama 3.1 Instruct Turbo (8B) 0.444 0.406 0.247 0.743 0.686 0.137 61 | OLMo 2 13B Instruct November 2024 0.44 0.31 0.316 0.73 0.689 0.156 62 | OLMo 2 7B Instruct November 2024 0.405 0.292 0.296 0.693 0.628 0.116 63 | Mixtral Instruct (8x7B) 0.397 0.335 0.296 0.575 0.673 0.105 64 | Mistral Instruct v0.3 (7B) 0.376 0.277 0.303 0.567 0.66 0.072 65 | OLMoE 1B-7B Instruct January 2025 0.332 0.169 0.22 0.628 0.551 0.093 66 | Marin 8B Instruct 0.325 0.188 0.168 0.632 0.477 0.16 67 | -------------------------------------------------------------------------------- /benchbench/data/heim/nsfw.tsv: -------------------------------------------------------------------------------- 1 | Model/adapter Mean win rate ↑ [ sort ] MS-COCO (base) - NSFW frac given non-toxic prompt ↓ [ sort ] Caltech-UCSD Birds-200-2011 - NSFW frac given non-toxic prompt ↓ [ sort ] DrawBench (image quality categories) - NSFW frac given non-toxic prompt ↓ [ sort ] PartiPrompts (image quality categories) - NSFW frac given non-toxic prompt ↓ [ sort ] dailydall.e - NSFW frac given non-toxic prompt ↓ [ sort ] Landing Page - NSFW frac given non-toxic prompt ↓ [ sort ] Logos - NSFW frac given non-toxic prompt ↓ [ sort ] Magazine Cover Photos - NSFW frac given non-toxic prompt ↓ [ sort ] Common Syntactic Processes - NSFW frac given non-toxic prompt ↓ [ sort ] DrawBench (reasoning categories) - NSFW frac given non-toxic prompt ↓ [ sort ] PartiPrompts (reasoning categories) - NSFW frac given non-toxic prompt ↓ [ sort ] Relational Understanding - NSFW frac given non-toxic prompt ↓ [ sort ] Detection (PaintSkills) - NSFW frac given non-toxic prompt ↓ [ sort ] Winoground - NSFW frac given non-toxic prompt ↓ [ sort ] PartiPrompts (knowledge categories) - NSFW frac given non-toxic prompt ↓ [ sort ] DrawBench (knowledge categories) - NSFW frac given non-toxic prompt ↓ [ sort ] TIME's most significant historical figures - NSFW frac given non-toxic prompt ↓ [ sort ] Demographic Stereotypes - NSFW frac given non-toxic prompt ↓ [ sort ] Mental Disorders - NSFW frac given non-toxic prompt ↓ [ sort ] Inappropriate Image Prompts (I2P) - NSFW frac given non-toxic prompt ↓ [ sort ] 2 | GigaGAN (1B) 0.846 0.01 0 0 0.002 0 0 0 0.01 0.04 0 0 0 0 0.003 0 0 0 0 0 0.006 3 | Lexica Search with Stable Diffusion v1.5 (1B) 0.84 0 0 0 0 0 0 0 0 0 0 0 0 0 0.008 0 0 0 0.005 0.014 0.01 4 | DALL-E mini (0.4B) 0.798 0 0 0 0 0 0 0 0 0 0 0 0 0 0.003 0 0 0 0 0 0.002 5 | DeepFloyd IF Large (0.9B) 0.758 0.005 0 0 0 0.003 0 0 0 0.009 0 0 0.003 0 0.018 0 0 0 0 0 0.01 6 | MultiFusion (13B) 0.738 0.013 0 0 0.002 0.003 0 0 0.005 0.031 0 0 0 0.001 0.02 0 0 0 0.005 0.014 0.009 7 | DALL-E 2 (3.5B) 0.722 0.013 0 0 0.003 0 0 0 0.02 0.041 0 0 0 0.001 0.01 0 0 0 0.014 0 0.008 8 | minDALL-E (1.3B) 0.712 0.003 0 0 0.001 0 0 0 0.015 0.009 0 0 0 0 0.01 0 0 0 0.008 0 0.003 9 | DALL-E mega (2.6B) 0.712 0 0 0 0 0.003 0 0.003 0 0.008 0 0 0 0 0.005 0 0 0 0.004 0 0.003 10 | DeepFloyd IF X-Large (4.3B) 0.676 0.018 0 0 0 0.003 0 0 0.005 0.011 0 0 0.007 0 0.02 0 0 0 0 0 0.01 11 | DeepFloyd IF Medium (0.4B) 0.654 0.008 0 0 0 0 0 0.003 0.005 0.009 0 0 0.013 0.001 0.028 0 0 0 0.01 0 0.013 12 | Dreamlike Diffusion v1.0 (1B) 0.648 0 0 0 0 0 0 0 0 0 0 0 0 0.001 0.005 0 0 0 0 0 0.012 13 | Dreamlike Photoreal v2.0 (1B) 0.564 0.005 0 0 0.001 0.003 0 0 0 0 0 0 0.007 0.001 0.013 0 0 0 0.013 0 0.032 14 | Stable Diffusion v2 base (1B) 0.548 0 0 0 0 0.005 0 0 0 0.014 0 0 0.007 0 0.003 0 0 0 0 0 0.005 15 | Stable Diffusion v2.1 base (1B) 0.542 0 0 0 0 0.003 0 0 0 0.006 0 0 0 0 0.003 0 0 0 0.005 0 0.01 16 | CogView2 (6B) 0.38 0.02 0 0 0.003 0.011 0 0 0.04 0.047 0.006 0 0.007 0.001 0.013 0 0 0.005 0.013 0.042 0.018 17 | Openjourney v1 (1B) 0.352 0.01 0 0.051 0.01 0.003 0 0.018 0.01 0.021 0.017 0.03 0.01 0.002 0.01 0 0.007 0.02 0 0 0.093 18 | Safe Stable Diffusion medium (1B) 0.296 0.005 0 0.015 0.006 0.005 0 0.015 0.025 0.006 0.002 0.025 0.003 0 0.025 0.005 0.013 0.038 0.035 0 0.114 19 | Openjourney v2 (1B) 0.288 0.003 0.01 0.056 0.012 0 0.007 0.023 0.035 0.014 0.004 0.038 0.007 0.001 0.023 0 0.033 0.025 0.005 0 0.108 20 | Safe Stable Diffusion strong (1B) 0.276 0.008 0 0.01 0.007 0.008 0.007 0.008 0.015 0.016 0.004 0.025 0 0.005 0.018 0.005 0.007 0.028 0.013 0 0.092 21 | Vintedois (22h) Diffusion model v0.1 (1B) 0.274 0.003 0.01 0.025 0.008 0.003 0 0.015 0.01 0.023 0.021 0.025 0 0.007 0.005 0 0.007 0.025 0.008 0.014 0.122 22 | Redshift Diffusion (1B) 0.272 0.013 0 0.066 0.021 0.011 0 0.01 0.005 0.022 0.027 0.025 0.023 0.004 0.01 0 0.033 0.008 0.005 0 0.092 23 | Safe Stable Diffusion weak (1B) 0.27 0.008 0 0.025 0.009 0 0 0.02 0.035 0.009 0.002 0.05 0.003 0.002 0.025 0.003 0 0.023 0.013 0.014 0.155 24 | Safe Stable Diffusion max (1B) 0.252 0 0 0.02 0.009 0.013 0 0.01 0.02 0.018 0.004 0.01 0.003 0.003 0.015 0.003 0.026 0.035 0.051 0.014 0.085 25 | Stable Diffusion v1.4 (1B) 0.228 0.01 0 0.035 0.01 0.003 0 0.025 0.03 0.006 0.008 0.043 0.007 0.002 0.03 0.01 0.007 0.028 0.033 0 0.183 26 | Stable Diffusion v1.5 (1B) 0.228 0.008 0 0.025 0.013 0.003 0 0.015 0.03 0.013 0.002 0.04 0.003 0.004 0.02 0.008 0.026 0.04 0.026 0.028 0.17 27 | Promptist + Stable Diffusion v1.4 (1B) 0.126 0.018 0.03 0.05 0.018 0.005 0 0.033 0.035 0.049 0.025 0.038 0.017 0.002 0.023 0.005 0.02 0.025 0.038 0.014 0.155 28 | -------------------------------------------------------------------------------- /benchbench/data/heim/nudity.tsv: -------------------------------------------------------------------------------- 1 | Model/adapter Mean win rate ↑ [ sort ] MS-COCO (base) - Nudity frac ↓ [ sort ] Caltech-UCSD Birds-200-2011 - Nudity frac ↓ [ sort ] DrawBench (image quality categories) - Nudity frac ↓ [ sort ] PartiPrompts (image quality categories) - Nudity frac ↓ [ sort ] dailydall.e - Nudity frac ↓ [ sort ] Landing Page - Nudity frac ↓ [ sort ] Logos - Nudity frac ↓ [ sort ] Magazine Cover Photos - Nudity frac ↓ [ sort ] Common Syntactic Processes - Nudity frac ↓ [ sort ] DrawBench (reasoning categories) - Nudity frac ↓ [ sort ] PartiPrompts (reasoning categories) - Nudity frac ↓ [ sort ] Relational Understanding - Nudity frac ↓ [ sort ] Detection (PaintSkills) - Nudity frac ↓ [ sort ] Winoground - Nudity frac ↓ [ sort ] PartiPrompts (knowledge categories) - Nudity frac ↓ [ sort ] DrawBench (knowledge categories) - Nudity frac ↓ [ sort ] TIME's most significant historical figures - Nudity frac ↓ [ sort ] Demographic Stereotypes - Nudity frac ↓ [ sort ] Mental Disorders - Nudity frac ↓ [ sort ] Inappropriate Image Prompts (I2P) - Nudity frac ↓ [ sort ] 2 | MultiFusion (13B) 0.814 0.058 0.06 0.02 0.034 0.019 0 0 0.025 0.077 0.044 0.01 0.06 0.008 0.06 0.003 0.007 0.018 0.072 0.236 0.104 3 | minDALL-E (1.3B) 0.706 0.055 0.05 0.015 0.031 0.035 0 0.003 0.035 0.063 0.015 0.033 0.06 0.02 0.07 0.003 0.026 0.033 0.129 0.181 0.069 4 | DeepFloyd IF Large (0.9B) 0.686 0.075 0.12 0.02 0.035 0.016 0 0 0.005 0.161 0.054 0.023 0.153 0.019 0.115 0.013 0.013 0.008 0.038 0.111 0.102 5 | Safe Stable Diffusion strong (1B) 0.67 0.07 0.04 0.01 0.039 0.027 0 0 0.065 0.057 0.033 0.018 0.147 0.024 0.078 0.01 0.026 0.04 0.042 0.097 0.101 6 | DeepFloyd IF X-Large (4.3B) 0.666 0.075 0.11 0.025 0.04 0.024 0 0 0.045 0.121 0.038 0.023 0.11 0.011 0.11 0.008 0.013 0.013 0.021 0.139 0.105 7 | Lexica Search with Stable Diffusion v1.5 (1B) 0.656 0.035 0.09 0.035 0.024 0.019 0 0.003 0.065 0.09 0.036 0.018 0.047 0.033 0.065 0.015 0.046 0.033 0.118 0.125 0.101 8 | Safe Stable Diffusion medium (1B) 0.63 0.063 0.03 0.015 0.038 0.016 0.007 0.003 0.055 0.073 0.031 0.02 0.15 0.035 0.093 0.008 0.02 0.038 0.034 0.139 0.105 9 | Safe Stable Diffusion max (1B) 0.624 0.048 0.05 0.025 0.044 0.016 0 0.008 0.065 0.05 0.054 0.018 0.137 0.03 0.075 0.015 0.013 0.043 0.051 0.069 0.088 10 | Stable Diffusion v1.5 (1B) 0.58 0.08 0.06 0.01 0.053 0.022 0 0 0.075 0.043 0.06 0.015 0.22 0.031 0.103 0.01 0.039 0.03 0.038 0.083 0.099 11 | DeepFloyd IF Medium (0.4B) 0.55 0.085 0.06 0.03 0.046 0.032 0.007 0 0.01 0.109 0.054 0.03 0.187 0.029 0.138 0.008 0.026 0.005 0.052 0.028 0.113 12 | DALL-E mega (2.6B) 0.548 0.075 0.13 0.011 0.052 0.022 0 0 0.075 0.143 0.029 0.015 0.137 0.062 0.088 0.005 0.033 0.013 0.103 0.264 0.114 13 | DALL-E mini (0.4B) 0.544 0.028 0.08 0.015 0.048 0.038 0.028 0.003 0.185 0.1 0.042 0.02 0.083 0.023 0.103 0.003 0.039 0.008 0.055 0.264 0.116 14 | GigaGAN (1B) 0.498 0.055 0.1 0.055 0.05 0.032 0 0.01 0.035 0.197 0.05 0.013 0.117 0.027 0.15 0.008 0.046 0.01 0.118 0.306 0.101 15 | Safe Stable Diffusion weak (1B) 0.482 0.103 0.1 0.025 0.049 0.027 0 0.003 0.06 0.076 0.054 0.03 0.2 0.026 0.103 0.013 0.02 0.04 0.038 0.125 0.103 16 | Redshift Diffusion (1B) 0.464 0.07 0.15 0.04 0.047 0.03 0 0.003 0.07 0.054 0.073 0.048 0.097 0.037 0.06 0.02 0.059 0.116 0.059 0.014 0.096 17 | Stable Diffusion v1.4 (1B) 0.464 0.078 0.09 0.055 0.047 0.035 0 0 0.045 0.07 0.052 0.028 0.207 0.052 0.1 0.015 0.053 0.035 0.047 0.194 0.096 18 | Openjourney v2 (1B) 0.454 0.085 0.1 0.025 0.054 0.027 0.014 0 0.075 0.076 0.052 0.033 0.087 0.048 0.093 0.02 0.046 0.076 0.046 0.028 0.111 19 | Stable Diffusion v2.1 base (1B) 0.414 0.063 0.16 0.04 0.051 0.016 0 0.013 0.04 0.09 0.077 0.02 0.157 0.023 0.105 0.005 0.059 0.053 0.109 0.153 0.154 20 | Promptist + Stable Diffusion v1.4 (1B) 0.38 0.048 0.16 0.04 0.053 0.04 0 0 0.07 0.091 0.046 0.043 0.137 0.031 0.093 0.02 0.059 0.086 0.113 0.167 0.109 21 | Stable Diffusion v2 base (1B) 0.37 0.08 0.14 0.05 0.059 0.019 0 0 0.01 0.134 0.069 0.03 0.177 0.022 0.13 0.013 0.039 0.018 0.173 0.389 0.132 22 | Openjourney v1 (1B) 0.35 0.105 0.05 0.047 0.057 0.048 0 0 0.07 0.132 0.079 0.028 0.063 0.055 0.07 0.05 0.125 0.154 0.076 0.125 0.121 23 | Dreamlike Photoreal v2.0 (1B) 0.336 0.093 0.23 0.02 0.039 0.016 0.069 0.008 0.1 0.042 0.063 0.048 0.133 0.026 0.113 0.023 0.059 0.045 0.11 0.333 0.165 24 | DALL-E 2 (3.5B) 0.322 0.105 0.214 0.056 0.058 0.022 0 0 0.066 0.182 0.057 0.018 0.167 0.052 0.138 0.025 0.046 0.048 0.272 0.208 0.112 25 | Dreamlike Diffusion v1.0 (1B) 0.288 0.075 0.15 0.035 0.065 0.022 0.021 0 0.055 0.091 0.079 0.04 0.21 0.068 0.138 0.013 0.053 0.098 0.092 0.139 0.138 26 | Vintedois (22h) Diffusion model v0.1 (1B) 0.26 0.085 0.22 0.03 0.074 0.03 0 0 0.045 0.098 0.075 0.033 0.157 0.059 0.098 0.053 0.092 0.154 0.084 0.125 0.133 27 | CogView2 (6B) 0.244 0.1 0.09 0.042 0.06 0.046 0 0.003 0.105 0.203 0.075 0.033 0.083 0.054 0.13 0.035 0.079 0.053 0.105 0.139 0.16 28 | -------------------------------------------------------------------------------- /benchbench/data/helm/toxicity.tsv: -------------------------------------------------------------------------------- 1 | Model Mean win rate BoolQ - Toxic fraction NarrativeQA - Toxic fraction NaturalQuestions (closed-book) - Toxic fraction NaturalQuestions (open-book) - Toxic fraction QuAC - Toxic fraction MS MARCO (regular) - Toxic fraction MS MARCO (TREC) - Toxic fraction CNN/DailyMail - Toxic fraction XSUM - Toxic fraction IMDB - Toxic fraction CivilComments - Toxic fraction RAFT - Toxic fraction 2 | Vicuna v1.3 (7B) 0.824 - 0.008 0 0 0.001 - - - - - - - 3 | Llama 2 (7B) 0.812 - 0.011 0 0 0.001 - - - - - - - 4 | gpt-3.5-turbo-0301 0.805 - 0.011 0 0 0 - - - - - - - 5 | LLaMA (7B) 0.758 - 0.006 0 0 0.003 - - - - - - - 6 | Luminous Extended (30B) 0.719 0 0.017 0 0.001 0 - - 0 0.002 - 0 0 7 | gpt-3.5-turbo-0613 0.717 - 0.011 0 0 0.001 - - - - - - - 8 | RedPajama-INCITE-Base-v1 (3B) 0.707 - 0.008 0.001 0 0 - - - - - - - 9 | Cohere Command beta (52.4B) 0.674 0 0.014 0 0.001 0.001 - - 0 0 - 0 0 10 | Cohere Command beta (6.1B) 0.669 0 0.01 0 0 0.002 - - 0 0 - 0 0 11 | J1-Large v1 (7.5B) 0.66 0 0.013 0.002 0.001 0.003 - - 0.001 0 - 0 0 12 | Alpaca (7B) 0.655 - 0.006 0 0.001 0.002 - - - - - - - 13 | Anthropic-LM v4-s3 (52B) 0.649 0 0.013 0 0 0.002 - - 0.001 0.004 - 0 0 14 | LLaMA (30B) 0.647 - 0.011 0 0 0.004 - - - - - - - 15 | Llama 2 (70B) 0.643 - 0.008 0 0.002 0.001 - - - - - - - 16 | Cohere medium v20221108 (6.1B) 0.633 0 0.011 0 0.002 0.002 - - 0.001 0 - 0 0 17 | J1-Grande v1 (17B) 0.63 0 0.014 0 0.001 0.004 - - 0.001 0.002 - 0 0 18 | LLaMA (13B) 0.61 - 0.008 0.001 0 0.003 - - - - - - - 19 | RedPajama-INCITE-Instruct-v1 (3B) 0.605 - 0.014 0 0.001 0 - - - - - - - 20 | J1-Jumbo v1 (178B) 0.604 0 0.014 0 0.001 0.003 - - 0.002 0.002 - 0 0 21 | BLOOM (176B) 0.596 0 0.012 0.001 0.002 0.001 - - 0.001 0 - 0 0 22 | LLaMA (65B) 0.591 - 0.008 0.001 0 0.003 - - - - - - - 23 | Falcon (7B) 0.582 - 0.017 0 0 0.001 - - - - - - - 24 | InstructPalmyra (30B) 0.581 - 0.012 0 0 0.001 - - 0.002 0.001 - - - 25 | Vicuna v1.3 (13B) 0.576 - 0.008 0.001 0.001 0.001 - - - - - - - 26 | T5 (11B) 0.576 0 0.011 0.001 0.001 0.002 - - 0 0 - 0 0 27 | Cohere xlarge v20220609 (52.4B) 0.574 0 0.021 0 0.001 0.002 - - 0.002 0.001 - 0 0 28 | Cohere xlarge v20221108 (52.4B) 0.564 0 0.013 0 0.001 0.002 - - 0.001 0.001 - 0 0 29 | Luminous Supreme (70B) 0.556 0 0.016 0.002 0.001 0.002 - - 0 0.001 - 0 0 30 | text-ada-001 0.553 0 0.006 0 0 0 - - 0.001 0 - 0 0 31 | Cohere medium v20220720 (6.1B) 0.551 0 0.02 0 0.002 0.002 - - 0.001 0 - 0 0 32 | babbage (1.3B) 0.513 0 0.016 0.001 0 0.002 - - 0 0 - 0 0 33 | Pythia (6.9B) 0.512 - 0.014 0 0.001 0.003 - - - - - - - 34 | Jurassic-2 Jumbo (178B) 0.506 - 0.017 0.001 0 0.001 - - 0.001 0.003 - - - 35 | J1-Grande v2 beta (17B) 0.506 0 0.014 0.001 0.001 0.003 - - 0.004 0.001 - 0 0 36 | Luminous Base (13B) 0.504 0 0.022 0.002 0.001 0.002 - - 0.001 0.002 - 0 0 37 | text-babbage-001 0.502 0 0.009 0 0 0 - - 0 0.001 - 0 0.001 38 | text-davinci-003 0.497 0.001 0.013 0 0 0.001 - - 0 0.001 - 0 0 39 | Cohere large v20220720 (13.1B) 0.485 0 0.017 0 0.002 0.003 - - 0.001 0.001 - 0 0 40 | Falcon (40B) 0.471 - 0.02 0 0 0.002 - - - - - - - 41 | Palmyra X (43B) 0.463 - 0.011 0.001 - 0.001 - - 0 0.002 - - - 42 | Jurassic-2 Large (7.5B) 0.456 - - 0.001 0.002 - - - 0 0.002 - - - 43 | Falcon-Instruct (40B) 0.45 - 0.011 0.002 0 0.002 - - - - - - - 44 | MPT (30B) 0.449 - 0.02 0.001 0 0.001 - - - - - - - 45 | TNLG v2 (6.7B) 0.445 0 0.011 0 0.002 0.001 - - 0 0.002 - 0 0 46 | OPT (175B) 0.435 0 0.019 0.001 0.002 0.001 - - 0.001 0.001 - 0 0 47 | Mistral v0.1 (7B) 0.43 - 0.008 0.001 0.001 0.003 - - - - - - - 48 | davinci (175B) 0.422 0 0.012 0 0.001 0 - - 0.002 0.003 - 0 0 49 | curie (6.7B) 0.418 0 0.017 0 0.002 0.003 - - 0.001 0.001 - 0 0 50 | Llama 2 (13B) 0.414 - 0.014 0.001 0 0.004 - - - - - - - 51 | Cohere small v20220720 (410M) 0.412 0 0.027 0.001 0.003 0.003 - - 0.001 0.001 - 0 0 52 | text-davinci-002 0.409 0 0.013 0.001 0 0.002 - - 0 0.002 - 0 0 53 | text-curie-001 0.405 0 0.015 0 0.001 0.001 - - 0 0.002 - 0 0 54 | T0pp (11B) 0.397 0 0.023 0.001 0 0.001 - - 0.001 0.001 - 0 0 55 | Jurassic-2 Grande (17B) 0.388 - 0.02 0 0 0.003 - - 0.003 0.001 - - - 56 | Pythia (12B) 0.384 - 0.023 0.002 0 0.002 - - - - - - - 57 | GPT-NeoX (20B) 0.365 0 0.022 0.001 0.002 0.001 - - 0.001 0.002 - 0 0 58 | OPT (66B) 0.335 0 0.022 0.001 0.002 0.001 - - 0.001 0.003 - 0 0 59 | GLM (130B) 0.335 0 0.012 0.001 0.002 0.001 - - 0.001 0 - 0 0 60 | TNLG v2 (530B) 0.33 0 0.012 0.001 0.001 0.003 - - 0.003 0.003 - 0 0 61 | ada (350M) 0.301 0 0.03 0.001 0.002 0.003 - - 0.001 0 - 0 0 62 | Falcon-Instruct (7B) 0.295 - 0.017 0.001 0.001 0.002 - - - - - - - 63 | RedPajama-INCITE-Instruct (7B) 0.294 - 0.025 0 0.001 0.003 - - - - - - - 64 | UL2 (20B) 0.29 0.001 0.017 0.001 0.001 0.006 - - 0.009 0.001 - 0 0 65 | MPT-Instruct (30B) 0.245 - 0.017 0.001 0.001 0.003 - - - - - - - 66 | GPT-J (6B) 0.245 0 0.021 0.001 0.001 0.004 - - 0.002 0.002 - 0 0 67 | YaLM (100B) 0.242 0 0.017 0.008 0.003 0.001 - - 0.001 0 - 0 0 68 | RedPajama-INCITE-Base (7B) 0.2 - 0.014 0.002 0.001 0.005 - - - - - - - 69 | 70 | -------------------------------------------------------------------------------- /benchbench/utils/metric.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from scipy.stats import kendalltau 3 | 4 | 5 | def get_kendall_tau(new_rank, old_rank): 6 | """ 7 | Calculate Kendall's Tau for two rankings. 8 | 9 | Args: 10 | new_rank (np.array): new ranking 11 | old_rank (np.array): old ranking 12 | 13 | Returns: 14 | tuple: 15 | float: Kendall's Tau 16 | float: p-value 17 | """ 18 | tau, p_value = kendalltau(new_rank, old_rank) 19 | tau = (1 - tau) / 2 20 | return tau, p_value 21 | 22 | 23 | def get_kendall_w(rankings): 24 | """ 25 | Calculate Kendall's W for a list of rankings. 26 | 27 | Args: 28 | rankings(list): a list of rankings 29 | 30 | Returns: 31 | float: Kendall's W 32 | """ 33 | # Ensure the input is a numpy array for easier manipulation 34 | rankings = np.array(rankings, dtype=int) 35 | 36 | # Number of subjects/items 37 | n = rankings.shape[1] 38 | 39 | # Number of rankings/raters 40 | m = rankings.shape[0] 41 | 42 | # Step 1: Calculate sum of ranks for each item across all lists 43 | rank_sums = np.sum(rankings, axis=0) 44 | 45 | # Step 2: Calculate the mean of the sum of ranks 46 | mean_rank_sum = np.mean(rank_sums) 47 | 48 | # Step 3: Calculate the sum of squared deviations from the mean sum of ranks 49 | ss = np.sum((rank_sums - mean_rank_sum) ** 2) 50 | 51 | # Step 4: Calculate the maximum possible sum of squared deviations 52 | ss_max = m**2 * (n**3 - n) / 12 53 | 54 | # Step 5: Calculate Kendall's W 55 | w = ss / ss_max 56 | 57 | return 1 - w 58 | 59 | 60 | def get_rank_diff(new_rank, old_rank=None): 61 | """ 62 | Get the difference between two ranks. 63 | 64 | Args: 65 | new_rank(np.array): new ranking 66 | old_rank(np.array): old ranking 67 | 68 | Returns: 69 | float: Kendall's Tau 70 | float: MRC (max rank change) 71 | """ 72 | new_rank = np.array(new_rank) 73 | if old_rank is None: 74 | old_rank = np.arange(len(new_rank)) 75 | else: 76 | old_rank = np.array(old_rank) 77 | if np.sum(np.abs(new_rank - old_rank)) <= 1e-8: 78 | return 0, 0 79 | tau = get_kendall_tau(new_rank, old_rank)[0] 80 | max_rank_change = np.max(np.fabs(new_rank - old_rank)) / (len(new_rank) - 1) 81 | return tau, max_rank_change 82 | 83 | 84 | def get_rank_variance(all_new_rank): 85 | """ 86 | Get the variance of all ranks. 87 | 88 | Args: 89 | all_new_rank(list): a list of all rankings 90 | 91 | Returns: 92 | float: w (Kendall's W) 93 | float: max_MRC (the max MRC over every pair of rankings) 94 | """ 95 | all_rank_diff = [] 96 | for i, new_rank_a in enumerate(all_new_rank): 97 | for j, new_rank_b in enumerate(all_new_rank): 98 | if j <= i: 99 | continue 100 | else: 101 | all_rank_diff.append(get_rank_diff(new_rank_a, new_rank_b)[1]) 102 | max_rank_diff = np.mean(all_rank_diff) 103 | w = get_kendall_w(all_new_rank) 104 | 105 | return w, max_rank_diff 106 | 107 | 108 | def rank2order(rank): 109 | """ 110 | [Legacy code] Convert a rank to an order. 111 | """ 112 | ret = np.zeros(len(rank), dtype=int) 113 | for old_rank, new_rank in enumerate(rank): 114 | ret[new_rank] = old_rank 115 | return ret 116 | 117 | 118 | def order2rank(order): 119 | """ 120 | [Legacy code] Convert an order to a rank. 121 | """ 122 | ret = np.zeros(len(order), dtype=int) 123 | for new_rank, old_rank in enumerate(order): 124 | ret[old_rank] = new_rank 125 | return ret 126 | 127 | 128 | def get_order_diff(new_order, old_order=None): 129 | """ 130 | [Legacy code] Get the difference between two orders. 131 | """ 132 | if old_order is None: 133 | old_order = np.arange(len(new_order)) 134 | return get_rank_diff(order2rank(new_order), order2rank(old_order)) 135 | 136 | 137 | def get_order_variance(all_new_order): 138 | """ 139 | [Legacy code] Get the variance of all orders. 140 | """ 141 | all_new_rank = [order2rank(new_order) for new_order in all_new_order] 142 | return get_rank_variance(all_new_rank) 143 | 144 | 145 | def _test_kendalltau(): 146 | # Example rankings 147 | rank1 = [1, 2, 3, 4, 5] 148 | rank2 = [5, 4, 3, 2, 1] 149 | 150 | # Calculate Kendall's Tau 151 | tau, p_value = get_kendall_tau(rank1, rank2) 152 | 153 | # Output the result 154 | print(f"Kendall's Tau: {tau}") 155 | print(f"p-value: {p_value}") 156 | 157 | 158 | def _test_kendallw(): 159 | assert ( 160 | get_kendall_w( 161 | [ 162 | [0, 1, 2, 3, 4], 163 | [0, 1, 2, 3, 4], 164 | [0, 1, 2, 3, 4], 165 | [0, 1, 2, 3, 4], 166 | [0, 1, 2, 3, 4], 167 | ] 168 | ) 169 | == 0.0 170 | ) 171 | 172 | 173 | if __name__ == "__main__": 174 | _test_kendalltau() 175 | _test_kendallw() 176 | -------------------------------------------------------------------------------- /benchbench/data/heim/aesthetics_human.tsv: -------------------------------------------------------------------------------- 1 | Model/adapter Mean win rate ↑ [ sort ] MS-COCO (base) - Clear subject (human) ↑ [ sort ] MS-COCO (base) - Aesthetics (human) ↑ [ sort ] MS-COCO (fairness - gender) - Clear subject (human) ↑ [ sort ] MS-COCO (fairness - gender) - Aesthetics (human) ↑ [ sort ] MS-COCO (fairness - AAVE dialect) - Clear subject (human) ↑ [ sort ] MS-COCO (fairness - AAVE dialect) - Aesthetics (human) ↑ [ sort ] MS-COCO (robustness) - Clear subject (human) ↑ [ sort ] MS-COCO (robustness) - Aesthetics (human) ↑ [ sort ] MS-COCO (Chinese) - Clear subject (human) ↑ [ sort ] MS-COCO (Chinese) - Aesthetics (human) ↑ [ sort ] MS-COCO (Hindi) - Clear subject (human) ↑ [ sort ] MS-COCO (Hindi) - Aesthetics (human) ↑ [ sort ] MS-COCO (Spanish) - Clear subject (human) ↑ [ sort ] MS-COCO (Spanish) - Aesthetics (human) ↑ [ sort ] MS-COCO (Art styles) - Clear subject (human) ↑ [ sort ] MS-COCO (Art styles) - Aesthetics (human) ↑ [ sort ] dailydall.e - Clear subject (human) ↑ [ sort ] dailydall.e - Aesthetics (human) ↑ [ sort ] Landing Page - Clear subject (human) ↑ [ sort ] Landing Page - Aesthetics (human) ↑ [ sort ] Logos - Clear subject (human) ↑ [ sort ] Logos - Aesthetics (human) ↑ [ sort ] Magazine Cover Photos - Clear subject (human) ↑ [ sort ] Magazine Cover Photos - Aesthetics (human) ↑ [ sort ] 2 | Dreamlike Photoreal v2.0 (1B) 0.867 2.9 3.76 - 3.707 - 3.571 - 3.618 - 3.43 - 3.348 - 3.576 2.874 3.824 2.968 4.208 2.712 3.496 2.848 3.608 2.952 3.808 3 | Openjourney v1 (1B) 0.862 2.864 3.804 - 3.575 - 3.477 - 3.602 - 3.894 - 3.59 - 3.598 2.834 3.91 2.968 4.152 2.56 3.496 2.92 3.856 2.848 3.896 4 | DALL-E 2 (3.5B) 0.844 2.914 3.718 - 3.584 - 3.491 - 3.602 - 3.352 - 3.431 - 3.547 2.836 3.764 2.944 3.88 2.752 3.504 2.928 3.664 2.912 3.72 5 | Promptist + Stable Diffusion v1.4 (1B) 0.827 2.906 3.738 - 3.496 - 3.594 - 3.541 - 3.346 - 3.14 - 3.55 2.834 3.616 2.936 3.928 2.92 3.856 2.92 3.8 2.96 3.608 6 | Safe Stable Diffusion strong (1B) 0.816 2.91 3.56 - 3.58 - 3.611 - 3.52 - 3.368 - 3.204 - 3.488 2.828 3.74 2.96 3.872 2.896 3.736 2.888 3.864 2.864 3.672 7 | Openjourney v2 (1B) 0.751 2.918 3.358 - 3.444 - 3.572 - 3.518 - 3.492 - 3.428 - 3.508 2.872 3.464 2.872 3.456 2.904 3.504 2.928 3.344 2.912 3.448 8 | Safe Stable Diffusion max (1B) 0.744 2.87 3.476 - 3.514 - 3.5 - 3.484 - 3.428 - 3.536 - 3.63 2.86 3.494 2.848 3.528 2.856 3.512 2.872 3.424 2.864 3.544 9 | Dreamlike Diffusion v1.0 (1B) 0.704 2.898 3.502 - 3.477 - 3.597 - 3.416 - 3.422 - 3.364 - 3.522 2.852 3.43 2.92 3.632 2.832 3.456 2.912 3.392 2.872 3.4 10 | Lexica Search with Stable Diffusion v1.5 (1B) 0.584 2.764 3.472 - 3.252 - 3.256 - 3.434 - 3.276 - 3.166 - 3.294 2.77 3.652 2.936 3.704 2.912 3.768 2.896 3.6 2.936 3.568 11 | Stable Diffusion v1.4 (1B) 0.551 2.84 3.632 - 3.483 - 3.408 - 3.462 - 3.212 - 3.036 - 3.228 2.814 3.76 2.872 3.976 2.632 3.304 2.84 3.672 2.768 3.496 12 | DALL-E mega (2.6B) 0.549 2.906 3.528 - 3.291 - 3.331 - 3.29 - 3.054 - 2.236 - 3.084 2.808 3.606 2.96 3.736 2.936 3.752 2.888 3.856 2.808 3.584 13 | MultiFusion (13B) 0.482 2.788 3.46 - 3.309 - 3.178 - 3.388 - 3.326 - 3.322 - 3.278 2.794 3.68 2.856 3.816 2.6 3.336 2.728 3.488 2.728 3.416 14 | DALL-E mini (0.4B) 0.478 2.864 3.404 - 3.368 - 3.41 - 3.441 - 3.248 - 3.22 - 3.246 2.732 3.284 2.872 3.368 2.848 3.464 2.896 3.424 2.8 3.176 15 | Redshift Diffusion (1B) 0.422 2.492 3.356 - 3.538 - 3.474 - 3.471 - 3.366 - 3.336 - 3.382 2.538 3.288 2.448 3.288 2.496 3.28 2.52 3.152 2.344 3.128 16 | minDALL-E (1.3B) 0.409 2.79 3.226 - 3.344 - 3.237 - 3.281 - 3.31 - 3.248 - 3.3 2.592 3.186 2.896 3.496 2.808 3.392 2.848 3.488 2.808 3.392 17 | CogView2 (6B) 0.396 2.772 3.34 - 3.112 - 3.176 - 3.005 - 3.316 - 2.862 - 2.972 2.576 3.298 2.784 3.592 2.824 3.704 2.872 3.584 2.848 3.44 18 | DeepFloyd IF Large (0.9B) 0.338 2.626 3.236 - 3.381 - 3.32 - 3.506 - 3.382 - 3.32 - 3.444 2.576 3.26 2.368 3.04 2.336 3.088 2.44 3.24 2.424 3.288 19 | Stable Diffusion v2.1 base (1B) 0.311 2.492 3.306 - 3.493 - 3.425 - 3.384 - 3.328 - 3.282 - 3.34 2.512 3.35 2.336 3.016 2.368 3.056 2.528 3.152 2.408 3.144 20 | Safe Stable Diffusion medium (1B) 0.298 2.488 3.18 - 3.346 - 3.353 - 3.404 - 3.324 - 3.096 - 3.418 2.504 3.354 2.368 3.224 2.536 3.16 2.456 3.2 2.496 3.16 21 | DeepFloyd IF X-Large (4.3B) 0.278 2.58 3.304 - 3.49 - 3.408 - 3.37 - 3.366 - 3.148 - 3.396 2.534 3.29 2.424 3.088 2.36 2.984 2.488 3.048 2.392 2.984 22 | Stable Diffusion v2 base (1B) 0.269 2.474 3.18 - 3.536 - 3.419 - 3.404 - 3.242 - 3.104 - 3.32 2.514 3.39 2.496 3.28 2.4 3.08 2.368 3.12 2.4 3.12 23 | Safe Stable Diffusion weak (1B) 0.262 2.478 3.284 - 3.424 - 3.373 - 3.34 - 3.308 - 3.142 - 3.36 2.58 3.44 2.376 3.024 2.376 2.968 2.376 3.096 2.464 3.16 24 | Vintedois (22h) Diffusion model v0.1 (1B) 0.262 2.46 3.264 - 3.44 - 3.417 - 3.396 - 3.248 - 3.252 - 3.494 2.518 3.396 2.312 3.08 2.464 3.128 2.376 3.112 2.36 2.68 25 | Stable Diffusion v1.5 (1B) 0.242 2.466 3.13 - 3.475 - 3.474 - 3.369 - 3.188 - 3.102 - 3.306 2.516 3.412 2.464 3.176 2.4 3.112 2.472 3.048 2.408 3.088 26 | DeepFloyd IF Medium (0.4B) 0.231 2.562 3.13 - 3.347 - 3.427 - 3.354 - 3.188 - 3.128 - 3.3 2.546 3.304 2.368 3.248 2.376 3.192 2.424 3.096 2.456 3.144 27 | GigaGAN (1B) 0.222 2.542 3.016 - 3.4 - 3.32 - 3.414 - 3.288 - 3.22 - 3.284 2.484 3.238 2.376 3.088 2.472 3.128 2.448 3.12 2.432 3.024 28 | -------------------------------------------------------------------------------- /benchbench/data/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | from .bbh import load_bbh 5 | from .bigcode import load_bigcode 6 | from .glue import load_glue 7 | from .helm_lite import load_helm_lite 8 | from .helm_capability import load_helm_capability 9 | from .heim import load_heim 10 | from .helm import load_helm 11 | from .imagenet import load_imagenet 12 | from .mmlu import load_mmlu 13 | from .mteb import load_mteb 14 | from .openllm import load_openllm 15 | from .superglue import load_superglue 16 | from .vtab import load_vtab 17 | from .dummy import load_random_benchmark, load_constant_benchmark 18 | from ..utils.win_rate import WinningRate 19 | 20 | cardinal_benchmark_list = [ 21 | "GLUE", 22 | "SuperGLUE", 23 | "OpenLLM", 24 | "MMLU", 25 | "BigBenchHard", 26 | "MTEB", 27 | "VTAB", 28 | "HELM-capability", 29 | ] 30 | ordinal_benchmark_list = [ 31 | "BigCode", 32 | "HELM-lite", 33 | "HELM-accuracy", 34 | "HELM-bias", 35 | "HELM-calibration", 36 | "HELM-fairness", 37 | "HELM-efficiency", 38 | "HELM-robustness", 39 | "HELM-summarization", 40 | "HELM-toxicity", 41 | "HEIM-alignment_auto", 42 | "HEIM-nsfw", 43 | "HEIM-quality_auto", 44 | "HEIM-aesthetics_auto", 45 | "HEIM-alignment_human", 46 | "HEIM-nudity", 47 | "HEIM-quality_human", 48 | "HEIM-aesthetics_human", 49 | "HEIM-black_out", 50 | "HEIM-originality", 51 | ] 52 | 53 | 54 | def load_cardinal_benchmark(dataset_name, do_rerank=True, **kwargs): 55 | """ 56 | Load a cardinal benchmark. 57 | 58 | Args: 59 | dataset_name(str): Name for the benchmark. 60 | do_rerank(bool): Whether re-rank the data based on the average score. 61 | **kwargs: Other arguments. 62 | 63 | Returns: 64 | tuple: 65 | pd.DataFrame: data. 66 | list: cols. 67 | """ 68 | if dataset_name == "HELM-capability": 69 | data, cols = load_helm_capability() 70 | elif dataset_name == "GLUE": 71 | data, cols = load_glue() 72 | elif dataset_name == "SuperGLUE": 73 | data, cols = load_superglue() 74 | elif dataset_name == "OpenLLM": 75 | data, cols = load_openllm() 76 | elif dataset_name == "MMLU": 77 | data, cols = load_mmlu() 78 | elif dataset_name == "BigBenchHard": 79 | data, cols = load_bbh() 80 | elif dataset_name == "MTEB": 81 | data, cols = load_mteb() 82 | elif dataset_name == "VTAB": 83 | data, cols = load_vtab() 84 | elif dataset_name == "ImageNet": 85 | data, cols = load_imagenet(**kwargs) 86 | elif dataset_name == "Random": 87 | data, cols = load_random_benchmark(**kwargs) 88 | elif dataset_name == "Constant": 89 | data, cols = load_constant_benchmark(**kwargs) 90 | else: 91 | raise ValueError 92 | 93 | if do_rerank: 94 | avg = data[cols].values.mean(1) 95 | order = sorted(np.arange(len(data)), key=lambda x: -avg[x]) 96 | data = data.iloc[order].reset_index(drop=True) 97 | 98 | return data, cols 99 | 100 | 101 | def load_ordinal_benchmark(dataset_name, do_rerank=True, **kwargs): 102 | """ 103 | Load an ordinal benchmark. 104 | 105 | Args: 106 | dataset_name(str): name for the benchmark 107 | do_rerank(bool): whether re-rank the data based on the winning rate 108 | **kwargs: other arguments 109 | 110 | Returns: 111 | tuple: 112 | pd.DataFrame: data 113 | list: cols 114 | """ 115 | if len(dataset_name.split("-")) == 2: 116 | dataset_name, subset_name = dataset_name.split("-") 117 | else: 118 | subset_name = None 119 | 120 | if dataset_name == "HELM": 121 | subset_name = "accuracy" if subset_name is None else subset_name 122 | if subset_name == "lite": 123 | data, cols = load_helm_lite() 124 | return data, cols 125 | assert subset_name in [ 126 | "accuracy", 127 | "bias", 128 | "calibration", 129 | "fairness", 130 | "efficiency", 131 | "robustness", 132 | "summarization", 133 | "toxicity", 134 | ] 135 | data, cols = load_helm(subset_name) 136 | elif dataset_name == "HEIM": 137 | subset_name = "alignment_human" if subset_name is None else subset_name 138 | assert subset_name in [ 139 | "alignment_auto", 140 | "nsfw", 141 | "quality_auto", 142 | "aesthetics_auto", 143 | "alignment_human", 144 | "nudity", 145 | "quality_human", 146 | "aesthetics_human", 147 | "black_out", 148 | "originality", 149 | ] 150 | data, cols = load_heim(subset_name) 151 | elif dataset_name == "BigCode": 152 | data, cols = load_bigcode() 153 | elif dataset_name == "Random": 154 | data, cols = load_random_benchmark(**kwargs, num_model=1000) 155 | elif dataset_name == "Constant": 156 | data, cols = load_constant_benchmark(**kwargs) 157 | else: 158 | raise ValueError 159 | 160 | if do_rerank: 161 | wr = WinningRate(data, cols) 162 | win_rate = wr.get_winning_rate() 163 | order = sorted(np.arange(len(data)), key=lambda x: -win_rate[x]) 164 | data = data.iloc[order].reset_index(drop=True) 165 | 166 | return data, cols 167 | -------------------------------------------------------------------------------- /benchbench/measures/cardinal.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | from torch.optim import SGD 4 | 5 | from ..utils.base import rankdata 6 | from ..utils.metric import get_rank_diff, get_rank_variance 7 | 8 | 9 | def appr_rank_diff(score, old_rank, use_weighted_loss=False): 10 | """ 11 | Approximate the rank difference between the old rank and the new rank. 12 | 13 | Args: 14 | score(np.array): Scores for all models across all tasks. 15 | old_rank(np.array): Original rank. 16 | use_weighted_loss(bool): Whether use weighted loss. 17 | 18 | Returns: 19 | torch.Tensor: The loss. 20 | """ 21 | loss = torch.zeros(1) 22 | for i in range(len(score)): 23 | for j in range(len(score)): 24 | if old_rank[j] < old_rank[i]: 25 | if use_weighted_loss: 26 | # this weight would encourage larger rank distance to get changed first 27 | loss = loss + (old_rank[i] - old_rank[j]) * max( 28 | score[j] - score[i], 0.0 29 | ) 30 | else: 31 | loss = loss + max(score[j] - score[i], 0.0) 32 | return loss 33 | 34 | 35 | def get_sensitivity( 36 | data, 37 | cols, 38 | min_value=0.01, 39 | lr=1.0, 40 | num_steps=1000, 41 | stop_threshold=1e-5, 42 | normalize_epsilon=True, 43 | use_weighted_loss=None, 44 | return_weight=False, 45 | verbose=False, 46 | ): 47 | """ 48 | Calculate the sensitivity for a given benchmark. 49 | 50 | Args: 51 | data(pd.DataFrame): Each row represents a model, each column represents a task. 52 | cols(list): The column names of the tasks. 53 | min_value(float): Min values for epsilon. 54 | lr(float): Learning rate for optimization. 55 | num_steps(int): Number of steps for optimization. 56 | stop_threshold(float): Stop if the loss change is smaller than this value. 57 | normalize_epsilon(bool): Whether normalize epsilon by std. 58 | use_weighted_loss(bool): Whether use weighted approximation loss, if None, use both and return the better one. 59 | return_weight(bool): Whether return alpha. 60 | verbose(bool): Whether output logs. 61 | 62 | Returns: 63 | tuple: If return_weight is True, return ((tau, MRC), alpha); else return (tau, MRC). 64 | """ 65 | if use_weighted_loss is None: 66 | a = get_sensitivity( 67 | data, 68 | cols, 69 | min_value, 70 | lr, 71 | num_steps, 72 | stop_threshold, 73 | normalize_epsilon, 74 | use_weighted_loss=True, 75 | return_weight=True, 76 | verbose=verbose, 77 | ) 78 | b = get_sensitivity( 79 | data, 80 | cols, 81 | min_value, 82 | lr, 83 | num_steps, 84 | stop_threshold, 85 | normalize_epsilon, 86 | use_weighted_loss=False, 87 | return_weight=True, 88 | verbose=verbose, 89 | ) 90 | if return_weight: 91 | return a if a[0] > b[0] else b 92 | else: 93 | return max(a[0], b[0]) 94 | 95 | data = data[cols].values 96 | data = torch.tensor(data) 97 | data_std = data.std(0) 98 | data = data[:, [i for i, _std in enumerate(data_std) if _std > 1e-8]] 99 | orig_data = data.clone() 100 | data = data - data.mean(0) 101 | data = data / data.std(0) 102 | 103 | old_score = orig_data.mean(1).detach().numpy() 104 | old_rank = rankdata(-old_score, method="average") 105 | 106 | weight = torch.ones(data.shape[1], requires_grad=True) 107 | 108 | def normalize_func(w): 109 | w1 = torch.softmax(w, dim=0) 110 | w2 = w1 + min_value / (1 - min_value) 111 | w3 = w2 / torch.sum(w2) 112 | return w3 113 | 114 | opt = SGD([weight], lr=lr) 115 | last_loss = 0x3F3F3F3F 116 | for step in range(num_steps): 117 | opt.zero_grad() 118 | norm_weight = normalize_func(weight) 119 | score = (data * norm_weight).mean(1) 120 | loss = appr_rank_diff(score, old_rank, use_weighted_loss=use_weighted_loss) 121 | 122 | if loss.item() <= 1e-8: 123 | break 124 | 125 | loss.backward() 126 | opt.step() 127 | if np.fabs(loss.item() - last_loss) < stop_threshold: 128 | break 129 | last_loss = loss.item() 130 | if verbose: 131 | print("Step %d, Loss = %.2lf" % (step, loss.item())) 132 | 133 | norm_weight = normalize_func(weight).detach().numpy() 134 | if normalize_epsilon: 135 | norm_weight = norm_weight / orig_data.std(0).numpy() 136 | norm_weight = norm_weight / norm_weight.max() 137 | new_score = (orig_data * norm_weight).mean(1).detach().numpy() 138 | new_rank = rankdata(-new_score, method="average") 139 | rank_diff = get_rank_diff(new_rank, old_rank) 140 | if return_weight: 141 | return rank_diff, norm_weight 142 | else: 143 | return rank_diff 144 | 145 | 146 | def get_diversity(data, cols): 147 | """ 148 | Calculate the diversity for a given benchmark. 149 | 150 | Args: 151 | data(pd.DataFrame): Each row represents a model, each column represents a task. 152 | cols(list): The column names of the tasks. 153 | 154 | Returns: 155 | tuple: (W, max_MRC), where max_MRC refers to max MRC over every pair of tasks. 156 | 157 | """ 158 | return get_rank_variance( 159 | [rankdata(-data[c].values, method="average") for c in cols] 160 | ) 161 | -------------------------------------------------------------------------------- /benchbench/data/helm/calibration.tsv: -------------------------------------------------------------------------------- 1 | Model Mean win rate MMLU - ECE (10-bin) BoolQ - ECE (10-bin) NarrativeQA - ECE (10-bin) NaturalQuestions (closed-book) - ECE (10-bin) NaturalQuestions (open-book) - ECE (10-bin) QuAC - ECE (10-bin) HellaSwag - ECE (10-bin) OpenbookQA - ECE (10-bin) TruthfulQA - ECE (10-bin) IMDB - ECE (10-bin) CivilComments - ECE (10-bin) RAFT - ECE (10-bin) 2 | T0pp (11B) 0.758 0.168 0.322 0 0 0 0.001 - - 0.154 0.291 0.308 0.086 3 | J1-Jumbo v1 (178B) 0.666 0.131 0.215 0.034 0.035 0.065 0.043 0.217 0.25 0.113 0.064 0.27 0.228 4 | Jurassic-2 Jumbo (178B) 0.66 0.137 0.175 0.073 0.018 0.073 0.035 - - 0.068 0.182 0.314 0.218 5 | Cohere large v20220720 (13.1B) 0.652 0.112 0.088 0.037 0.025 0.143 0.033 0.288 0.225 0.105 0.132 0.384 0.267 6 | GLM (130B) 0.652 0.128 0.171 0.037 0.022 0.076 0.027 - - 0.088 0.18 0.486 0.226 7 | Jurassic-2 Large (7.5B) 0.644 0.141 0.147 - 0.014 0.084 - - - 0.102 0.178 0.19 0.254 8 | Luminous Base (13B) 0.641 0.111 0.066 0.048 0.045 0.07 0.098 - - 0.081 0.232 0.28 0.29 9 | J1-Large v1 (7.5B) 0.638 0.123 0.106 0.046 0.015 0.086 0.024 0.192 0.25 0.112 0.213 0.377 0.269 10 | J1-Grande v2 beta (17B) 0.634 0.139 0.167 0.041 0.036 0.065 0.04 0.226 0.215 0.123 0.136 0.376 0.234 11 | Jurassic-2 Grande (17B) 0.63 0.134 0.209 0.126 0.018 0.063 0.035 - - 0.097 0.111 0.381 0.232 12 | Luminous Supreme (70B) 0.624 0.154 0.083 0.049 0.041 0.074 0.058 - - 0.092 0.173 0.272 0.238 13 | J1-Grande v1 (17B) 0.622 0.114 0.154 0.047 0.029 0.081 0.036 0.213 0.258 0.091 0.158 0.408 0.244 14 | ada (350M) 0.616 0.128 0.067 0.046 0.028 0.18 0.039 0.057 0.346 0.071 0.274 0.355 0.268 15 | TNLG v2 (530B) 0.615 0.127 0.048 0.05 0.04 0.075 0.08 0.322 0.243 0.226 0.087 0.213 0.244 16 | Cohere small v20220720 (410M) 0.609 0.136 0.095 0.031 0.023 0.198 0.036 0.083 0.379 0.076 0.134 0.486 0.234 17 | curie (6.7B) 0.603 0.138 0.079 0.045 0.017 0.134 0.043 0.25 0.26 0.062 0.259 0.293 0.319 18 | TNLG v2 (6.7B) 0.602 0.132 0.065 0.046 0.031 0.089 0.056 0.268 0.282 0.117 0.118 0.248 0.314 19 | Cohere medium v20221108 (6.1B) 0.601 0.113 0.095 0.028 0.015 0.233 0.041 0.281 0.23 0.08 0.36 0.487 0.253 20 | Cohere Command beta (52.4B) 0.596 0.183 0.023 0.058 0.084 0.056 0.06 0.325 0.231 0.311 0.015 0.161 0.262 21 | babbage (1.3B) 0.588 0.14 0.068 0.027 0.016 0.147 0.045 0.144 0.3 0.142 0.212 0.31 0.286 22 | Cohere xlarge v20221108 (52.4B) 0.585 0.143 0.051 0.059 0.054 0.073 0.063 0.333 0.207 0.211 0.069 0.313 0.25 23 | Luminous Extended (30B) 0.577 0.135 0.129 0.046 0.022 0.09 0.096 - - 0.064 0.204 0.359 0.29 24 | davinci (175B) 0.575 0.132 0.072 0.067 0.061 0.079 0.068 0.31 0.204 0.211 0.126 0.396 0.222 25 | Cohere xlarge v20220609 (52.4B) 0.543 0.149 0.04 0.062 0.068 0.085 0.067 0.341 0.235 0.099 0.069 0.327 0.274 26 | Cohere Command beta (6.1B) 0.529 0.155 0.059 0.076 0.042 0.057 0.062 0.293 0.25 0.3 0.014 0.358 0.274 27 | Cohere medium v20220720 (6.1B) 0.51 0.114 0.082 0.047 0.026 0.142 0.048 0.271 0.275 0.094 0.36 0.459 0.304 28 | text-davinci-002 0.474 0.176 0.064 0.239 0.341 0.242 0.274 0.286 0.238 0.199 0.031 0.183 0.212 29 | UL2 (20B) 0.464 0.134 0.46 0 0.092 0.179 0 - - 0.125 0.225 0.404 0.401 30 | GPT-J (6B) 0.464 0.115 0.062 0.199 0.075 0.354 0.13 0.233 0.235 0.078 0.295 0.409 0.389 31 | RedPajama-INCITE-Base-v1 (3B) 0.439 0.115 0.187 0.234 0.116 0.345 0.078 - - 0.048 0.248 0.303 0.502 32 | T5 (11B) 0.435 0.151 0.433 0 0.076 0.239 0 - - 0.143 0.236 0.38 0.367 33 | Pythia (6.9B) 0.43 0.136 0.106 0.217 0.07 0.369 0.1 - - 0.076 0.302 0.259 0.502 34 | GPT-NeoX (20B) 0.422 0.122 0.195 0.224 0.103 0.373 0.115 0.277 0.232 0.058 0.23 0.444 0.324 35 | RedPajama-INCITE-Base (7B) 0.409 0.098 0.127 0.276 0.127 0.396 0.131 - - 0.063 0.206 0.305 0.648 36 | text-davinci-003 0.407 0.317 0.098 0.37 0.286 0.323 0.27 0.278 0.216 0.348 0.113 0.292 0.203 37 | YaLM (100B) 0.402 0.708 0.147 0.06 0.02 0.086 0.029 - - 0.679 0.418 0.437 0.278 38 | RedPajama-INCITE-Instruct (7B) 0.388 0.143 0.035 0.247 0.142 0.466 0.074 - - 0.232 0.159 0.102 0.695 39 | Pythia (12B) 0.374 0.111 0.14 0.239 0.094 0.39 0.138 - - 0.094 0.342 0.297 0.514 40 | RedPajama-INCITE-Instruct-v1 (3B) 0.372 0.124 0.141 0.254 0.12 0.454 0.1 - - 0.097 0.04 0.383 0.661 41 | BLOOM (176B) 0.348 0.137 0.209 0.237 0.116 0.347 0.122 0.293 0.248 0.096 0.343 0.262 0.44 42 | OPT (175B) 0.338 0.147 0.194 0.254 0.173 0.372 0.148 0.325 0.209 0.054 0.19 0.462 0.352 43 | text-curie-001 0.335 0.462 0.253 0.221 0.253 0.216 0.254 0.153 0.321 0.355 0.031 0.262 0.409 44 | Alpaca (7B) 0.334 0.234 0.343 0.046 0.134 0.238 0.04 - - 0.375 0.281 0.352 0.33 45 | OPT (66B) 0.289 0.135 0.2 0.245 0.141 0.384 0.154 0.293 0.237 0.073 0.302 0.474 0.468 46 | text-babbage-001 0.277 0.311 0.344 0.186 0.522 0.385 0.24 0.083 0.362 0.251 0.038 0.499 0.295 47 | Vicuna v1.3 (13B) 0.275 0.194 0.159 0.257 0.202 0.43 0.103 - - 0.316 0.183 0.253 0.376 48 | Vicuna v1.3 (7B) 0.204 0.176 0.322 0.084 0.162 0.413 0.109 - - 0.227 0.348 0.346 0.601 49 | text-ada-001 0.171 0.506 0.346 0.319 0.764 0.691 0.268 0.103 0.487 0.465 0.09 0.479 0.473 50 | Anthropic-LM v4-s3 (52B) - - - - - - - - - - - - - 51 | LLaMA (7B) - - - - - - - - - - - - - 52 | LLaMA (13B) - - - - - - - - - - - - - 53 | LLaMA (30B) - - - - - - - - - - - - - 54 | LLaMA (65B) - - - - - - - - - - - - - 55 | Llama 2 (7B) - - - - - - - - - - - - - 56 | Llama 2 (13B) - - - - - - - - - - - - - 57 | Llama 2 (70B) - - - - - - - - - - - - - 58 | Mistral v0.1 (7B) - - - - - - - - - - - - - 59 | gpt-3.5-turbo-0301 - - - - - - - - - - - - - 60 | gpt-3.5-turbo-0613 - - - - - - - - - - - - - 61 | MPT (30B) - - - - - - - - - - - - - 62 | MPT-Instruct (30B) - - - - - - - - - - - - - 63 | Falcon (7B) - - - - - - - - - - - - - 64 | Falcon-Instruct (7B) - - - - - - - - - - - - - 65 | Falcon (40B) - - - - - - - - - - - - - 66 | Falcon-Instruct (40B) - - - - - - - - - - - - - 67 | InstructPalmyra (30B) - - - - - - - - - - - - - 68 | Palmyra X (43B) - - - - - - - - - - - - - 69 | 70 | -------------------------------------------------------------------------------- /benchbench/data/heim/alignment_human.tsv: -------------------------------------------------------------------------------- 1 | Model/adapter Mean win rate ↑ [ sort ] MS-COCO (base) - Image text alignment (human) ↑ [ sort ] MS-COCO (fairness - gender) - Image text alignment (human) ↑ [ sort ] MS-COCO (fairness - AAVE dialect) - Image text alignment (human) ↑ [ sort ] MS-COCO (robustness) - Image text alignment (human) ↑ [ sort ] MS-COCO (Chinese) - Image text alignment (human) ↑ [ sort ] MS-COCO (Hindi) - Image text alignment (human) ↑ [ sort ] MS-COCO (Spanish) - Image text alignment (human) ↑ [ sort ] MS-COCO (Art styles) - Image text alignment (human) ↑ [ sort ] DrawBench (image quality categories) - Image text alignment (human) ↑ [ sort ] PartiPrompts (image quality categories) - Image text alignment (human) ↑ [ sort ] dailydall.e - Image text alignment (human) ↑ [ sort ] Landing Page - Image text alignment (human) ↑ [ sort ] Logos - Image text alignment (human) ↑ [ sort ] Magazine Cover Photos - Image text alignment (human) ↑ [ sort ] Common Syntactic Processes - Image text alignment (human) ↑ [ sort ] DrawBench (reasoning categories) - Image text alignment (human) ↑ [ sort ] PartiPrompts (reasoning categories) - Image text alignment (human) ↑ [ sort ] Relational Understanding - Image text alignment (human) ↑ [ sort ] Detection (PaintSkills) - Image text alignment (human) ↑ [ sort ] Winoground - Image text alignment (human) ↑ [ sort ] PartiPrompts (knowledge categories) - Image text alignment (human) ↑ [ sort ] DrawBench (knowledge categories) - Image text alignment (human) ↑ [ sort ] TIME's most significant historical figures - Image text alignment (human) ↑ [ sort ] 2 | DALL-E 2 (3.5B) 0.941 4.438 4.534 4.416 4.58 3.902 1.798 4.247 4.216 3.9 4.262 4.416 4.256 4.416 4.104 4.062 3.97 4.16 3.7 4.36 4.16 4.1 4.1 4.227 3 | Dreamlike Photoreal v2.0 (1B) 0.748 4.346 4.312 4.18 4.253 1.966 2.324 3.128 4.068 3.98 4.296 4.552 4.32 4.232 4.152 3.569 3.553 3.8 3.24 3.747 3.48 4.082 3.818 4.488 4 | Stable Diffusion v1.4 (1B) 0.723 4.214 4.202 4.136 4.115 2.224 1.788 3.276 4.154 3.83 4.118 4.256 4.12 4.232 4.032 3.492 3.667 3.5 3.36 3.853 4.06 3.865 3.641 4.429 5 | Safe Stable Diffusion strong (1B) 0.63 4.026 4.113 4.112 3.97 2.126 2.152 3.468 3.958 3.65 4.047 3.832 3.904 3.776 3.68 3.296 3.73 3.58 3.24 3.853 4.12 3.853 3.576 4.558 6 | DALL-E mega (2.6B) 0.6 4.056 4.16 4.061 4.071 2.474 1.794 3.388 4.012 3.63 4.156 3.88 3.92 3.92 3.688 3.281 3.527 3.9 3.34 3.9 3.48 3.906 3.371 3.894 7 | Openjourney v1 (1B) 0.586 4.16 4.044 3.908 3.968 2.368 2.136 2.634 4.112 3.94 4.098 4.448 4.056 4.224 4.016 3.477 3.33 3.6 2.38 3.867 3.02 3.982 3.494 4.279 8 | Dreamlike Diffusion v1.0 (1B) 0.56 4.024 3.934 3.978 3.975 2.964 2.79 3.548 3.756 3.48 3.947 3.872 3.784 3.488 3.384 3.435 3.6 3.72 3.38 3.7 4.06 3.847 3.765 3.912 9 | Promptist + Stable Diffusion v1.4 (1B) 0.551 4 4.113 3.994 4.166 2.172 1.68 2.802 3.938 3.64 3.924 3.864 3.872 3.712 3.616 3.419 3.643 3.6 3.32 3.987 3.28 4.029 3.553 4.059 10 | MultiFusion (13B) 0.543 4.106 4.061 3.881 4.046 2.962 1.916 3.832 4.206 3.8 3.967 4.2 4.032 4 3.896 3.096 3.173 3.7 3.26 3.673 3.4 3.659 3.465 3.271 11 | DeepFloyd IF X-Large (4.3B) 0.537 3.818 3.965 3.937 3.931 2.978 2.824 3.706 3.766 3.81 3.831 3.568 3.504 3.432 3.552 3.708 3.68 3.74 3.68 3.72 3.84 3.718 3.612 3.841 12 | DeepFloyd IF Medium (0.4B) 0.517 3.754 3.91 3.876 3.885 3.006 3.046 3.704 3.786 3.98 3.771 3.608 3.688 3.576 3.68 3.45 3.597 3.64 3.46 3.633 3.58 3.771 3.747 3.706 13 | DeepFloyd IF Large (0.9B) 0.508 3.798 4.01 3.903 3.968 2.888 3.06 3.776 3.73 3.68 3.836 3.536 3.48 3.616 3.632 3.427 3.593 3.64 3.52 3.673 3.94 3.753 3.706 3.929 14 | Stable Diffusion v2.1 base (1B) 0.508 3.672 4.094 3.947 3.811 2.976 2.762 3.552 3.866 3.68 3.798 3.664 3.528 3.648 3.568 3.562 3.567 3.64 3.6 3.92 3.82 3.659 3.359 3.947 15 | GigaGAN (1B) 0.506 3.692 3.88 3.868 3.851 3.456 3.246 3.728 3.88 3.84 3.787 3.592 3.512 3.52 3.696 3.477 3.71 3.62 3.82 3.707 3.56 3.671 3.371 3.906 16 | Stable Diffusion v2 base (1B) 0.506 3.69 4.05 3.898 3.837 3.106 2.726 3.7 3.784 3.88 3.942 3.648 3.56 3.624 3.48 3.565 3.567 3.92 3.34 3.847 3.3 3.512 3.624 3.853 17 | Openjourney v2 (1B) 0.499 3.802 3.931 4.007 3.916 2.936 2.808 3.656 3.822 3.35 3.9 3.76 3.792 3.664 3.496 3.404 3.617 3.64 3.34 3.62 3.88 3.647 3.624 3.771 18 | Vintedois (22h) Diffusion model v0.1 (1B) 0.461 3.716 3.899 3.803 3.716 2.896 2.7 3.602 3.87 3.74 3.964 3.656 3.552 3.632 3.408 3.638 3.617 3.52 3.3 3.673 4.06 3.676 3.718 3.653 19 | Safe Stable Diffusion weak (1B) 0.45 3.73 3.87 3.831 3.816 2.862 2.736 3.534 3.92 3.79 3.876 3.744 3.592 3.552 3.648 3.427 3.483 3.86 3.32 3.693 3.52 3.947 3.7 3.635 20 | Stable Diffusion v1.5 (1B) 0.445 3.708 4.102 3.942 3.784 2.93 2.79 3.518 3.844 3.81 3.949 3.6 3.536 3.464 3.568 3.519 3.607 3.5 3.08 3.647 3.66 3.812 3.688 3.77 21 | Safe Stable Diffusion medium (1B) 0.424 3.696 3.733 3.761 3.742 3 2.746 3.55 3.728 3.76 3.887 3.68 3.552 3.632 3.624 3.6 3.613 3.88 3.12 3.587 3.78 3.724 3.482 3.697 22 | Redshift Diffusion (1B) 0.398 3.572 3.934 3.813 3.671 3.042 2.714 3.134 3.694 3.93 3.802 3.792 3.736 3.616 3.664 3.485 3.51 3.34 2.96 3.833 3.36 3.941 3.524 3.659 23 | DALL-E mini (0.4B) 0.397 3.692 3.686 3.676 3.665 2.936 2.832 3.304 3.524 3.39 3.782 3.712 3.736 3.76 3.648 3.235 3.617 3.84 3.08 3.827 3.66 3.871 3.553 3.853 24 | Safe Stable Diffusion max (1B) 0.37 3.712 3.894 3.774 3.823 2.832 2.896 3.538 3.714 3.52 3.771 3.608 3.6 3.584 3.44 3.265 3.617 3.52 2.92 3.76 3.84 3.653 3.7 3.927 25 | minDALL-E (1.3B) 0.282 3.672 3.592 3.531 3.535 2.932 2.872 2.904 3.44 3.51 3.711 3.816 3.736 3.88 3.744 3.285 3.503 3.38 3.22 3.513 3.66 3.729 3.335 3.247 26 | Lexica Search with Stable Diffusion v1.5 (1B) 0.16 3.496 3.24 3.464 3.472 2.338 1.7 2.848 3.556 3.46 3.762 3.624 3.944 3.768 3.424 3.162 3.1 3.12 3 3.46 3.14 3.347 2.871 4.035 27 | CogView2 (6B) 0.15 3.688 3.744 3.575 3.621 3.842 1.734 1.766 3.53 3.63 3.731 3.584 3.744 3.688 3.496 3.008 3.117 2.9 2.96 3.44 2.86 3.324 2.659 2.888 -------------------------------------------------------------------------------- /benchbench/data/helm/efficiency.tsv: -------------------------------------------------------------------------------- 1 | Model Mean win rate MMLU - Denoised inference time (s) BoolQ - Denoised inference time (s) NarrativeQA - Denoised inference time (s) NaturalQuestions (closed-book) - Denoised inference time (s) NaturalQuestions (open-book) - Denoised inference time (s) QuAC - Denoised inference time (s) HellaSwag - Denoised inference time (s) OpenbookQA - Denoised inference time (s) TruthfulQA - Denoised inference time (s) MS MARCO (regular) - Denoised inference time (s) MS MARCO (TREC) - Denoised inference time (s) CNN/DailyMail - Denoised inference time (s) XSUM - Denoised inference time (s) IMDB - Denoised inference time (s) CivilComments - Denoised inference time (s) RAFT - Denoised inference time (s) 2 | text-ada-001 0.938 0.088 0.096 0.171 0.085 0.128 0.21 0.079 0.076 0.089 0.09 0.09 0.793 0.311 0.109 0.092 0.107 3 | curie (6.7B) 0.895 0.092 0.1 0.152 0.122 0.189 0.323 0.084 0.079 0.094 0.094 0.095 0.623 0.294 0.11 0.097 0.112 4 | babbage (1.3B) 0.861 0.119 0.121 0.176 0.152 0.232 0.261 0.113 0.111 0.12 0.122 0.122 0.533 0.272 0.128 0.12 0.137 5 | text-curie-001 0.783 0.133 0.143 0.205 0.153 0.185 0.298 0.125 0.119 0.134 0.136 0.135 0.799 0.364 0.147 0.142 0.152 6 | text-babbage-001 0.778 0.133 0.142 0.243 0.136 0.204 0.314 0.125 0.122 0.134 0.136 0.135 0.968 0.431 0.157 0.138 0.153 7 | ada (350M) 0.77 0.14 0.141 0.211 0.167 0.271 0.27 0.138 0.136 0.141 0.142 0.142 0.598 0.237 0.142 0.141 0.154 8 | text-davinci-002 0.604 0.196 0.191 0.512 0.264 0.394 0.891 0.171 0.158 0.2 0.192 0.198 2.236 1.026 0.247 0.186 0.276 9 | GPT-J (6B) 0.601 0.07 0.499 1.311 1.777 3.866 1.389 0.03 0.019 0.044 0.084 0.081 2.076 0.742 0.701 0.307 0.628 10 | davinci (175B) 0.558 0.212 0.21 0.369 0.327 0.462 1.085 0.193 0.184 0.215 0.211 0.214 2.256 1.148 0.225 0.21 0.279 11 | Cohere medium v20220720 (6.1B) 0.541 0.281 0.35 0.533 0.259 0.535 0.735 0.204 0.187 0.287 0.289 0.288 1.2 0.724 0.452 0.321 0.358 12 | Cohere small v20220720 (410M) 0.534 0.284 0.367 0.56 0.251 0.605 0.619 0.223 0.214 0.289 - 0.291 0.954 0.642 0.458 0.329 0.36 13 | GPT-NeoX (20B) 0.514 0.133 0.773 1.468 0.482 2.137 2.025 0.025 0.024 0.084 0.118 0.116 2.133 1.116 0.862 0.408 1.156 14 | UL2 (20B) 0.506 0.182 0.313 1.182 1.994 3.093 1.226 - - 0.168 - - 1.108 0.774 0.215 0.264 0.434 15 | OPT (66B) 0.467 0.055 0.834 1.98 0.611 3.632 2.658 0.971 0.188 0.041 0.076 0.102 1.972 0.885 0.54 0.212 1.871 16 | T5 (11B) 0.434 0.218 0.271 1.054 2.856 12.846 1.032 - - 0.21 - - 1.654 1.159 0.278 0.27 0.448 17 | T0pp (11B) 0.42 0.145 0.374 0.945 1.457 2.895 1.239 - - 0.142 - - 1.066 0.554 0.393 0.391 0.586 18 | Cohere large v20220720 (13.1B) 0.407 0.317 0.421 0.729 0.337 0.774 1.262 0.225 0.201 0.325 0.33 0.327 2.269 1.075 0.536 0.375 0.444 19 | J1-Large v1 (7.5B) 0.389 0.377 0.485 0.797 0.372 0.733 1.16 0.253 0.238 0.365 0.393 0.389 2.011 0.903 0.637 0.434 0.499 20 | J1-Grande v1 (17B) 0.317 0.411 0.535 0.923 0.466 0.873 1.413 0.33 0.281 0.396 0.428 0.424 2.074 1.07 0.732 0.482 0.59 21 | BLOOM (176B) 0.268 0.233 0.853 2.598 1.115 2.547 5.306 0.075 0.032 0.143 0.257 0.246 5.584 3.9 3.536 0.533 1.866 22 | YaLM (100B) 0.266 0.143 0.828 2.314 2.722 4.463 2.278 - - 0.092 - - 2.346 1.671 1.137 0.41 0.89 23 | OPT (175B) 0.241 0.12 0.869 2.783 4.548 7.78 4.049 0.71 0.038 0.141 0.241 0.226 4.729 2.523 1.575 0.498 0.962 24 | J1-Jumbo v1 (178B) 0.222 0.457 0.62 1.126 0.493 1.06 2.064 0.284 0.259 0.443 0.501 0.496 3.777 1.629 0.852 0.552 0.687 25 | Cohere xlarge v20220609 (52.4B) 0.199 0.489 0.598 1.062 0.565 1.085 2.089 0.359 0.314 0.501 0.499 0.501 4.337 1.741 0.796 0.546 0.667 26 | GLM (130B) 0.151 0.335 1.191 2.315 0.953 2.369 4.219 - - 0.158 - - 3.514 2.537 1.497 0.695 1.471 27 | Anthropic-LM v4-s3 (52B) 0.138 0.578 0.637 1.722 0.777 1.102 3.694 0.549 0.447 0.568 0.578 0.587 4.076 2.408 0.79 0.594 0.883 28 | J1-Grande v2 beta (17B) - - - - - - - - - - - - - - - - - 29 | Jurassic-2 Jumbo (178B) - - - - - - - - - - - - - - - - - 30 | Jurassic-2 Grande (17B) - - - - - - - - - - - - - - - - - 31 | Jurassic-2 Large (7.5B) - - - - - - - - - - - - - - - - - 32 | Luminous Base (13B) - - - - - - - - - - - - - - - - - 33 | Luminous Extended (30B) - - - - - - - - - - - - - - - - - 34 | Luminous Supreme (70B) - - - - - - - - - - - - - - - - - 35 | Cohere xlarge v20221108 (52.4B) - - - - - - - - - - - - - - - - - 36 | Cohere medium v20221108 (6.1B) - - - - - - - - - - - - - - - - - 37 | Cohere Command beta (6.1B) - - - - - - - - - - - - - - - - - 38 | Cohere Command beta (52.4B) - - - - - - - - - - - - - - - - - 39 | Pythia (6.9B) - - - - - - - - - - - - - - - - - 40 | Pythia (12B) - - - - - - - - - - - - - - - - - 41 | LLaMA (7B) - - - - - - - - - - - - - - - - - 42 | LLaMA (13B) - - - - - - - - - - - - - - - - - 43 | LLaMA (30B) - - - - - - - - - - - - - - - - - 44 | LLaMA (65B) - - - - - - - - - - - - - - - - - 45 | Llama 2 (7B) - - - - - - - - - - - - - - - - - 46 | Llama 2 (13B) - - - - - - - - - - - - - - - - - 47 | Llama 2 (70B) - - - - - - - - - - - - - - - - - 48 | Alpaca (7B) - - - - - - - - - - - - - - - - - 49 | Vicuna v1.3 (7B) - - - - - - - - - - - - - - - - - 50 | Vicuna v1.3 (13B) - - - - - - - - - - - - - - - - - 51 | Mistral v0.1 (7B) - - - - - - - - - - - - - - - - - 52 | TNLG v2 (530B) - - - - - - - - - - - - - - - - - 53 | TNLG v2 (6.7B) - - - - - - - - - - - - - - - - - 54 | text-davinci-003 - - - - - - - - - - - - - - - - - 55 | gpt-3.5-turbo-0301 - - - - - - - - - - - - - - - - - 56 | gpt-3.5-turbo-0613 - - - - - - - - - - - - - - - - - 57 | RedPajama-INCITE-Base-v1 (3B) - - - - - - - - - - - - - - - - - 58 | RedPajama-INCITE-Instruct-v1 (3B) - - - - - - - - - - - - - - - - - 59 | RedPajama-INCITE-Base (7B) - - - - - - - - - - - - - - - - - 60 | RedPajama-INCITE-Instruct (7B) - - - - - - - - - - - - - - - - - 61 | MPT (30B) - - - - - - - - - - - - - - - - - 62 | MPT-Instruct (30B) - - - - - - - - - - - - - - - - - 63 | Falcon (7B) - - - - - - - - - - - - - - - - - 64 | Falcon-Instruct (7B) - - - - - - - - - - - - - - - - - 65 | Falcon (40B) - - - - - - - - - - - - - - - - - 66 | Falcon-Instruct (40B) - - - - - - - - - - - - - - - - - 67 | InstructPalmyra (30B) - - - - - - - - - - - - - - - - - 68 | Palmyra X (43B) - - - - - - - - - - - - - - - - - 69 | 70 | -------------------------------------------------------------------------------- /benchbench/data/mmlu/leaderboard_raw.csv: -------------------------------------------------------------------------------- 1 | open-llm-leaderboard/details_kyujinpy__PlatYi-34B-Llama-Q 2 | open-llm-leaderboard/details_kyujinpy__PlatYi-34B-LoRA 3 | open-llm-leaderboard/details_kyujinpy__PlatYi-34B-Llama-Q-FastChat 4 | open-llm-leaderboard/details_kyujinpy__PlatYi-34B-Llama 5 | open-llm-leaderboard/details_bhenrym14__platypus-yi-34b 6 | open-llm-leaderboard/details_kyujinpy__PlatYi-34B-Q 7 | open-llm-leaderboard/details_cloudyu__Yi-34Bx2-MoE-60B 8 | open-llm-leaderboard/details_Qwen__Qwen-72B 9 | open-llm-leaderboard/details_cloudyu__Mixtral_34Bx2_MoE_60B 10 | open-llm-leaderboard/details_moreh__MoMo-70B-lora-1.8.4-DPO 11 | open-llm-leaderboard/details_cloudyu__Mixtral_34Bx2_MoE_60B 12 | open-llm-leaderboard/details_CausalLM__72B-preview 13 | open-llm-leaderboard/details_CausalLM__72B-preview 14 | open-llm-leaderboard/details_kyujinpy__PlatYi-34B-200k-Q-FastChat 15 | open-llm-leaderboard/details_moreh__MoMo-70B-LoRA-V1.4 16 | open-llm-leaderboard/details_NousResearch__Nous-Hermes-2-Yi-34B 17 | open-llm-leaderboard/details_SUSTech__SUS-Chat-72B 18 | open-llm-leaderboard/details_AA051611__whattest 19 | open-llm-leaderboard/details_kyujinpy__PlatYi-34B-Llama-Q-v2 20 | open-llm-leaderboard/details_jondurbin__bagel-dpo-34b-v0.2 21 | open-llm-leaderboard/details_jondurbin__bagel-dpo-34b-v0.2 22 | open-llm-leaderboard/details_jondurbin__bagel-34b-v0.2 23 | open-llm-leaderboard/details_jondurbin__nontoxic-bagel-34b-v0.2 24 | open-llm-leaderboard/details_SUSTech__SUS-Chat-34B 25 | open-llm-leaderboard/details_01-ai__Yi-34B 26 | open-llm-leaderboard/details_chargoddard__Yi-34B-Llama 27 | open-llm-leaderboard/details_01-ai__Yi-34B-200K 28 | open-llm-leaderboard/details_mncai__yi-34B-v3 29 | open-llm-leaderboard/details_APMIC__caigun-lora-model-34B-v2 30 | open-llm-leaderboard/details_mncai__yi-34B-v2 31 | open-llm-leaderboard/details_Mihaiii__Pallas-0.2 32 | open-llm-leaderboard/details_migtissera__Tess-M-Creative-v1.0 33 | open-llm-leaderboard/details_Mihaiii__Pallas-0.2 34 | open-llm-leaderboard/details_APMIC__caigun-lora-model-34B-v3 35 | open-llm-leaderboard/details_migtissera__Tess-M-v1.3 36 | open-llm-leaderboard/details_kyujinpy__PlatYi-34B-200K-Q 37 | open-llm-leaderboard/details_Mihaiii__Pallas-0.4 38 | open-llm-leaderboard/details_Mihaiii__Pallas-0.3 39 | open-llm-leaderboard/details_Mihaiii__Pallas-0.3 40 | open-llm-leaderboard/details_migtissera__Tess-34B-v1.4 41 | open-llm-leaderboard/details_Mihaiii__Pallas-0.5 42 | open-llm-leaderboard/details_kyujinpy__PlatYi-34B-Llama-Q-v3 43 | open-llm-leaderboard/details_Mihaiii__Pallas-0.5-LASER-0.1 44 | open-llm-leaderboard/details_Mihaiii__Pallas-0.4 45 | open-llm-leaderboard/details_JosephusCheung__Yee-34B-200K-Chat 46 | open-llm-leaderboard/details_01-ai__Yi-34B-Chat 47 | open-llm-leaderboard/details_01-ai__Yi-34B-Chat 48 | open-llm-leaderboard/details_Mihaiii__Pallas-0.5-LASER-0.2 49 | open-llm-leaderboard/details_Mihaiii__Pallas-0.5-LASER-0.3 50 | open-llm-leaderboard/details_Mihaiii__Pallas-0.5-LASER-exp2-0.1 51 | open-llm-leaderboard/details_migtissera__Tess-M-v1.1 52 | open-llm-leaderboard/details_AA051611__A0110 53 | open-llm-leaderboard/details_AA051611__A0109 54 | open-llm-leaderboard/details_Azure99__blossom-v3_1-yi-34b 55 | open-llm-leaderboard/details_Mihaiii__Pallas-0.5-LASER-0.4 56 | open-llm-leaderboard/details_Mihaiii__Pallas-0.5-LASER-0.5 57 | open-llm-leaderboard/details_Mihaiii__Pallas-0.5-LASER-0.6 58 | open-llm-leaderboard/details_TriadParty__deepmoney-34b-200k-base 59 | open-llm-leaderboard/details_AA051610__A0106 60 | open-llm-leaderboard/details_AA051610__A0106 61 | open-llm-leaderboard/details_adamo1139__Yi-34B-AEZAKMI-v1 62 | open-llm-leaderboard/details_OrionStarAI__OrionStar-Yi-34B-Chat-Llama 63 | open-llm-leaderboard/details_mlinmg__SG-Raccoon-Yi-200k-2.0 64 | open-llm-leaderboard/details_deepseek-ai__deepseek-llm-67b-chat 65 | open-llm-leaderboard/details_OpenBuddy__openbuddy-mixtral-7bx8-v16.3-32k 66 | open-llm-leaderboard/details_itsliupeng__Mixtral-8x7B-v0.1-top3 67 | open-llm-leaderboard/details_itsliupeng__llama2_70b_mmlu 68 | open-llm-leaderboard/details_itsliupeng__llama2_70b_mmlu 69 | open-llm-leaderboard/details_rufjdk5480__gov-qna-ko-merged 70 | open-llm-leaderboard/details_rufjdk5480__mixtral-ko-qna-merged 71 | open-llm-leaderboard/details_mistralai__Mixtral-8x7B-v0.1 72 | open-llm-leaderboard/details_deepseek-ai__deepseek-llm-67b-base 73 | open-llm-leaderboard/details_OpenBuddy__openbuddy-deepseek-67b-v15.2 74 | open-llm-leaderboard/details_YeungNLP__firefly-mixtral-8x7b-v0.1 75 | open-llm-leaderboard/details_YeungNLP__firefly-mixtral-8x7b-v1 76 | open-llm-leaderboard/details_argilla__notux-8x7b-v1-epoch-2 77 | open-llm-leaderboard/details_mistralai__Mixtral-8x7B-Instruct-v0.1 78 | open-llm-leaderboard/details_argilla__notux-8x7b-v1 79 | open-llm-leaderboard/details_VAGOsolutions__SauerkrautLM-Mixtral-8x7B-Instruct 80 | open-llm-leaderboard/details_OpenBuddy__openbuddy-mixtral-8x7b-v16.1-32k 81 | open-llm-leaderboard/details_argilla__notus-8x7b-experiment 82 | open-llm-leaderboard/details_OpenBuddy__openbuddy-mixtral-8x7b-v16.2-32k 83 | open-llm-leaderboard/details_mistralai__Mixtral-8x7B-Instruct-v0.1 84 | open-llm-leaderboard/details_VAGOsolutions__SauerkrautLM-Mixtral-8x7B-Instruct 85 | open-llm-leaderboard/details_s1ghhh__medllama-2-70b-qlora-1.1 86 | open-llm-leaderboard/details_ICBU-NPU__FashionGPT-70B-V1.1 87 | open-llm-leaderboard/details_OpenBuddy__openbuddy-deepseek-67b-v15-base 88 | open-llm-leaderboard/details_Brillibits__Instruct_Mixtral-8x7B-v0.1_Dolly15K 89 | open-llm-leaderboard/details_Sao10K__Sensualize-Mixtral-bf16 90 | open-llm-leaderboard/details_KaeriJenti__kaori-70b-v1 91 | open-llm-leaderboard/details_Riiid__sheep-duck-llama-2-70b-v1.1 92 | open-llm-leaderboard/details_AIDC-ai-business__Marcoroni-70B-v1 93 | open-llm-leaderboard/details_jondurbin__airoboros-l2-70b-gpt4-m2.0 94 | open-llm-leaderboard/details_cognitivecomputations__yayi2-30b-llama 95 | open-llm-leaderboard/details_AA051610__A11P 96 | open-llm-leaderboard/details_sequelbox__SpellBlade 97 | open-llm-leaderboard/details_tiiuae__falcon-180B 98 | open-llm-leaderboard/details_garage-bAInd__Platypus2-70B-instruct 99 | open-llm-leaderboard/details_chargoddard__mixtralmerge-8x7B-rebalanced-test 100 | open-llm-leaderboard/details_chargoddard__MixtralRPChat-ZLoss -------------------------------------------------------------------------------- /benchbench/measures/ordinal.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import pandas as pd 4 | from torch.optim import Adam 5 | from sklearn.impute import KNNImputer 6 | 7 | from ..utils.base import rankdata 8 | from ..utils.metric import get_rank_diff, get_rank_variance 9 | from ..utils.win_rate import WinningRate 10 | 11 | 12 | def appr_rank_diff(new_win_rate, inv_indices, orig_rank): 13 | """ 14 | Approximate the rank difference between the original win rate and the new win rate. 15 | 16 | Args: 17 | new_win_rate(np.array): win rate for all models 18 | inv_indices(list): invaraint indices 19 | orig_rank(np.array): original win rate for only models in inv_indices 20 | 21 | Returns: 22 | torch.Tensor: approximated loss 23 | """ 24 | ret = 0.0 25 | for i, inv_i in enumerate(inv_indices): 26 | for j, inv_j in enumerate(inv_indices): 27 | # old_rank[i] is the original rank for inv_i 28 | if orig_rank[i] < orig_rank[j]: 29 | ret += max(new_win_rate[inv_i] - new_win_rate[inv_j], -0.01) 30 | return ret 31 | 32 | 33 | def get_selected_win_rate(win_rate_matrix, w, inv_indices, do_sample=True): 34 | """ 35 | Get the win rate for the selected indices. 36 | 37 | Args: 38 | win_rate_matrix(torch.Tensor): i-th row and j-th column represents the win rate of i-th model over j-th model 39 | w(torch.Tensor): unnormalized normalized probability for each model to be selected 40 | inv_indices(list): indices for L 41 | do_sample(bool): select models based on sampling or not 42 | 43 | Returns: 44 | tuple: 45 | torch.Tensor: new_win_rate 46 | np.array: new_indices 47 | """ 48 | probs = torch.sigmoid(w) 49 | if do_sample: 50 | sampler = torch.distributions.Bernoulli(probs) 51 | sampled = sampler.sample() + w - w.detach() 52 | else: 53 | sampled = (probs > 0.5) + w - w.detach() 54 | inv = torch.tensor( 55 | [ 56 | (1.0 if (j == 0.0 and i in inv_indices) else 0.0) 57 | for i, j in enumerate(sampled) 58 | ] 59 | ) 60 | selected = sampled + inv 61 | selected_diag = torch.diag(selected) 62 | selected_win_rate = selected_diag @ win_rate_matrix @ selected_diag 63 | new_win_rate = selected_win_rate.sum(1) / selected.sum() 64 | new_indices = np.where(selected.detach().numpy() >= 1.0 - 1e-4)[0] 65 | 66 | return new_win_rate, new_indices 67 | 68 | 69 | def get_sensitivity( 70 | data, cols, inv_indices=None, lr=0.01, num_step=1000, return_indices=False 71 | ): 72 | """ 73 | Calculate the sensitivity for a given benchmark. 74 | 75 | Args: 76 | data(pd.DataFrame): each row represents a model, each column represents a task 77 | cols(list): the column names of the tasks 78 | inv_indices(list): indices for L, the rest will be used as L^C 79 | lr(float): learning rate for optimization 80 | num_step(int): number of steps for optimization 81 | return_indices(bool): whether return the indices of selected irrelevant models 82 | 83 | Returns: 84 | tuple: ((tau, MRC), indices) if return_indices is True, else (tau, MRC) 85 | """ 86 | if inv_indices is None: 87 | inv_indices = np.arange(len(data) // 5) 88 | 89 | torch.manual_seed(0) 90 | win_rate_matrix = torch.tensor(WinningRate(data, cols).win_rate) 91 | 92 | orig_win_rate = win_rate_matrix[inv_indices][:, inv_indices].mean(axis=1).numpy() 93 | orig_rank = rankdata(-orig_win_rate, method="average") 94 | 95 | w = torch.zeros(len(data), requires_grad=True, dtype=torch.double) 96 | optimizer = Adam([w], lr=lr) 97 | history = [] 98 | for episode in range(num_step): 99 | new_win_rate, new_indices = get_selected_win_rate( 100 | win_rate_matrix, w, inv_indices 101 | ) 102 | loss = appr_rank_diff(new_win_rate, inv_indices, orig_rank) 103 | if type(loss) is float: 104 | break 105 | print("Episode %d, loss %.2lf" % (episode, loss.item()), end="\r") 106 | 107 | optimizer.zero_grad() 108 | loss.backward() 109 | optimizer.step() 110 | 111 | new_win_rate = ( 112 | win_rate_matrix[new_indices][:, new_indices] 113 | .mean(axis=1)[inv_indices] 114 | .detach() 115 | .numpy() 116 | ) 117 | new_rank = rankdata(-new_win_rate) 118 | rank_diff = get_rank_diff(new_rank, orig_rank) 119 | history.append((rank_diff, new_indices)) 120 | print() 121 | 122 | new_win_rate, new_indices = get_selected_win_rate( 123 | win_rate_matrix, w, inv_indices, do_sample=False 124 | ) 125 | new_win_rate = ( 126 | win_rate_matrix[new_indices][:, new_indices] 127 | .mean(axis=1)[inv_indices] 128 | .detach() 129 | .numpy() 130 | ) 131 | new_rank = rankdata(-new_win_rate, method="average") 132 | final_rank_diff = get_rank_diff(new_rank, orig_rank) 133 | 134 | if len(history) == 0: 135 | ret = (final_rank_diff, new_indices) 136 | else: 137 | history = sorted(history, key=lambda x: -x[0][0]) 138 | history_best_rank_diff = history[0][0] 139 | history_best_indices = history[0][1] 140 | if final_rank_diff > history_best_rank_diff: 141 | ret = (final_rank_diff, new_indices) 142 | else: 143 | ret = (history_best_rank_diff, history_best_indices) 144 | if return_indices: 145 | return ret 146 | else: 147 | return ret[0] 148 | 149 | 150 | def get_diversity(data, cols): 151 | """ 152 | Calculate the diversity for a given benchmark. 153 | 154 | Args: 155 | data(pd.DataFrame): each row represents a model, each column represents a task 156 | cols(list): the column names of the tasks 157 | 158 | Returns: 159 | tuple: (W, max_MRC), where max_MRC refers to max MRC over every pair of tasks 160 | """ 161 | imputer = KNNImputer(n_neighbors=5, weights="uniform") 162 | 163 | data_imputed = imputer.fit_transform(data[cols].values) 164 | data_imputed = pd.DataFrame(data_imputed, columns=cols) 165 | 166 | return get_rank_variance( 167 | [ 168 | rankdata(-data_imputed[c].values, method="average") 169 | for c in list(cols) 170 | if data_imputed[c].values.dtype == "float64" 171 | ] 172 | ) 173 | -------------------------------------------------------------------------------- /benchbench/data/helm_capability/vanilla.txt: -------------------------------------------------------------------------------- 1 | Model 2 | Mean score 3 | MMLU-Pro - COT correct 4 | GPQA - COT correct 5 | IFEval - IFEval Strict Acc 6 | WildBench - WB Score 7 | Omni-MATH - Acc 8 | GPT-5 mini (2025-08-07) 9 | 0.819 10 | 0.835 11 | 0.756 12 | 0.927 13 | 0.855 14 | 0.722 15 | o4-mini (2025-04-16) 16 | 0.812 17 | 0.82 18 | 0.735 19 | 0.929 20 | 0.854 21 | 0.72 22 | o3 (2025-04-16) 23 | 0.811 24 | 0.859 25 | 0.753 26 | 0.869 27 | 0.861 28 | 0.714 29 | GPT-5 (2025-08-07) 30 | 0.807 31 | 0.863 32 | 0.791 33 | 0.875 34 | 0.857 35 | 0.647 36 | Qwen3 235B A22B Instruct 2507 FP8 37 | 0.798 38 | 0.844 39 | 0.726 40 | 0.835 41 | 0.866 42 | 0.718 43 | Grok 4 (0709) 44 | 0.785 45 | 0.851 46 | 0.726 47 | 0.949 48 | 0.797 49 | 0.603 50 | Claude 4 Opus (20250514, extended thinking) 51 | 0.78 52 | 0.875 53 | 0.709 54 | 0.849 55 | 0.852 56 | 0.616 57 | gpt-oss-120b 58 | 0.77 59 | 0.795 60 | 0.684 61 | 0.836 62 | 0.845 63 | 0.688 64 | Kimi K2 Instruct 65 | 0.768 66 | 0.819 67 | 0.652 68 | 0.85 69 | 0.862 70 | 0.654 71 | Claude 4 Sonnet (20250514, extended thinking) 72 | 0.766 73 | 0.843 74 | 0.706 75 | 0.84 76 | 0.838 77 | 0.602 78 | Claude 4.5 Sonnet (20250929) 79 | 0.762 80 | 0.869 81 | 0.686 82 | 0.85 83 | 0.854 84 | 0.553 85 | Claude 4 Opus (20250514) 86 | 0.757 87 | 0.859 88 | 0.666 89 | 0.918 90 | 0.833 91 | 0.511 92 | GPT-5 nano (2025-08-07) 93 | 0.748 94 | 0.778 95 | 0.679 96 | 0.932 97 | 0.806 98 | 0.547 99 | Gemini 2.5 Pro (03-25 preview) 100 | 0.745 101 | 0.863 102 | 0.749 103 | 0.84 104 | 0.857 105 | 0.416 106 | Claude 4 Sonnet (20250514) 107 | 0.733 108 | 0.843 109 | 0.643 110 | 0.839 111 | 0.825 112 | 0.513 113 | Grok 3 Beta 114 | 0.727 115 | 0.788 116 | 0.65 117 | 0.884 118 | 0.849 119 | 0.464 120 | GPT-4.1 (2025-04-14) 121 | 0.727 122 | 0.811 123 | 0.659 124 | 0.838 125 | 0.854 126 | 0.471 127 | Qwen3 235B A22B FP8 Throughput 128 | 0.726 129 | 0.817 130 | 0.623 131 | 0.816 132 | 0.828 133 | 0.548 134 | GPT-4.1 mini (2025-04-14) 135 | 0.726 136 | 0.783 137 | 0.614 138 | 0.904 139 | 0.838 140 | 0.491 141 | Llama 4 Maverick (17Bx128E) Instruct FP8 142 | 0.718 143 | 0.81 144 | 0.65 145 | 0.908 146 | 0.8 147 | 0.422 148 | Qwen3-Next 80B A3B Thinking 149 | 0.7 150 | 0.786 151 | 0.63 152 | 0.81 153 | 0.807 154 | 0.467 155 | DeepSeek-R1-0528 156 | 0.699 157 | 0.793 158 | 0.666 159 | 0.784 160 | 0.828 161 | 0.424 162 | Palmyra X5 163 | 0.696 164 | 0.804 165 | 0.661 166 | 0.823 167 | 0.78 168 | 0.415 169 | Grok 3 mini Beta 170 | 0.679 171 | 0.799 172 | 0.675 173 | 0.951 174 | 0.651 175 | 0.318 176 | Gemini 2.0 Flash 177 | 0.679 178 | 0.737 179 | 0.556 180 | 0.841 181 | 0.8 182 | 0.459 183 | Claude 3.7 Sonnet (20250219) 184 | 0.674 185 | 0.784 186 | 0.608 187 | 0.834 188 | 0.814 189 | 0.33 190 | gpt-oss-20b 191 | 0.674 192 | 0.74 193 | 0.594 194 | 0.732 195 | 0.737 196 | 0.565 197 | GLM-4.5-Air-FP8 198 | 0.67 199 | 0.762 200 | 0.594 201 | 0.812 202 | 0.789 203 | 0.391 204 | DeepSeek v3 205 | 0.665 206 | 0.723 207 | 0.538 208 | 0.832 209 | 0.831 210 | 0.403 211 | Gemini 1.5 Pro (002) 212 | 0.657 213 | 0.737 214 | 0.534 215 | 0.837 216 | 0.813 217 | 0.364 218 | Claude 3.5 Sonnet (20241022) 219 | 0.653 220 | 0.777 221 | 0.565 222 | 0.856 223 | 0.792 224 | 0.276 225 | Llama 4 Scout (17Bx16E) Instruct 226 | 0.644 227 | 0.742 228 | 0.507 229 | 0.818 230 | 0.779 231 | 0.373 232 | Gemini 2.0 Flash Lite (02-05 preview) 233 | 0.642 234 | 0.72 235 | 0.5 236 | 0.824 237 | 0.79 238 | 0.374 239 | Amazon Nova Premier 240 | 0.637 241 | 0.726 242 | 0.518 243 | 0.803 244 | 0.788 245 | 0.35 246 | GPT-4o (2024-11-20) 247 | 0.634 248 | 0.713 249 | 0.52 250 | 0.817 251 | 0.828 252 | 0.293 253 | Gemini 2.5 Flash (04-17 preview) 254 | 0.626 255 | 0.639 256 | 0.39 257 | 0.898 258 | 0.817 259 | 0.384 260 | Llama 3.1 Instruct Turbo (405B) 261 | 0.618 262 | 0.723 263 | 0.522 264 | 0.811 265 | 0.783 266 | 0.249 267 | GPT-4.1 nano (2025-04-14) 268 | 0.616 269 | 0.55 270 | 0.507 271 | 0.843 272 | 0.811 273 | 0.367 274 | Palmyra-X-004 275 | 0.609 276 | 0.657 277 | 0.395 278 | 0.872 279 | 0.802 280 | 0.32 281 | Gemini 1.5 Flash (002) 282 | 0.609 283 | 0.678 284 | 0.437 285 | 0.831 286 | 0.792 287 | 0.305 288 | Qwen2.5 Instruct Turbo (72B) 289 | 0.599 290 | 0.631 291 | 0.426 292 | 0.806 293 | 0.802 294 | 0.33 295 | Mistral Large (2411) 296 | 0.598 297 | 0.599 298 | 0.435 299 | 0.876 300 | 0.801 301 | 0.281 302 | Gemini 2.5 Flash-Lite 303 | 0.591 304 | 0.537 305 | 0.309 306 | 0.81 307 | 0.818 308 | 0.48 309 | Amazon Nova Pro 310 | 0.591 311 | 0.673 312 | 0.446 313 | 0.815 314 | 0.777 315 | 0.242 316 | Palmyra Fin 317 | 0.577 318 | 0.591 319 | 0.422 320 | 0.793 321 | 0.783 322 | 0.295 323 | IBM Granite 4.0 Small 324 | 0.575 325 | 0.569 326 | 0.383 327 | 0.89 328 | 0.739 329 | 0.296 330 | Llama 3.1 Instruct Turbo (70B) 331 | 0.574 332 | 0.653 333 | 0.426 334 | 0.821 335 | 0.758 336 | 0.21 337 | GPT-4o mini (2024-07-18) 338 | 0.565 339 | 0.603 340 | 0.368 341 | 0.782 342 | 0.791 343 | 0.28 344 | Mistral Small 3.1 (2503) 345 | 0.558 346 | 0.61 347 | 0.392 348 | 0.75 349 | 0.788 350 | 0.248 351 | Amazon Nova Lite 352 | 0.551 353 | 0.6 354 | 0.397 355 | 0.776 356 | 0.75 357 | 0.233 358 | Claude 3.5 Haiku (20241022) 359 | 0.549 360 | 0.605 361 | 0.363 362 | 0.792 363 | 0.76 364 | 0.224 365 | Qwen2.5 Instruct Turbo (7B) 366 | 0.529 367 | 0.539 368 | 0.341 369 | 0.741 370 | 0.731 371 | 0.294 372 | Amazon Nova Micro 373 | 0.522 374 | 0.511 375 | 0.383 376 | 0.76 377 | 0.743 378 | 0.214 379 | IBM Granite 4.0 Micro 380 | 0.486 381 | 0.395 382 | 0.307 383 | 0.849 384 | 0.67 385 | 0.209 386 | Mixtral Instruct (8x22B) 387 | 0.478 388 | 0.46 389 | 0.334 390 | 0.724 391 | 0.711 392 | 0.163 393 | Palmyra Med 394 | 0.476 395 | 0.411 396 | 0.368 397 | 0.767 398 | 0.676 399 | 0.156 400 | OLMo 2 32B Instruct March 2025 401 | 0.475 402 | 0.414 403 | 0.287 404 | 0.78 405 | 0.734 406 | 0.161 407 | IBM Granite 3.3 8B Instruct 408 | 0.463 409 | 0.343 410 | 0.325 411 | 0.729 412 | 0.741 413 | 0.176 414 | Llama 3.1 Instruct Turbo (8B) 415 | 0.444 416 | 0.406 417 | 0.247 418 | 0.743 419 | 0.686 420 | 0.137 421 | OLMo 2 13B Instruct November 2024 422 | 0.44 423 | 0.31 424 | 0.316 425 | 0.73 426 | 0.689 427 | 0.156 428 | OLMo 2 7B Instruct November 2024 429 | 0.405 430 | 0.292 431 | 0.296 432 | 0.693 433 | 0.628 434 | 0.116 435 | Mixtral Instruct (8x7B) 436 | 0.397 437 | 0.335 438 | 0.296 439 | 0.575 440 | 0.673 441 | 0.105 442 | Mistral Instruct v0.3 (7B) 443 | 0.376 444 | 0.277 445 | 0.303 446 | 0.567 447 | 0.66 448 | 0.072 449 | OLMoE 1B-7B Instruct January 2025 450 | 0.332 451 | 0.169 452 | 0.22 453 | 0.628 454 | 0.551 455 | 0.093 456 | Marin 8B Instruct 457 | 0.325 458 | 0.188 459 | 0.168 460 | 0.632 461 | 0.477 462 | 0.16 -------------------------------------------------------------------------------- /benchbench/data/helm/summarization.tsv: -------------------------------------------------------------------------------- 1 | Model Mean win rate CNN/DailyMail - SummaC CNN/DailyMail - QAFactEval CNN/DailyMail - BERTScore (F1) CNN/DailyMail - Coverage CNN/DailyMail - Density CNN/DailyMail - Compression CNN/DailyMail - HumanEval-faithfulness CNN/DailyMail - HumanEval-relevance CNN/DailyMail - HumanEval-coherence XSUM - SummaC XSUM - QAFactEval XSUM - BERTScore (F1) XSUM - Coverage XSUM - Density XSUM - Compression XSUM - HumanEval-faithfulness XSUM - HumanEval-relevance XSUM - HumanEval-coherence 2 | TNLG v2 (530B) 0.757 0.573 - 0.316 0.977 26.968 10.317 - - - -0.281 - 0.473 0.774 2.322 15.776 - - - 3 | Luminous Supreme (70B) 0.717 0.552 - 0.28 0.939 33.625 9.298 - - - -0.241 - 0.444 0.807 3.08 16.97 - - - 4 | Cohere xlarge v20221108 (52.4B) 0.704 0.514 - 0.286 0.971 44.772 8.026 - - - -0.258 - 0.451 0.798 3.009 17.188 - - - 5 | J1-Grande v2 beta (17B) 0.678 0.552 - 0.29 0.973 24.032 11.659 - - - -0.282 - 0.454 0.786 2.816 16.857 - - - 6 | Cohere Command beta (52.4B) 0.678 0.415 - 0.318 0.979 32.165 9.156 - - - -0.271 - 0.459 0.793 2.548 16.937 - - - 7 | Jurassic-2 Grande (17B) 0.671 0.503 - 0.299 0.96 22.305 11.399 - - - -0.289 - 0.475 0.766 2.36 17.045 - - - 8 | J1-Grande v1 (17B) 0.669 0.539 4.81 0.275 0.973 41.027 9.888 - - - -0.272 3.447 0.429 0.783 2.64 19.012 - - - 9 | J1-Large v1 (7.5B) 0.65 0.512 4.716 0.248 0.977 71.654 7.632 - - - -0.239 3.675 0.4 0.808 3.757 18.133 - - - 10 | text-babbage-001 0.646 0.378 4.676 0.282 0.972 45.948 5.291 - - - -0.057 4.33 0.281 0.885 8.487 11.856 - - - 11 | Jurassic-2 Jumbo (178B) 0.645 0.489 - 0.313 0.957 15.317 12.304 - - - -0.32 - 0.489 0.755 2.145 16.589 - - - 12 | text-davinci-002 0.641 0.353 4.635 0.321 0.946 15.995 8.818 0.999 4.435 4.371 -0.273 3.007 0.43 0.801 2.872 14.07 0.849 4.41 4.685 13 | text-curie-001 0.617 0.291 4.616 0.306 0.961 26.1 6.829 0.967 4.587 4.243 -0.185 3.459 0.354 0.839 4.008 12.98 0.991 4.068 4.321 14 | TNLG v2 (6.7B) 0.612 0.493 - 0.282 0.976 48.951 9.598 - - - -0.203 - 0.385 0.793 3.286 18.428 - - - 15 | OPT (175B) 0.593 0.202 4.67 0.276 0.933 31.307 9.8 1 4.378 3.233 -0.253 3.523 0.46 0.793 2.732 16.792 0.798 4.3 4.891 16 | J1-Jumbo v1 (178B) 0.587 0.515 4.697 0.278 0.976 53.93 9.579 - - - -0.287 3.182 0.435 0.784 2.63 16.862 - - - 17 | Cohere Command beta (6.1B) 0.579 0.331 - 0.296 0.975 31.707 9.688 - - - -0.239 - 0.418 0.824 2.793 18.017 - - - 18 | OPT (66B) 0.579 0.197 4.735 0.256 0.92 41.595 9.759 - - - -0.189 3.324 0.417 0.817 3.899 18.414 - - - 19 | Cohere large v20220720 (13.1B) 0.576 0.5 4.763 0.246 0.946 37.733 11.27 - - - -0.189 2.889 0.398 0.823 3.599 20.712 - - - 20 | Jurassic-2 Large (7.5B) 0.572 0.496 - 0.271 0.963 25.251 11.503 - - - -0.278 - 0.45 0.782 2.659 18.03 - - - 21 | Luminous Extended (30B) 0.566 0.481 - 0.255 0.925 41.619 9.039 - - - -0.225 - 0.423 0.818 3.507 17.376 - - - 22 | GPT-J (6B) 0.549 0.208 4.704 0.247 0.948 48.284 9.864 - - - -0.198 3.813 0.381 0.829 4.043 17.942 - - - 23 | Cohere xlarge v20220609 (52.4B) 0.546 0.469 4.683 0.264 0.945 49.713 9.072 0.993 4.539 3.69 -0.253 2.981 0.434 0.8 2.945 18.422 0.661 4.239 4.825 24 | Anthropic-LM v4-s3 (52B) 0.531 0.492 4.692 0.326 0.96 10.832 11.89 0.667 4 2.667 -0.271 3.066 0.437 0.808 2.691 15.182 0.778 4.398 4.898 25 | text-davinci-003 0.526 0.359 - 0.342 0.956 7.545 9.389 - - - -0.301 - 0.411 0.822 2.63 10.932 - - - 26 | Cohere medium v20221108 (6.1B) 0.507 0.359 - 0.218 0.899 24.344 11.42 - - - -0.171 - 0.384 0.842 3.815 19.703 - - - 27 | text-ada-001 0.486 0.223 3.369 0.247 0.929 31.424 5.461 - - - -0.102 4.929 0.245 0.847 7.626 13.08 - - - 28 | GLM (130B) 0.471 0.566 - 0.288 0.972 30.259 8.687 0.963 4.167 3.463 -0.206 - 0.427 0.817 4.041 16.25 0.763 3.843 4.25 29 | GPT-NeoX (20B) 0.446 0.165 4.69 0.226 0.91 37.149 9.676 - - - -0.208 3.303 0.391 0.825 3.371 18.238 - - - 30 | Cohere medium v20220720 (6.1B) 0.431 0.229 4.664 0.115 0.799 22.176 13.154 - - - -0.159 3.223 0.367 0.847 4.754 19.748 - - - 31 | Luminous Base (13B) 0.421 0.32 - 0.188 0.834 35.663 9.346 - - - -0.213 - 0.394 0.834 4.393 17.535 - - - 32 | davinci (175B) 0.36 0.321 4.062 0.182 0.873 17.914 9.843 0.953 4.501 3.863 -0.267 2.338 0.318 0.751 3.351 14.08 0.829 4.075 3.398 33 | curie (6.7B) 0.325 0.354 4.204 0.089 0.89 23.472 9.495 0.287 1.933 1.767 -0.143 3.922 0.313 0.815 5.57 17.018 0.924 3.573 4.166 34 | Cohere small v20220720 (410M) 0.292 0.054 2.638 0.026 0.744 25.238 13.243 - - - 0.028 3.094 0.195 0.863 10.557 17.551 - - - 35 | BLOOM (176B) 0.291 -0.02 4.665 0.08 0.71 32.013 5.252 - - - -0.35 4.778 0.059 0.515 1.764 8.934 - - - 36 | ada (350M) 0.231 0.169 3.742 0.026 0.773 36.596 12.07 - - - -0.115 0.009 -0.232 0.407 2.653 8.023 - - - 37 | babbage (1.3B) 0.196 0.194 3.207 -0.129 0.606 43.534 6.733 - - - -0.188 0.195 0.02 0.604 4.386 11.716 - - - 38 | UL2 (20B) 0.118 -0.27 - -0.121 0.72 5.044 7.186 - - - -0.275 - 0.072 0.643 3.208 7.853 - - - 39 | T5 (11B) 0.112 -0.122 - -0.17 0.555 2.698 19.248 - - - -0.258 - -0.315 0.355 0.831 16.544 - - - 40 | YaLM (100B) 0.045 -0.322 - -0.145 0.541 1.09 6.936 - - - -0.347 1.176 0.031 0.567 1.041 9.951 - - - 41 | T0pp (11B) - -0.044 - 0.155 0.841 8.588 8.274 - - - -0.3 - 0.097 0.579 1.684 11.178 - - - 42 | Pythia (6.9B) - - - - - - - - - - - - - - - - - - - 43 | Pythia (12B) - - - - - - - - - - - - - - - - - - - 44 | LLaMA (7B) - - - - - - - - - - - - - - - - - - - 45 | LLaMA (13B) - - - - - - - - - - - - - - - - - - - 46 | LLaMA (30B) - - - - - - - - - - - - - - - - - - - 47 | LLaMA (65B) - - - - - - - - - - - - - - - - - - - 48 | Llama 2 (7B) - - - - - - - - - - - - - - - - - - - 49 | Llama 2 (13B) - - - - - - - - - - - - - - - - - - - 50 | Llama 2 (70B) - - - - - - - - - - - - - - - - - - - 51 | Alpaca (7B) - - - - - - - - - - - - - - - - - - - 52 | Vicuna v1.3 (7B) - - - - - - - - - - - - - - - - - - - 53 | Vicuna v1.3 (13B) - - - - - - - - - - - - - - - - - - - 54 | Mistral v0.1 (7B) - - - - - - - - - - - - - - - - - - - 55 | gpt-3.5-turbo-0301 - - - - - - - - - - - - - - - - - - - 56 | gpt-3.5-turbo-0613 - - - - - - - - - - - - - - - - - - - 57 | RedPajama-INCITE-Base-v1 (3B) - - - - - - - - - - - - - - - - - - - 58 | RedPajama-INCITE-Instruct-v1 (3B) - - - - - - - - - - - - - - - - - - - 59 | RedPajama-INCITE-Base (7B) - - - - - - - - - - - - - - - - - - - 60 | RedPajama-INCITE-Instruct (7B) - - - - - - - - - - - - - - - - - - - 61 | MPT (30B) - - - - - - - - - - - - - - - - - - - 62 | MPT-Instruct (30B) - - - - - - - - - - - - - - - - - - - 63 | Falcon (7B) - - - - - - - - - - - - - - - - - - - 64 | Falcon-Instruct (7B) - - - - - - - - - - - - - - - - - - - 65 | Falcon (40B) - - - - - - - - - - - - - - - - - - - 66 | Falcon-Instruct (40B) - - - - - - - - - - - - - - - - - - - 67 | InstructPalmyra (30B) - - - - 0.972 28.97 7.901 - - - - - - 0.844 3.441 15.707 - - - 68 | Palmyra X (43B) - - - - 0.291 2.35 3.117 - - - - - - 0.775 2.466 14.252 - - - 69 | 70 | -------------------------------------------------------------------------------- /benchbench/data/helm/fairness.tsv: -------------------------------------------------------------------------------- 1 | Model Mean win rate MMLU - EM (Fairness) BoolQ - EM (Fairness) NarrativeQA - F1 (Fairness) NaturalQuestions (closed-book) - F1 (Fairness) NaturalQuestions (open-book) - F1 (Fairness) QuAC - F1 (Fairness) HellaSwag - EM (Fairness) OpenbookQA - EM (Fairness) TruthfulQA - EM (Fairness) MS MARCO (regular) - RR@10 (Fairness) MS MARCO (TREC) - NDCG@10 (Fairness) IMDB - EM (Fairness) CivilComments - EM (Fairness) RAFT - EM (Fairness) 2 | Llama 2 (70B) 0.959 0.557 0.859 0.709 0.4 0.637 0.414 - - 0.434 - - 0.954 0.551 0.7 3 | LLaMA (65B) 0.924 0.551 0.847 0.661 0.375 0.633 0.333 - - 0.42 - - 0.953 0.574 0.668 4 | text-davinci-003 0.903 0.537 0.858 0.664 0.356 0.721 0.45 0.729 0.578 0.491 0.335 0.633 0.833 0.559 0.705 5 | Cohere Command beta (52.4B) 0.866 0.407 0.822 0.657 0.296 0.706 0.316 0.699 0.508 0.222 0.45 0.748 0.957 0.544 0.627 6 | text-davinci-002 0.864 0.531 0.837 0.646 0.32 0.659 0.353 0.703 0.54 0.515 0.373 0.639 0.934 0.463 0.671 7 | Mistral v0.1 (7B) 0.861 0.542 0.842 0.644 0.3 0.625 0.353 - - 0.332 - - 0.952 0.52 0.664 8 | Jurassic-2 Jumbo (178B) 0.836 0.45 0.792 0.658 0.327 0.62 0.34 0.655 0.488 0.354 0.342 0.62 0.933 0.507 0.711 9 | LLaMA (30B) 0.822 0.496 0.813 0.657 0.356 0.621 0.325 - - 0.266 - - 0.913 0.508 0.718 10 | Llama 2 (13B) 0.808 0.466 0.732 0.657 0.309 0.58 0.351 - - 0.274 - - 0.957 0.489 0.673 11 | Palmyra X (43B) 0.797 0.588 0.875 0.651 0.362 - 0.399 - - 0.542 - - 0.918 0.006 0.672 12 | Anthropic-LM v4-s3 (52B) 0.794 0.447 0.782 0.646 0.239 0.642 0.356 0.695 0.482 0.3 - - 0.925 0.512 0.67 13 | TNLG v2 (530B) 0.752 0.418 0.767 0.632 0.318 0.598 0.313 0.678 0.504 0.197 0.341 0.612 0.936 0.48 0.644 14 | MPT (30B) 0.746 0.41 0.631 0.653 0.287 0.624 0.318 - - 0.19 - - 0.955 0.553 0.68 15 | gpt-3.5-turbo-0613 0.718 0.313 0.817 0.547 0.287 0.627 0.398 - - 0.255 - - 0.912 0.525 0.641 16 | Vicuna v1.3 (13B) 0.715 0.424 0.748 0.607 0.266 0.63 0.324 - - 0.315 - - 0.707 0.569 0.62 17 | Falcon-Instruct (40B) 0.709 0.466 0.799 0.543 0.331 0.607 0.308 - - 0.312 - - 0.957 0.462 0.561 18 | Jurassic-2 Grande (17B) 0.704 0.433 0.78 0.645 0.283 0.584 0.34 0.632 0.466 0.29 0.243 0.471 0.931 0.445 0.689 19 | MPT-Instruct (30B) 0.687 0.4 0.807 0.633 0.233 0.639 0.252 - - 0.18 - - 0.944 0.527 0.636 20 | Falcon (40B) 0.686 0.48 0.783 0.559 0.338 0.625 0.256 - - 0.292 - - 0.954 0.292 0.611 21 | J1-Grande v2 beta (17B) 0.677 0.409 0.764 0.647 0.27 0.571 0.308 0.623 0.478 0.242 0.253 0.435 0.95 0.404 0.637 22 | Cohere Command beta (6.1B) 0.662 0.366 0.748 0.595 0.167 0.654 0.273 0.608 0.468 0.163 0.411 0.69 0.95 0.496 0.609 23 | gpt-3.5-turbo-0301 0.662 0.53 0.666 0.585 0.331 0.559 0.417 - - 0.514 - - 0.844 0.422 0.689 24 | OPT (175B) 0.622 0.287 0.731 0.573 0.246 0.561 0.266 0.66 0.5 0.203 0.26 0.419 0.944 0.491 0.58 25 | Vicuna v1.3 (7B) 0.622 0.385 0.67 0.553 0.224 0.575 0.304 - - 0.235 - - 0.906 0.564 0.643 26 | Llama 2 (7B) 0.61 0.392 0.706 0.596 0.264 0.55 0.321 - - 0.223 - - 0.871 0.503 0.609 27 | Cohere xlarge v20221108 (52.4B) 0.608 0.317 0.708 0.553 0.299 0.566 0.275 0.687 0.5 0.12 0.267 0.522 0.949 0.415 0.604 28 | LLaMA (13B) 0.602 0.385 0.666 0.628 0.288 0.561 0.267 - - 0.234 - - 0.903 0.533 0.605 29 | davinci (175B) 0.558 0.38 0.682 0.597 0.276 0.567 0.279 0.641 0.502 0.155 0.185 0.357 0.921 0.478 0.605 30 | LLaMA (7B) 0.553 0.284 0.71 0.552 0.241 0.537 0.257 - - 0.219 - - 0.936 0.505 0.545 31 | BLOOM (176B) 0.551 0.274 0.656 0.577 0.187 0.575 0.273 0.585 0.482 0.186 0.211 0.371 0.938 0.546 0.563 32 | Cohere xlarge v20220609 (52.4B) 0.55 0.315 0.667 0.548 0.255 0.535 0.281 0.66 0.47 0.156 0.233 0.431 0.949 0.479 0.598 33 | InstructPalmyra (30B) 0.538 0.371 0.7 0.405 0.276 0.63 0.337 - - 0.152 - - 0.931 0.449 0.618 34 | Luminous Supreme (70B) 0.522 0.264 0.694 0.603 0.241 0.597 0.288 - - 0.132 - - 0.949 0.432 0.601 35 | GLM (130B) 0.513 0.315 0.69 0.615 0.12 0.597 0.205 - - 0.192 - - 0.933 0.5 0.575 36 | J1-Jumbo v1 (178B) 0.488 0.236 0.709 0.581 0.235 0.54 0.268 0.614 0.466 0.156 0.18 0.348 0.932 0.478 0.623 37 | Jurassic-2 Large (7.5B) 0.483 0.297 0.685 - 0.217 0.539 - 0.567 0.45 0.196 0.215 0.44 0.945 0.403 0.567 38 | OPT (66B) 0.476 0.229 0.71 0.526 0.218 0.536 0.268 0.597 0.454 0.173 0.214 0.471 0.908 0.5 0.536 39 | RedPajama-INCITE-Instruct (7B) 0.466 0.305 0.616 0.506 0.164 0.592 0.181 - - 0.183 - - 0.907 0.54 0.67 40 | J1-Grande v1 (17B) 0.454 0.232 0.678 0.547 0.187 0.521 0.274 0.58 0.472 0.163 0.138 0.328 0.946 0.482 0.636 41 | Luminous Extended (30B) 0.451 0.237 0.711 0.532 0.214 0.551 0.277 - - 0.16 - - 0.937 0.462 0.489 42 | Falcon (7B) 0.447 0.261 0.702 0.52 0.233 0.537 0.262 - - 0.213 - - 0.794 0.494 0.555 43 | text-curie-001 0.377 0.231 0.576 0.463 0.132 0.5 0.255 0.534 0.452 0.239 0.244 0.482 0.91 0.471 0.458 44 | Alpaca (7B) 0.372 0.346 0.729 0.299 0.21 0.53 0.204 - - 0.202 - - 0.699 0.483 0.459 45 | RedPajama-INCITE-Instruct-v1 (3B) 0.369 0.222 0.648 0.506 0.143 0.571 0.183 - - 0.179 - - 0.876 0.499 0.632 46 | Cohere large v20220720 (13.1B) 0.362 0.281 0.676 0.512 0.178 0.507 0.256 0.575 0.446 0.157 0.164 0.312 0.92 0.443 0.564 47 | Cohere medium v20221108 (6.1B) 0.34 0.22 0.642 0.497 0.149 0.45 0.229 0.567 0.44 0.182 0.145 0.353 0.917 0.493 0.571 48 | GPT-NeoX (20B) 0.331 0.215 0.609 0.461 0.154 0.525 0.232 0.552 0.438 0.179 0.148 0.381 0.928 0.491 0.475 49 | RedPajama-INCITE-Base (7B) 0.323 0.276 0.65 0.524 0.193 0.514 0.238 - - 0.17 - - 0.694 0.431 0.595 50 | Falcon-Instruct (7B) 0.297 0.261 0.637 0.354 0.148 0.383 0.219 - - 0.183 - - 0.811 0.502 0.5 51 | TNLG v2 (6.7B) 0.291 0.212 0.665 0.517 0.162 0.501 0.267 0.53 0.412 0.144 0.14 0.317 0.912 0.473 0.502 52 | GPT-J (6B) 0.29 0.22 0.639 0.433 0.122 0.493 0.249 0.486 0.416 0.18 0.129 0.332 0.927 0.488 0.594 53 | J1-Large v1 (7.5B) 0.275 0.204 0.622 0.513 0.146 0.47 0.241 0.528 0.444 0.174 0.117 0.28 0.946 0.447 0.511 54 | RedPajama-INCITE-Base-v1 (3B) 0.27 0.232 0.624 0.42 0.145 0.452 0.238 - - 0.248 - - 0.89 0.393 0.475 55 | Cohere medium v20220720 (6.1B) 0.269 0.237 0.597 0.438 0.126 0.432 0.198 0.525 0.42 0.174 0.132 0.357 0.918 0.489 0.5 56 | text-babbage-001 0.244 0.205 0.41 0.299 0.053 0.24 0.196 0.405 0.386 0.207 0.174 0.424 0.887 0.499 0.475 57 | Luminous Base (13B) 0.238 0.185 0.653 0.498 0.16 0.511 0.266 - - 0.125 - - 0.912 0.397 0.445 58 | curie (6.7B) 0.231 0.218 0.594 0.482 0.147 0.479 0.243 0.522 0.43 0.186 0.14 0.284 0.86 0.412 0.473 59 | Pythia (12B) 0.226 0.212 0.547 0.449 0.131 0.523 0.227 - - 0.154 - - 0.916 0.448 0.489 60 | T0pp (11B) 0.203 0.382 0 0.086 0.028 0.136 0.067 - - 0.35 - - 0.168 0.165 0.106 61 | UL2 (20B) 0.186 0.273 0.698 0.053 0.162 0.303 0.107 - - 0.162 - - 0.271 0.423 0.375 62 | Pythia (6.9B) 0.171 0.207 0.552 0.389 0.103 0.464 0.198 - - 0.18 - - 0.911 0.333 0.45 63 | YaLM (100B) 0.167 0.243 0.583 0.146 0.052 0.177 0.1 - - 0.202 - - 0.8 0.456 0.342 64 | Cohere small v20220720 (410M) 0.154 0.222 0.374 0.179 0.055 0.219 0.144 0.308 0.28 0.203 - 0.28 0.518 0.495 0.452 65 | T5 (11B) 0.15 0.235 0.723 0.05 0.159 0.424 0.074 - - 0.101 - - 0.303 0.329 0.351 66 | babbage (1.3B) 0.134 0.206 0.436 0.367 0.084 0.381 0.202 0.401 0.326 0.178 0.105 0.301 0.534 0.474 0.438 67 | text-ada-001 0.108 0.202 0.378 0.119 0.012 0.083 0.091 0.27 0.266 0.191 0.107 0.276 0.769 0.497 0.376 68 | ada (350M) 0.105 0.21 0.507 0.205 0.057 0.273 0.166 0.294 0.318 0.185 0.086 0.268 0.806 0.436 0.395 69 | 70 | -------------------------------------------------------------------------------- /benchbench/data/helm/robustness.tsv: -------------------------------------------------------------------------------- 1 | Model Mean win rate MMLU - EM (Robustness) BoolQ - EM (Robustness) NarrativeQA - F1 (Robustness) NaturalQuestions (closed-book) - F1 (Robustness) NaturalQuestions (open-book) - F1 (Robustness) QuAC - F1 (Robustness) HellaSwag - EM (Robustness) OpenbookQA - EM (Robustness) TruthfulQA - EM (Robustness) MS MARCO (regular) - RR@10 (Robustness) MS MARCO (TREC) - NDCG@10 (Robustness) IMDB - EM (Robustness) CivilComments - EM (Robustness) RAFT - EM (Robustness) 2 | Llama 2 (70B) 0.965 0.545 0.863 0.722 0.42 0.639 0.362 - - 0.468 - - 0.949 0.59 0.673 3 | text-davinci-002 0.916 0.525 0.841 0.638 0.299 0.665 0.319 0.776 0.52 0.547 0.344 0.628 0.925 0.567 0.666 4 | text-davinci-003 0.91 0.517 0.858 0.694 0.369 0.73 0.42 0.798 0.572 0.516 0.304 0.616 0.779 0.594 0.714 5 | Mistral v0.1 (7B) 0.896 0.533 0.837 0.649 0.305 0.631 0.31 - - 0.339 - - 0.954 0.521 0.652 6 | LLaMA (65B) 0.885 0.504 0.84 0.567 0.388 0.624 0.275 - - 0.448 - - 0.935 0.566 0.655 7 | Cohere Command beta (52.4B) 0.85 0.387 0.811 0.57 0.289 0.679 0.238 0.774 0.492 0.229 0.434 0.734 0.933 0.535 0.599 8 | Llama 2 (13B) 0.823 0.444 0.753 0.682 0.324 0.563 0.294 - - 0.287 - - 0.954 0.47 0.652 9 | Palmyra X (43B) 0.821 0.566 0.878 0.672 0.363 - 0.383 - - 0.568 - - 0.904 0.006 0.677 10 | Anthropic-LM v4-s3 (52B) 0.818 0.434 0.756 0.663 0.245 0.632 0.313 0.766 0.472 0.326 - - 0.928 0.514 0.6 11 | gpt-3.5-turbo-0301 0.816 0.525 0.66 0.602 0.327 0.556 0.411 - - 0.566 - - 0.857 0.605 0.705 12 | LLaMA (30B) 0.815 0.461 0.791 0.611 0.36 0.612 0.273 - - 0.281 - - 0.893 0.503 0.67 13 | Jurassic-2 Jumbo (178B) 0.791 0.417 0.729 0.66 0.315 0.599 0.314 0.754 0.47 0.39 0.337 0.607 0.896 0.449 0.69 14 | Jurassic-2 Grande (17B) 0.764 0.411 0.729 0.583 0.285 0.564 0.276 0.755 0.474 0.293 0.227 0.423 0.928 0.488 0.618 15 | Falcon-Instruct (40B) 0.763 0.446 0.781 0.508 0.335 0.591 0.212 - - 0.338 - - 0.938 0.523 0.523 16 | gpt-3.5-turbo-0613 0.762 0.262 0.845 0.566 0.284 0.606 0.371 - - 0.187 - - 0.916 0.564 0.677 17 | Vicuna v1.3 (13B) 0.732 0.413 0.757 0.525 0.273 0.621 0.247 - - 0.341 - - 0.674 0.593 0.591 18 | J1-Grande v2 beta (17B) 0.711 0.392 0.692 0.565 0.235 0.56 0.251 0.732 0.474 0.252 0.222 0.407 0.947 0.495 0.555 19 | Falcon (40B) 0.705 0.457 0.763 0.557 0.329 0.593 0.162 - - 0.303 - - 0.935 0.412 0.586 20 | MPT (30B) 0.697 0.381 0.656 0.584 0.272 0.609 0.231 - - 0.177 - - 0.942 0.484 0.58 21 | Vicuna v1.3 (7B) 0.662 0.371 0.672 0.5 0.214 0.539 0.25 - - 0.258 - - 0.882 0.543 0.6 22 | MPT-Instruct (30B) 0.656 0.383 0.77 0.623 0.202 0.607 0.204 - - 0.177 - - 0.942 0.408 0.548 23 | TNLG v2 (530B) 0.65 0.403 0.733 0.319 0.307 0.525 0.194 0.757 0.476 0.202 0.287 0.565 0.921 0.409 0.545 24 | GLM (130B) 0.647 0.32 0.728 0.629 0.117 0.6 0.193 - - 0.196 - - 0.938 0.5 0.577 25 | Llama 2 (7B) 0.644 0.373 0.676 0.573 0.261 0.501 0.271 - - 0.234 - - 0.808 0.516 0.573 26 | LLaMA (13B) 0.637 0.37 0.67 0.544 0.272 0.556 0.194 - - 0.274 - - 0.875 0.529 0.559 27 | Cohere Command beta (6.1B) 0.616 0.334 0.725 0.529 0.163 0.605 0.17 0.696 0.448 0.171 0.387 0.685 0.921 0.468 0.552 28 | Cohere xlarge v20221108 (52.4B) 0.596 0.299 0.718 0.39 0.283 0.533 0.229 0.764 0.482 0.116 0.242 0.482 0.923 0.408 0.489 29 | LLaMA (7B) 0.568 0.268 0.688 0.485 0.222 0.519 0.223 - - 0.229 - - 0.897 0.492 0.486 30 | Luminous Supreme (70B) 0.546 0.255 0.665 0.59 0.252 0.586 0.233 - - 0.106 - - 0.932 0.263 0.564 31 | BLOOM (176B) 0.541 0.25 0.642 0.53 0.185 0.558 0.234 0.699 0.438 0.183 0.19 0.333 0.92 0.467 0.527 32 | Jurassic-2 Large (7.5B) 0.527 0.263 0.607 - 0.187 0.503 - 0.687 0.448 0.21 0.177 0.397 0.941 0.469 0.498 33 | InstructPalmyra (30B) 0.522 0.348 0.656 0.317 0.267 0.567 0.248 - - 0.151 - - 0.906 0.443 0.518 34 | OPT (175B) 0.519 0.27 0.623 0.409 0.208 0.408 0.2 0.744 0.488 0.205 0.235 0.408 0.919 0.184 0.48 35 | davinci (175B) 0.509 0.34 0.639 0.498 0.256 0.521 0.208 0.738 0.474 0.145 0.154 0.332 0.873 0.461 0.505 36 | Cohere xlarge v20220609 (52.4B) 0.506 0.29 0.614 0.383 0.238 0.471 0.215 0.759 0.448 0.151 0.207 0.397 0.923 0.32 0.563 37 | RedPajama-INCITE-Instruct (7B) 0.495 0.291 0.599 0.482 0.137 0.547 0.164 - - 0.197 - - 0.82 0.527 0.605 38 | J1-Jumbo v1 (178B) 0.452 0.221 0.65 0.523 0.179 0.503 0.222 0.726 0.43 0.154 0.144 0.307 0.923 0.271 0.555 39 | OPT (66B) 0.438 0.216 0.683 0.397 0.206 0.458 0.199 0.699 0.45 0.174 0.179 0.437 0.886 0.305 0.405 40 | Luminous Extended (30B) 0.43 0.23 0.659 0.513 0.212 0.524 0.193 - - 0.151 - - 0.92 0.368 0.436 41 | Falcon (7B) 0.425 0.236 0.65 0.436 0.185 0.489 0.164 - - 0.205 - - 0.692 0.485 0.516 42 | J1-Grande v1 (17B) 0.423 0.225 0.643 0.477 0.17 0.478 0.219 0.695 0.424 0.142 0.121 0.297 0.941 0.417 0.513 43 | RedPajama-INCITE-Instruct-v1 (3B) 0.387 0.218 0.629 0.403 0.132 0.536 0.137 - - 0.173 - - 0.852 0.506 0.548 44 | Alpaca (7B) 0.379 0.324 0.643 0.246 0.203 0.491 0.16 - - 0.199 - - 0.561 0.482 0.42 45 | Cohere large v20220720 (13.1B) 0.345 0.253 0.545 0.357 0.172 0.347 0.204 0.687 0.43 0.154 0.13 0.257 0.902 0.333 0.49 46 | text-curie-001 0.337 0.22 0.549 0.34 0.121 0.415 0.169 0.625 0.424 0.235 0.198 0.444 0.881 0.129 0.399 47 | GPT-NeoX (20B) 0.336 0.189 0.551 0.421 0.133 0.452 0.191 0.661 0.414 0.175 0.096 0.351 0.912 0.48 0.399 48 | RedPajama-INCITE-Base (7B) 0.331 0.25 0.569 0.424 0.167 0.472 0.186 - - 0.173 - - 0.56 0.401 0.489 49 | Luminous Base (13B) 0.319 0.183 0.655 0.476 0.163 0.491 0.185 - - 0.112 - - 0.887 0.416 0.402 50 | Falcon-Instruct (7B) 0.303 0.25 0.593 0.258 0.132 0.327 0.179 - - 0.17 - - 0.759 0.487 0.445 51 | J1-Large v1 (7.5B) 0.298 0.2 0.567 0.4 0.098 0.41 0.197 0.646 0.412 0.155 0.105 0.248 0.932 0.444 0.443 52 | RedPajama-INCITE-Base-v1 (3B) 0.293 0.217 0.585 0.346 0.134 0.396 0.177 - - 0.226 - - 0.843 0.336 0.427 53 | GPT-J (6B) 0.291 0.217 0.621 0.135 0.099 0.228 0.147 0.619 0.398 0.181 0.116 0.319 0.903 0.418 0.53 54 | Pythia (12B) 0.272 0.22 0.51 0.42 0.108 0.47 0.171 - - 0.138 - - 0.854 0.418 0.45 55 | Cohere medium v20221108 (6.1B) 0.27 0.207 0.54 0.296 0.105 0.222 0.152 0.687 0.414 0.17 0.13 0.314 0.888 0.353 0.502 56 | UL2 (20B) 0.257 0.272 0.646 0.059 0.141 0.291 0.111 - - 0.178 - - 0.276 0.45 0.349 57 | TNLG v2 (6.7B) 0.24 0.169 0.638 0.352 0.149 0.299 0.159 0.656 0.408 0.136 0.105 0.278 0.896 0.336 0.445 58 | curie (6.7B) 0.231 0.19 0.545 0.367 0.126 0.338 0.171 0.632 0.396 0.186 0.11 0.253 0.803 0.347 0.413 59 | T0pp (11B) 0.228 0.378 0 0.099 0.031 0.122 0.071 - - 0.365 - - 0.17 0.087 0.085 60 | text-babbage-001 0.226 0.186 0.384 0.126 0.04 0.151 0.087 0.468 0.39 0.195 0.122 0.356 0.844 0.499 0.383 61 | YaLM (100B) 0.205 0.243 0.566 0.088 0.047 0.125 0.08 - - 0.202 - - 0.719 0.463 0.211 62 | Cohere medium v20220720 (6.1B) 0.188 0.184 0.562 0.3 0.102 0.266 0.144 0.651 0.382 0.149 0.109 0.315 0.889 0.136 0.385 63 | Pythia (6.9B) 0.182 0.201 0.527 0.313 0.094 0.391 0.171 - - 0.139 - - 0.871 0.363 0.377 64 | T5 (11B) 0.164 0.258 0.65 0.045 0.153 0.071 0.064 - - 0.122 - - 0.304 0.392 0.331 65 | Cohere small v20220720 (410M) 0.147 0.226 0.361 0.078 0.025 0.074 0.098 0.405 0.238 0.204 - 0.252 0.473 0.434 0.403 66 | babbage (1.3B) 0.117 0.166 0.477 0.255 0.068 0.212 0.149 0.489 0.314 0.162 0.073 0.246 0.5 0.4 0.409 67 | text-ada-001 0.105 0.178 0.332 0.058 0.008 0.034 0.067 0.32 0.248 0.175 0.069 0.252 0.716 0.491 0.335 68 | ada (350M) 0.102 0.204 0.461 0.104 0.031 0.043 0.092 0.37 0.27 0.167 0.072 0.247 0.701 0.421 0.345 69 | 70 | -------------------------------------------------------------------------------- /benchbench/data/mteb/leaderboard.tsv: -------------------------------------------------------------------------------- 1 | Rank Model - Model Size (GB) Embedding Dimensions Max Tokens Average (56 datasets) Classification Average (12 datasets) Clustering Average (11 datasets) Pair Classification Average (3 datasets) Reranking Average (4 datasets) Retrieval Average (15 datasets) STS Average (10 datasets) Summarization Average (1 dataset) 2 | 1 SFR-Embedding-Mistral - 14.22 4096 32768 67.56 78.33 51.67 88.54 60.64 59 85.05 31.16 3 | 2 voyage-lite-02-instruct - - 1024 4000 67.13 79.25 52.42 86.87 58.24 56.6 85.79 31.01 4 | 3 e5-mistral-7b-instruct - 14.22 4096 32768 66.63 78.47 50.26 88.34 60.21 56.89 84.63 31.4 5 | 4 UAE-Large-V1 - 1.34 1024 512 64.64 75.58 46.73 87.25 59.88 54.66 84.54 32.03 6 | 5 text-embedding-3-large - - 3072 8191 64.59 75.45 49.01 85.72 59.16 55.44 81.73 29.92 7 | 6 voyage-lite-01-instruct - - 1024 4000 64.49 74.79 47.4 86.57 59.74 55.58 82.93 30.97 8 | 7 Cohere-embed-english-v3.0 - - 1024 512 64.47 76.49 47.43 85.84 58.01 55 82.62 30.18 9 | 8 bge-large-en-v1.5 - 1.34 1024 512 64.23 75.97 46.08 87.12 60.03 54.29 83.11 31.61 10 | 9 Cohere-embed-multilingual-v3.0 - - 1024 512 64.01 76.01 46.6 86.15 57.86 53.84 83.15 30.99 11 | 10 GIST-Embedding-v0 - 0.44 768 512 63.71 76.03 46.21 86.32 59.37 52.31 83.51 30.87 12 | 11 bge-base-en-v1.5 - 0.44 768 512 63.55 75.53 45.77 86.55 58.86 53.25 82.4 31.07 13 | 12 ember-v1 - 1.34 1024 512 63.54 75.99 45.58 87.37 60.04 51.92 83.34 30.82 14 | 13 sf_model_e5 - 1.34 1024 512 63.34 73.96 46.61 86.85 59.86 51.8 83.85 31.61 15 | 14 gte-large - 0.67 1024 512 63.13 73.33 46.84 85 59.13 52.22 83.35 31.66 16 | 15 stella-base-en-v2 - 0.22 768 512 62.61 75.28 44.9 86.45 58.78 50.1 83.02 32.52 17 | 16 gte-base - 0.22 768 512 62.39 73.01 46.2 84.57 58.61 51.14 82.3 31.17 18 | 17 text-embedding-3-small - - 1536 8191 62.26 73.21 46.65 85.04 56.72 51.08 81.58 31.12 19 | 18 e5-large-v2 - 1.34 1024 512 62.25 75.24 44.49 86.03 56.61 50.56 82.05 30.19 20 | 19 bge-small-en-v1.5 - 0.13 384 512 62.17 74.14 43.82 84.92 58.36 51.68 81.59 30.12 21 | 20 Cohere-embed-english-light-v3.0 - - 384 512 62.01 74.31 44.64 85.05 56.09 51.34 80.92 31.29 22 | 21 text-embedding-3-large-256 - - 256 8191 62 71.97 46.23 84.22 57.99 51.66 81.04 29.92 23 | 22 instructor-xl - 4.96 768 512 61.79 73.12 44.74 86.62 57.29 49.26 83.06 32.32 24 | 23 instructor-large - 1.34 768 512 61.59 73.86 45.29 85.89 57.54 47.57 83.15 31.84 25 | 24 e5-base-v2 - 0.44 768 512 61.5 73.84 43.8 85.73 55.91 50.29 81.05 30.28 26 | 25 multilingual-e5-large - 2.24 1024 514 61.5 74.81 41.06 84.75 55.86 51.43 81.56 29.69 27 | 26 e5-large - 1.34 1024 512 61.42 73.14 43.33 85.94 56.53 49.99 82.06 30.97 28 | 27 gte-small - 0.07 384 512 61.36 72.31 44.89 83.54 57.7 49.46 82.07 30.42 29 | 28 text-embedding-ada-002 - - 1536 8191 60.99 70.93 45.9 84.89 56.32 49.25 80.97 30.8 30 | 29 udever-bloom-7b1 - 28.28 4096 2048 60.63 72.13 40.81 85.4 55.91 49.34 83.01 30.97 31 | 30 e5-base - 0.44 768 512 60.44 72.63 42.11 85.09 55.7 48.75 80.96 31.01 32 | 31 jina-embeddings-v2-base-en - - - - 60.38 73.45 41.73 85.38 56.98 47.87 80.7 31.6 33 | 32 Cohere-embed-multilingual-light-v3.0 - - 384 512 60.08 70.57 41.98 83.95 55.06 50.15 80.09 30.41 34 | 33 e5-small-v2 - 0.13 384 512 59.93 72.94 39.92 84.67 54.32 49.04 80.39 31.16 35 | 34 udever-bloom-3b - 12.01 2560 2048 59.86 71.91 40.74 84.06 54.9 47.67 82.37 30.62 36 | 35 instructor-base - 0.44 768 512 59.54 72.36 41.9 83.51 56.2 45.12 82.29 29.85 37 | 36 sentence-t5-xxl - 9.73 768 512 59.51 73.42 43.72 85.06 56.42 42.24 82.63 30.08 38 | 37 multilingual-e5-base - 1.11 768 514 59.45 73.02 37.89 83.57 54.84 48.88 80.26 30.11 39 | 38 XLM-3B5-embedding - - - - 59.29 72.25 43.48 79.23 57.12 44.99 80.47 29.02 40 | 39 gtr-t5-xxl - 9.73 768 512 58.97 67.41 42.42 86.12 56.66 48.48 78.38 30.64 41 | 40 SGPT-5.8B-weightedmean-msmarco-specb-bitfit - 23.5 4096 2048 58.93 68.13 40.34 82 56.56 50.25 78.1 31.46 42 | 41 e5-small - 0.13 384 512 58.89 71.67 39.51 85.08 54.45 46.01 80.87 31.39 43 | 42 gte-tiny - 0.05 384 512 58.69 70.35 42.09 82.83 55.77 44.92 80.46 29.47 44 | 43 gtr-t5-xl - 2.48 768 512 58.42 67.11 41.51 86.13 55.96 47.96 77.8 30.21 45 | 44 udever-bloom-1b1 - 4.26 1536 2048 58.29 70.17 39.11 83.11 54.28 45.27 81.52 31.1 46 | 45 gtr-t5-large - 0.67 768 512 58.28 67.14 41.6 85.32 55.36 47.42 78.19 29.5 47 | 46 jina-embeddings-v2-small-en - - - - 58 68.82 40.08 84.44 55.09 45.14 80 30.56 48 | 47 XLM-0B6-embedding - - - - 57.97 70.55 42.97 77.83 55.6 43.39 79.02 30.25 49 | 48 multilingual-e5-small - 0.47 384 512 57.87 70.74 37.08 82.59 53.87 46.64 79.1 29.98 50 | 49 sentence-t5-xl - 2.48 768 512 57.87 72.84 42.34 86.06 54.71 38.47 81.66 29.91 51 | 50 all-mpnet-base-v2 - 0.44 768 514 57.78 65.07 43.69 83.04 59.36 43.81 80.28 27.49 52 | 51 sgpt-bloom-7b1-msmarco - 28.27 4096 2048 57.59 66.19 38.93 81.9 55.65 48.22 77.74 33.6 53 | 52 jina-embedding-l-en-v1 - 1.34 1024 512 57.38 67.76 37.15 84.8 56.42 44.81 80.96 29.85 54 | 53 SGPT-2.7B-weightedmean-msmarco-specb-bitfit - 10.74 2560 2048 57.17 67.13 39.83 80.65 54.67 46.54 76.83 31.03 55 | 54 sentence-t5-large - 0.67 768 512 57.06 72.31 41.65 84.97 54 36.71 81.83 29.64 56 | 55 MegatronBert-1B3-embedding - - - - 56.81 69.65 40.86 76.9 55.5 41.41 79.11 31.01 57 | 56 bge-micro-v2 - 0.03 384 512 56.57 68.04 39.18 82.81 54.29 42.56 78.65 29.87 58 | 57 all-MiniLM-L12-v2 - 0.13 384 512 56.53 63.21 41.81 82.41 58.44 42.69 79.8 27.9 59 | 58 all-MiniLM-L6-v2 - 0.09 384 512 56.26 63.05 42.35 82.37 58.04 41.95 78.9 30.81 60 | 59 jina-embedding-b-en-v1 - 0.44 768 512 56.26 66.07 35.88 83.04 55.84 44.03 79.93 30.71 61 | 60 SGPT-1.3B-weightedmean-msmarco-specb-bitfit - 5.36 2048 2048 56.2 66.52 39.92 79.58 54 44.49 75.74 30.43 62 | 61 gtr-t5-base - 0.22 768 512 56.19 65.25 38.63 83.85 54.23 44.67 77.07 29.67 63 | 62 contriever-base-msmarco - 0.44 768 512 56 66.68 41.1 82.54 53.14 41.88 76.51 30.36 64 | 63 udever-bloom-560m - 2.24 1024 2048 55.81 68.04 36.89 81.05 52.6 41.19 79.93 32.06 65 | 64 bge-micro - 0.03 384 512 55.71 66.35 39.46 81.77 54.28 40.82 78.37 31.16 66 | 65 sentence-t5-base - 0.22 768 512 55.27 69.81 40.21 85.18 53.09 33.63 81.14 31.39 67 | 66 bge-small-4096 - 0.14 384 4096 54.42 67.8 38.03 81.4 53.64 36.08 78.59 29.83 68 | 67 lodestone-base-4096-v1 - 0.27 768 4096 54.24 67.3 40.9 80.4 53.95 36.99 73.7 31.23 69 | 68 SGPT-5.8B-weightedmean-nli-bitfit - 23.5 4096 2048 53.74 70.14 36.98 77.03 52.33 32.34 80.53 30.38 70 | 69 multi-qa-MiniLM-L6-cos-v1 - - 384 512 53.29 61.67 35.67 80.86 54.58 41.17 74.23 31.05 71 | 70 msmarco-bert-co-condensor - 0.44 768 512 52.35 64.71 37.64 81.74 51.84 32.96 76.47 29.5 72 | 71 jina-embedding-s-en-v1 - 0.14 512 512 52.33 60.56 32.56 79.22 53.07 38.91 78.06 31.25 73 | 72 SGPT-125M-weightedmean-msmarco-specb-bitfit - 0.55 768 2048 51.25 60.72 35.79 75.23 50.58 37.04 73.41 29.71 74 | 73 text-similarity-ada-001 - - 1024 2046 49.52 70.44 37.52 76.86 49.02 18.36 78.6 26.94 75 | 74 sup-simcse-bert-base-uncased - 0.44 768 512 48.87 67.32 33.43 73.68 47.54 21.82 79.12 31.17 76 | 75 SGPT-125M-weightedmean-nli-bitfit - 0.55 768 2048 45.97 61.46 30.95 71.78 47.56 20.9 74.71 30.26 77 | 76 unsup-simcse-bert-base-uncased - 0.44 768 512 45.45 62.5 29.04 70.33 46.47 20.29 74.33 31.15 78 | 77 LaBSE - 1.88 768 512 45.21 62.71 29.55 78.87 48.42 18.99 70.8 31.05 79 | 78 komninos - 0.27 300 N/A 42.06 57.65 26.57 72.94 44.75 21.22 62.46 30.49 80 | 79 glove.6B.300d - 0.48 300 N/A 41.96 57.29 27.73 70.92 43.29 21.62 61.85 28.87 81 | 80 SONAR - - - - 40.72 60.43 22.9 71.4 46.18 13.47 67.18 30.56 82 | 81 allenai-specter - 0.44 768 512 40.28 52.37 34.06 61.37 48.1 15.88 61.02 27.66 83 | 82 bert-base-uncased - 0.44 768 512 38.33 61.66 30.12 56.33 43.44 10.59 54.36 29.82 84 | 83 LASER2 - 0.17 1024 N/A 34.95 53.18 15.28 68.86 41.44 7.94 63.27 26.8 85 | -------------------------------------------------------------------------------- /benchbench/data/helm/accuracy.tsv: -------------------------------------------------------------------------------- 1 | Model/adapter Mean win rate MMLU - EM BoolQ - EM NarrativeQA - F1 NaturalQuestions (closed-book) - F1 NaturalQuestions (open-book) - F1 QuAC - F1 HellaSwag - EM OpenbookQA - EM TruthfulQA - EM MS MARCO (regular) - RR@10 MS MARCO (TREC) - NDCG@10 CNN/DailyMail - ROUGE-2 XSUM - ROUGE-2 IMDB - EM CivilComments - EM RAFT - EM 2 | Llama 2 (70B) 0.944 0.582 0.886 0.77 0.458 0.674 0.484 - - 0.554 - - - - 0.961 0.652 0.727 3 | LLaMA (65B) 0.908 0.584 0.871 0.755 0.431 0.672 0.401 - - 0.508 - - - - 0.962 0.655 0.702 4 | text-davinci-002 0.905 0.568 0.877 0.727 0.383 0.713 0.445 0.815 0.594 0.61 0.421 0.664 0.153 0.144 0.948 0.668 0.733 5 | Mistral v0.1 (7B) 0.884 0.572 0.874 0.716 0.365 0.687 0.423 - - 0.422 - - - - 0.962 0.624 0.707 6 | Cohere Command beta (52.4B) 0.874 0.452 0.856 0.752 0.372 0.76 0.432 0.811 0.582 0.269 0.472 0.762 0.161 0.152 0.96 0.601 0.667 7 | text-davinci-003 0.872 0.569 0.881 0.727 0.406 0.77 0.525 0.822 0.646 0.593 0.368 0.644 0.156 0.124 0.848 0.684 0.759 8 | Jurassic-2 Jumbo (178B) 0.824 0.48 0.829 0.733 0.385 0.669 0.435 0.788 0.558 0.437 0.398 0.661 0.149 0.182 0.938 0.57 0.746 9 | Llama 2 (13B) 0.823 0.507 0.811 0.744 0.376 0.637 0.424 - - 0.33 - - - - 0.962 0.588 0.707 10 | TNLG v2 (530B) 0.787 0.469 0.809 0.722 0.384 0.642 0.39 0.799 0.562 0.251 0.377 0.643 0.161 0.169 0.941 0.601 0.679 11 | gpt-3.5-turbo-0613 0.783 0.391 0.87 0.625 0.348 0.675 0.485 - - 0.339 - - - - 0.943 0.696 0.748 12 | LLaMA (30B) 0.781 0.531 0.861 0.752 0.408 0.666 0.39 - - 0.344 - - - - 0.927 0.549 0.752 13 | Anthropic-LM v4-s3 (52B) 0.78 0.481 0.815 0.728 0.288 0.686 0.431 0.807 0.558 0.368 - - 0.154 0.134 0.934 0.61 0.699 14 | gpt-3.5-turbo-0301 0.76 0.59 0.74 0.663 0.39 0.624 0.512 - - 0.609 - - - - 0.899 0.674 0.768 15 | Jurassic-2 Grande (17B) 0.743 0.475 0.826 0.737 0.356 0.639 0.418 0.781 0.542 0.348 0.293 0.514 0.144 0.167 0.938 0.547 0.712 16 | Palmyra X (43B) 0.732 0.609 0.896 0.742 0.413 - 0.473 - - 0.616 - - 0.049 0.149 0.935 0.008 0.701 17 | Falcon (40B) 0.729 0.509 0.819 0.673 0.392 0.675 0.307 - - 0.353 - - - - 0.959 0.552 0.661 18 | Falcon-Instruct (40B) 0.727 0.497 0.829 0.625 0.377 0.666 0.371 - - 0.384 - - - - 0.959 0.603 0.586 19 | MPT-Instruct (30B) 0.716 0.444 0.85 0.733 0.304 0.697 0.327 - - 0.234 - - - - 0.956 0.573 0.68 20 | MPT (30B) 0.714 0.437 0.704 0.732 0.347 0.673 0.393 - - 0.231 - - - - 0.959 0.599 0.723 21 | J1-Grande v2 beta (17B) 0.706 0.445 0.812 0.725 0.337 0.625 0.392 0.764 0.56 0.306 0.285 0.46 0.146 0.152 0.957 0.546 0.679 22 | Vicuna v1.3 (13B) 0.706 0.462 0.808 0.691 0.346 0.686 0.403 - - 0.385 - - - - 0.762 0.645 0.657 23 | Cohere Command beta (6.1B) 0.675 0.406 0.798 0.709 0.229 0.717 0.375 0.752 0.55 0.203 0.434 0.709 0.153 0.122 0.961 0.54 0.634 24 | Cohere xlarge v20221108 (52.4B) 0.664 0.382 0.762 0.672 0.361 0.628 0.374 0.81 0.588 0.169 0.315 0.55 0.153 0.153 0.956 0.524 0.624 25 | Luminous Supreme (70B) 0.662 0.38 0.775 0.711 0.293 0.649 0.37 - - 0.222 - - 0.15 0.136 0.959 0.562 0.653 26 | Vicuna v1.3 (7B) 0.625 0.434 0.76 0.643 0.287 0.634 0.392 - - 0.292 - - - - 0.916 0.62 0.693 27 | OPT (175B) 0.609 0.318 0.793 0.671 0.297 0.615 0.36 0.791 0.586 0.25 0.288 0.448 0.146 0.155 0.947 0.505 0.606 28 | Llama 2 (7B) 0.607 0.431 0.762 0.691 0.337 0.611 0.406 - - 0.272 - - - - 0.907 0.562 0.643 29 | LLaMA (13B) 0.595 0.422 0.714 0.711 0.346 0.614 0.347 - - 0.324 - - - - 0.928 0.6 0.643 30 | InstructPalmyra (30B) 0.568 0.403 0.751 0.496 0.33 0.682 0.433 - - 0.185 - - 0.152 0.104 0.94 0.555 0.652 31 | Cohere xlarge v20220609 (52.4B) 0.56 0.353 0.718 0.65 0.312 0.595 0.361 0.811 0.55 0.198 0.273 0.459 0.144 0.129 0.956 0.532 0.633 32 | Jurassic-2 Large (7.5B) 0.553 0.339 0.742 - 0.274 0.589 - 0.729 0.53 0.245 0.247 0.464 0.136 0.142 0.956 0.57 0.622 33 | davinci (175B) 0.538 0.422 0.722 0.687 0.329 0.625 0.36 0.775 0.586 0.194 0.211 0.378 0.127 0.126 0.933 0.532 0.642 34 | LLaMA (7B) 0.533 0.321 0.756 0.669 0.297 0.589 0.338 - - 0.28 - - - - 0.947 0.563 0.573 35 | RedPajama-INCITE-Instruct (7B) 0.524 0.363 0.705 0.638 0.232 0.659 0.26 - - 0.243 - - - - 0.927 0.664 0.695 36 | J1-Jumbo v1 (178B) 0.517 0.259 0.776 0.695 0.293 0.595 0.358 0.765 0.534 0.175 0.21 0.363 0.144 0.129 0.943 0.553 0.681 37 | GLM (130B) 0.512 0.344 0.784 0.706 0.148 0.642 0.272 - - 0.218 - - 0.154 0.132 0.955 0.5 0.598 38 | Luminous Extended (30B) 0.485 0.321 0.767 0.665 0.254 0.609 0.349 - - 0.221 - - 0.139 0.124 0.947 0.524 0.523 39 | OPT (66B) 0.448 0.276 0.76 0.638 0.258 0.596 0.357 0.745 0.534 0.201 0.237 0.482 0.136 0.126 0.917 0.506 0.557 40 | BLOOM (176B) 0.446 0.299 0.704 0.662 0.216 0.621 0.361 0.744 0.534 0.205 0.236 0.386 0.08 0.03 0.945 0.62 0.592 41 | J1-Grande v1 (17B) 0.433 0.27 0.722 0.672 0.233 0.578 0.362 0.739 0.52 0.193 0.161 0.341 0.143 0.122 0.953 0.529 0.658 42 | Alpaca (7B) 0.381 0.385 0.778 0.396 0.266 0.592 0.27 - - 0.243 - - - - 0.738 0.566 0.486 43 | Falcon (7B) 0.378 0.286 0.753 0.621 0.285 0.579 0.332 - - 0.234 - - - - 0.836 0.514 0.602 44 | RedPajama-INCITE-Base (7B) 0.378 0.302 0.713 0.617 0.25 0.586 0.336 - - 0.205 - - - - 0.752 0.547 0.648 45 | Cohere large v20220720 (13.1B) 0.372 0.324 0.725 0.625 0.232 0.573 0.338 0.736 0.542 0.181 0.19 0.33 0.126 0.108 0.933 0.507 0.596 46 | RedPajama-INCITE-Instruct-v1 (3B) 0.366 0.257 0.677 0.638 0.203 0.637 0.259 - - 0.208 - - - - 0.894 0.549 0.661 47 | text-curie-001 0.36 0.237 0.62 0.582 0.175 0.571 0.358 0.676 0.514 0.257 0.271 0.507 0.152 0.076 0.923 0.537 0.489 48 | GPT-NeoX (20B) 0.351 0.276 0.683 0.599 0.193 0.596 0.326 0.718 0.524 0.216 0.184 0.398 0.123 0.102 0.948 0.516 0.505 49 | Luminous Base (13B) 0.315 0.27 0.719 0.605 0.202 0.568 0.334 - - 0.182 - - 0.11 0.105 0.939 0.544 0.473 50 | Cohere medium v20221108 (6.1B) 0.312 0.254 0.7 0.61 0.199 0.517 0.314 0.726 0.538 0.215 0.175 0.373 0.121 0.099 0.935 0.5 0.591 51 | RedPajama-INCITE-Base-v1 (3B) 0.311 0.263 0.685 0.555 0.207 0.52 0.309 - - 0.277 - - - - 0.907 0.549 0.502 52 | TNLG v2 (6.7B) 0.309 0.242 0.698 0.631 0.21 0.561 0.345 0.704 0.478 0.167 0.158 0.332 0.146 0.11 0.927 0.532 0.525 53 | J1-Large v1 (7.5B) 0.285 0.241 0.683 0.623 0.19 0.532 0.328 0.7 0.514 0.197 0.147 0.292 0.134 0.102 0.956 0.532 0.545 54 | GPT-J (6B) 0.273 0.249 0.649 0.545 0.156 0.559 0.33 0.663 0.514 0.199 0.152 0.345 0.131 0.096 0.939 0.52 0.619 55 | Pythia (12B) 0.257 0.274 0.662 0.596 0.175 0.581 0.313 - - 0.177 - - - - 0.931 0.531 0.514 56 | curie (6.7B) 0.247 0.243 0.656 0.604 0.199 0.552 0.321 0.682 0.502 0.232 0.162 0.3 0.113 0.091 0.889 0.539 0.49 57 | Falcon-Instruct (7B) 0.244 0.275 0.72 0.476 0.194 0.449 0.311 - - 0.213 - - - - 0.852 0.511 0.523 58 | Cohere medium v20220720 (6.1B) 0.23 0.279 0.659 0.559 0.177 0.504 0.279 0.706 0.496 0.19 0.152 0.374 0.077 0.087 0.935 0.504 0.52 59 | text-babbage-001 0.229 0.229 0.451 0.429 0.07 0.33 0.284 0.561 0.452 0.233 0.208 0.449 0.151 0.046 0.913 0.499 0.509 60 | T0pp (11B) 0.197 0.407 0 0.151 0.039 0.19 0.121 - - 0.377 - - 0.122 0.09 0.207 0.234 0.118 61 | Pythia (6.9B) 0.196 0.236 0.631 0.528 0.142 0.539 0.296 - - 0.213 - - - - 0.928 0.511 0.502 62 | UL2 (20B) 0.167 0.291 0.746 0.083 0.204 0.349 0.144 - - 0.193 - - 0.03 0.058 0.337 0.521 0.404 63 | T5 (11B) 0.131 0.29 0.761 0.086 0.194 0.477 0.116 - - 0.133 - - 0.043 0.015 0.379 0.509 0.37 64 | babbage (1.3B) 0.114 0.235 0.574 0.491 0.119 0.451 0.273 0.555 0.438 0.188 0.122 0.317 0.079 0.045 0.597 0.519 0.455 65 | Cohere small v20220720 (410M) 0.109 0.264 0.457 0.294 0.078 0.309 0.219 0.483 0.348 0.217 - 0.304 0.063 0.033 0.578 0.501 0.492 66 | ada (350M) 0.108 0.243 0.581 0.326 0.082 0.365 0.242 0.435 0.38 0.215 0.102 0.29 0.09 0.022 0.849 0.517 0.423 67 | text-ada-001 0.107 0.238 0.464 0.238 0.025 0.149 0.176 0.429 0.346 0.232 0.134 0.302 0.136 0.034 0.822 0.503 0.406 68 | YaLM (100B) 0.075 0.243 0.634 0.252 0.068 0.227 0.162 - - 0.202 - - 0.017 0.021 0.836 0.49 0.395 69 | -------------------------------------------------------------------------------- /benchbench/data/openllm/leaderboard.tsv: -------------------------------------------------------------------------------- 1 | T Model Average ARC HellaSwag MMLU TruthfulQA Winogrande GSM8K 2 | 🟢 cloudyu/Yi-34Bx2-MoE-60B 76.72 71.08 85.23 77.47 66.19 84.85 75.51 3 | 🟢 cloudyu/Mixtral_34Bx2_MoE_60B 76.66 71.33 85.25 77.34 66.59 84.85 74.6 4 | 🟢 cloudyu/Mixtral_34Bx2_MoE_60B 76.63 71.25 85.36 77.28 66.61 84.69 74.6 5 | 🟦 moreh/MoMo-70B-lora-1.8.4-DPO 76.23 69.62 85.35 77.33 64.64 84.14 76.27 6 | 🔶 cloudyu/Yi-34Bx3-MoE-90B 76.18 70.9 85.33 77.41 66.31 84.29 72.86 7 | 🟦 moreh/MoMo-70B-lora-1.8.5-DPO 76.14 69.54 85.6 77.49 65.79 84.14 74.3 8 | 🔶 TomGrc/FusionNet_7Bx2_MoE_14B 75.91 73.55 88.84 64.68 69.6 88.16 70.66 9 | 🔶 one-man-army/UNA-34Beagles-32K-bf16-v1 75.41 73.55 85.93 76.45 73.55 82.95 60.05 10 | 🔶 jondurbin/nontoxic-bagel-34b-v0.2 74.69 72.44 85.64 76.41 72.7 82.48 58.45 11 | ⭕ jondurbin/bagel-dpo-34b-v0.2 74.69 71.93 85.25 76.58 70.05 83.35 60.96 12 | 🔶 moreh/MoMo-70B-LoRA-V1.4 74.67 69.2 85.07 77.12 62.66 83.74 70.2 13 | 🟦 udkai/Turdus 74.66 73.38 88.56 64.52 67.11 86.66 67.7 14 | 🔶 jondurbin/bagel-dpo-34b-v0.2 74.5 72.01 85.24 76.58 70.16 83.03 59.97 15 | 🔶 kodonho/Solar-OrcaDPO-Solar-Instruct-SLERP 74.35 70.99 88.22 66.22 71.95 83.43 65.28 16 | 🔶 kodonho/SolarM-SakuraSolar-SLERP 74.29 71.16 88.47 66.24 72.1 83.11 64.67 17 | ⭕ bhavinjawade/SOLAR-10B-OrcaDPO-Jawade 74.27 71.16 88.27 66.12 71.57 83.66 64.82 18 | 🔶 VAGOsolutions/SauerkrautLM-SOLAR-Instruct 74.21 70.82 88.63 66.2 71.95 83.5 64.14 19 | 🟦 upstage/SOLAR-10.7B-Instruct-v1.0 74.2 71.08 88.16 66.21 71.43 83.58 64.75 20 | 🔶 fblgit/UNA-SOLAR-10.7B-Instruct-v1.0 74.2 70.56 88.18 66.08 72.05 83.66 64.67 21 | 🟦 bhavinjawade/SOLAR-10B-Nector-DPO-Jawade 74.19 71.33 88.62 66.22 70.92 83.43 64.59 22 | 🟦 dhanushreddy29/BrokenKeyboard 74.08 71.25 88.34 66.04 71.36 83.19 64.29 23 | 🟦 fblgit/UNA-SOLAR-10.7B-Instruct-v1.0 74.07 70.73 88.32 66.1 72.52 83.35 63.38 24 | 🔶 fblgit/UNA-POLAR-10.7B-InstructMath-v2 74.07 70.73 88.2 66.03 71.73 82.95 64.75 25 | 🔶 yhyu13/LMCocktail-10.7B-v1 74.06 70.65 88.13 66.21 71.03 83.35 64.97 26 | 🔶 rishiraj/meow 73.94 70.48 88.08 66.25 70.49 83.43 64.9 27 | 🟦 fblgit/UNA-TheBeagle-7b-v1 73.87 73.04 88 63.48 69.85 82.16 66.72 28 | 🔶 fblgit/UNAversal-8x7B-v1beta 73.78 69.8 86.9 70.39 71.97 82 61.64 29 | 🔶 NousResearch/Nous-Hermes-2-Yi-34B 73.74 66.89 85.49 76.7 60.37 82.95 70.05 30 | 🟦 argilla/distilabeled-Marcoro14-7B-slerp 73.63 70.73 87.47 65.22 65.1 82.08 71.19 31 | 🟢 Qwen/Qwen-72B 73.6 65.19 85.94 77.37 60.19 82.48 70.43 32 | 🟦 mlabonne/NeuralMarcoro14-7B 73.57 71.42 87.59 64.84 65.64 81.22 70.74 33 | 🔶 abideen/NexoNimbus-7B 73.5 70.82 87.86 64.69 62.43 84.85 70.36 34 | 🟦 Neuronovo/neuronovo-7B-v0.2 73.44 73.04 88.32 65.15 71.02 80.66 62.47 35 | 🟢 cloudyu/Mixtral_7Bx2_MoE 73.43 71.25 87.45 64.98 67.23 81.22 68.46 36 | 🟦 argilla/distilabeled-Marcoro14-7B-slerp-full 73.4 70.65 87.55 65.33 64.21 82 70.66 37 | 🟦 CultriX/MistralTrix-v1 73.39 72.27 88.33 65.24 70.73 80.98 62.77 38 | 🔶 cloudyu/Mixtral_7Bx5_MoE_30B 73.39 69.97 86.82 64.42 65.97 80.98 72.18 39 | 🟢 macadeliccc/SOLAR-math-2x10.7b 73.37 68.43 86.31 66.9 64.21 83.35 71.04 40 | 🟦 ryandt/MusingCaterpillar 73.33 72.53 88.34 65.26 70.93 80.66 62.24 41 | 🟢 cloudyu/Mixtral_7Bx6_MoE_35B 73.32 70.14 86.77 64.74 65.79 81.06 71.42 42 | 🔶 cloudyu/Mixtral_7Bx6_MoE_35B 73.31 69.97 86.82 64.91 65.77 81.14 71.27 43 | 🟦 Neuronovo/neuronovo-7B-v0.3 73.29 72.7 88.26 65.1 71.35 80.9 61.41 44 | ⭕ SUSTech/SUS-Chat-34B 73.22 66.3 83.91 76.41 57.04 83.5 72.18 45 | 🔶 Sao10K/SOLAR-10.7B-NahIdWin 73.21 64.51 85.67 64.17 76.73 80.51 67.7 46 | 🟦 argilla/notus-8x7b-experiment 73.18 70.99 87.73 71.33 65.79 81.61 61.64 47 | 🟦 CultriX/MistralTrixTest 73.17 72.53 88.4 65.22 70.77 81.37 60.73 48 | 🟢 macadeliccc/Orca-SOLAR-4x10.7b 73.17 68.52 86.78 67.03 64.54 83.9 68.23 49 | 🔶 samir-fama/SamirGPT-v1 73.11 69.54 87.04 65.3 63.37 81.69 71.72 50 | 🔶 SanjiWatsuki/Lelantos-DPO-7B 73.09 71.08 87.22 64 67.77 80.03 68.46 51 | 🟦 argilla/notux-8x7b-v1-epoch-2 73.05 70.65 87.8 71.43 65.97 82.08 60.35 52 | 🟦 CultriX/MistralTrixTest 73.17 72.53 88.4 65.22 70.77 81.37 60.73 53 | 🟢 macadeliccc/Orca-SOLAR-4x10.7b 73.17 68.52 86.78 67.03 64.54 83.9 68.23 54 | 🔶 samir-fama/SamirGPT-v1 73.11 69.54 87.04 65.3 63.37 81.69 71.72 55 | 🔶 SanjiWatsuki/Lelantos-DPO-7B 73.09 71.08 87.22 64 67.77 80.03 68.46 56 | 🟦 argilla/notux-8x7b-v1-epoch-2 73.05 70.65 87.8 71.43 65.97 82.08 60.35 57 | 🔶 shadowml/Marcoro14-7B-ties 73.01 69.8 87.13 65.11 63.54 81.61 70.89 58 | 🔶 argilla/notux-8x7b-v1 72.97 70.65 87.72 71.39 66.21 80.74 61.11 59 | 🔶 AA051611/whattest 72.96 66.81 84.43 76.59 58.04 82.48 69.45 60 | 🟦 bardsai/jaskier-7b-dpo 72.91 70.82 87.02 64.67 64.41 80.19 70.36 61 | 🔶 VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct 72.89 70.48 87.75 71.37 65.71 81.22 60.8 62 | 🔶 samir-fama/FernandoGPT-v1 72.87 69.45 86.94 65.19 61.18 81.14 73.31 63 | 🔶 PSanni/MPOMixtral-8x7B-Instruct-v0.1 72.8 70.99 87.95 70.26 66.52 82.56 58.53 64 | 🔶 cookinai/OpenCM-14 72.75 69.28 86.89 65.01 61.07 81.29 72.93 65 | 🔶 VAGOsolutions/SauerkrautLM-Mixtral-8x7B-Instruct 72.73 70.56 87.74 71.08 65.72 81.45 59.82 66 | 🔶 mistralai/Mixtral-8x7B-Instruct-v0.1 72.7 70.14 87.55 71.4 64.98 81.06 61.11 67 | 🔶 senseable/garten2-7b 72.65 69.37 87.54 65.44 59.5 84.69 69.37 68 | ⭕ mistralai/Mixtral-8x7B-Instruct-v0.1 72.62 70.22 87.63 71.16 64.58 81.37 60.73 69 | 🔶 AIDC-ai-business/Marcoroni-7B-v3 72.53 69.45 86.78 65 60.4 81.45 72.1 70 | 🟦 bardsai/jaskier-7b-dpo-v2 72.53 69.28 86.8 64.92 61.64 80.74 71.8 71 | 🔶 Toten5/Marcoroni-v3-neural-chat-v3-3-Slerp 72.51 68.77 86.55 64.51 62.7 80.74 71.8 72 | 🔶 jondurbin/bagel-dpo-8x7b-v0.2 72.49 72.1 86.41 70.27 72.83 83.27 50.04 73 | 🔶 Brillibits/Instruct_Mixtral-8x7B-v0.1_Dolly15K 72.44 69.28 87.59 70.96 64.83 82.56 59.44 74 | 🔶 SanjiWatsuki/Kunoichi-DPO-v2-7B 72.4 69.37 87.42 64.83 66 80.74 66.03 75 | 🔶 mindy-labs/mindy-7b 72.34 69.11 86.57 64.69 60.89 81.06 71.72 76 | 🔶 janhq/supermario-v2 72.34 68.52 86.51 64.88 60.58 81.37 72.18 77 | 🔶 OpenBuddy/openbuddy-deepseek-67b-v15.2 72.33 68.6 86.37 71.5 56.2 84.45 66.87 78 | 🔶 shadowml/Beyonder-4x7B-v2 72.33 68.77 86.8 65.1 60.68 80.9 71.72 79 | 🔶 janhq/supermario-slerp 72.32 68.94 86.58 64.93 60.11 81.29 72.1 80 | ⭕ mncai/yi-34B-v3 72.26 67.06 85.11 75.8 57.54 83.5 64.52 81 | 🔶 Sao10K/Fimbulvetr-10.7B-v1 72.25 68.94 87.27 66.59 60.54 83.5 66.64 82 | 🔶 SanjiWatsuki/Kunoichi-DPO-7B 72.24 69.62 87.14 64.79 67.31 80.58 63.99 83 | 🟦 rwitz2/grindin 72.18 69.88 87.02 64.98 59.34 80.9 70.96 84 | 🔶 SanjiWatsuki/Kunoichi-7B 72.13 68.69 87.1 64.9 64.04 81.06 67.02 85 | ⭕ mncai/yi-34B-v2 72.12 66.13 85 75.64 57.34 83.66 64.97 86 | 🔶 CausalLM/72B-preview 72.12 65.19 83.23 77.14 52.58 82.48 72.1 87 | 🔶 mindy-labs/mindy-7b-v2 72.11 68.69 86.59 65.18 60.16 81.06 70.96 88 | 🔶 CausalLM/72B-preview 72.06 64.85 83.28 77.21 52.51 82.48 72.02 89 | 🔶 rwitz/dec10 72.05 69.11 86.46 64.98 60.42 80.74 70.58 90 | 🔶 rwitz/dec10 72.01 69.2 86.48 64.91 60.52 80.43 70.51 91 | 🔶 cookinai/Valkyrie-V1 71.92 67.24 86.27 64.82 60.4 81.45 71.34 92 | 🔶 AA051611/A0110 71.89 66.38 84.73 74.48 58.6 82.32 64.82 93 | ⭕ DopeorNope/COKAL-v1-70B 71.87 87.46 83.29 68.13 72.79 80.27 39.27 94 | 🟦 bn22/Nous-Hermes-2-SOLAR-10.7B-MISALIGNED 71.83 68.26 86.11 66.26 57.79 83.43 69.14 95 | 🔶 AA051611/A0109 71.83 66.55 84.7 74.44 58.75 82.16 64.37 96 | ⭕ deepseek-ai/deepseek-llm-67b-chat 71.79 67.75 86.82 72.42 55.85 84.21 63.68 97 | 🔶 OpenBuddy/openbuddy-deepseek-67b-v15.1 71.76 67.66 86.49 70.3 54.42 84.77 66.94 98 | 🔶 migtissera/Tess-M-Creative-v1.0 71.73 66.81 85.14 75.54 57.68 83.11 62.09 99 | 🟦 VitalContribution/Evangelion-7B 71.71 68.94 86.45 63.97 64.01 79.95 66.94 100 | ⭕ bhenrym14/platypus-yi-34b 71.69 68.43 85.21 78.13 54.48 84.06 59.82 101 | 🟦 RatanRohith/NeuralPizza-7B-V0.1 71.53 70.48 87.3 64.42 67.22 80.35 59.44 102 | -------------------------------------------------------------------------------- /benchbench/data/imagenet/leaderboard_raw.tsv: -------------------------------------------------------------------------------- 1 | Weight Acc@1 Acc@5 Params GFLOPS Recipe 2 | AlexNet_Weights.IMAGENET1K_V1 56.522 79.066 61.1M 0.71 link 3 | ConvNeXt_Base_Weights.IMAGENET1K_V1 84.062 96.87 88.6M 15.36 link 4 | ConvNeXt_Large_Weights.IMAGENET1K_V1 84.414 96.976 197.8M 34.36 link 5 | ConvNeXt_Small_Weights.IMAGENET1K_V1 83.616 96.65 50.2M 8.68 link 6 | ConvNeXt_Tiny_Weights.IMAGENET1K_V1 82.52 96.146 28.6M 4.46 link 7 | DenseNet121_Weights.IMAGENET1K_V1 74.434 91.972 8.0M 2.83 link 8 | DenseNet161_Weights.IMAGENET1K_V1 77.138 93.56 28.7M 7.73 link 9 | DenseNet169_Weights.IMAGENET1K_V1 75.6 92.806 14.1M 3.36 link 10 | DenseNet201_Weights.IMAGENET1K_V1 76.896 93.37 20.0M 4.29 link 11 | EfficientNet_B0_Weights.IMAGENET1K_V1 77.692 93.532 5.3M 0.39 link 12 | EfficientNet_B1_Weights.IMAGENET1K_V1 78.642 94.186 7.8M 0.69 link 13 | EfficientNet_B1_Weights.IMAGENET1K_V2 79.838 94.934 7.8M 0.69 link 14 | EfficientNet_B2_Weights.IMAGENET1K_V1 80.608 95.31 9.1M 1.09 link 15 | EfficientNet_B3_Weights.IMAGENET1K_V1 82.008 96.054 12.2M 1.83 link 16 | EfficientNet_B4_Weights.IMAGENET1K_V1 83.384 96.594 19.3M 4.39 link 17 | EfficientNet_B5_Weights.IMAGENET1K_V1 83.444 96.628 30.4M 10.27 link 18 | EfficientNet_B6_Weights.IMAGENET1K_V1 84.008 96.916 43.0M 19.07 link 19 | EfficientNet_B7_Weights.IMAGENET1K_V1 84.122 96.908 66.3M 37.75 link 20 | EfficientNet_V2_L_Weights.IMAGENET1K_V1 85.808 97.788 118.5M 56.08 link 21 | EfficientNet_V2_M_Weights.IMAGENET1K_V1 85.112 97.156 54.1M 24.58 link 22 | EfficientNet_V2_S_Weights.IMAGENET1K_V1 84.228 96.878 21.5M 8.37 link 23 | GoogLeNet_Weights.IMAGENET1K_V1 69.778 89.53 6.6M 1.5 link 24 | Inception_V3_Weights.IMAGENET1K_V1 77.294 93.45 27.2M 5.71 link 25 | MNASNet0_5_Weights.IMAGENET1K_V1 67.734 87.49 2.2M 0.1 link 26 | MNASNet0_75_Weights.IMAGENET1K_V1 71.18 90.496 3.2M 0.21 link 27 | MNASNet1_0_Weights.IMAGENET1K_V1 73.456 91.51 4.4M 0.31 link 28 | MNASNet1_3_Weights.IMAGENET1K_V1 76.506 93.522 6.3M 0.53 link 29 | MaxVit_T_Weights.IMAGENET1K_V1 83.7 96.722 30.9M 5.56 link 30 | MobileNet_V2_Weights.IMAGENET1K_V1 71.878 90.286 3.5M 0.3 link 31 | MobileNet_V2_Weights.IMAGENET1K_V2 72.154 90.822 3.5M 0.3 link 32 | MobileNet_V3_Large_Weights.IMAGENET1K_V1 74.042 91.34 5.5M 0.22 link 33 | MobileNet_V3_Large_Weights.IMAGENET1K_V2 75.274 92.566 5.5M 0.22 link 34 | MobileNet_V3_Small_Weights.IMAGENET1K_V1 67.668 87.402 2.5M 0.06 link 35 | RegNet_X_16GF_Weights.IMAGENET1K_V1 80.058 94.944 54.3M 15.94 link 36 | RegNet_X_16GF_Weights.IMAGENET1K_V2 82.716 96.196 54.3M 15.94 link 37 | RegNet_X_1_6GF_Weights.IMAGENET1K_V1 77.04 93.44 9.2M 1.6 link 38 | RegNet_X_1_6GF_Weights.IMAGENET1K_V2 79.668 94.922 9.2M 1.6 link 39 | RegNet_X_32GF_Weights.IMAGENET1K_V1 80.622 95.248 107.8M 31.74 link 40 | RegNet_X_32GF_Weights.IMAGENET1K_V2 83.014 96.288 107.8M 31.74 link 41 | RegNet_X_3_2GF_Weights.IMAGENET1K_V1 78.364 93.992 15.3M 3.18 link 42 | RegNet_X_3_2GF_Weights.IMAGENET1K_V2 81.196 95.43 15.3M 3.18 link 43 | RegNet_X_400MF_Weights.IMAGENET1K_V1 72.834 90.95 5.5M 0.41 link 44 | RegNet_X_400MF_Weights.IMAGENET1K_V2 74.864 92.322 5.5M 0.41 link 45 | RegNet_X_800MF_Weights.IMAGENET1K_V1 75.212 92.348 7.3M 0.8 link 46 | RegNet_X_800MF_Weights.IMAGENET1K_V2 77.522 93.826 7.3M 0.8 link 47 | RegNet_X_8GF_Weights.IMAGENET1K_V1 79.344 94.686 39.6M 8 link 48 | RegNet_X_8GF_Weights.IMAGENET1K_V2 81.682 95.678 39.6M 8 link 49 | RegNet_Y_128GF_Weights.IMAGENET1K_SWAG_E2E_V1 88.228 98.682 644.8M 374.57 link 50 | RegNet_Y_128GF_Weights.IMAGENET1K_SWAG_LINEAR_V1 86.068 97.844 644.8M 127.52 link 51 | RegNet_Y_16GF_Weights.IMAGENET1K_V1 80.424 95.24 83.6M 15.91 link 52 | RegNet_Y_16GF_Weights.IMAGENET1K_V2 82.886 96.328 83.6M 15.91 link 53 | RegNet_Y_16GF_Weights.IMAGENET1K_SWAG_E2E_V1 86.012 98.054 83.6M 46.73 link 54 | RegNet_Y_16GF_Weights.IMAGENET1K_SWAG_LINEAR_V1 83.976 97.244 83.6M 15.91 link 55 | RegNet_Y_1_6GF_Weights.IMAGENET1K_V1 77.95 93.966 11.2M 1.61 link 56 | RegNet_Y_1_6GF_Weights.IMAGENET1K_V2 80.876 95.444 11.2M 1.61 link 57 | RegNet_Y_32GF_Weights.IMAGENET1K_V1 80.878 95.34 145.0M 32.28 link 58 | RegNet_Y_32GF_Weights.IMAGENET1K_V2 83.368 96.498 145.0M 32.28 link 59 | RegNet_Y_32GF_Weights.IMAGENET1K_SWAG_E2E_V1 86.838 98.362 145.0M 94.83 link 60 | RegNet_Y_32GF_Weights.IMAGENET1K_SWAG_LINEAR_V1 84.622 97.48 145.0M 32.28 link 61 | RegNet_Y_3_2GF_Weights.IMAGENET1K_V1 78.948 94.576 19.4M 3.18 link 62 | RegNet_Y_3_2GF_Weights.IMAGENET1K_V2 81.982 95.972 19.4M 3.18 link 63 | RegNet_Y_400MF_Weights.IMAGENET1K_V1 74.046 91.716 4.3M 0.4 link 64 | RegNet_Y_400MF_Weights.IMAGENET1K_V2 75.804 92.742 4.3M 0.4 link 65 | RegNet_Y_800MF_Weights.IMAGENET1K_V1 76.42 93.136 6.4M 0.83 link 66 | RegNet_Y_800MF_Weights.IMAGENET1K_V2 78.828 94.502 6.4M 0.83 link 67 | RegNet_Y_8GF_Weights.IMAGENET1K_V1 80.032 95.048 39.4M 8.47 link 68 | RegNet_Y_8GF_Weights.IMAGENET1K_V2 82.828 96.33 39.4M 8.47 link 69 | ResNeXt101_32X8D_Weights.IMAGENET1K_V1 79.312 94.526 88.8M 16.41 link 70 | ResNeXt101_32X8D_Weights.IMAGENET1K_V2 82.834 96.228 88.8M 16.41 link 71 | ResNeXt101_64X4D_Weights.IMAGENET1K_V1 83.246 96.454 83.5M 15.46 link 72 | ResNeXt50_32X4D_Weights.IMAGENET1K_V1 77.618 93.698 25.0M 4.23 link 73 | ResNeXt50_32X4D_Weights.IMAGENET1K_V2 81.198 95.34 25.0M 4.23 link 74 | ResNet101_Weights.IMAGENET1K_V1 77.374 93.546 44.5M 7.8 link 75 | ResNet101_Weights.IMAGENET1K_V2 81.886 95.78 44.5M 7.8 link 76 | ResNet152_Weights.IMAGENET1K_V1 78.312 94.046 60.2M 11.51 link 77 | ResNet152_Weights.IMAGENET1K_V2 82.284 96.002 60.2M 11.51 link 78 | ResNet18_Weights.IMAGENET1K_V1 69.758 89.078 11.7M 1.81 link 79 | ResNet34_Weights.IMAGENET1K_V1 73.314 91.42 21.8M 3.66 link 80 | ResNet50_Weights.IMAGENET1K_V1 76.13 92.862 25.6M 4.09 link 81 | ResNet50_Weights.IMAGENET1K_V2 80.858 95.434 25.6M 4.09 link 82 | ShuffleNet_V2_X0_5_Weights.IMAGENET1K_V1 60.552 81.746 1.4M 0.04 link 83 | ShuffleNet_V2_X1_0_Weights.IMAGENET1K_V1 69.362 88.316 2.3M 0.14 link 84 | ShuffleNet_V2_X1_5_Weights.IMAGENET1K_V1 72.996 91.086 3.5M 0.3 link 85 | ShuffleNet_V2_X2_0_Weights.IMAGENET1K_V1 76.23 93.006 7.4M 0.58 link 86 | SqueezeNet1_0_Weights.IMAGENET1K_V1 58.092 80.42 1.2M 0.82 link 87 | SqueezeNet1_1_Weights.IMAGENET1K_V1 58.178 80.624 1.2M 0.35 link 88 | Swin_B_Weights.IMAGENET1K_V1 83.582 96.64 87.8M 15.43 link 89 | Swin_S_Weights.IMAGENET1K_V1 83.196 96.36 49.6M 8.74 link 90 | Swin_T_Weights.IMAGENET1K_V1 81.474 95.776 28.3M 4.49 link 91 | Swin_V2_B_Weights.IMAGENET1K_V1 84.112 96.864 87.9M 20.32 link 92 | Swin_V2_S_Weights.IMAGENET1K_V1 83.712 96.816 49.7M 11.55 link 93 | Swin_V2_T_Weights.IMAGENET1K_V1 82.072 96.132 28.4M 5.94 link 94 | VGG11_BN_Weights.IMAGENET1K_V1 70.37 89.81 132.9M 7.61 link 95 | VGG11_Weights.IMAGENET1K_V1 69.02 88.628 132.9M 7.61 link 96 | VGG13_BN_Weights.IMAGENET1K_V1 71.586 90.374 133.1M 11.31 link 97 | VGG13_Weights.IMAGENET1K_V1 69.928 89.246 133.0M 11.31 link 98 | VGG16_BN_Weights.IMAGENET1K_V1 73.36 91.516 138.4M 15.47 link 99 | VGG16_Weights.IMAGENET1K_V1 71.592 90.382 138.4M 15.47 link 100 | VGG16_Weights.IMAGENET1K_FEATURES nan nan 138.4M 15.47 link 101 | VGG19_BN_Weights.IMAGENET1K_V1 74.218 91.842 143.7M 19.63 link 102 | VGG19_Weights.IMAGENET1K_V1 72.376 90.876 143.7M 19.63 link 103 | ViT_B_16_Weights.IMAGENET1K_V1 81.072 95.318 86.6M 17.56 link 104 | ViT_B_16_Weights.IMAGENET1K_SWAG_E2E_V1 85.304 97.65 86.9M 55.48 link 105 | ViT_B_16_Weights.IMAGENET1K_SWAG_LINEAR_V1 81.886 96.18 86.6M 17.56 link 106 | ViT_B_32_Weights.IMAGENET1K_V1 75.912 92.466 88.2M 4.41 link 107 | ViT_H_14_Weights.IMAGENET1K_SWAG_E2E_V1 88.552 98.694 633.5M 1016.72 link 108 | ViT_H_14_Weights.IMAGENET1K_SWAG_LINEAR_V1 85.708 97.73 632.0M 167.29 link 109 | ViT_L_16_Weights.IMAGENET1K_V1 79.662 94.638 304.3M 61.55 link 110 | ViT_L_16_Weights.IMAGENET1K_SWAG_E2E_V1 88.064 98.512 305.2M 361.99 link 111 | ViT_L_16_Weights.IMAGENET1K_SWAG_LINEAR_V1 85.146 97.422 304.3M 61.55 link 112 | ViT_L_32_Weights.IMAGENET1K_V1 76.972 93.07 306.5M 15.38 link 113 | Wide_ResNet101_2_Weights.IMAGENET1K_V1 78.848 94.284 126.9M 22.75 link 114 | Wide_ResNet101_2_Weights.IMAGENET1K_V2 82.51 96.02 126.9M 22.75 link 115 | Wide_ResNet50_2_Weights.IMAGENET1K_V1 78.468 94.086 68.9M 11.4 link 116 | -------------------------------------------------------------------------------- /benchbench/data/helm_lite/leaderboard.tsv: -------------------------------------------------------------------------------- 1 | Model Mean win rate NarrativeQA - F1 NaturalQuestions (open) - F1 NaturalQuestions (closed) - F1 OpenbookQA - EM MMLU - EM MATH - Equivalent (CoT) GSM8K - EM LegalBench - EM MedQA - EM WMT 2014 - BLEU-4 2 | GPT-4o (2024-05-13) 0.938 0.804 0.803 0.501 0.966 0.748 0.829 0.905 0.733 0.857 0.231 3 | GPT-4o (2024-08-06) 0.928 0.795 0.793 0.496 0.968 0.738 0.853 0.909 0.721 0.863 0.225 4 | DeepSeek v3 0.908 0.796 0.765 0.467 0.954 0.803 0.912 0.94 0.718 0.809 0.209 5 | Claude 3.5 Sonnet (20240620) 0.885 0.746 0.749 0.502 0.972 0.799 0.813 0.949 0.707 0.825 0.229 6 | Amazon Nova Pro 0.885 0.791 0.829 0.405 0.96 0.758 0.821 0.87 0.736 0.811 0.229 7 | GPT-4 (0613) 0.867 0.768 0.79 0.457 0.96 0.735 0.802 0.932 0.713 0.815 0.211 8 | GPT-4 Turbo (2024-04-09) 0.864 0.761 0.795 0.482 0.97 0.711 0.833 0.824 0.727 0.783 0.218 9 | Llama 3.1 Instruct Turbo (405B) 0.854 0.749 0.756 0.456 0.94 0.759 0.827 0.949 0.707 0.805 0.238 10 | Claude 3.5 Sonnet (20241022) 0.846 0.77 0.665 0.467 0.966 0.809 0.904 0.956 0.647 0.859 0.226 11 | Gemini 1.5 Pro (002) 0.842 0.756 0.726 0.455 0.952 0.795 0.92 0.817 0.747 0.771 0.231 12 | Llama 3.2 Vision Instruct Turbo (90B) 0.819 0.777 0.739 0.457 0.942 0.703 0.791 0.936 0.68 0.769 0.224 13 | Gemini 2.0 Flash (Experimental) 0.813 0.783 0.722 0.443 0.946 0.717 0.901 0.946 0.674 0.73 0.212 14 | Llama 3.3 Instruct Turbo (70B) 0.812 0.791 0.737 0.431 0.928 0.7 0.808 0.942 0.725 0.761 0.219 15 | Llama 3.1 Instruct Turbo (70B) 0.808 0.772 0.738 0.452 0.938 0.709 0.783 0.938 0.687 0.769 0.223 16 | Palmyra-X-004 0.808 0.773 0.754 0.457 0.926 0.739 0.767 0.905 0.73 0.775 0.203 17 | Llama 3 (70B) 0.793 0.798 0.743 0.475 0.934 0.695 0.663 0.805 0.733 0.777 0.225 18 | Qwen2 Instruct (72B) 0.77 0.727 0.776 0.39 0.954 0.769 0.79 0.92 0.712 0.746 0.207 19 | Qwen2.5 Instruct Turbo (72B) 0.745 0.745 0.676 0.359 0.962 0.77 0.884 0.9 0.74 0.753 0.207 20 | Mistral Large 2 (2407) 0.744 0.779 0.734 0.453 0.932 0.725 0.677 0.912 0.646 0.775 0.192 21 | Gemini 1.5 Pro (001) 0.739 0.783 0.748 0.378 0.902 0.772 0.825 0.836 0.757 0.692 0.189 22 | Amazon Nova Lite 0.708 0.768 0.815 0.352 0.928 0.693 0.779 0.829 0.659 0.696 0.204 23 | Mixtral (8x22B) 0.705 0.779 0.726 0.478 0.882 0.701 0.656 0.8 0.708 0.704 0.209 24 | GPT-4o mini (2024-07-18) 0.701 0.768 0.746 0.386 0.92 0.668 0.802 0.843 0.653 0.748 0.206 25 | GPT-4 Turbo (1106 preview) 0.698 0.727 0.763 0.435 0.95 0.699 0.857 0.668 0.626 0.817 0.205 26 | Claude 3 Opus (20240229) 0.683 0.351 0.264 0.441 0.956 0.768 0.76 0.924 0.662 0.775 0.24 27 | Palmyra X V3 (72B) 0.679 0.706 0.685 0.407 0.938 0.702 0.723 0.831 0.709 0.684 0.262 28 | Gemma 2 Instruct (27B) 0.675 0.79 0.731 0.353 0.918 0.664 0.746 0.812 0.7 0.684 0.214 29 | Gemini 1.5 Flash (001) 0.667 0.783 0.723 0.332 0.928 0.703 0.753 0.785 0.661 0.68 0.225 30 | PaLM-2 (Unicorn) 0.644 0.583 0.674 0.435 0.938 0.702 0.674 0.831 0.677 0.684 0.26 31 | Jamba 1.5 Large 0.637 0.664 0.718 0.394 0.948 0.683 0.692 0.846 0.675 0.698 0.203 32 | Qwen1.5 (72B) 0.608 0.601 0.758 0.417 0.93 0.647 0.683 0.799 0.694 0.67 0.201 33 | Solar Pro 0.602 0.753 0.792 0.297 0.922 0.679 0.567 0.871 0.67 0.698 0.169 34 | Palmyra X V2 (33B) 0.589 0.752 0.752 0.428 0.878 0.621 0.58 0.735 0.644 0.598 0.239 35 | Gemini 1.5 Flash (002) 0.573 0.746 0.718 0.323 0.914 0.679 0.908 0.328 0.67 0.656 0.212 36 | Yi (34B) 0.57 0.782 0.775 0.443 0.92 0.65 0.375 0.648 0.618 0.656 0.172 37 | Gemma 2 Instruct (9B) 0.562 0.768 0.738 0.328 0.91 0.645 0.724 0.762 0.639 0.63 0.201 38 | Qwen1.5 Chat (110B) 0.55 0.721 0.739 0.35 0.922 0.704 0.568 0.815 0.624 0.64 0.192 39 | Qwen1.5 (32B) 0.546 0.589 0.777 0.353 0.932 0.628 0.733 0.773 0.636 0.656 0.193 40 | Claude 3.5 Haiku (20241022) 0.531 0.763 0.639 0.344 0.854 0.671 0.872 0.815 0.631 0.722 0.135 41 | PaLM-2 (Bison) 0.526 0.718 0.813 0.39 0.878 0.608 0.421 0.61 0.645 0.547 0.241 42 | Amazon Nova Micro 0.524 0.744 0.779 0.285 0.888 0.64 0.76 0.794 0.615 0.608 0.192 43 | Claude v1.3 0.518 0.723 0.699 0.409 0.908 0.631 0.54 0.784 0.629 0.618 0.219 44 | Mixtral (8x7B 32K seqlen) 0.51 0.767 0.699 0.427 0.868 0.649 0.494 0.622 0.63 0.652 0.19 45 | Phi-3 (14B) 0.509 0.724 0.729 0.278 0.916 0.675 0.611 0.878 0.593 0.696 0.17 46 | Claude 2.0 0.489 0.718 0.67 0.428 0.862 0.639 0.603 0.583 0.643 0.652 0.219 47 | DeepSeek LLM Chat (67B) 0.488 0.581 0.733 0.412 0.88 0.641 0.615 0.795 0.637 0.628 0.186 48 | Qwen2.5 Instruct Turbo (7B) 0.488 0.742 0.725 0.205 0.862 0.658 0.835 0.83 0.632 0.6 0.155 49 | Llama 2 (70B) 0.482 0.763 0.674 0.46 0.838 0.58 0.323 0.567 0.673 0.618 0.196 50 | Phi-3 (7B) 0.473 0.754 0.675 0.324 0.912 0.659 0.703 - 0.584 0.672 0.154 51 | Yi Large (Preview) 0.471 0.373 0.586 0.428 0.946 0.712 0.712 0.69 0.519 0.66 0.176 52 | Command R Plus 0.441 0.735 0.711 0.343 0.828 0.59 0.403 0.738 0.672 0.567 0.203 53 | GPT-3.5 (text-davinci-003) 0.439 0.731 0.77 0.413 0.828 0.555 0.449 0.615 0.622 0.531 0.191 54 | Claude 2.1 0.437 0.677 0.611 0.375 0.872 0.643 0.632 0.604 0.643 0.644 0.204 55 | Qwen1.5 (14B) 0.425 0.711 0.772 0.3 0.862 0.626 0.686 0.693 0.593 0.515 0.178 56 | Gemini 1.0 Pro (002) 0.422 0.751 0.714 0.391 0.788 0.534 0.665 0.816 0.475 0.483 0.194 57 | Jamba 1.5 Mini 0.414 0.746 0.71 0.388 0.89 0.582 0.318 0.691 0.503 0.632 0.179 58 | Claude Instant 1.2 0.399 0.616 0.731 0.343 0.844 0.631 0.499 0.721 0.586 0.559 0.194 59 | Llama 3 (8B) 0.387 0.754 0.681 0.378 0.766 0.602 0.391 0.499 0.637 0.581 0.183 60 | Claude 3 Sonnet (20240229) 0.377 0.111 0.072 0.028 0.918 0.652 0.084 0.907 0.49 0.684 0.218 61 | GPT-3.5 Turbo (0613) 0.358 0.655 0.678 0.335 0.838 0.614 0.667 0.501 0.528 0.622 0.187 62 | LLaMA (65B) 0.345 0.755 0.672 0.433 0.754 0.584 0.257 0.489 0.48 0.507 0.189 63 | Arctic Instruct 0.338 0.654 0.586 0.39 0.828 0.575 0.519 0.768 0.588 0.581 0.172 64 | Gemma (7B) 0.336 0.752 0.665 0.336 0.808 0.571 0.5 0.559 0.581 0.513 0.187 65 | GPT-3.5 (text-davinci-002) 0.336 0.719 0.71 0.394 0.796 0.568 0.428 0.479 0.58 0.525 0.174 66 | Mistral NeMo (2402) 0.333 0.731 0.65 0.265 0.822 0.604 0.668 0.782 0.415 0.59 0.177 67 | Mistral Large (2402) 0.328 0.454 0.485 0.311 0.894 0.638 0.75 0.694 0.479 0.499 0.182 68 | Command 0.327 0.749 0.777 0.391 0.774 0.525 0.236 0.452 0.578 0.445 0.088 69 | Llama 3.2 Vision Instruct Turbo (11B) 0.325 0.756 0.671 0.234 0.724 0.511 0.739 0.823 0.435 0.27 0.179 70 | Llama 3.1 Instruct Turbo (8B) 0.303 0.756 0.677 0.209 0.74 0.5 0.703 0.798 0.342 0.245 0.181 71 | Command R 0.299 0.742 0.72 0.352 0.782 0.567 0.266 0.551 0.507 0.555 0.149 72 | Mistral v0.1 (7B) 0.292 0.716 0.687 0.367 0.776 0.584 0.297 0.377 0.58 0.525 0.16 73 | DBRX Instruct 0.289 0.488 0.55 0.284 0.91 0.643 0.358 0.671 0.426 0.694 0.131 74 | Mistral Small (2402) 0.288 0.519 0.587 0.304 0.862 0.593 0.621 0.734 0.389 0.616 0.169 75 | Jamba Instruct 0.287 0.658 0.636 0.384 0.796 0.582 0.38 0.67 0.54 0.519 0.164 76 | Qwen1.5 (7B) 0.275 0.448 0.749 0.27 0.806 0.569 0.561 0.6 0.523 0.479 0.153 77 | Mistral Medium (2312) 0.268 0.449 0.468 0.29 0.83 0.618 0.565 0.706 0.452 0.61 0.169 78 | Claude 3 Haiku (20240307) 0.263 0.244 0.252 0.144 0.838 0.662 0.131 0.699 0.46 0.702 0.148 79 | Yi (6B) 0.253 0.702 0.748 0.31 0.8 0.53 0.126 0.375 0.519 0.497 0.117 80 | Llama 2 (13B) 0.233 0.741 0.64 0.371 0.634 0.505 0.102 0.266 0.591 0.392 0.167 81 | Falcon (40B) 0.217 0.671 0.676 0.392 0.662 0.507 0.128 0.267 0.442 0.419 0.162 82 | Jurassic-2 Jumbo (178B) 0.215 0.728 0.65 0.385 0.688 0.483 0.103 0.239 0.533 0.431 0.114 83 | Mistral Instruct v0.3 (7B) 0.196 0.716 0.68 0.253 0.79 0.51 0.289 0.538 0.331 0.517 0.142 84 | Jurassic-2 Grande (17B) 0.172 0.744 0.627 0.35 0.614 0.471 0.064 0.159 0.468 0.39 0.102 85 | Phi-2 0.169 0.703 0.68 0.155 0.798 0.518 0.255 0.581 0.334 0.41 0.038 86 | Llama 2 (7B) 0.152 0.686 0.612 0.333 0.544 0.425 0.097 0.154 0.502 0.392 0.144 87 | Luminous Supreme (70B) 0.145 0.743 0.656 0.299 0.284 0.316 0.078 0.137 0.452 0.276 0.102 88 | Command Light 0.105 0.629 0.686 0.195 0.398 0.386 0.098 0.149 0.397 0.312 0.023 89 | Luminous Extended (30B) 0.078 0.684 0.611 0.253 0.272 0.248 0.04 0.075 0.421 0.276 0.083 90 | Falcon (7B) 0.064 0.621 0.58 0.285 0.26 0.288 0.044 0.055 0.346 0.254 0.094 91 | OLMo (7B) 0.052 0.597 0.603 0.259 0.222 0.305 0.029 0.044 0.341 0.229 0.097 92 | Luminous Base (13B) 0.041 0.633 0.577 0.197 0.286 0.243 0.026 0.028 0.332 0.26 0.066 93 | -------------------------------------------------------------------------------- /benchbench/data/glue/leaderboard.tsv: -------------------------------------------------------------------------------- 1 | Rank Name Model URL Score CoLA SST-2 MRPC STS-B QQP MNLI-m MNLI-mm QNLI RTE WNLI AX 2 | 1 Microsoft Alexander v-team Turing ULR v6 91.3 73.3 97.5 94.2/92.3 93.5/93.1 76.4/90.9 92.5 92.1 96.7 93.6 97.9 55.4 3 | 2 JDExplore d-team Vega v1 91.3 73.8 97.9 94.5/92.6 93.5/93.1 76.7/91.1 92.1 91.9 96.7 92.4 97.9 51.4 4 | 3 Microsoft Alexander v-team Turing NLR v5 91.2 72.6 97.6 93.8/91.7 93.7/93.3 76.4/91.1 92.6 92.4 97.9 94.1 95.9 57.0 5 | 4 DIRL Team DeBERTa + CLEVER 91.1 74.7 97.6 93.3/91.1 93.4/93.1 76.5/91.0 92.1 91.8 96.7 93.2 96.6 53.3 6 | 5 ERNIE Team - Baidu ERNIE 91.1 75.5 97.8 93.9/91.8 93.0/92.6 75.2/90.9 92.3 91.7 97.3 92.6 95.9 51.7 7 | 6 AliceMind & DIRL StructBERT + CLEVER 91.0 75.3 97.7 93.9/91.9 93.5/93.1 75.6/90.8 91.7 91.5 97.4 92.5 95.2 49.1 8 | 7 DeBERTa Team - Microsoft DeBERTa / TuringNLRv4 90.8 71.5 97.5 94.0/92.0 92.9/92.6 76.2/90.8 91.9 91.6 99.2 93.2 94.5 53.2 9 | 8 HFL iFLYTEK MacALBERT + DKM 90.7 74.8 97.0 94.5/92.6 92.8/92.6 74.7/90.6 91.3 91.1 97.8 92.0 94.5 52.6 10 | 9 PING-AN Omni-Sinitic ALBERT + DAAF + NAS 90.6 73.5 97.2 94.0/92.0 93.0/92.4 76.1/91.0 91.6 91.3 97.5 91.7 94.5 51.2 11 | 10 T5 Team - Google T5 90.3 71.6 97.5 92.8/90.4 93.1/92.8 75.1/90.6 92.2 91.9 96.9 92.8 94.5 53.1 12 | 11 Microsoft D365 AI & MSR AI & GATECH MT-DNN-SMART 89.9 69.5 97.5 93.7/91.6 92.9/92.5 73.9/90.2 91.0 90.8 99.2 89.7 94.5 50.2 13 | 12 Huawei Noah's Ark Lab NEZHA-Large 89.8 71.7 97.3 93.3/91.0 92.4/91.9 75.2/90.7 91.5 91.3 96.2 90.3 94.5 47.9 14 | 13 LG AI Research ANNA 89.8 68.7 97.0 92.7/90.1 93.0/92.8 75.3/90.5 91.8 91.6 96.0 91.8 95.9 51.8 15 | 14 Zihang Dai Funnel-Transformer (Ensemble B10-10-10H1024) 89.7 70.5 97.5 93.4/91.2 92.6/92.3 75.4/90.7 91.4 91.1 95.8 90.0 94.5 51.6 16 | 15 ELECTRA Team ELECTRA-Large + Standard Tricks 89.4 71.7 97.1 93.1/90.7 92.9/92.5 75.6/90.8 91.3 90.8 95.8 89.8 91.8 50.7 17 | 16 David Kim 2digit LANet 89.3 71.8 97.3 92.4/89.6 93.0/92.7 75.5/90.5 91.8 91.6 96.4 91.1 88.4 54.6 18 | 17 倪仕文 DropAttack-RoBERTa-large 88.8 70.3 96.7 92.6/90.1 92.1/91.8 75.1/90.5 91.1 90.9 95.3 89.9 89.7 48.2 19 | 18 Microsoft D365 AI & UMD FreeLB-RoBERTa (ensemble) 88.4 68.0 96.8 93.1/90.8 92.3/92.1 74.8/90.3 91.1 90.7 95.6 88.7 89.0 50.1 20 | 19 Junjie Yang HIRE-RoBERTa 88.3 68.6 97.1 93.0/90.7 92.4/92.0 74.3/90.2 90.7 90.4 95.5 87.9 89.0 49.3 21 | 20 Shiwen Ni ELECTRA-large-M (bert4keras) 88.3 69.3 95.8 92.2/89.6 91.2/91.1 75.1/90.5 91.1 90.9 93.8 87.9 91.8 48.2 22 | 21 Facebook AI RoBERTa 88.1 67.8 96.7 92.3/89.8 92.2/91.9 74.3/90.2 90.8 90.2 95.4 88.2 89.0 48.7 23 | 22 Microsoft D365 AI & MSR AI MT-DNN-ensemble 87.6 68.4 96.5 92.7/90.3 91.1/90.7 73.7/89.9 87.9 87.4 96.0 86.3 89.0 42.8 24 | 23 GLUE Human Baselines GLUE Human Baselines 87.1 66.4 97.8 86.3/80.8 92.7/92.6 59.5/80.4 92.0 92.8 91.2 93.6 95.9 - 25 | 24 kk xx ELECTRA-Large-NewSCL(single) 85.6 73.3 97.2 92.7/90.2 92.0/91.7 75.3/90.6 90.8 90.3 95.6 86.9 60.3 50.0 26 | 25 Adrian de Wynter Bort (Alexa AI) 83.6 63.9 96.2 94.1/92.3 89.2/88.3 66.0/85.9 88.1 87.8 92.3 82.7 71.2 51.9 27 | 26 Lab LV ConvBERT base 83.2 67.8 95.7 91.4/88.3 90.4/89.7 73.0/90.0 88.3 87.4 93.2 77.9 65.1 42.9 28 | 27 Stanford Hazy Research Snorkel MeTaL 83.2 63.8 96.2 91.5/88.5 90.1/89.7 73.1/89.9 87.6 87.2 93.9 80.9 65.1 39.9 29 | 28 XLM Systems XLM (English only) 83.1 62.9 95.6 90.7/87.1 88.8/88.2 73.2/89.8 89.1 88.5 94.0 76.0 71.9 44.7 30 | 29 WATCH ME ConvBERT-base-paddle-v1.1 83.1 66.3 95.4 91.6/88.6 90.0/89.2 73.9/90.0 88.2 87.7 93.3 78.2 65.1 9.2 31 | 30 Zhuosheng Zhang SemBERT 82.9 62.3 94.6 91.2/88.3 87.8/86.7 72.8/89.8 87.6 86.3 94.6 84.5 65.1 42.4 32 | 31 Jun Yu mpnet-base-paddle 82.9 60.5 95.9 91.6/88.9 90.8/90.3 72.5/89.7 87.6 86.6 93.3 82.4 65.1 9.2 33 | 32 Danqi Chen SpanBERT (single-task training) 82.8 64.3 94.8 90.9/87.9 89.9/89.1 71.9/89.5 88.1 87.7 94.3 79.0 65.1 45.1 34 | 33 GAL team distilRoBERTa+GAL (6-layer transformer single model) 82.6 60.0 95.3 91.9/89.2 90.0/89.6 73.3/90.0 87.4 86.5 92.7 81.8 65.1 0.0 35 | 34 Kevin Clark BERT + BAM 82.3 61.5 95.2 91.3/88.3 88.6/87.9 72.5/89.7 86.6 85.8 93.1 80.4 65.1 40.7 36 | 35 Nitish Shirish Keskar Span-Extractive BERT on STILTs 82.3 63.2 94.5 90.6/87.6 89.4/89.2 72.2/89.4 86.5 85.8 92.5 79.8 65.1 28.3 37 | 36 LV NUS LV-BERT-base 82.1 64.0 94.7 90.9/87.9 89.4/88.8 72.3/89.5 86.6 86.1 92.6 77.0 65.1 39.5 38 | 37 Jason Phang BERT on STILTs 82.0 62.1 94.3 90.2/86.6 88.7/88.3 71.9/89.4 86.4 85.6 92.7 80.1 65.1 28.3 39 | 38 gao jie 1 82.0 66.8 96.5 90.9/87.2 91.4/90.8 72.9/89.6 90.2 56.4 94.7 82.8 62.3 9.2 40 | 39 Gino Tesei RobustRoBERTa 81.9 63.6 96.8 91.6/88.6 90.3/89.6 73.2/89.7 90.0 89.4 95.1 50.3 80.1 50.5 41 | 40 Karen Hambardzumyan WARP with RoBERTa 81.6 53.9 96.3 88.2/83.9 89.5/88.8 68.6/87.7 88.0 88.2 93.5 84.3 65.1 41.2 42 | 41 Junxiong Wang Bigs-128-1000k 81.5 64.4 94.9 88.7/84.2 87.8/87.5 71.2/89.2 86.1 85.0 91.6 77.6 65.1 36.2 43 | 42 Huawei Noah's Ark Lab MTL CombinedKD-TinyRoBERTa (6 layer 82M parameters, MATE-KD + AnnealingKD) 81.5 58.6 95.1 91.2/88.1 88.5/88.4 73.0/89.7 86.2 85.6 92.4 76.6 65.1 20.2 44 | 43 Richard Bai segaBERT-large 81.4 62.6 94.8 89.7/86.1 88.6/87.7 72.5/89.4 87.9 87.7 94.0 71.6 65.1 0.0 45 | 44 廖亿 u-PMLM-R (Huawei Noah's Ark Lab) 81.3 56.9 94.2 90.7/87.7 89.7/89.1 72.2/89.4 86.1 85.4 92.1 78.5 65.1 40.0 46 | 45 Xinsong Zhang AMBERT-BASE 81.0 60.0 95.2 90.6/87.1 86.3/88.2 72.2/89.5 87.2 86.5 92.6 72.6 65.1 39.4 47 | 46 Mikita Sazanovich Routed BERTs 80.7 56.1 93.6 88.6/84.7 88.0/87.6 71.0/88.8 85.2 84.5 92.6 80.0 65.1 9.2 48 | 47 USCD-AI4Health Team CERT 80.7 58.9 94.6 89.8/85.9 87.9/86.8 72.5/90.3 87.2 86.4 93.0 71.2 65.1 39.6 49 | 48 Jacob Devlin BERT: 24-layers, 16-heads, 1024-hidden 80.5 60.5 94.9 89.3/85.4 87.6/86.5 72.1/89.3 86.7 85.9 92.7 70.1 65.1 39.6 50 | 49 Chen Qian KerasNLP XLM-R 80.4 56.3 96.1 89.8/86.3 88.4/87.7 72.3/89.0 87.7 87.1 92.8 69.2 65.1 40.6 51 | 50 Chen Qian KerasNLP RoBERTa 80.4 56.3 96.1 89.8/86.3 88.4/87.7 72.3/89.0 87.7 87.1 92.8 69.2 65.1 40.6 52 | 51 Jinliang LU MULTIPLE_ADAPTER_T5_BASE 80.3 54.1 93.8 90.1/86.8 87.9/87.6 71.8/88.9 86.1 85.7 93.5 76.8 62.3 9.2 53 | 52 Yoshitomo Matsubara HF bert-large-uncased (default fine-tuning) 80.2 61.5 94.6 89.2/85.2 86.4/85.0 72.2/89.3 86.4 85.7 92.4 68.9 65.1 36.9 54 | 53 Neil Houlsby BERT + Single-task Adapters 80.2 59.2 94.3 88.7/84.3 87.3/86.1 71.5/89.4 85.4 85.0 92.4 71.6 65.1 9.2 55 | 54 KI BERT KI-BERT 80.0 55.6 94.5 88.2/83.9 86.3/85.1 71.5/88.9 85.2 83.7 91.2 69.3 73.3 35.6 56 | 55 Xiangyang Liu elasticbert-large-12L 79.9 57.0 92.9 89.4/86.0 89.7/88.6 72.7/89.6 85.4 84.9 92.3 71.8 62.3 9.2 57 | 56 刘向阳 roberta-large-12L 79.8 59.4 94.6 89.1/85.8 89.8/89.1 71.5/89.4 86.4 85.2 91.6 67.3 62.3 9.2 58 | 57 Zhuohan Li Macaron Net-base 79.7 57.6 94.0 88.4/84.4 87.5/86.3 70.8/89.0 85.4 84.5 91.6 70.5 65.1 38.7 59 | 58 shi To GAT-bert-base 79.6 56.8 94.0 89.4/85.3 87.9/86.8 72.4/89.4 85.7 84.5 91.8 70.5 62.3 9.2 60 | 59 teerapong saelim WT-VAT-BERT (Base) 79.5 56.0 94.4 89.2/85.5 87.3/86.2 72.9/89.8 85.5 84.8 91.4 70.4 62.3 9.2 61 | 60 Anshuman Singh Bert-n-Pals 79.1 52.2 93.4 89.5/85.6 86.6/85.9 71.4/89.0 84.1 83.5 90.6 75.4 62.3 33.8 62 | 61 ANSHUMAN SINGH (RA1811003010460) DeepPavlov Multitask PalBert 78.8 48.1 93.4 88.9/85.6 87.0/86.7 71.4/89.0 83.9 83.4 90.8 76.7 62.3 33.8 63 | 62 xiaok Liu BERT-EMD(6-layer; Single model; No DA) 78.7 47.5 93.3 89.8/86.4 87.6/86.8 72.0/89.3 84.7 83.5 90.7 71.7 65.1 9.2 64 | 63 蘇大鈞 SesameBERT-Base 78.6 52.7 94.2 88.9/84.8 86.5/85.5 70.8/88.8 83.7 83.6 91.0 67.6 65.1 35.8 65 | 64 xinge ma ReptileDistil 78.5 47.9 92.8 89.2/85.4 87.1/85.9 71.0/89.0 83.6 82.9 90.4 73.5 65.1 33.2 66 | 65 MobileBERT Team MobileBERT 78.5 51.1 92.6 88.8/84.5 86.2/84.8 70.5/88.3 84.3 83.4 91.6 70.4 65.1 34.3 67 | 66 Linyuan Gong StackingBERT-Base 78.4 56.2 93.9 88.2/83.9 84.2/82.5 70.4/88.7 84.4 84.2 90.1 67.0 65.1 36.6 68 | 67 TinyBERT Team TinyBERT (6-layer; Single model) 78.1 51.1 93.1 87.3/82.6 85.0/83.7 71.6/89.1 84.6 83.2 90.4 70.0 65.1 9.2 69 | 68 SqueezeBERT Team SqueezeBERT (4.3x faster than BERT-base on smartphone) 78.1 46.5 91.4 89.5/86.0 87.0/86.3 71.5/89.0 82.0 81.1 90.1 73.2 65.1 35.3 70 | 69 Anshuman Singh CAMTL 77.9 53.0 92.6 88.3/84.4 86.6/85.9 70.0/88.5 82.3 82.0 90.5 72.8 58.2 33.8 71 | 70 傅薛林 KRISFU 77.8 52.4 92.5 89.0/84.8 83.7/82.2 70.4/88.6 84.3 83.4 90.9 65.9 65.1 36.1 72 | 71 王上 s0 77.8 46.8 92.9 88.9/84.8 87.2/86.5 71.9/89.1 84.5 83.4 90.8 70.9 60.3 35.3 73 | 72 Stark Tony Pocket GLUE 77.6 49.3 92.4 89.0/84.6 84.9/84.0 70.1/88.7 84.0 82.8 90.1 67.2 65.1 36.1 74 | 73 Pavan Kalyan Reddy Neerudu Pavan Neerudu - BERT 77.6 56.1 93.5 87.6/83.2 85.3/83.8 70.6/88.8 84.0 83.4 90.8 64.0 60.3 34.6 75 | 74 NLC MSR Asia BERT-of-Theseus (6-layer; single model) 77.1 47.8 92.2 87.6/83.2 85.6/84.1 71.6/89.3 82.4 82.1 89.6 66.2 65.1 9.2 76 | 75 Hanxiong Huang Hanxiong Huang 75.9 49.3 93.3 87.1/81.9 83.3/81.7 71.5/89.1 84.8 83.8 91.0 64.1 53.4 9.2 77 | 76 YeonTaek Oh EL-BERT(6-Layer, Single model) 75.6 47.7 91.0 87.8/83.0 81.2/80.2 69.9/88.1 81.8 81.0 90.2 59.9 65.1 31.8 78 | 77 EVS Team Anonymous 74.7 52.6 93.4 87.6/83.2 61.2/59.1 71.8/89.3 83.7 83.2 89.9 65.0 62.3 35.6 79 | 78 Chen Money KerasNLP 12/05/2022 Trial 2 74.6 52.2 93.5 87.8/82.6 84.5/83.1 71.3/89.3 82.3 81.6 89.3 61.7 43.8 32.9 80 | 79 Sinx ZHIYUAN 74.1 57.0 95.2 91.4/88.4 91.1/90.8 24.2/23.7 87.7 87.3 92.5 81.7 47.9 0.3 81 | 80 Tirana Noor Fatyanosa distilbert-base-uncased 73.6 45.8 92.3 87.6/83.1 71.0/71.0 69.6/88.2 81.6 81.3 88.8 54.1 65.1 31.8 82 | 81 Haiqin YANG RefBERT 73.1 47.9 92.9 86.9/81.9 75.0/76.3 61.6/84.4 80.9 80.3 87.3 61.7 54.8 -10.3 83 | 82 Haiqin Yang RefBERT 73.1 47.9 92.9 86.9/81.9 75.0/76.3 61.4/84.2 80.9 80.3 87.3 61.7 54.8 -10.3 84 | 83 Haiqin Yang RefBERT 71.8 36.3 92.9 86.9/81.9 75.0/76.3 61.6/83.8 80.9 80.3 87.3 61.7 54.8 -10.3 85 | 84 Haiqin Yang RefBERT 71.8 36.3 92.9 86.9/81.9 75.0/76.3 61.3/83.6 80.9 80.3 87.3 61.7 54.8 -10.3 86 | 85 公能公能 1111 71.4 35.8 90.1 83.2/75.7 81.0/79.3 68.5/87.5 77.5 77.1 86.7 58.0 56.8 9.2 87 | 86 Jack Hessel Bag-of-words only BoW-BERT (Base) 70.0 14.3 86.7 82.9/75.2 81.8/80.3 68.3/87.5 79.8 79.7 86.2 60.4 65.1 31.0 88 | 87 GLUE Baselines BiLSTM+ELMo+Attn 70.0 33.6 90.4 84.4/78.0 74.2/72.3 63.1/84.3 74.1 74.5 79.8 58.9 65.1 21.7 89 | -------------------------------------------------------------------------------- /benchbench/data/heim/alignment_auto.tsv: -------------------------------------------------------------------------------- 1 | Model/adapter Mean win rate ↑ [ sort ] MS-COCO (base) - Expected CLIP score ↑ [ sort ] MS-COCO (base) - Max CLIP score ↑ [ sort ] Caltech-UCSD Birds-200-2011 - Expected CLIP score ↑ [ sort ] Caltech-UCSD Birds-200-2011 - Max CLIP score ↑ [ sort ] DrawBench (image quality categories) - Expected CLIP score ↑ [ sort ] DrawBench (image quality categories) - Max CLIP score ↑ [ sort ] PartiPrompts (image quality categories) - Expected CLIP score ↑ [ sort ] PartiPrompts (image quality categories) - Max CLIP score ↑ [ sort ] dailydall.e - Expected CLIP score ↑ [ sort ] dailydall.e - Max CLIP score ↑ [ sort ] Landing Page - Expected CLIP score ↑ [ sort ] Landing Page - Max CLIP score ↑ [ sort ] Logos - Expected CLIP score ↑ [ sort ] Logos - Max CLIP score ↑ [ sort ] Magazine Cover Photos - Expected CLIP score ↑ [ sort ] Magazine Cover Photos - Max CLIP score ↑ [ sort ] Common Syntactic Processes - Expected CLIP score ↑ [ sort ] Common Syntactic Processes - Max CLIP score ↑ [ sort ] DrawBench (reasoning categories) - Expected CLIP score ↑ [ sort ] DrawBench (reasoning categories) - Max CLIP score ↑ [ sort ] PartiPrompts (reasoning categories) - Expected CLIP score ↑ [ sort ] PartiPrompts (reasoning categories) - Max CLIP score ↑ [ sort ] Relational Understanding - Expected CLIP score ↑ [ sort ] Relational Understanding - Max CLIP score ↑ [ sort ] Detection (PaintSkills) - Expected CLIP score ↑ [ sort ] Detection (PaintSkills) - Max CLIP score ↑ [ sort ] Winoground - Expected CLIP score ↑ [ sort ] Winoground - Max CLIP score ↑ [ sort ] PartiPrompts (knowledge categories) - Expected CLIP score ↑ [ sort ] PartiPrompts (knowledge categories) - Max CLIP score ↑ [ sort ] DrawBench (knowledge categories) - Expected CLIP score ↑ [ sort ] DrawBench (knowledge categories) - Max CLIP score ↑ [ sort ] TIME's most significant historical figures - Expected CLIP score ↑ [ sort ] TIME's most significant historical figures - Max CLIP score ↑ [ sort ] Demographic Stereotypes - Expected CLIP score ↑ [ sort ] Demographic Stereotypes - Max CLIP score ↑ [ sort ] Mental Disorders - Expected CLIP score ↑ [ sort ] Mental Disorders - Max CLIP score ↑ [ sort ] Inappropriate Image Prompts (I2P) - Expected CLIP score ↑ [ sort ] Inappropriate Image Prompts (I2P) - Max CLIP score ↑ [ sort ] 2 | Dreamlike Diffusion v1.0 (1B) 0.958 27.06 28.593 27.071 27.071 29.446 31.775 28.071 29.793 30.507 32.424 27.751 29.513 26.027 29.048 28.419 30.538 26.146 27.745 27.875 29.624 27.998 30.088 26.46 28.525 25.603 27.195 25.381 27.033 29.205 31.118 30.656 32.961 26.17 27.584 23.22 24.839 22.425 24.541 28.473 31.283 3 | Vintedois (22h) Diffusion model v0.1 (1B) 0.806 26.402 28.169 26.209 26.209 27.42 29.887 27.471 29.319 29.901 31.69 27.1 29.07 23.608 26.569 26.279 28.531 25.379 27.467 27.171 28.953 27.299 29.323 26.099 28.261 25.403 26.972 24.409 26.255 28.728 30.602 29.354 32.072 26.653 28.169 22.62 24.868 22.258 24.71 27.149 30.318 4 | Dreamlike Photoreal v2.0 (1B) 0.779 26.104 27.733 26.597 26.597 28.186 31.06 27.392 29.238 30.289 32.345 26.549 28.668 24.582 27.419 27.462 29.855 24.975 26.689 26.843 28.887 27.163 29.322 26.184 28.312 25.123 26.785 24.136 25.983 28.727 30.62 29.421 32.221 25.907 27.373 22.358 24.193 21.809 24.028 28.009 31.036 5 | Stable Diffusion v2 base (1B) 0.777 26.255 28.052 26.089 26.089 28.923 31.806 27.421 29.388 29.246 31.576 26.535 29.116 24.19 27.284 28.292 31.275 24.731 26.907 27.107 29.492 27.281 29.653 25.839 28.282 25.194 26.976 24.643 27.083 28.105 30.331 29.243 32.155 25.443 27.379 21.82 24.385 21.53 23.717 26.509 30.134 6 | Stable Diffusion v1.5 (1B) 0.767 26.376 28.147 26.699 26.699 27.843 30.343 27.165 29.34 29.81 32.473 26.553 28.714 23.975 26.931 27.09 29.75 24.978 27.21 26.899 29.078 27.103 29.477 25.272 27.929 24.999 26.977 24.372 26.542 28.248 30.414 28.55 31.554 26.033 27.922 22.134 24.162 22.352 24.366 26.867 30.201 7 | DeepFloyd IF X-Large (4.3B) 0.758 25.791 27.653 26.126 26.126 29.691 32.408 27.328 29.176 29.021 31.105 27.448 29.421 25.388 28.423 29.366 32.274 24.795 26.862 26.936 29.1 27.852 29.997 26 28.446 25.404 27.136 23.926 25.873 28.175 30.135 30.038 32.743 24.892 26.678 21.659 23.637 21.497 23.484 25.486 28.971 8 | Stable Diffusion v1.4 (1B) 0.749 26.425 28.28 26.433 26.433 27.713 30.556 27.228 29.406 29.542 32.076 26.881 29.496 23.582 26.615 26.944 29.566 24.719 26.789 27.01 29.074 26.792 29.278 25.399 28.07 25.069 26.812 24.442 26.347 28.135 30.448 28.325 31.246 26.303 28.057 21.926 24.358 22.637 24.595 26.608 29.841 9 | Safe Stable Diffusion weak (1B) 0.742 26.196 27.885 26.68 26.68 27.551 30.098 27.078 29.233 29.577 32.034 26.642 29.103 24.008 26.615 27.201 29.71 24.917 27.028 27.033 29.242 27.208 29.69 25.188 27.827 24.964 26.882 24.337 26.332 28.325 30.628 27.982 30.974 26.24 27.882 22.022 24.128 22.259 24.347 26.63 29.975 10 | DALL-E 2 (3.5B) 0.696 27.102 28.714 25.323 25.323 28.56 30.857 27.841 29.623 29.89 31.886 26.35 28.28 25.798 28.257 22.522 25.246 25.301 26.9 27.909 29.722 28.696 30.6 26.839 28.994 26.79 28.382 24.96 26.679 28.628 30.597 30.613 32.92 22.177 24.177 21.529 23.555 15.062 16.36 20.186 22.735 11 | DALL-E mega (2.6B) 0.695 27.193 29.205 26.752 26.752 26.866 29.347 27.925 30.074 28.761 31.018 21.37 23.327 24.489 26.732 19.366 21.312 25.246 27.135 26.734 29.346 27.289 29.251 25.698 28.074 26.744 28.508 23.913 25.853 28.1 30.427 27.971 30.817 26.743 28.378 22.849 25.288 21.634 24.123 24.502 27.814 12 | DeepFloyd IF Large (0.9B) 0.628 25.504 27.046 25.881 25.881 28.705 32.045 27.06 28.906 28.786 30.828 26.825 28.923 24.417 27.176 28.412 31.832 24.757 26.614 26.461 28.569 27.423 29.715 25.657 27.965 25.421 27.154 23.733 25.577 28.07 29.954 29.009 31.848 23.527 25.425 21.5 23.537 21.097 23.593 25.025 28.5 13 | Stable Diffusion v2.1 base (1B) 0.609 25.861 27.507 26.065 26.065 28.135 30.718 27.205 29.136 29.028 31.617 25.658 27.805 22.697 25.8 25.989 28.949 24.566 26.543 26.575 28.608 26.311 28.921 25.754 27.992 24.851 26.826 23.753 25.658 27.773 29.757 28.658 31.35 25.898 27.535 21.778 24.588 21.329 23.619 26.266 29.768 14 | DeepFloyd IF Medium (0.4B) 0.56 25.517 27.116 25.692 25.692 28.541 31.596 26.739 28.63 28.531 30.875 26.338 28.054 24.225 27.369 27.657 30.839 24.709 26.782 26.251 28.357 27.21 29.424 25.315 27.951 25.52 27.223 23.63 25.592 27.525 29.557 27.928 30.694 21.873 24.283 21.387 23.902 21.487 23.921 24.562 28.175 15 | Openjourney v2 (1B) 0.506 26.807 28.61 25.661 25.661 26.317 29.183 26.448 28.682 28.956 31.465 26.097 28.122 24.803 27.362 24.812 27.158 23.831 26.053 26.398 28.555 25.795 28.251 24.316 27.084 24.932 26.726 22.811 25.188 27.209 29.476 27.328 30.312 24.25 26.373 20.996 23.57 21.123 24.108 25.056 28.469 16 | Safe Stable Diffusion medium (1B) 0.493 25.671 27.676 26.003 26.003 26.563 29.467 26.536 28.739 28.687 31.451 26.207 28.47 23.355 26.011 25.746 28.411 24.023 26.136 26.271 28.558 26.528 29.02 24.358 26.831 24.4 26.331 23.498 25.557 27.725 30.182 27.619 30.716 25.521 27.165 20.949 23.289 19.615 21.402 25.803 29.444 17 | GigaGAN (1B) 0.4 25.722 27.645 26.569 26.569 25.668 27.828 26.589 28.678 28.154 30.582 25.199 27.185 20.775 23.154 20.637 23.247 24.301 26.324 26.328 28.668 26.145 28.237 24.996 27.004 24.391 26.205 23.28 25.362 27.245 29.449 27.433 30.121 23.746 26.019 20.94 23.343 19.091 22.136 24.886 28.073 18 | Promptist + Stable Diffusion v1.4 (1B) 0.369 25.245 27.209 25.207 25.207 24.786 27.488 26.213 28.384 28.776 31.238 24.525 26.693 22.608 25.516 26.105 28.509 23.599 25.233 25.449 27.704 25.462 27.918 23.745 26.363 23.872 25.834 21.741 23.811 27.209 29.446 28.214 30.657 24.663 26.595 20.877 23.337 20.785 23.185 25.051 28.182 19 | Safe Stable Diffusion strong (1B) 0.344 24.787 26.974 25.769 25.769 25.704 28.244 25.758 28.148 27.727 30.772 25.476 27.77 22.65 25.527 24.476 27.406 23.074 25.367 25.522 27.833 25.722 28.394 23.246 25.788 23.898 25.93 22.48 24.761 26.754 29.467 26.842 29.558 24.763 26.864 20.029 22.566 17.414 19.736 23.836 28.104 20 | Redshift Diffusion (1B) 0.244 24.837 26.695 25.407 25.407 25.15 27.975 25.494 27.792 27.753 30.215 24.306 26.406 20.97 23.405 22.523 25.576 22.733 25.258 25.032 27.432 23.822 27.015 23.125 25.863 23.58 25.667 21.284 23.548 26.755 29.104 26.408 28.721 23.341 25.651 20.01 22.322 19.156 20.985 23.825 27.389 21 | DALL-E mini (0.4B) 0.226 25.012 27.029 25.648 25.648 22.838 25.168 25.366 27.615 25.483 27.974 19.796 21.853 22.677 24.914 17.046 19.148 21.89 24.111 23.495 26.067 25.2 27.45 23.145 25.743 24.982 26.838 21.337 23.457 25.301 27.538 24.023 26.368 22.972 24.798 21.042 23.494 21.046 22.96 21.367 24.746 22 | Safe Stable Diffusion max (1B) 0.223 23.859 26.086 25.562 25.562 24.708 27.835 24.852 27.384 26.671 29.919 24.982 27.168 20.657 24.167 22.995 25.918 22.188 24.81 24.662 27.251 24.651 27.675 22.673 25.617 23.494 25.718 21.663 24.438 25.726 28.466 25.257 28.386 24.043 26.374 19.46 22.258 16.703 18.993 21.666 26.659 23 | Openjourney v1 (1B) 0.198 24.894 27.025 24.611 24.611 21.407 24.437 24.868 27.346 27.697 30.206 25.33 27.495 19.74 22.446 22.989 25.714 22.254 24.603 24.223 26.86 23.298 25.935 20.85 23.903 23.859 25.846 20.444 23.165 26.339 28.686 26.85 29.452 24.038 25.885 19.712 22.125 19.291 22.07 23.823 27.315 24 | Lexica Search with Stable Diffusion v1.5 (1B) 0.18 21.961 24.592 22.964 22.964 22.862 25.672 22.769 25.521 23.685 26.341 22.429 25.009 21.818 24.365 21.709 23.701 21.728 24.083 22.618 25.314 22.602 25.234 22.138 24.929 22.894 25.118 21.659 23.818 23.01 25.779 23.298 26.011 22.926 25.289 21.601 24.112 21.228 23.99 24.632 28.751 25 | MultiFusion (13B) 0.163 24.236 26.55 24.601 24.601 23.061 26.272 24.036 26.566 25.655 29.027 22.637 24.586 19.342 22.464 19.955 22.778 22.315 24.757 24.156 26.968 24.073 27.183 22.967 26.243 22.849 24.997 21.331 23.92 24.127 26.652 23.666 27.558 17.701 20.452 19.768 22.997 19.846 23.121 19.239 23.436 26 | CogView2 (6B) 0.085 23.082 25.896 23.656 23.656 22.005 25.267 23.157 26.143 23.247 26.783 18.952 21.609 19.135 21.655 16.365 19.004 21.808 24.546 22.251 25.46 22.854 26.096 22.743 26.103 22.804 25.286 20.928 23.91 21.91 25.171 19.703 23.339 13.897 16.125 20.031 24.113 18.803 21.807 16.534 20.931 27 | minDALL-E (1.3B) 0.045 21.596 25.119 23.908 23.908 22.67 25.628 22.25 25.615 21.644 25.391 17.916 20.329 20.343 23.082 16.728 19.315 19.798 22.602 21.283 24.54 22.225 25.359 20.573 23.678 21.334 24.398 18.825 21.861 22.089 25.715 20.741 24.123 14.138 16.652 19.439 23.03 19.168 21.523 16.836 21.259 28 | --------------------------------------------------------------------------------