├── data └── .gitkeep ├── tests ├── __init__.py ├── testing_utils.py ├── test_evaluate.py ├── test_dummy_benchmark.py └── test_hub.py ├── benchmarks ├── dummy │ ├── __init__.py │ ├── requirements.txt │ └── evaluation.py ├── gem │ ├── __init__.py │ ├── requirements.txt │ └── evaluation.py ├── raft │ ├── __init__.py │ ├── requirements.txt │ └── evaluation.py ├── generic_competition │ ├── __init__.py │ ├── requirements.txt │ └── evaluation.py ├── __init__.py ├── README.md └── registration.py ├── src └── hf_benchmarks │ ├── __init__.py │ ├── file_utils.py │ ├── schemas.py │ └── hub.py ├── .env.example ├── Makefile ├── setup.cfg ├── .github └── workflows │ ├── run_gem_scoring.yml │ ├── test_benchmarks.yaml │ └── run_raft_evaluation.yaml ├── README.md ├── scripts ├── submission_table.py ├── run_evaluation_dummy.py ├── run_evaluation.py └── run_gem_scoring.py ├── .gitignore ├── setup.py └── LICENSE /data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /benchmarks/dummy/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /benchmarks/gem/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /benchmarks/raft/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /benchmarks/generic_competition/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /benchmarks/raft/requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn>=0.24.2 -------------------------------------------------------------------------------- /benchmarks/generic_competition/requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn>=0.24.2 -------------------------------------------------------------------------------- /benchmarks/dummy/requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn>=0.24.2 2 | datasets<=2.2 3 | evaluate==0.1.2 -------------------------------------------------------------------------------- /benchmarks/gem/requirements.txt: -------------------------------------------------------------------------------- 1 | datasets==1.17.0 # DO NOT CHANGE! 2 | gem-metrics @ git+https://github.com/GEM-benchmark/GEM-metrics.git -------------------------------------------------------------------------------- /src/hf_benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | from .file_utils import load_json, save_json 2 | from .hub import get_benchmark_repos, get_model_index, http_get, http_post 3 | from .schemas import Evaluation, Metric, Result, Task 4 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | HF_TOKEN=hf_app_xxx # A "God" token to download private submission repos 2 | AUTOTRAIN_TOKEN=hf_xxx # A Hugging Face access token associated with a valid AutoTrain account 3 | AUTOTRAIN_BACKEND_API=https://api.autotrain.huggingface.co -------------------------------------------------------------------------------- /tests/testing_utils.py: -------------------------------------------------------------------------------- 1 | BOGUS_BENCHMARK_NAME = "bogus" 2 | DUMMY_BENCHMARK_NAME = "dummy" 3 | DUMMY_EVALUATION_ID = "lewtun/benchmarks-dummy-evaluation" 4 | DUMMY_PRIVATE_LABELS_ID = "lewtun/benchmarks-dummy-private-labels" 5 | DUMMY_SUBMISSION_ID = "lewtun/benchmarks-dummy-submission" 6 | -------------------------------------------------------------------------------- /benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | from .registration import Benchmark, registry 2 | 3 | 4 | raft = Benchmark(name="raft") 5 | gem = Benchmark(name="gem") 6 | dummy = Benchmark(name="dummy") 7 | 8 | registry.register_benchmark(raft) 9 | registry.register_benchmark(gem) 10 | registry.register_benchmark(dummy) 11 | -------------------------------------------------------------------------------- /src/hf_benchmarks/file_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | from pathlib import Path 3 | 4 | 5 | def load_json(path): 6 | with open(path, "r") as f: 7 | return json.load(f) 8 | 9 | 10 | def save_json(path, data): 11 | Path(path).parent.mkdir(parents=True, exist_ok=True) 12 | with open(path, "w", encoding="utf-8") as f: 13 | json.dump(data, f) 14 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | style: 2 | python -m black --line-length 119 --target-version py39 . 3 | python -m isort . 4 | 5 | quality: 6 | python -m black --check --line-length 119 --target-version py39 . 7 | python -m isort --check-only . 8 | python -m flake8 --max-line-length 119 9 | 10 | typecheck-benchmarks: 11 | python -m mypy ./benchmarks 12 | 13 | test: 14 | python -m pytest -sv tests/ -------------------------------------------------------------------------------- /src/hf_benchmarks/schemas.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, TypedDict, Union 2 | 3 | 4 | class Metric(TypedDict): 5 | name: str 6 | type: str 7 | value: Union[float, Optional[dict]] 8 | 9 | 10 | class Task(TypedDict): 11 | name: str 12 | type: str 13 | metrics: List[Metric] 14 | 15 | 16 | class Result(TypedDict): 17 | task: Task 18 | 19 | 20 | class Evaluation(TypedDict): 21 | results: List[Result] 22 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [mypy] 2 | [mypy-datasets] 3 | follow_imports = skip 4 | [mypy-hf_benchmarks] 5 | ignore_missing_imports = True 6 | 7 | [isort] 8 | multi_line_output = 3 9 | include_trailing_comma = True 10 | force_grid_wrap = 0 11 | use_parentheses = True 12 | ensure_newline_before_comments = True 13 | line_length = 119 14 | lines_after_imports = 2 15 | 16 | [flake8] 17 | ignore = E203, E501, W503 18 | max-line-length = 119 19 | per-file-ignores = 20 | # imported but unused 21 | __init__.py: F401 -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Hugging Face Benchmarks 2 | 3 | ## AutoTrain configuration details 4 | 5 | Benchmarks are evaluated by AutoTrain, with the payload sent to the `AUTOTRAIN_BACKEND_API` environment variable. The current configuration for the hosted benchmarks is shown in the table below. 6 | 7 | | Benchmark | Backend API | 8 | |:---------:|:----------------------------------------------:| 9 | | RAFT | `https://api.autotrain.huggingface.co` | 10 | | GEM | `https://api.autotrain.huggingface.co` | 11 | -------------------------------------------------------------------------------- /tests/test_evaluate.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import inspect 3 | 4 | from benchmarks import registry 5 | 6 | 7 | # TODO(lewtun): use common.evaluation.evaluate as reference? 8 | EVALUATE_ARGS = {"evaluation_dataset", "submission_dataset", "use_auth_token"} 9 | 10 | 11 | def test_evaluate_signature(): 12 | benchmarks = registry.list_benchmarks() 13 | for benchmark in benchmarks: 14 | evaluate_module = importlib.import_module(f"benchmarks.{benchmark.name}.evaluation") 15 | args = inspect.signature(evaluate_module.compute_metrics).parameters.keys() 16 | assert len(args) == len(EVALUATE_ARGS) and sorted(args) == sorted(EVALUATE_ARGS) 17 | -------------------------------------------------------------------------------- /.github/workflows/run_gem_scoring.yml: -------------------------------------------------------------------------------- 1 | name: Update GEM scores 2 | 3 | on: 4 | schedule: 5 | - cron: '0 * * * *' # Update score every hour 6 | 7 | jobs: 8 | 9 | build: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - name: Checkout code 14 | uses: actions/checkout@v2 15 | 16 | - name: Setup Python Environment 17 | uses: actions/setup-python@v2 18 | with: 19 | python-version: 3.8 20 | 21 | - name: Install requirements 22 | run: pip install '.[cron]' 23 | 24 | - name: Execute scoring script 25 | env: 26 | HF_GEM_TOKEN: ${{ secrets.HF_GEM_TOKEN }} 27 | run: | 28 | HF_GEM_TOKEN=$HF_GEM_TOKEN python scripts/run_gem_scoring.py -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hugging Face Benchmarks 2 | 3 | > A toolkit for evaluating benchmarks on the [Hugging Face Hub](https://huggingface.co) 4 | 5 | ## Hosted benchmarks 6 | 7 | The list of hosted benchmarks is shown in the table below: 8 | 9 | | Benchmark | Description | Submission | Leaderboard | 10 | | :---: | :---: | :---: | :---: | 11 | | RAFT | A benchmark to test few-shot learning in NLP | [`ought/raft-submission`](https://huggingface.co/datasets/ought/raft-submission) | [`ought/raft-leaderboard`](https://huggingface.co/spaces/ought/raft-leaderboard) | 12 | | GEM | A large-scale benchmark for natural language generation | [`GEM/submission-form`](https://huggingface.co/spaces/GEM/submission-form) | [`GEM/results`](https://huggingface.co/spaces/GEM/results) | 13 | 14 | ## Developer installation 15 | 16 | Clone the repository and install the requirements: 17 | 18 | ``` 19 | git clone git@github.com:huggingface/hf_benchmarks.git 20 | cd hf_benchmarks 21 | pip install '.[dev]' 22 | ``` 23 | 24 | -------------------------------------------------------------------------------- /tests/test_dummy_benchmark.py: -------------------------------------------------------------------------------- 1 | import importlib 2 | import os 3 | from unittest import TestCase 4 | 5 | from huggingface_hub import HfFolder 6 | 7 | from .testing_utils import DUMMY_PRIVATE_LABELS_ID, DUMMY_SUBMISSION_ID 8 | 9 | 10 | class DummyBenchmarkTest(TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | """ 14 | Share this valid token in all tests below. Needed for CI 15 | """ 16 | token = os.getenv("HF_TOKEN") 17 | if token: 18 | HfFolder.save_token(token) 19 | 20 | def test_compute_metrics(self): 21 | eval_module = importlib.import_module("benchmarks.dummy.evaluation") 22 | token = HfFolder.get_token() 23 | results = eval_module.compute_metrics(DUMMY_PRIVATE_LABELS_ID, DUMMY_SUBMISSION_ID, use_auth_token=token) 24 | expected_results = { 25 | "results": [ 26 | { 27 | "task": { 28 | "name": "default", 29 | "type": "text-classification", 30 | "metrics": [{"name": "f1", "type": "f1", "value": 0.5}], 31 | } 32 | } 33 | ] 34 | } 35 | self.assertDictEqual(expected_results, results) 36 | -------------------------------------------------------------------------------- /.github/workflows/test_benchmarks.yaml: -------------------------------------------------------------------------------- 1 | name: Benchmark tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | 13 | test_library: 14 | name: Test library 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Checkout code 18 | uses: actions/checkout@v2 19 | - name: Setup Python environment 20 | uses: actions/setup-python@v2 21 | with: 22 | python-version: 3.9 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | python -m pip install ".[tests]" 27 | - name: Run unit tests 28 | env: 29 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 30 | run: pytest 31 | 32 | check_code_quality: 33 | name: Check code quality 34 | runs-on: ubuntu-latest 35 | steps: 36 | - name: Checkout code 37 | uses: actions/checkout@v2 38 | - name: Setup Python environment 39 | uses: actions/setup-python@v2 40 | with: 41 | python-version: 3.9 42 | - name: Install dependencies 43 | run: | 44 | python -m pip install --upgrade pip 45 | python -m pip install ".[quality]" 46 | - name: Code quality 47 | run: | 48 | make quality 49 | make typecheck-benchmarks -------------------------------------------------------------------------------- /benchmarks/registration.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class Benchmark: 6 | name: str 7 | 8 | 9 | class BenchmarkRegistry: 10 | """ 11 | Registry for all registered benchmarks. 12 | """ 13 | 14 | def __init__(self): 15 | self.benchmarks = {} 16 | 17 | def register_benchmark(self, benchmark): 18 | """ 19 | Register a benchmark. 20 | 21 | Args: 22 | benchmark: Benchmark to register. 23 | """ 24 | name = benchmark.name 25 | if name in self.benchmarks: 26 | raise ValueError(f"Benchmark with name {name} already registered.") 27 | self.benchmarks[name] = benchmark 28 | 29 | def get_benchmark(self, name): 30 | """ 31 | Get a registered benchmark. 32 | 33 | Args: 34 | name: Name of the benchmark. 35 | 36 | Returns: 37 | Benchmark with the given name. 38 | """ 39 | if name not in self.benchmarks: 40 | raise ValueError("Benchmark with name {} not registered.".format(name)) 41 | return self.benchmarks[name] 42 | 43 | def list_benchmarks(self): 44 | """ 45 | List all registered benchmarks. 46 | 47 | Returns: 48 | List of all registered benchmarks. 49 | """ 50 | return list(self.benchmarks.values()) 51 | 52 | 53 | registry = BenchmarkRegistry() 54 | -------------------------------------------------------------------------------- /scripts/submission_table.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import pandas as pd 4 | import typer 5 | 6 | from hf_benchmarks import get_benchmark_repos 7 | 8 | 9 | app = typer.Typer() 10 | 11 | 12 | @app.command() 13 | def run( 14 | benchmark: str, 15 | repo_type: str = "prediction", 16 | start_date: str = None, 17 | end_date: str = None, 18 | save_path: str = "./data", 19 | ): 20 | if start_date is None or end_date is None: 21 | default_start_time = pd.Timestamp.now() 22 | default_end_time = pd.Timestamp.now() - pd.Timedelta(days=7) 23 | typer.echo( 24 | f"Submission window not provided, so using past week from {default_start_time.date()} as default window" 25 | ) 26 | start_date = str(default_start_time.date()) 27 | end_date = str(default_end_time.date()) 28 | 29 | submissions = get_benchmark_repos( 30 | benchmark=benchmark, 31 | use_auth_token=True, 32 | repo_type=repo_type, 33 | start_date=start_date, 34 | end_date=end_date, 35 | ) 36 | typer.echo(f"Found {len(submissions)} submissions for evaluation!") 37 | df = pd.DataFrame(submissions) 38 | file_path = os.path.join(save_path, f"{benchmark}_submissions_{start_date}_{end_date}.csv") 39 | df.to_csv(file_path, index=False) 40 | typer.echo(f"Saved submissions to {os.path.abspath(file_path)}") 41 | 42 | 43 | if __name__ == "__main__": 44 | app() 45 | -------------------------------------------------------------------------------- /.github/workflows/run_raft_evaluation.yaml: -------------------------------------------------------------------------------- 1 | name: Run RAFT evaluation 2 | 3 | on: 4 | workflow_dispatch: 5 | schedule: 6 | - cron: '0 0 * * SUN' # Run evaluation at midnight every Sunday 7 | 8 | jobs: 9 | 10 | build: 11 | runs-on: ubuntu-latest 12 | 13 | steps: 14 | - name: Checkout code 15 | uses: actions/checkout@v2 16 | 17 | - name: Setup Python Environment 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: 3.8 21 | 22 | - name: Install requirements 23 | run: pip install '.[cron]' 24 | 25 | - name: Get current date 26 | uses: 1466587594/get-current-time@v2 27 | id: current-time 28 | with: 29 | format: YYYYMMDD-HH 30 | utcOffset: "+08:00" 31 | 32 | - name: Execute evaluation script 33 | env: 34 | YEAR: "${{ steps.current-time.outputs.year }}" 35 | MONTH: "${{ steps.current-time.outputs.month }}" 36 | DAY: "${{ steps.current-time.outputs.day }}" 37 | HF_TOKEN: ${{ secrets.HF_TOKEN }} 38 | AUTOTRAIN_USERNAME : ${{ secrets.AUTOTRAIN_USERNAME }} 39 | AUTOTRAIN_TOKEN: ${{ secrets.AUTOTRAIN_TOKEN }} 40 | AUTOTRAIN_BACKEND_API: ${{ secrets.AUTOTRAIN_BACKEND_API }} 41 | run: | 42 | HF_TOKEN=$HF_TOKEN AUTOTRAIN_USERNAME=$AUTOTRAIN_USERNAME AUTOTRAIN_TOKEN=$AUTOTRAIN_TOKEN AUTOTRAIN_BACKEND_API=$AUTOTRAIN_BACKEND_API python scripts/run_evaluation.py raft ought/raft-private-labels $YEAR-$MONTH-$DAY 7 -------------------------------------------------------------------------------- /benchmarks/dummy/evaluation.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset 2 | from evaluate import load # type: ignore 3 | 4 | from hf_benchmarks import Evaluation, Metric, Result, Task 5 | 6 | 7 | def compute_metrics(evaluation_dataset: str, submission_dataset: str, use_auth_token: str) -> Evaluation: 8 | """Computes metrics for a benchmark. 9 | 10 | Args: 11 | evaluation_dataset (:obj:`str`): Name of private dataset with ground truth labels. 12 | submission_dataset (:obj:`str`): Name of user submission dataset with model predictions. 13 | use_auth_token (:obj:`str`): The API token to access your private dataset on the Hugging Face Hub. 14 | 15 | Returns: 16 | evaluation (:obj:`Evaluation`): The evaluation metrics. 17 | """ 18 | 19 | # Load datasets associated with benchmark 20 | evaluation_ds = load_dataset(evaluation_dataset, use_auth_token=use_auth_token, split="test") 21 | submission_ds = load_dataset(submission_dataset, use_auth_token=use_auth_token, split="test") 22 | # Load metric 23 | f1 = load("f1") 24 | # Define container to store metrics 25 | evaluation = Evaluation(results=[]) 26 | # Compute metrics and build up list of dictionaries, one per task in the benchmark 27 | task_data = Task(name="default", type="text-classification", metrics=[]) 28 | scores = f1.compute( 29 | predictions=submission_ds["label"], 30 | references=evaluation_ds["label"], 31 | average="macro", 32 | ) 33 | for k, v in scores.items(): 34 | task_data["metrics"].append(Metric(name=k, type=k, value=v)) 35 | # Collect results 36 | result = Result(task=task_data) 37 | evaluation["results"].append(result) 38 | 39 | return evaluation 40 | -------------------------------------------------------------------------------- /benchmarks/gem/evaluation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import subprocess 3 | from typing import List 4 | 5 | from huggingface_hub import hf_hub_download # type: ignore 6 | 7 | 8 | def compute_metrics(evaluation_dataset: str, submission_dataset: str, use_auth_token: str) -> List[dict]: 9 | """Computes metrics for a benchmark. 10 | 11 | Args: 12 | evaluation_dataset (:obj:`str`): Name of private dataset with ground truth labels. 13 | submission_dataset (:obj:`str`): Name of user submission dataset with model predictions. 14 | use_auth_token (:obj:`str`): The API token to access your private dataset on the Hugging Face Hub. 15 | 16 | Returns: 17 | metrics (:obj:`List[dict]`): The evaluation metrics. 18 | """ 19 | # AutoTrain runs the evaluation job inside a Docker container, so we need to 20 | # save the metrics in the root directory to avoid permission errors. 21 | metrics_filepath = "/app/metrics.json" 22 | # This assumes that the GEM submissions are a single file, with a predefined name 23 | # We'll need to enforce this on the submission repositories 24 | submission_filename = "submission.json" 25 | submission_filepath = hf_hub_download( 26 | repo_id=submission_dataset, filename=submission_filename, repo_type="dataset", use_auth_token=use_auth_token 27 | ) 28 | # gem_metrics automatically downloads the evaluation splits from the Hub 29 | process = subprocess.run( 30 | ["gem_metrics", f"{submission_filepath}", "-o", f"{metrics_filepath}"], stdout=subprocess.PIPE 31 | ) 32 | if process.returncode == -1: 33 | raise ValueError(f"Error running gem_metrics for submission {submission_dataset} on {evaluation_dataset}!") 34 | else: 35 | with open(metrics_filepath, "r") as f: 36 | metrics = json.load(f) 37 | 38 | return [metrics] 39 | -------------------------------------------------------------------------------- /benchmarks/raft/evaluation.py: -------------------------------------------------------------------------------- 1 | from datasets import get_dataset_config_names, load_dataset, load_metric 2 | 3 | from hf_benchmarks import Evaluation, Metric, Result, Task 4 | 5 | 6 | def compute_metrics(evaluation_dataset: str, submission_dataset: str, use_auth_token: str) -> Evaluation: 7 | """Computes metrics for a benchmark. 8 | 9 | Args: 10 | evaluation_dataset (:obj:`str`): Name of private dataset with ground truth labels. 11 | submission_dataset (:obj:`str`): Name of user submission dataset with model predictions. 12 | use_auth_token (:obj:`str`): The API token to access your private dataset on the Hugging Face Hub. 13 | 14 | Returns: 15 | evaluation (:obj:`Evaluation`): The evaluation metrics. 16 | """ 17 | 18 | # We need to use the public dataset to get the task names 19 | tasks = get_dataset_config_names("ought/raft") 20 | # Load metric 21 | f1 = load_metric("f1") 22 | # Define container to store metrics 23 | evaluation = Evaluation(results=[]) 24 | # Iterate over tasks and build up metrics 25 | for task in sorted(tasks): 26 | task_data = Task(name=task, type="text-classification", metrics=[]) 27 | # Load datasets associated with task 28 | evaluation_ds = load_dataset(path=evaluation_dataset, name=task, use_auth_token=use_auth_token, split="test") 29 | submission_ds = load_dataset(path=submission_dataset, name=task, use_auth_token=use_auth_token, split="test") 30 | # Sort IDs to ensure we compare the correct examples 31 | evaluation_ds = evaluation_ds.sort("ID") 32 | submission_ds = submission_ds.sort("ID") 33 | # Compute metrics and build up list of dictionaries, one per task in the benchmark 34 | scores = f1.compute( 35 | predictions=submission_ds["Label"], 36 | references=evaluation_ds["Label"], 37 | average="macro", 38 | ) 39 | for k, v in scores.items(): 40 | task_data["metrics"].append(Metric(name=k, type=k, value=v)) 41 | # Collect results 42 | result = Result(task=task_data) 43 | evaluation["results"].append(result) 44 | 45 | return evaluation 46 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # mac OS 132 | .DS_Store 133 | 134 | # Repo settings 135 | data/ 136 | -------------------------------------------------------------------------------- /tests/test_hub.py: -------------------------------------------------------------------------------- 1 | import os 2 | from unittest import TestCase 3 | 4 | import pandas as pd 5 | from huggingface_hub import HfFolder 6 | 7 | from hf_benchmarks import get_benchmark_repos 8 | 9 | from .testing_utils import BOGUS_BENCHMARK_NAME, DUMMY_BENCHMARK_NAME, DUMMY_EVALUATION_ID, DUMMY_SUBMISSION_ID 10 | 11 | 12 | class GetBenchmarkReposTest(TestCase): 13 | @classmethod 14 | def setUpClass(cls): 15 | """ 16 | Share this valid token in all tests below. Needed for CI 17 | """ 18 | token = os.getenv("HF_TOKEN") 19 | if token: 20 | HfFolder.save_token(token) 21 | 22 | def test_no_datasets_repo(self): 23 | data = get_benchmark_repos(benchmark=BOGUS_BENCHMARK_NAME, use_auth_token=True, repo_type="prediction") 24 | self.assertEqual(len(data), 0) 25 | 26 | def test_prediction_repo(self): 27 | data = get_benchmark_repos(benchmark=DUMMY_BENCHMARK_NAME, use_auth_token=True, repo_type="prediction") 28 | self.assertEqual(len(data), 1) 29 | self.assertEqual(data[0].id, DUMMY_SUBMISSION_ID) 30 | 31 | def test_evaluation_repo(self): 32 | data = get_benchmark_repos(benchmark=DUMMY_BENCHMARK_NAME, use_auth_token=True, repo_type="evaluation") 33 | self.assertEqual(data[0].id, DUMMY_EVALUATION_ID) 34 | 35 | def test_repo_in_submission_window(self): 36 | repo = get_benchmark_repos(benchmark=DUMMY_BENCHMARK_NAME, use_auth_token=True, repo_type="prediction") 37 | submission_time = pd.to_datetime(repo[0].lastModified) 38 | start_date = (submission_time - pd.Timedelta(days=1)).date() 39 | end_date = (submission_time + pd.Timedelta(days=1)).date() 40 | data = get_benchmark_repos( 41 | benchmark=DUMMY_BENCHMARK_NAME, 42 | use_auth_token=True, 43 | repo_type="prediction", 44 | start_date=start_date, 45 | end_date=end_date, 46 | ) 47 | self.assertEqual(len(data), 1) 48 | self.assertEqual(data[0].id, DUMMY_SUBMISSION_ID) 49 | 50 | def test_repo_outside_submission_window(self): 51 | repo = get_benchmark_repos(benchmark=DUMMY_BENCHMARK_NAME, use_auth_token=True, repo_type="prediction") 52 | submission_time = pd.to_datetime(repo[0].lastModified) 53 | start_date = (submission_time + pd.Timedelta(days=1)).date() 54 | end_date = (submission_time + pd.Timedelta(days=2)).date() 55 | data = get_benchmark_repos( 56 | benchmark=DUMMY_BENCHMARK_NAME, 57 | use_auth_token=True, 58 | repo_type="prediction", 59 | start_date=start_date, 60 | end_date=end_date, 61 | ) 62 | self.assertEqual(len(data), 0) 63 | -------------------------------------------------------------------------------- /benchmarks/generic_competition/evaluation.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pandas as pd # type: ignore 4 | from huggingface_hub import hf_hub_download # type: ignore 5 | from sklearn import metrics # type: ignore 6 | 7 | 8 | def compute_metrics(evaluation_dataset: str, submission_dataset: str, use_auth_token: str, **kwargs): 9 | """Computes metrics for a benchmark. 10 | 11 | Args: 12 | evaluation_dataset (:obj:`str`): Name of private dataset with ground truth labels. 13 | submission_dataset (:obj:`str`): Name of user submission dataset with model predictions. 14 | use_auth_token (:obj:`str`): The API token to access your private dataset on the Hugging Face Hub. 15 | 16 | Returns: 17 | evaluation (:obj:`Evaluation`): The evaluation metrics. 18 | """ 19 | 20 | user_id = kwargs.get("user_id", None) 21 | if user_id is None: 22 | raise ValueError("user_id is required") 23 | submission_id = kwargs.get("submission_id", None) 24 | if submission_id is None: 25 | raise ValueError("submission_id is required") 26 | 27 | eval_fname = hf_hub_download( 28 | repo_id=evaluation_dataset, 29 | filename="solution.csv", 30 | use_auth_token=use_auth_token, 31 | repo_type="dataset", 32 | ) 33 | # download conf 34 | conf_fname = hf_hub_download( 35 | repo_id=evaluation_dataset, 36 | filename="conf.json", 37 | use_auth_token=use_auth_token, 38 | repo_type="dataset", 39 | ) 40 | 41 | # read conf json 42 | with open(conf_fname, "r") as f: 43 | conf = json.load(f) 44 | 45 | metric = conf["EVAL_METRIC"] 46 | 47 | eval_df = pd.read_csv(eval_fname) 48 | 49 | submission_filename = f"submissions/{user_id}-{submission_id}.csv" 50 | sub_fname = hf_hub_download( 51 | repo_id=submission_dataset, 52 | filename=submission_filename, 53 | use_auth_token=use_auth_token, 54 | repo_type="dataset", 55 | ) 56 | sub_df = pd.read_csv(sub_fname) 57 | 58 | # fetch the metric function 59 | _metric = getattr(metrics, metric) 60 | 61 | public_ids = eval_df[eval_df.split == "public"].id.values 62 | private_ids = eval_df[eval_df.split == "private"].id.values 63 | 64 | target_cols = [col for col in eval_df.columns if col not in ["id", "split"]] 65 | public_score = _metric( 66 | eval_df[eval_df.id.isin(public_ids)][target_cols], 67 | sub_df[sub_df.id.isin(public_ids)][target_cols], 68 | ) 69 | private_score = _metric( 70 | eval_df[eval_df.id.isin(private_ids)][target_cols], 71 | sub_df[sub_df.id.isin(private_ids)][target_cols], 72 | ) 73 | 74 | evaluation = { 75 | "public_score": public_score, 76 | "private_score": private_score, 77 | } 78 | return evaluation 79 | -------------------------------------------------------------------------------- /src/hf_benchmarks/hub.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Union 2 | 3 | import pandas as pd 4 | import requests 5 | import typer 6 | from huggingface_hub import HfApi, list_datasets 7 | 8 | 9 | def delete_repos(repository_ids: List[str], auth_token: str, repo_type: str = "dataset") -> None: 10 | typer.echo(f"Found {len(repository_ids)} repos to delete") 11 | for repo_id in repository_ids: 12 | org, name = repo_id.split("/") 13 | HfApi().delete_repo(token=auth_token, organization=org, name=name, repo_type=repo_type) 14 | typer.echo(f"Deleted repo: {repo_id}") 15 | 16 | 17 | def is_time_between(begin_time: str, end_time: str, check_time: str = None) -> bool: 18 | # Adapted from: https://stackoverflow.com/questions/10048249/how-do-i-determine-if-current-time-is-within-a-specified-range-using-pythons-da 19 | # If check time is not given, default to current UTC time 20 | begin_time = pd.to_datetime(begin_time).tz_localize("UTC") 21 | end_time = pd.to_datetime(end_time).tz_localize("UTC") 22 | check_time = pd.to_datetime(check_time) or pd.Timestamp.now() 23 | if begin_time < end_time: 24 | return check_time >= begin_time and check_time <= end_time 25 | else: # crosses midnight 26 | return check_time >= begin_time or check_time <= end_time 27 | 28 | 29 | def get_benchmark_repos( 30 | benchmark: str, 31 | use_auth_token: Union[bool, str, None] = None, 32 | repo_type: str = "prediction", 33 | start_date: Union[str, pd.Timestamp] = None, 34 | end_date: Union[str, pd.Timestamp] = None, 35 | ) -> List[Dict]: 36 | """Gets the metadata associated with benchmark submission and evaluation repositories. 37 | 38 | Args: 39 | benchmark: The benchmark name. 40 | auth_token: The authentication token for the Hugging Face Hub 41 | repo_type: The type of benchmark repository. Can be `prediction`, `model` or `evaluation`. 42 | start_date: The timestamp for the start of the submission window. 43 | end_date: The timestamp for the end of the submission window. 44 | 45 | Returns: 46 | The benchmark repositories' metadata of a given `repo_type`. 47 | """ 48 | submissions_to_evaluate = [] 49 | submissions = list_datasets(filter=f"benchmark:{benchmark}", full=True, use_auth_token=use_auth_token) 50 | 51 | # Filter for repos that fall within submission window 52 | if start_date and end_date: 53 | submissions = [ 54 | submission for submission in submissions if is_time_between(start_date, end_date, submission.lastModified) 55 | ] 56 | 57 | for submission in submissions: 58 | # Filter submission templates which have the submission_name="none" default value 59 | card_data = submission.cardData 60 | if ( 61 | card_data.get("benchmark") == benchmark 62 | and card_data.get("submission_name") != "none" 63 | and card_data.get("type") == repo_type 64 | ): 65 | submissions_to_evaluate.append(submission) 66 | 67 | return submissions_to_evaluate 68 | 69 | 70 | def get_model_index(submissions): 71 | all_scores = [] 72 | for submission in submissions: 73 | card_data = submission.cardData 74 | scores = card_data["model-index"][0] 75 | all_scores.append(scores) 76 | return all_scores 77 | 78 | 79 | def get_auth_headers(token: str, prefix: str = "Bearer"): 80 | return {"Authorization": f"{prefix} {token}"} 81 | 82 | 83 | def http_post(path: str, token: str, payload=None, domain: str = None, params=None) -> requests.Response: 84 | """HTTP POST request to the AutoNLP API, raises UnreachableAPIError if the API cannot be reached""" 85 | try: 86 | response = requests.post( 87 | url=domain + path, json=payload, headers=get_auth_headers(token=token), allow_redirects=True, params=params 88 | ) 89 | except requests.exceptions.ConnectionError: 90 | print("❌ Failed to reach AutoNLP API, check your internet connection") 91 | response.raise_for_status() 92 | return response 93 | 94 | 95 | def http_get( 96 | path: str, 97 | token: str, 98 | domain: str = None, 99 | ) -> requests.Response: 100 | """HTTP POST request to the AutoNLP API, raises UnreachableAPIError if the API cannot be reached""" 101 | try: 102 | response = requests.get(url=domain + path, headers=get_auth_headers(token=token), allow_redirects=True) 103 | except requests.exceptions.ConnectionError: 104 | print("❌ Failed to reach AutoNLP API, check your internet connection") 105 | response.raise_for_status() 106 | return response 107 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Lint as: python3 2 | """ Hugging Face Benchmarks is an open-source library for evaluating machine learning benchmarks. 3 | 4 | Note: 5 | 6 | VERSION needs to be formatted following the MAJOR.MINOR.PATCH convention 7 | (we need to follow this convention to be able to retrieve versioned scripts) 8 | 9 | Simple check list for release from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py 10 | 11 | To create the package for pypi. 12 | 13 | 1. Change the version in __init__.py, setup.py as well as docs/source/conf.py. 14 | 15 | 2. Commit these changes with the message: "Release: VERSION" 16 | 17 | 3. Add a tag in git to mark the release: "git tag VERSION -m'Adds tag VERSION for pypi' " 18 | Push the tag to git: git push --tags origin master 19 | 20 | 4. Build both the sources and the wheel. Do not change anything in setup.py between 21 | creating the wheel and the source distribution (obviously). 22 | 23 | For the wheel, run: "python setup.py bdist_wheel" in the top level directory. 24 | (this will build a wheel for the python version you use to build it). 25 | 26 | For the sources, run: "python setup.py sdist" 27 | You should now have a /dist directory with both .whl and .tar.gz source versions. 28 | 29 | 5. Check that everything looks correct by uploading the package to the pypi test server: 30 | 31 | twine upload dist/* -r pypitest 32 | (pypi suggest using twine as other methods upload files via plaintext.) 33 | You may have to specify the repository url, use the following command then: 34 | twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/ 35 | 36 | Check that you can install it in a virtualenv by running: 37 | pip install -i https://testpypi.python.org/pypi datasets 38 | 39 | 6. Upload the final version to actual pypi: 40 | twine upload dist/* -r pypi 41 | 42 | 7. Fill release notes in the tag in github once everything is looking hunky-dory. 43 | 44 | 8. Update the documentation commit in .circleci/deploy.sh for the accurate documentation to be displayed 45 | Update the version mapping in docs/source/_static/js/custom.js with utils/release.py, 46 | and set version to X.X.X+1.dev0 (e.g. 1.8.0 -> 1.8.1.dev0) in setup.py and __init__.py 47 | 48 | """ 49 | 50 | from pathlib import Path 51 | 52 | from setuptools import find_packages, setup 53 | 54 | 55 | DOCLINES = __doc__.split("\n") 56 | 57 | # We must upper bound the datasets version to match that in the AutoTrain backend 58 | REQUIRED_PKGS = [ 59 | "datasets<=2.2", 60 | "typer>=0.3.2", 61 | "click==8.0", 62 | "python-dotenv>=0.18.0", 63 | "evaluate==0.1.2", 64 | "scikit-learn==1.1.1", 65 | "huggingface-hub==0.10.1", 66 | ] 67 | 68 | QUALITY_REQUIRE = ["black", "flake8", "isort", "pyyaml>=5.3.1", "mypy", "types-requests"] 69 | 70 | TESTS_REQUIRE = ["pytest", "pytest-cov"] 71 | 72 | EXTRAS_REQUIRE = {"quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE} 73 | 74 | 75 | def combine_requirements(base_keys): 76 | return list(set(k for v in base_keys for k in EXTRAS_REQUIRE[v])) 77 | 78 | 79 | EXTRAS_REQUIRE["dev"] = combine_requirements([k for k in EXTRAS_REQUIRE]) 80 | EXTRAS_REQUIRE["cron"] = ["requests"] 81 | 82 | benchmark_dependencies = list(Path("benchmarks/").glob("**/requirements.txt")) 83 | for benchmark in benchmark_dependencies: 84 | with open(benchmark, "r") as f: 85 | deps = f.read().splitlines() 86 | EXTRAS_REQUIRE[benchmark.parent.name] = deps 87 | 88 | setup( 89 | name="hf_benchmarks", 90 | version="0.0.1", 91 | description=DOCLINES[0], 92 | long_description="\n".join(DOCLINES[2:]), 93 | author="HuggingFace Inc.", 94 | author_email="lewis@huggingface.co", 95 | url="https://github.com/huggingface/hf_benchmarks", 96 | download_url="https://github.com/huggingface/hf_benchmarks/tags", 97 | license="Apache 2.0", 98 | package_dir={"": "src"}, 99 | packages=find_packages("src"), 100 | install_requires=REQUIRED_PKGS, 101 | extras_require=EXTRAS_REQUIRE, 102 | classifiers=[ 103 | "Development Status :: 1 - Planning", 104 | "Intended Audience :: Developers", 105 | "Intended Audience :: Education", 106 | "Intended Audience :: Science/Research", 107 | "License :: OSI Approved :: Apache Software License", 108 | "Operating System :: OS Independent", 109 | "Programming Language :: Python :: 3", 110 | "Programming Language :: Python :: 3.8", 111 | "Programming Language :: Python :: 3.9", 112 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 113 | ], 114 | keywords="machine learning benchmarks evaluation metrics", 115 | zip_safe=False, # Required for mypy to find the py.typed file 116 | ) 117 | -------------------------------------------------------------------------------- /scripts/run_evaluation_dummy.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | import pandas as pd 5 | import typer 6 | from dotenv import load_dotenv 7 | 8 | from hf_benchmarks import get_benchmark_repos, http_get, http_post 9 | 10 | 11 | if Path(".env").is_file(): 12 | load_dotenv(".env") 13 | 14 | HF_TOKEN = os.getenv("HF_TOKEN") 15 | AUTOTRAIN_TOKEN = os.getenv("AUTOTRAIN_TOKEN") 16 | AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME") 17 | AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API") 18 | 19 | app = typer.Typer() 20 | 21 | 22 | @app.command() 23 | def run( 24 | benchmark: str = "dummy", 25 | evaluation_dataset: str = "lewtun/benchmarks-dummy-private-labels", 26 | end_date: str = "2022-06-22", 27 | previous_days: int = 7, 28 | ): 29 | start_date = pd.to_datetime(end_date) - pd.Timedelta(days=previous_days) 30 | typer.echo(f"Evaluating submissions on benchmark {benchmark} from {start_date} to {end_date}") 31 | submissions = get_benchmark_repos(benchmark, use_auth_token=HF_TOKEN, start_date=start_date, end_date=end_date) 32 | typer.echo(f"Found {len(submissions)} submissions to evaluate on benchmark {benchmark}") 33 | for submission in submissions: 34 | submission_dataset = submission.id 35 | typer.echo(f"Evaluating submission {submission_dataset}") 36 | card_data = submission.cardData 37 | # Format submission name to comply with AutoTrain API 38 | # _XXX_ for spaces, _DDD_ for double dashes 39 | # TODO: remove these dirty hacks - should really apply validation at submission time! 40 | submission_name = card_data.get("submission_name").replace(" ", "_XXX_") 41 | submission_name = submission_name.replace("--", "_DDD_") 42 | # Extract submission timestamp and convert to Unix epoch in nanoseconds 43 | timestamp = pd.to_datetime(submission.lastModified) 44 | submission_timestamp = int(timestamp.tz_localize(None).timestamp()) 45 | # Use the user-generated submission name, Git commit SHA and timestamp to create submission ID 46 | submission_id = submission_name + "__" + submission.sha[:6] + "__" + str(submission_timestamp) 47 | # Define AutoTrain payload 48 | project_config = {} 49 | # Need a dummy dataset to use the dataset loader in AutoTrain 50 | project_config["dataset_name"] = "lewtun/imdb-dummy" 51 | project_config["dataset_config"] = "lewtun--imdb-dummy" 52 | project_config["dataset_split"] = "train" 53 | project_config["col_mapping"] = {"text": "text", "label": "target"} 54 | # Specify benchmark parameters 55 | project_config["dataset"] = evaluation_dataset 56 | project_config["model"] = benchmark 57 | project_config["submission_dataset"] = submission_dataset 58 | 59 | # Create project 60 | payload = { 61 | "username": AUTOTRAIN_USERNAME, 62 | "proj_name": submission_id, 63 | "task": 1, 64 | "config": { 65 | "language": "en", 66 | "max_models": 5, 67 | "instance": { 68 | "provider": "aws", 69 | "instance_type": "ml.g4dn.4xlarge", 70 | "max_runtime_seconds": 172800, 71 | "num_instances": 1, 72 | "disk_size_gb": 150, 73 | }, 74 | "benchmark": { 75 | "dataset": project_config["dataset"], 76 | "model": project_config["model"], 77 | "submission_dataset": project_config["submission_dataset"], 78 | }, 79 | }, 80 | } 81 | project_json_resp = http_post( 82 | path="/projects/create", payload=payload, token=AUTOTRAIN_TOKEN, domain=AUTOTRAIN_BACKEND_API 83 | ).json() 84 | typer.echo(f"Project creation: {project_json_resp}") 85 | 86 | # Upload data 87 | payload = { 88 | "split": 4, 89 | "col_mapping": project_config["col_mapping"], 90 | "load_config": {"max_size_bytes": 0, "shuffle": False}, 91 | } 92 | data_json_resp = http_post( 93 | path=f"/projects/{project_json_resp['id']}/data/{project_config['dataset_name']}", 94 | payload=payload, 95 | token=AUTOTRAIN_TOKEN, 96 | domain=AUTOTRAIN_BACKEND_API, 97 | params={ 98 | "type": "dataset", 99 | "config_name": project_config["dataset_config"], 100 | "split_name": project_config["dataset_split"], 101 | }, 102 | ).json() 103 | typer.echo(f"Dataset creation: {data_json_resp}") 104 | 105 | # Run training 106 | train_json_resp = http_get( 107 | path=f"/projects/{project_json_resp['id']}/data/start_process", 108 | token=AUTOTRAIN_TOKEN, 109 | domain=AUTOTRAIN_BACKEND_API, 110 | ).json() 111 | typer.echo(f"Training job response: {train_json_resp}") 112 | 113 | 114 | if __name__ == "__main__": 115 | app() 116 | -------------------------------------------------------------------------------- /scripts/run_evaluation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | from pathlib import Path 4 | 5 | import pandas as pd 6 | import typer 7 | from dotenv import load_dotenv 8 | 9 | from hf_benchmarks import get_benchmark_repos, http_get, http_post 10 | 11 | 12 | if Path(".env").is_file(): 13 | load_dotenv(".env") 14 | 15 | HF_TOKEN = os.getenv("HF_TOKEN") 16 | AUTOTRAIN_TOKEN = os.getenv("AUTOTRAIN_TOKEN") 17 | AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME") 18 | AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API") 19 | 20 | app = typer.Typer() 21 | 22 | 23 | @app.command() 24 | def run(benchmark: str, evaluation_dataset: str, end_date: str, previous_days: int): 25 | start_date = pd.to_datetime(end_date) - pd.Timedelta(days=previous_days) 26 | typer.echo(f"Evaluating submissions on benchmark {benchmark} from {start_date} to {end_date}") 27 | submissions = get_benchmark_repos(benchmark, use_auth_token=HF_TOKEN, start_date=start_date, end_date=end_date) 28 | typer.echo( 29 | f"Found {len(submissions)} submissions to evaluate on benchmark {benchmark}: {[s.id for s in submissions]}" 30 | ) 31 | for submission in submissions: 32 | submission_dataset = submission.id 33 | typer.echo(f"Evaluating submission {submission_dataset}") 34 | card_data = submission.cardData 35 | # Format submission name to comply with AutoTrain API 36 | # _XXX_ for spaces, _DDD_ for double dashes 37 | # TODO: remove these dirty hacks - should really apply validation at submission time! 38 | submission_name = card_data.get("submission_name").replace(" ", "_XXX_") 39 | submission_name = submission_name.replace("--", "_DDD_") 40 | # Extract submission timestamp and convert to Unix epoch in nanoseconds 41 | timestamp = pd.to_datetime(submission.lastModified) 42 | submission_timestamp = int(timestamp.tz_localize(None).timestamp()) 43 | # Use the user-generated submission name, Git commit SHA and timestamp to create submission ID 44 | submission_id = submission_name + "__" + submission.sha[:6] + "__" + str(submission_timestamp) 45 | # Define AutoTrain payload 46 | project_config = {} 47 | # Need a dummy dataset to use the dataset loader in AutoTrain 48 | # Derived from the `emotion` dataset => multiclass classification task 49 | project_config["dataset_name"] = "autoevaluator/benchmark-dummy-data" 50 | project_config["dataset_config"] = "autoevaluator--benchmark-dummy-data" 51 | project_config["dataset_split"] = "train" 52 | project_config["col_mapping"] = {"text": "text", "label": "target"} 53 | # Specify benchmark parameters 54 | project_config["dataset"] = evaluation_dataset 55 | project_config["model"] = benchmark 56 | project_config["submission_dataset"] = submission_dataset 57 | 58 | # Create project 59 | payload = { 60 | "username": AUTOTRAIN_USERNAME, 61 | "proj_name": submission_id, 62 | "task": 2, # Need multi-class classification task to align with dummy dataset 63 | "config": { 64 | "language": "en", 65 | "max_models": 5, 66 | "instance": { 67 | "provider": "ovh", 68 | "instance_type": "p3", 69 | "max_runtime_seconds": 172800, 70 | "num_instances": 1, 71 | "disk_size_gb": 150, 72 | }, 73 | "benchmark": { 74 | "dataset": project_config["dataset"], 75 | "model": project_config["model"], 76 | "submission_dataset": project_config["submission_dataset"], 77 | }, 78 | }, 79 | } 80 | project_json_resp = http_post( 81 | path="/projects/create", payload=payload, token=AUTOTRAIN_TOKEN, domain=AUTOTRAIN_BACKEND_API 82 | ).json() 83 | typer.echo("🎨🎨🎨 Project creation 🎨🎨🎨") 84 | typer.echo(project_json_resp) 85 | 86 | if project_json_resp["created"]: 87 | data_payload = { 88 | "split": 4, # use "auto" split choice in AutoTrain 89 | "col_mapping": project_config["col_mapping"], 90 | "load_config": {"max_size_bytes": 0, "shuffle": False}, 91 | "dataset_id": project_config["dataset_name"], 92 | "dataset_config": project_config["dataset_config"], 93 | "dataset_split": project_config["dataset_split"], 94 | } 95 | data_json_resp = http_post( 96 | path=f"/projects/{project_json_resp['id']}/data/dataset", 97 | payload=data_payload, 98 | token=AUTOTRAIN_TOKEN, 99 | domain=AUTOTRAIN_BACKEND_API, 100 | ).json() 101 | typer.echo("💾💾💾 Dataset creation 💾💾💾") 102 | typer.echo(data_json_resp) 103 | 104 | # Process data 105 | data_proc_json_resp = http_post( 106 | path=f"/projects/{project_json_resp['id']}/data/start_processing", 107 | token=AUTOTRAIN_TOKEN, 108 | domain=AUTOTRAIN_BACKEND_API, 109 | ).json() 110 | typer.echo(f"🍪 Start data processing response: {data_proc_json_resp}") 111 | 112 | typer.echo("⏳ Waiting for data processing to complete ...") 113 | is_data_processing_success = False 114 | while is_data_processing_success is not True: 115 | project_status = http_get( 116 | path=f"/projects/{project_json_resp['id']}", 117 | token=AUTOTRAIN_TOKEN, 118 | domain=AUTOTRAIN_BACKEND_API, 119 | ).json() 120 | # See database.database.enums.ProjectStatus for definitions of `status` 121 | if project_status["status"] == 3: 122 | is_data_processing_success = True 123 | print("✅ Data processing complete!") 124 | time.sleep(3) 125 | else: 126 | time.sleep(10) 127 | typer.echo("🥱 Dataset not ready, waiting 10 more seconds ...") 128 | 129 | # Approve training job 130 | train_job_resp = http_post( 131 | path=f"/projects/{project_json_resp['id']}/start_training", 132 | token=AUTOTRAIN_TOKEN, 133 | domain=AUTOTRAIN_BACKEND_API, 134 | ).json() 135 | print(f"🏃 Training job approval response: {train_job_resp}") 136 | print("🔥 Project and dataset preparation completed!") 137 | 138 | 139 | if __name__ == "__main__": 140 | app() 141 | -------------------------------------------------------------------------------- /scripts/run_gem_scoring.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import zipfile 4 | from pathlib import Path 5 | 6 | import numpy as np 7 | import requests 8 | import typer 9 | from dotenv import load_dotenv 10 | from huggingface_hub import Repository, cached_download, hf_hub_url 11 | 12 | from hf_benchmarks import get_benchmark_repos, get_model_index, load_json, save_json 13 | 14 | 15 | if Path(".env").is_file(): 16 | load_dotenv(".env") 17 | 18 | auth_token = os.getenv("HF_GEM_TOKEN") 19 | 20 | SCORES_REPO_URL = "https://huggingface.co/datasets/GEM-submissions/submission-scores" 21 | OUTPUTS_REPO_URL = "https://huggingface.co/datasets/GEM-submissions/v2-outputs-and-scores" 22 | LOCAL_SCORES_REPO = "data/submission-scores" 23 | LOCAL_OUTPUTS_REPO = "data/v2-outputs-and-scores" 24 | LOCAL_GEM_V1_PATH = "data/gem-v1-outputs-and-scores" 25 | # This file is used to configure the filtering of the raw submissions and also used to configure the GEM website 26 | EVAL_CONFIG_URL = ( 27 | "https://raw.githubusercontent.com/GEM-benchmark/GEM-benchmark.github.io/main/web/results/eval_config.json" 28 | ) 29 | 30 | app = typer.Typer() 31 | 32 | 33 | def extract_relevant_metrics(config: dict): 34 | """Extract the `measures` field from the config.""" 35 | metric_names = [] 36 | for k, v in config["measures"].items(): 37 | metric_names.extend(v) 38 | return metric_names 39 | 40 | 41 | def drop_unnecessary_metrics(submission_scores: dict, list_of_metrics: list): 42 | """Return submission_scores with every metric not in list_of_metrics removed.""" 43 | for data_name, data in submission_scores.items(): 44 | if data_name in ["param_count", "submission_name"]: 45 | continue 46 | filtered_scores = {k: v for k, v in data.items() if k in list_of_metrics} 47 | submission_scores[data_name] = filtered_scores 48 | return submission_scores 49 | 50 | 51 | def _round_subelements(v): 52 | """traverses object and rounds items.""" 53 | if isinstance(v, float): 54 | return round(v, 3) 55 | elif isinstance(v, int) or isinstance(v, str): 56 | return v 57 | elif isinstance(v, dict): 58 | return {k: (round(d, 3) if isinstance(d, float) else d) for k, d in v.items()} 59 | else: 60 | raise ValueError(f"unexpected type: {type(v)}: {v}.") 61 | 62 | 63 | def round_results(submission_scores: dict): 64 | """rounds every metric result to three decimal places.""" 65 | for data_name, data in submission_scores.items(): 66 | if data_name in ["param_count", "submission_name"]: 67 | continue 68 | rounded_scores = {k: _round_subelements(v) for k, v in data.items()} 69 | submission_scores[data_name] = rounded_scores 70 | return submission_scores 71 | 72 | 73 | def filter_submission_output(submission_scores: dict, config: dict): 74 | relevant_metrics = extract_relevant_metrics(config) 75 | filtered_scores = [drop_unnecessary_metrics(d, relevant_metrics) for d in submission_scores] 76 | return [round_results(d) for d in filtered_scores] 77 | 78 | 79 | @app.command() 80 | def run(): 81 | # Download the submission from v1 of the GEM benchmark 82 | gem_v1_url = hf_hub_url( 83 | "GEM-submissions/v1-outputs-and-scores", filename="gem-v1-outputs-and-scores.zip", repo_type="dataset" 84 | ) 85 | gem_v1_path = cached_download(gem_v1_url) 86 | # Load the submissions from v1 87 | with zipfile.ZipFile(gem_v1_path) as zf: 88 | zf.extractall("data") 89 | 90 | gem_v1_files = [p for p in Path(LOCAL_GEM_V1_PATH).glob("*.scores.json")] 91 | gem_v1_submissions = [load_json(p) for p in gem_v1_files] 92 | typer.echo(f"Number of submissions from version 1 of the benchmark: {len(gem_v1_submissions)}") 93 | # Some fields have NaNs which breaks the frontend - replace with -999 as a workaround 94 | gem_v1_scores = [] 95 | for scores in gem_v1_submissions: 96 | for k, v in scores.items(): 97 | if isinstance(v, dict): 98 | for kk, vv in v.items(): 99 | if "msttr" in kk and np.isnan(vv): 100 | scores[k][kk] = -999 101 | gem_v1_scores.append(scores) 102 | # Download submission metadata from the Hub and combine with v1 scores 103 | hub_submissions = get_benchmark_repos(benchmark="gem", repo_type="evaluation", use_auth_token=auth_token) 104 | # Filter out the test submissions 105 | hub_submissions = [sub for sub in hub_submissions if "lewtun" not in sub.id] 106 | all_scores = get_model_index(hub_submissions) 107 | all_scores.extend(gem_v1_scores) 108 | typer.echo(f"Number of raw scores: {len(all_scores)}") 109 | # Clone the Hub repo with the scores 110 | scores_repo = Repository( 111 | local_dir=LOCAL_SCORES_REPO, 112 | clone_from=SCORES_REPO_URL, 113 | repo_type="dataset", 114 | use_auth_token=auth_token, 115 | ) 116 | # Filter the scores for smaller payload to the website / Spaces 117 | eval_config = requests.get(EVAL_CONFIG_URL).json() 118 | filtered_scores = filter_submission_output(all_scores, eval_config) 119 | typer.echo(f"Number of filtered scores: {len(filtered_scores)}") 120 | if len(all_scores) != len(filtered_scores): 121 | raise ValueError("The raw and filtered scores must have the same count!") 122 | # Save and update the raw and filtered scores 123 | save_json(f"{LOCAL_SCORES_REPO}/scores.json", all_scores) 124 | save_json(f"{LOCAL_SCORES_REPO}/filtered_scores.json", filtered_scores) 125 | 126 | if scores_repo.is_repo_clean(): 127 | typer.echo("No new submissions were found! Skipping update to the scores repo ...") 128 | else: 129 | scores_repo.git_add() 130 | typer.echo("Pushing scores and outputs to the hub ...") 131 | scores_repo.push_to_hub("Update submission scores") 132 | 133 | # Dumping all scores and outputs - refactor this! 134 | # Clone the Hub repo with the scores 135 | outputs_repo = Repository( 136 | local_dir=LOCAL_OUTPUTS_REPO, 137 | clone_from=OUTPUTS_REPO_URL, 138 | repo_type="dataset", 139 | use_auth_token=auth_token, 140 | ) 141 | 142 | # Load the submissions from v1 143 | gem_v1_scores_files = [p for p in Path(LOCAL_GEM_V1_PATH).glob("*.scores.json")] 144 | gem_v1_outputs_files = [p for p in Path(LOCAL_GEM_V1_PATH).glob("*.outputs.json")] 145 | # Load scores from v2 146 | gem_v2_scores = get_model_index(hub_submissions) 147 | scores_submission_names = [] 148 | gem_v2_scores_files = [] 149 | for score in gem_v2_scores: 150 | submission_name = score["submission_name"] 151 | scores_submission_names.append(submission_name) 152 | filename = f"data/tmp/{submission_name}.scores.json" 153 | gem_v2_scores_files.append(Path(f"data/tmp/{submission_name}.scores.json")) 154 | save_json(filename, score) 155 | 156 | gem_v2_outputs = get_benchmark_repos("gem", use_auth_token=auth_token) 157 | gem_v2_outputs = [s for s in gem_v2_outputs if "lewtun" not in s.id] 158 | gem_v2_outputs_files = [] 159 | 160 | for submission in gem_v2_outputs: 161 | card_data = submission.cardData 162 | submission_name = card_data["submission_name"] 163 | if submission_name in scores_submission_names: 164 | url = hf_hub_url(submission.id, "submission.json", repo_type="dataset") 165 | cache_filepath = cached_download( 166 | url, cache_dir="data/tmp/", force_filename=f"{submission_name}.outputs.json" 167 | ) 168 | gem_v2_outputs_files.append(Path(cache_filepath)) 169 | 170 | with zipfile.ZipFile(f"{LOCAL_OUTPUTS_REPO}/gem-v2-outputs-and-scores.zip", "w") as f: 171 | for path in gem_v1_scores_files: 172 | f.write(path, path.relative_to("data/gem-v1-outputs-and-scores"), compress_type=zipfile.ZIP_DEFLATED) 173 | for path in gem_v1_outputs_files: 174 | f.write(path, path.relative_to("data/gem-v1-outputs-and-scores"), compress_type=zipfile.ZIP_DEFLATED) 175 | for path in gem_v2_outputs_files: 176 | f.write(path, path.relative_to("data/tmp"), compress_type=zipfile.ZIP_DEFLATED) 177 | for path in gem_v2_scores_files: 178 | f.write(path, path.relative_to("data/tmp"), compress_type=zipfile.ZIP_DEFLATED) 179 | 180 | if outputs_repo.is_repo_clean(): 181 | typer.echo("No new outputs were found! Skipping update to the outputs repo ...") 182 | else: 183 | outputs_repo.git_add() 184 | typer.echo("Pushing scores and outputs to the hub ...") 185 | outputs_repo.push_to_hub("Update scores and outputs") 186 | 187 | # Flush local repos 188 | shutil.rmtree(LOCAL_SCORES_REPO, ignore_errors=True) 189 | shutil.rmtree(LOCAL_OUTPUTS_REPO, ignore_errors=True) 190 | 191 | 192 | if __name__ == "__main__": 193 | app() 194 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | --------------------------------------------------------------------------------