├── data
    └── .gitkeep
├── tests
    ├── __init__.py
    ├── testing_utils.py
    ├── test_evaluate.py
    ├── test_dummy_benchmark.py
    └── test_hub.py
├── benchmarks
    ├── dummy
    │   ├── __init__.py
    │   ├── requirements.txt
    │   └── evaluation.py
    ├── gem
    │   ├── __init__.py
    │   ├── requirements.txt
    │   └── evaluation.py
    ├── raft
    │   ├── __init__.py
    │   ├── requirements.txt
    │   └── evaluation.py
    ├── generic_competition
    │   ├── __init__.py
    │   ├── requirements.txt
    │   └── evaluation.py
    ├── __init__.py
    ├── README.md
    └── registration.py
├── src
    └── hf_benchmarks
    │   ├── __init__.py
    │   ├── file_utils.py
    │   ├── schemas.py
    │   └── hub.py
├── .env.example
├── Makefile
├── setup.cfg
├── .github
    └── workflows
    │   ├── run_gem_scoring.yml
    │   ├── test_benchmarks.yaml
    │   └── run_raft_evaluation.yaml
├── README.md
├── scripts
    ├── submission_table.py
    ├── run_evaluation_dummy.py
    ├── run_evaluation.py
    └── run_gem_scoring.py
├── .gitignore
├── setup.py
└── LICENSE


/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/benchmarks/dummy/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/benchmarks/gem/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/benchmarks/raft/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/benchmarks/generic_competition/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/benchmarks/raft/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn>=0.24.2


--------------------------------------------------------------------------------
/benchmarks/generic_competition/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn>=0.24.2


--------------------------------------------------------------------------------
/benchmarks/dummy/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn>=0.24.2
2 | datasets<=2.2
3 | evaluate==0.1.2


--------------------------------------------------------------------------------
/benchmarks/gem/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets==1.17.0 # DO NOT CHANGE!
2 | gem-metrics @ git+https://github.com/GEM-benchmark/GEM-metrics.git


--------------------------------------------------------------------------------
/src/hf_benchmarks/__init__.py:
--------------------------------------------------------------------------------
1 | from .file_utils import load_json, save_json
2 | from .hub import get_benchmark_repos, get_model_index, http_get, http_post
3 | from .schemas import Evaluation, Metric, Result, Task
4 | 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
1 | HF_TOKEN=hf_app_xxx # A "God" token to download private submission repos
2 | AUTOTRAIN_TOKEN=hf_xxx # A Hugging Face access token associated with a valid AutoTrain account
3 | AUTOTRAIN_BACKEND_API=https://api.autotrain.huggingface.co


--------------------------------------------------------------------------------
/tests/testing_utils.py:
--------------------------------------------------------------------------------
1 | BOGUS_BENCHMARK_NAME = "bogus"
2 | DUMMY_BENCHMARK_NAME = "dummy"
3 | DUMMY_EVALUATION_ID = "lewtun/benchmarks-dummy-evaluation"
4 | DUMMY_PRIVATE_LABELS_ID = "lewtun/benchmarks-dummy-private-labels"
5 | DUMMY_SUBMISSION_ID = "lewtun/benchmarks-dummy-submission"
6 | 


--------------------------------------------------------------------------------
/benchmarks/__init__.py:
--------------------------------------------------------------------------------
 1 | from .registration import Benchmark, registry
 2 | 
 3 | 
 4 | raft = Benchmark(name="raft")
 5 | gem = Benchmark(name="gem")
 6 | dummy = Benchmark(name="dummy")
 7 | 
 8 | registry.register_benchmark(raft)
 9 | registry.register_benchmark(gem)
10 | registry.register_benchmark(dummy)
11 | 


--------------------------------------------------------------------------------
/src/hf_benchmarks/file_utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | 
 4 | 
 5 | def load_json(path):
 6 |     with open(path, "r") as f:
 7 |         return json.load(f)
 8 | 
 9 | 
10 | def save_json(path, data):
11 |     Path(path).parent.mkdir(parents=True, exist_ok=True)
12 |     with open(path, "w", encoding="utf-8") as f:
13 |         json.dump(data, f)
14 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | style:
 2 | 	python -m black --line-length 119 --target-version py39 .
 3 | 	python -m isort .
 4 | 
 5 | quality:
 6 | 	python -m black --check --line-length 119 --target-version py39 .
 7 | 	python -m isort --check-only .
 8 | 	python -m flake8 --max-line-length 119
 9 | 
10 | typecheck-benchmarks:
11 | 	python -m mypy ./benchmarks
12 | 
13 | test:
14 | 	python -m pytest -sv tests/


--------------------------------------------------------------------------------
/src/hf_benchmarks/schemas.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional, TypedDict, Union
 2 | 
 3 | 
 4 | class Metric(TypedDict):
 5 |     name: str
 6 |     type: str
 7 |     value: Union[float, Optional[dict]]
 8 | 
 9 | 
10 | class Task(TypedDict):
11 |     name: str
12 |     type: str
13 |     metrics: List[Metric]
14 | 
15 | 
16 | class Result(TypedDict):
17 |     task: Task
18 | 
19 | 
20 | class Evaluation(TypedDict):
21 |     results: List[Result]
22 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | [mypy-datasets]
 3 | follow_imports = skip
 4 | [mypy-hf_benchmarks]
 5 | ignore_missing_imports = True
 6 | 
 7 | [isort]
 8 | multi_line_output = 3
 9 | include_trailing_comma = True
10 | force_grid_wrap = 0
11 | use_parentheses = True
12 | ensure_newline_before_comments = True
13 | line_length = 119
14 | lines_after_imports = 2
15 | 
16 | [flake8]
17 | ignore = E203, E501, W503
18 | max-line-length = 119
19 | per-file-ignores =
20 |     # imported but unused
21 |     __init__.py: F401


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
 1 | # Hugging Face Benchmarks
 2 | 
 3 | ## AutoTrain configuration details
 4 | 
 5 | Benchmarks are evaluated by AutoTrain, with the payload sent to the `AUTOTRAIN_BACKEND_API` environment variable. The current configuration for the hosted benchmarks is shown in the table below.
 6 | 
 7 | | Benchmark |                  Backend API                   |
 8 | |:---------:|:----------------------------------------------:|
 9 | |   RAFT    | `https://api.autotrain.huggingface.co` |
10 | |    GEM    | `https://api.autotrain.huggingface.co` |
11 | 


--------------------------------------------------------------------------------
/tests/test_evaluate.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import inspect
 3 | 
 4 | from benchmarks import registry
 5 | 
 6 | 
 7 | # TODO(lewtun): use common.evaluation.evaluate as reference?
 8 | EVALUATE_ARGS = {"evaluation_dataset", "submission_dataset", "use_auth_token"}
 9 | 
10 | 
11 | def test_evaluate_signature():
12 |     benchmarks = registry.list_benchmarks()
13 |     for benchmark in benchmarks:
14 |         evaluate_module = importlib.import_module(f"benchmarks.{benchmark.name}.evaluation")
15 |         args = inspect.signature(evaluate_module.compute_metrics).parameters.keys()
16 |         assert len(args) == len(EVALUATE_ARGS) and sorted(args) == sorted(EVALUATE_ARGS)
17 | 


--------------------------------------------------------------------------------
/.github/workflows/run_gem_scoring.yml:
--------------------------------------------------------------------------------
 1 | name: Update GEM scores
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron:  '0 * * * *' # Update score every hour
 6 | 
 7 | jobs:
 8 | 
 9 |   build:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |       - name: Checkout code
14 |         uses: actions/checkout@v2
15 | 
16 |       - name: Setup Python Environment
17 |         uses: actions/setup-python@v2
18 |         with:
19 |           python-version: 3.8
20 | 
21 |       - name: Install requirements
22 |         run: pip install '.[cron]'
23 | 
24 |       - name: Execute scoring script
25 |         env:
26 |           HF_GEM_TOKEN: ${{ secrets.HF_GEM_TOKEN }}
27 |         run: |
28 |           HF_GEM_TOKEN=$HF_GEM_TOKEN python scripts/run_gem_scoring.py


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Hugging Face Benchmarks
 2 | 
 3 | > A toolkit for evaluating benchmarks on the [Hugging Face Hub](https://huggingface.co)
 4 | 
 5 | ## Hosted benchmarks
 6 | 
 7 | The list of hosted benchmarks is shown in the table below:
 8 | 
 9 | | Benchmark | Description | Submission | Leaderboard |
10 | | :---: | :---: | :---: | :---: | 
11 | | RAFT | A benchmark to test few-shot learning in NLP | [`ought/raft-submission`](https://huggingface.co/datasets/ought/raft-submission) | [`ought/raft-leaderboard`](https://huggingface.co/spaces/ought/raft-leaderboard) |
12 | | GEM | A large-scale benchmark for natural language generation | [`GEM/submission-form`](https://huggingface.co/spaces/GEM/submission-form) | [`GEM/results`](https://huggingface.co/spaces/GEM/results) |
13 | 
14 | ## Developer installation
15 | 
16 | Clone the repository and install the requirements:
17 | 
18 | ```
19 | git clone git@github.com:huggingface/hf_benchmarks.git
20 | cd hf_benchmarks
21 | pip install '.[dev]'
22 | ```
23 | 
24 | 


--------------------------------------------------------------------------------
/tests/test_dummy_benchmark.py:
--------------------------------------------------------------------------------
 1 | import importlib
 2 | import os
 3 | from unittest import TestCase
 4 | 
 5 | from huggingface_hub import HfFolder
 6 | 
 7 | from .testing_utils import DUMMY_PRIVATE_LABELS_ID, DUMMY_SUBMISSION_ID
 8 | 
 9 | 
10 | class DummyBenchmarkTest(TestCase):
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         """
14 |         Share this valid token in all tests below. Needed for CI
15 |         """
16 |         token = os.getenv("HF_TOKEN")
17 |         if token:
18 |             HfFolder.save_token(token)
19 | 
20 |     def test_compute_metrics(self):
21 |         eval_module = importlib.import_module("benchmarks.dummy.evaluation")
22 |         token = HfFolder.get_token()
23 |         results = eval_module.compute_metrics(DUMMY_PRIVATE_LABELS_ID, DUMMY_SUBMISSION_ID, use_auth_token=token)
24 |         expected_results = {
25 |             "results": [
26 |                 {
27 |                     "task": {
28 |                         "name": "default",
29 |                         "type": "text-classification",
30 |                         "metrics": [{"name": "f1", "type": "f1", "value": 0.5}],
31 |                     }
32 |                 }
33 |             ]
34 |         }
35 |         self.assertDictEqual(expected_results, results)
36 | 


--------------------------------------------------------------------------------
/.github/workflows/test_benchmarks.yaml:
--------------------------------------------------------------------------------
 1 | name: Benchmark tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 |     branches:
 9 |       - main
10 | 
11 | jobs:
12 | 
13 |   test_library:
14 |     name: Test library
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - name: Checkout code
18 |         uses: actions/checkout@v2
19 |       - name: Setup Python environment
20 |         uses: actions/setup-python@v2
21 |         with:
22 |           python-version: 3.9
23 |       - name: Install dependencies
24 |         run: |
25 |           python -m pip install --upgrade pip
26 |           python -m pip install ".[tests]"
27 |       - name: Run unit tests
28 |         env:
29 |           HF_TOKEN: ${{ secrets.HF_TOKEN }}
30 |         run: pytest
31 | 
32 |   check_code_quality:
33 |     name: Check code quality
34 |     runs-on: ubuntu-latest
35 |     steps:
36 |       - name: Checkout code
37 |         uses: actions/checkout@v2
38 |       - name: Setup Python environment
39 |         uses: actions/setup-python@v2
40 |         with:
41 |           python-version: 3.9
42 |       - name: Install dependencies
43 |         run: |
44 |           python -m pip install --upgrade pip
45 |           python -m pip install ".[quality]"
46 |       - name: Code quality
47 |         run: |
48 |           make quality
49 |           make typecheck-benchmarks


--------------------------------------------------------------------------------
/benchmarks/registration.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | 
 4 | @dataclass
 5 | class Benchmark:
 6 |     name: str
 7 | 
 8 | 
 9 | class BenchmarkRegistry:
10 |     """
11 |     Registry for all registered benchmarks.
12 |     """
13 | 
14 |     def __init__(self):
15 |         self.benchmarks = {}
16 | 
17 |     def register_benchmark(self, benchmark):
18 |         """
19 |         Register a benchmark.
20 | 
21 |         Args:
22 |             benchmark: Benchmark to register.
23 |         """
24 |         name = benchmark.name
25 |         if name in self.benchmarks:
26 |             raise ValueError(f"Benchmark with name {name} already registered.")
27 |         self.benchmarks[name] = benchmark
28 | 
29 |     def get_benchmark(self, name):
30 |         """
31 |         Get a registered benchmark.
32 | 
33 |         Args:
34 |             name: Name of the benchmark.
35 | 
36 |         Returns:
37 |             Benchmark with the given name.
38 |         """
39 |         if name not in self.benchmarks:
40 |             raise ValueError("Benchmark with name {} not registered.".format(name))
41 |         return self.benchmarks[name]
42 | 
43 |     def list_benchmarks(self):
44 |         """
45 |         List all registered benchmarks.
46 | 
47 |         Returns:
48 |             List of all registered benchmarks.
49 |         """
50 |         return list(self.benchmarks.values())
51 | 
52 | 
53 | registry = BenchmarkRegistry()
54 | 


--------------------------------------------------------------------------------
/scripts/submission_table.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import pandas as pd
 4 | import typer
 5 | 
 6 | from hf_benchmarks import get_benchmark_repos
 7 | 
 8 | 
 9 | app = typer.Typer()
10 | 
11 | 
12 | @app.command()
13 | def run(
14 |     benchmark: str,
15 |     repo_type: str = "prediction",
16 |     start_date: str = None,
17 |     end_date: str = None,
18 |     save_path: str = "./data",
19 | ):
20 |     if start_date is None or end_date is None:
21 |         default_start_time = pd.Timestamp.now()
22 |         default_end_time = pd.Timestamp.now() - pd.Timedelta(days=7)
23 |         typer.echo(
24 |             f"Submission window not provided, so using past week from {default_start_time.date()} as default window"
25 |         )
26 |         start_date = str(default_start_time.date())
27 |         end_date = str(default_end_time.date())
28 | 
29 |     submissions = get_benchmark_repos(
30 |         benchmark=benchmark,
31 |         use_auth_token=True,
32 |         repo_type=repo_type,
33 |         start_date=start_date,
34 |         end_date=end_date,
35 |     )
36 |     typer.echo(f"Found {len(submissions)} submissions for evaluation!")
37 |     df = pd.DataFrame(submissions)
38 |     file_path = os.path.join(save_path, f"{benchmark}_submissions_{start_date}_{end_date}.csv")
39 |     df.to_csv(file_path, index=False)
40 |     typer.echo(f"Saved submissions to {os.path.abspath(file_path)}")
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     app()
45 | 


--------------------------------------------------------------------------------
/.github/workflows/run_raft_evaluation.yaml:
--------------------------------------------------------------------------------
 1 | name: Run RAFT evaluation
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   schedule:
 6 |     - cron:  '0 0 * * SUN' # Run evaluation at midnight every Sunday
 7 | 
 8 | jobs:
 9 | 
10 |   build:
11 |     runs-on: ubuntu-latest
12 | 
13 |     steps:
14 |       - name: Checkout code
15 |         uses: actions/checkout@v2
16 | 
17 |       - name: Setup Python Environment
18 |         uses: actions/setup-python@v2
19 |         with:
20 |           python-version: 3.8
21 | 
22 |       - name: Install requirements
23 |         run: pip install '.[cron]'
24 | 
25 |       - name: Get current date
26 |         uses: 1466587594/get-current-time@v2
27 |         id: current-time
28 |         with:
29 |           format: YYYYMMDD-HH
30 |           utcOffset: "+08:00"
31 | 
32 |       - name: Execute evaluation script
33 |         env:
34 |           YEAR: "${{ steps.current-time.outputs.year }}"
35 |           MONTH: "${{ steps.current-time.outputs.month }}"
36 |           DAY: "${{ steps.current-time.outputs.day }}"
37 |           HF_TOKEN: ${{ secrets.HF_TOKEN }}
38 |           AUTOTRAIN_USERNAME : ${{ secrets.AUTOTRAIN_USERNAME }}
39 |           AUTOTRAIN_TOKEN: ${{ secrets.AUTOTRAIN_TOKEN }}
40 |           AUTOTRAIN_BACKEND_API: ${{ secrets.AUTOTRAIN_BACKEND_API }}
41 |         run: |
42 |           HF_TOKEN=$HF_TOKEN AUTOTRAIN_USERNAME=$AUTOTRAIN_USERNAME AUTOTRAIN_TOKEN=$AUTOTRAIN_TOKEN AUTOTRAIN_BACKEND_API=$AUTOTRAIN_BACKEND_API python scripts/run_evaluation.py raft ought/raft-private-labels $YEAR-$MONTH-$DAY 7


--------------------------------------------------------------------------------
/benchmarks/dummy/evaluation.py:
--------------------------------------------------------------------------------
 1 | from datasets import load_dataset
 2 | from evaluate import load  # type: ignore
 3 | 
 4 | from hf_benchmarks import Evaluation, Metric, Result, Task
 5 | 
 6 | 
 7 | def compute_metrics(evaluation_dataset: str, submission_dataset: str, use_auth_token: str) -> Evaluation:
 8 |     """Computes metrics for a benchmark.
 9 | 
10 |     Args:
11 |         evaluation_dataset (:obj:`str`): Name of private dataset with ground truth labels.
12 |         submission_dataset (:obj:`str`): Name of user submission dataset with model predictions.
13 |         use_auth_token (:obj:`str`): The API token to access your private dataset on the Hugging Face Hub.
14 | 
15 |     Returns:
16 |         evaluation (:obj:`Evaluation`): The evaluation metrics.
17 |     """
18 | 
19 |     # Load datasets associated with benchmark
20 |     evaluation_ds = load_dataset(evaluation_dataset, use_auth_token=use_auth_token, split="test")
21 |     submission_ds = load_dataset(submission_dataset, use_auth_token=use_auth_token, split="test")
22 |     # Load metric
23 |     f1 = load("f1")
24 |     # Define container to store metrics
25 |     evaluation = Evaluation(results=[])
26 |     # Compute metrics and build up list of dictionaries, one per task in the benchmark
27 |     task_data = Task(name="default", type="text-classification", metrics=[])
28 |     scores = f1.compute(
29 |         predictions=submission_ds["label"],
30 |         references=evaluation_ds["label"],
31 |         average="macro",
32 |     )
33 |     for k, v in scores.items():
34 |         task_data["metrics"].append(Metric(name=k, type=k, value=v))
35 |     # Collect results
36 |     result = Result(task=task_data)
37 |     evaluation["results"].append(result)
38 | 
39 |     return evaluation
40 | 


--------------------------------------------------------------------------------
/benchmarks/gem/evaluation.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import subprocess
 3 | from typing import List
 4 | 
 5 | from huggingface_hub import hf_hub_download  # type: ignore
 6 | 
 7 | 
 8 | def compute_metrics(evaluation_dataset: str, submission_dataset: str, use_auth_token: str) -> List[dict]:
 9 |     """Computes metrics for a benchmark.
10 | 
11 |     Args:
12 |         evaluation_dataset (:obj:`str`): Name of private dataset with ground truth labels.
13 |         submission_dataset (:obj:`str`): Name of user submission dataset with model predictions.
14 |         use_auth_token (:obj:`str`): The API token to access your private dataset on the Hugging Face Hub.
15 | 
16 |     Returns:
17 |         metrics (:obj:`List[dict]`): The evaluation metrics.
18 |     """
19 |     # AutoTrain runs the evaluation job inside a Docker container, so we need to
20 |     # save the metrics in the root directory to avoid permission errors.
21 |     metrics_filepath = "/app/metrics.json"
22 |     # This assumes that the GEM submissions are a single file, with a predefined name
23 |     # We'll need to enforce this on the submission repositories
24 |     submission_filename = "submission.json"
25 |     submission_filepath = hf_hub_download(
26 |         repo_id=submission_dataset, filename=submission_filename, repo_type="dataset", use_auth_token=use_auth_token
27 |     )
28 |     # gem_metrics automatically downloads the evaluation splits from the Hub
29 |     process = subprocess.run(
30 |         ["gem_metrics", f"{submission_filepath}", "-o", f"{metrics_filepath}"], stdout=subprocess.PIPE
31 |     )
32 |     if process.returncode == -1:
33 |         raise ValueError(f"Error running gem_metrics for submission {submission_dataset} on {evaluation_dataset}!")
34 |     else:
35 |         with open(metrics_filepath, "r") as f:
36 |             metrics = json.load(f)
37 | 
38 |     return [metrics]
39 | 


--------------------------------------------------------------------------------
/benchmarks/raft/evaluation.py:
--------------------------------------------------------------------------------
 1 | from datasets import get_dataset_config_names, load_dataset, load_metric
 2 | 
 3 | from hf_benchmarks import Evaluation, Metric, Result, Task
 4 | 
 5 | 
 6 | def compute_metrics(evaluation_dataset: str, submission_dataset: str, use_auth_token: str) -> Evaluation:
 7 |     """Computes metrics for a benchmark.
 8 | 
 9 |     Args:
10 |         evaluation_dataset (:obj:`str`): Name of private dataset with ground truth labels.
11 |         submission_dataset (:obj:`str`): Name of user submission dataset with model predictions.
12 |         use_auth_token (:obj:`str`): The API token to access your private dataset on the Hugging Face Hub.
13 | 
14 |     Returns:
15 |         evaluation (:obj:`Evaluation`): The evaluation metrics.
16 |     """
17 | 
18 |     # We need to use the public dataset to get the task names
19 |     tasks = get_dataset_config_names("ought/raft")
20 |     # Load metric
21 |     f1 = load_metric("f1")
22 |     # Define container to store metrics
23 |     evaluation = Evaluation(results=[])
24 |     # Iterate over tasks and build up metrics
25 |     for task in sorted(tasks):
26 |         task_data = Task(name=task, type="text-classification", metrics=[])
27 |         # Load datasets associated with task
28 |         evaluation_ds = load_dataset(path=evaluation_dataset, name=task, use_auth_token=use_auth_token, split="test")
29 |         submission_ds = load_dataset(path=submission_dataset, name=task, use_auth_token=use_auth_token, split="test")
30 |         # Sort IDs to ensure we compare the correct examples
31 |         evaluation_ds = evaluation_ds.sort("ID")
32 |         submission_ds = submission_ds.sort("ID")
33 |         # Compute metrics and build up list of dictionaries, one per task in the benchmark
34 |         scores = f1.compute(
35 |             predictions=submission_ds["Label"],
36 |             references=evaluation_ds["Label"],
37 |             average="macro",
38 |         )
39 |         for k, v in scores.items():
40 |             task_data["metrics"].append(Metric(name=k, type=k, value=v))
41 |         # Collect results
42 |         result = Result(task=task_data)
43 |         evaluation["results"].append(result)
44 | 
45 |     return evaluation
46 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # mac OS
132 | .DS_Store
133 | 
134 | # Repo settings
135 | data/
136 | 


--------------------------------------------------------------------------------
/tests/test_hub.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from unittest import TestCase
 3 | 
 4 | import pandas as pd
 5 | from huggingface_hub import HfFolder
 6 | 
 7 | from hf_benchmarks import get_benchmark_repos
 8 | 
 9 | from .testing_utils import BOGUS_BENCHMARK_NAME, DUMMY_BENCHMARK_NAME, DUMMY_EVALUATION_ID, DUMMY_SUBMISSION_ID
10 | 
11 | 
12 | class GetBenchmarkReposTest(TestCase):
13 |     @classmethod
14 |     def setUpClass(cls):
15 |         """
16 |         Share this valid token in all tests below. Needed for CI
17 |         """
18 |         token = os.getenv("HF_TOKEN")
19 |         if token:
20 |             HfFolder.save_token(token)
21 | 
22 |     def test_no_datasets_repo(self):
23 |         data = get_benchmark_repos(benchmark=BOGUS_BENCHMARK_NAME, use_auth_token=True, repo_type="prediction")
24 |         self.assertEqual(len(data), 0)
25 | 
26 |     def test_prediction_repo(self):
27 |         data = get_benchmark_repos(benchmark=DUMMY_BENCHMARK_NAME, use_auth_token=True, repo_type="prediction")
28 |         self.assertEqual(len(data), 1)
29 |         self.assertEqual(data[0].id, DUMMY_SUBMISSION_ID)
30 | 
31 |     def test_evaluation_repo(self):
32 |         data = get_benchmark_repos(benchmark=DUMMY_BENCHMARK_NAME, use_auth_token=True, repo_type="evaluation")
33 |         self.assertEqual(data[0].id, DUMMY_EVALUATION_ID)
34 | 
35 |     def test_repo_in_submission_window(self):
36 |         repo = get_benchmark_repos(benchmark=DUMMY_BENCHMARK_NAME, use_auth_token=True, repo_type="prediction")
37 |         submission_time = pd.to_datetime(repo[0].lastModified)
38 |         start_date = (submission_time - pd.Timedelta(days=1)).date()
39 |         end_date = (submission_time + pd.Timedelta(days=1)).date()
40 |         data = get_benchmark_repos(
41 |             benchmark=DUMMY_BENCHMARK_NAME,
42 |             use_auth_token=True,
43 |             repo_type="prediction",
44 |             start_date=start_date,
45 |             end_date=end_date,
46 |         )
47 |         self.assertEqual(len(data), 1)
48 |         self.assertEqual(data[0].id, DUMMY_SUBMISSION_ID)
49 | 
50 |     def test_repo_outside_submission_window(self):
51 |         repo = get_benchmark_repos(benchmark=DUMMY_BENCHMARK_NAME, use_auth_token=True, repo_type="prediction")
52 |         submission_time = pd.to_datetime(repo[0].lastModified)
53 |         start_date = (submission_time + pd.Timedelta(days=1)).date()
54 |         end_date = (submission_time + pd.Timedelta(days=2)).date()
55 |         data = get_benchmark_repos(
56 |             benchmark=DUMMY_BENCHMARK_NAME,
57 |             use_auth_token=True,
58 |             repo_type="prediction",
59 |             start_date=start_date,
60 |             end_date=end_date,
61 |         )
62 |         self.assertEqual(len(data), 0)
63 | 


--------------------------------------------------------------------------------
/benchmarks/generic_competition/evaluation.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | import pandas as pd  # type: ignore
 4 | from huggingface_hub import hf_hub_download  # type: ignore
 5 | from sklearn import metrics  # type: ignore
 6 | 
 7 | 
 8 | def compute_metrics(evaluation_dataset: str, submission_dataset: str, use_auth_token: str, **kwargs):
 9 |     """Computes metrics for a benchmark.
10 | 
11 |     Args:
12 |         evaluation_dataset (:obj:`str`): Name of private dataset with ground truth labels.
13 |         submission_dataset (:obj:`str`): Name of user submission dataset with model predictions.
14 |         use_auth_token (:obj:`str`): The API token to access your private dataset on the Hugging Face Hub.
15 | 
16 |     Returns:
17 |         evaluation (:obj:`Evaluation`): The evaluation metrics.
18 |     """
19 | 
20 |     user_id = kwargs.get("user_id", None)
21 |     if user_id is None:
22 |         raise ValueError("user_id is required")
23 |     submission_id = kwargs.get("submission_id", None)
24 |     if submission_id is None:
25 |         raise ValueError("submission_id is required")
26 | 
27 |     eval_fname = hf_hub_download(
28 |         repo_id=evaluation_dataset,
29 |         filename="solution.csv",
30 |         use_auth_token=use_auth_token,
31 |         repo_type="dataset",
32 |     )
33 |     # download conf
34 |     conf_fname = hf_hub_download(
35 |         repo_id=evaluation_dataset,
36 |         filename="conf.json",
37 |         use_auth_token=use_auth_token,
38 |         repo_type="dataset",
39 |     )
40 | 
41 |     # read conf json
42 |     with open(conf_fname, "r") as f:
43 |         conf = json.load(f)
44 | 
45 |     metric = conf["EVAL_METRIC"]
46 | 
47 |     eval_df = pd.read_csv(eval_fname)
48 | 
49 |     submission_filename = f"submissions/{user_id}-{submission_id}.csv"
50 |     sub_fname = hf_hub_download(
51 |         repo_id=submission_dataset,
52 |         filename=submission_filename,
53 |         use_auth_token=use_auth_token,
54 |         repo_type="dataset",
55 |     )
56 |     sub_df = pd.read_csv(sub_fname)
57 | 
58 |     # fetch the metric function
59 |     _metric = getattr(metrics, metric)
60 | 
61 |     public_ids = eval_df[eval_df.split == "public"].id.values
62 |     private_ids = eval_df[eval_df.split == "private"].id.values
63 | 
64 |     target_cols = [col for col in eval_df.columns if col not in ["id", "split"]]
65 |     public_score = _metric(
66 |         eval_df[eval_df.id.isin(public_ids)][target_cols],
67 |         sub_df[sub_df.id.isin(public_ids)][target_cols],
68 |     )
69 |     private_score = _metric(
70 |         eval_df[eval_df.id.isin(private_ids)][target_cols],
71 |         sub_df[sub_df.id.isin(private_ids)][target_cols],
72 |     )
73 | 
74 |     evaluation = {
75 |         "public_score": public_score,
76 |         "private_score": private_score,
77 |     }
78 |     return evaluation
79 | 


--------------------------------------------------------------------------------
/src/hf_benchmarks/hub.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List, Union
  2 | 
  3 | import pandas as pd
  4 | import requests
  5 | import typer
  6 | from huggingface_hub import HfApi, list_datasets
  7 | 
  8 | 
  9 | def delete_repos(repository_ids: List[str], auth_token: str, repo_type: str = "dataset") -> None:
 10 |     typer.echo(f"Found {len(repository_ids)} repos to delete")
 11 |     for repo_id in repository_ids:
 12 |         org, name = repo_id.split("/")
 13 |         HfApi().delete_repo(token=auth_token, organization=org, name=name, repo_type=repo_type)
 14 |         typer.echo(f"Deleted repo: {repo_id}")
 15 | 
 16 | 
 17 | def is_time_between(begin_time: str, end_time: str, check_time: str = None) -> bool:
 18 |     # Adapted from: https://stackoverflow.com/questions/10048249/how-do-i-determine-if-current-time-is-within-a-specified-range-using-pythons-da
 19 |     # If check time is not given, default to current UTC time
 20 |     begin_time = pd.to_datetime(begin_time).tz_localize("UTC")
 21 |     end_time = pd.to_datetime(end_time).tz_localize("UTC")
 22 |     check_time = pd.to_datetime(check_time) or pd.Timestamp.now()
 23 |     if begin_time < end_time:
 24 |         return check_time >= begin_time and check_time <= end_time
 25 |     else:  # crosses midnight
 26 |         return check_time >= begin_time or check_time <= end_time
 27 | 
 28 | 
 29 | def get_benchmark_repos(
 30 |     benchmark: str,
 31 |     use_auth_token: Union[bool, str, None] = None,
 32 |     repo_type: str = "prediction",
 33 |     start_date: Union[str, pd.Timestamp] = None,
 34 |     end_date: Union[str, pd.Timestamp] = None,
 35 | ) -> List[Dict]:
 36 |     """Gets the metadata associated with benchmark submission and evaluation repositories.
 37 | 
 38 |     Args:
 39 |         benchmark: The benchmark name.
 40 |         auth_token: The authentication token for the Hugging Face Hub
 41 |         repo_type: The type of benchmark repository. Can be `prediction`, `model` or `evaluation`.
 42 |         start_date: The timestamp for the start of the submission window.
 43 |         end_date: The timestamp for the end of the submission window.
 44 | 
 45 |     Returns:
 46 |         The benchmark repositories' metadata of a given `repo_type`.
 47 |     """
 48 |     submissions_to_evaluate = []
 49 |     submissions = list_datasets(filter=f"benchmark:{benchmark}", full=True, use_auth_token=use_auth_token)
 50 | 
 51 |     # Filter for repos that fall within submission window
 52 |     if start_date and end_date:
 53 |         submissions = [
 54 |             submission for submission in submissions if is_time_between(start_date, end_date, submission.lastModified)
 55 |         ]
 56 | 
 57 |     for submission in submissions:
 58 |         # Filter submission templates which have the submission_name="none" default value
 59 |         card_data = submission.cardData
 60 |         if (
 61 |             card_data.get("benchmark") == benchmark
 62 |             and card_data.get("submission_name") != "none"
 63 |             and card_data.get("type") == repo_type
 64 |         ):
 65 |             submissions_to_evaluate.append(submission)
 66 | 
 67 |     return submissions_to_evaluate
 68 | 
 69 | 
 70 | def get_model_index(submissions):
 71 |     all_scores = []
 72 |     for submission in submissions:
 73 |         card_data = submission.cardData
 74 |         scores = card_data["model-index"][0]
 75 |         all_scores.append(scores)
 76 |     return all_scores
 77 | 
 78 | 
 79 | def get_auth_headers(token: str, prefix: str = "Bearer"):
 80 |     return {"Authorization": f"{prefix} {token}"}
 81 | 
 82 | 
 83 | def http_post(path: str, token: str, payload=None, domain: str = None, params=None) -> requests.Response:
 84 |     """HTTP POST request to the AutoNLP API, raises UnreachableAPIError if the API cannot be reached"""
 85 |     try:
 86 |         response = requests.post(
 87 |             url=domain + path, json=payload, headers=get_auth_headers(token=token), allow_redirects=True, params=params
 88 |         )
 89 |     except requests.exceptions.ConnectionError:
 90 |         print("❌ Failed to reach AutoNLP API, check your internet connection")
 91 |     response.raise_for_status()
 92 |     return response
 93 | 
 94 | 
 95 | def http_get(
 96 |     path: str,
 97 |     token: str,
 98 |     domain: str = None,
 99 | ) -> requests.Response:
100 |     """HTTP POST request to the AutoNLP API, raises UnreachableAPIError if the API cannot be reached"""
101 |     try:
102 |         response = requests.get(url=domain + path, headers=get_auth_headers(token=token), allow_redirects=True)
103 |     except requests.exceptions.ConnectionError:
104 |         print("❌ Failed to reach AutoNLP API, check your internet connection")
105 |     response.raise_for_status()
106 |     return response
107 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
  1 | # Lint as: python3
  2 | """ Hugging Face Benchmarks is an open-source library for evaluating machine learning benchmarks.
  3 | 
  4 | Note:
  5 | 
  6 |    VERSION needs to be formatted following the MAJOR.MINOR.PATCH convention
  7 |    (we need to follow this convention to be able to retrieve versioned scripts)
  8 | 
  9 | Simple check list for release from AllenNLP repo: https://github.com/allenai/allennlp/blob/master/setup.py
 10 | 
 11 | To create the package for pypi.
 12 | 
 13 | 1. Change the version in __init__.py, setup.py as well as docs/source/conf.py.
 14 | 
 15 | 2. Commit these changes with the message: "Release: VERSION"
 16 | 
 17 | 3. Add a tag in git to mark the release: "git tag VERSION -m'Adds tag VERSION for pypi' "
 18 |    Push the tag to git: git push --tags origin master
 19 | 
 20 | 4. Build both the sources and the wheel. Do not change anything in setup.py between
 21 |    creating the wheel and the source distribution (obviously).
 22 | 
 23 |    For the wheel, run: "python setup.py bdist_wheel" in the top level directory.
 24 |    (this will build a wheel for the python version you use to build it).
 25 | 
 26 |    For the sources, run: "python setup.py sdist"
 27 |    You should now have a /dist directory with both .whl and .tar.gz source versions.
 28 | 
 29 | 5. Check that everything looks correct by uploading the package to the pypi test server:
 30 | 
 31 |    twine upload dist/* -r pypitest
 32 |    (pypi suggest using twine as other methods upload files via plaintext.)
 33 |    You may have to specify the repository url, use the following command then:
 34 |    twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
 35 | 
 36 |    Check that you can install it in a virtualenv by running:
 37 |    pip install -i https://testpypi.python.org/pypi datasets
 38 | 
 39 | 6. Upload the final version to actual pypi:
 40 |    twine upload dist/* -r pypi
 41 | 
 42 | 7. Fill release notes in the tag in github once everything is looking hunky-dory.
 43 | 
 44 | 8. Update the documentation commit in .circleci/deploy.sh for the accurate documentation to be displayed
 45 |    Update the version mapping in docs/source/_static/js/custom.js with utils/release.py,
 46 |    and set version to X.X.X+1.dev0 (e.g. 1.8.0 -> 1.8.1.dev0) in setup.py and __init__.py
 47 | 
 48 | """
 49 | 
 50 | from pathlib import Path
 51 | 
 52 | from setuptools import find_packages, setup
 53 | 
 54 | 
 55 | DOCLINES = __doc__.split("\n")
 56 | 
 57 | # We must upper bound the datasets version to match that in the AutoTrain backend
 58 | REQUIRED_PKGS = [
 59 |     "datasets<=2.2",
 60 |     "typer>=0.3.2",
 61 |     "click==8.0",
 62 |     "python-dotenv>=0.18.0",
 63 |     "evaluate==0.1.2",
 64 |     "scikit-learn==1.1.1",
 65 |     "huggingface-hub==0.10.1",
 66 | ]
 67 | 
 68 | QUALITY_REQUIRE = ["black", "flake8", "isort", "pyyaml>=5.3.1", "mypy", "types-requests"]
 69 | 
 70 | TESTS_REQUIRE = ["pytest", "pytest-cov"]
 71 | 
 72 | EXTRAS_REQUIRE = {"quality": QUALITY_REQUIRE, "tests": TESTS_REQUIRE}
 73 | 
 74 | 
 75 | def combine_requirements(base_keys):
 76 |     return list(set(k for v in base_keys for k in EXTRAS_REQUIRE[v]))
 77 | 
 78 | 
 79 | EXTRAS_REQUIRE["dev"] = combine_requirements([k for k in EXTRAS_REQUIRE])
 80 | EXTRAS_REQUIRE["cron"] = ["requests"]
 81 | 
 82 | benchmark_dependencies = list(Path("benchmarks/").glob("**/requirements.txt"))
 83 | for benchmark in benchmark_dependencies:
 84 |     with open(benchmark, "r") as f:
 85 |         deps = f.read().splitlines()
 86 |         EXTRAS_REQUIRE[benchmark.parent.name] = deps
 87 | 
 88 | setup(
 89 |     name="hf_benchmarks",
 90 |     version="0.0.1",
 91 |     description=DOCLINES[0],
 92 |     long_description="\n".join(DOCLINES[2:]),
 93 |     author="HuggingFace Inc.",
 94 |     author_email="lewis@huggingface.co",
 95 |     url="https://github.com/huggingface/hf_benchmarks",
 96 |     download_url="https://github.com/huggingface/hf_benchmarks/tags",
 97 |     license="Apache 2.0",
 98 |     package_dir={"": "src"},
 99 |     packages=find_packages("src"),
100 |     install_requires=REQUIRED_PKGS,
101 |     extras_require=EXTRAS_REQUIRE,
102 |     classifiers=[
103 |         "Development Status :: 1 - Planning",
104 |         "Intended Audience :: Developers",
105 |         "Intended Audience :: Education",
106 |         "Intended Audience :: Science/Research",
107 |         "License :: OSI Approved :: Apache Software License",
108 |         "Operating System :: OS Independent",
109 |         "Programming Language :: Python :: 3",
110 |         "Programming Language :: Python :: 3.8",
111 |         "Programming Language :: Python :: 3.9",
112 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
113 |     ],
114 |     keywords="machine learning benchmarks evaluation metrics",
115 |     zip_safe=False,  # Required for mypy to find the py.typed file
116 | )
117 | 


--------------------------------------------------------------------------------
/scripts/run_evaluation_dummy.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from pathlib import Path
  3 | 
  4 | import pandas as pd
  5 | import typer
  6 | from dotenv import load_dotenv
  7 | 
  8 | from hf_benchmarks import get_benchmark_repos, http_get, http_post
  9 | 
 10 | 
 11 | if Path(".env").is_file():
 12 |     load_dotenv(".env")
 13 | 
 14 | HF_TOKEN = os.getenv("HF_TOKEN")
 15 | AUTOTRAIN_TOKEN = os.getenv("AUTOTRAIN_TOKEN")
 16 | AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
 17 | AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
 18 | 
 19 | app = typer.Typer()
 20 | 
 21 | 
 22 | @app.command()
 23 | def run(
 24 |     benchmark: str = "dummy",
 25 |     evaluation_dataset: str = "lewtun/benchmarks-dummy-private-labels",
 26 |     end_date: str = "2022-06-22",
 27 |     previous_days: int = 7,
 28 | ):
 29 |     start_date = pd.to_datetime(end_date) - pd.Timedelta(days=previous_days)
 30 |     typer.echo(f"Evaluating submissions on benchmark {benchmark} from {start_date} to {end_date}")
 31 |     submissions = get_benchmark_repos(benchmark, use_auth_token=HF_TOKEN, start_date=start_date, end_date=end_date)
 32 |     typer.echo(f"Found {len(submissions)} submissions to evaluate on benchmark {benchmark}")
 33 |     for submission in submissions:
 34 |         submission_dataset = submission.id
 35 |         typer.echo(f"Evaluating submission {submission_dataset}")
 36 |         card_data = submission.cardData
 37 |         # Format submission name to comply with AutoTrain API
 38 |         # _XXX_ for spaces, _DDD_ for double dashes
 39 |         # TODO: remove these dirty hacks - should really apply validation at submission time!
 40 |         submission_name = card_data.get("submission_name").replace(" ", "_XXX_")
 41 |         submission_name = submission_name.replace("--", "_DDD_")
 42 |         # Extract submission timestamp and convert to Unix epoch in nanoseconds
 43 |         timestamp = pd.to_datetime(submission.lastModified)
 44 |         submission_timestamp = int(timestamp.tz_localize(None).timestamp())
 45 |         # Use the user-generated submission name, Git commit SHA and timestamp to create submission ID
 46 |         submission_id = submission_name + "__" + submission.sha[:6] + "__" + str(submission_timestamp)
 47 |         # Define AutoTrain payload
 48 |         project_config = {}
 49 |         # Need a dummy dataset to use the dataset loader in AutoTrain
 50 |         project_config["dataset_name"] = "lewtun/imdb-dummy"
 51 |         project_config["dataset_config"] = "lewtun--imdb-dummy"
 52 |         project_config["dataset_split"] = "train"
 53 |         project_config["col_mapping"] = {"text": "text", "label": "target"}
 54 |         # Specify benchmark parameters
 55 |         project_config["dataset"] = evaluation_dataset
 56 |         project_config["model"] = benchmark
 57 |         project_config["submission_dataset"] = submission_dataset
 58 | 
 59 |         # Create project
 60 |         payload = {
 61 |             "username": AUTOTRAIN_USERNAME,
 62 |             "proj_name": submission_id,
 63 |             "task": 1,
 64 |             "config": {
 65 |                 "language": "en",
 66 |                 "max_models": 5,
 67 |                 "instance": {
 68 |                     "provider": "aws",
 69 |                     "instance_type": "ml.g4dn.4xlarge",
 70 |                     "max_runtime_seconds": 172800,
 71 |                     "num_instances": 1,
 72 |                     "disk_size_gb": 150,
 73 |                 },
 74 |                 "benchmark": {
 75 |                     "dataset": project_config["dataset"],
 76 |                     "model": project_config["model"],
 77 |                     "submission_dataset": project_config["submission_dataset"],
 78 |                 },
 79 |             },
 80 |         }
 81 |         project_json_resp = http_post(
 82 |             path="/projects/create", payload=payload, token=AUTOTRAIN_TOKEN, domain=AUTOTRAIN_BACKEND_API
 83 |         ).json()
 84 |         typer.echo(f"Project creation: {project_json_resp}")
 85 | 
 86 |         # Upload data
 87 |         payload = {
 88 |             "split": 4,
 89 |             "col_mapping": project_config["col_mapping"],
 90 |             "load_config": {"max_size_bytes": 0, "shuffle": False},
 91 |         }
 92 |         data_json_resp = http_post(
 93 |             path=f"/projects/{project_json_resp['id']}/data/{project_config['dataset_name']}",
 94 |             payload=payload,
 95 |             token=AUTOTRAIN_TOKEN,
 96 |             domain=AUTOTRAIN_BACKEND_API,
 97 |             params={
 98 |                 "type": "dataset",
 99 |                 "config_name": project_config["dataset_config"],
100 |                 "split_name": project_config["dataset_split"],
101 |             },
102 |         ).json()
103 |         typer.echo(f"Dataset creation: {data_json_resp}")
104 | 
105 |         # Run training
106 |         train_json_resp = http_get(
107 |             path=f"/projects/{project_json_resp['id']}/data/start_process",
108 |             token=AUTOTRAIN_TOKEN,
109 |             domain=AUTOTRAIN_BACKEND_API,
110 |         ).json()
111 |         typer.echo(f"Training job response: {train_json_resp}")
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     app()
116 | 


--------------------------------------------------------------------------------
/scripts/run_evaluation.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import time
  3 | from pathlib import Path
  4 | 
  5 | import pandas as pd
  6 | import typer
  7 | from dotenv import load_dotenv
  8 | 
  9 | from hf_benchmarks import get_benchmark_repos, http_get, http_post
 10 | 
 11 | 
 12 | if Path(".env").is_file():
 13 |     load_dotenv(".env")
 14 | 
 15 | HF_TOKEN = os.getenv("HF_TOKEN")
 16 | AUTOTRAIN_TOKEN = os.getenv("AUTOTRAIN_TOKEN")
 17 | AUTOTRAIN_USERNAME = os.getenv("AUTOTRAIN_USERNAME")
 18 | AUTOTRAIN_BACKEND_API = os.getenv("AUTOTRAIN_BACKEND_API")
 19 | 
 20 | app = typer.Typer()
 21 | 
 22 | 
 23 | @app.command()
 24 | def run(benchmark: str, evaluation_dataset: str, end_date: str, previous_days: int):
 25 |     start_date = pd.to_datetime(end_date) - pd.Timedelta(days=previous_days)
 26 |     typer.echo(f"Evaluating submissions on benchmark {benchmark} from {start_date} to {end_date}")
 27 |     submissions = get_benchmark_repos(benchmark, use_auth_token=HF_TOKEN, start_date=start_date, end_date=end_date)
 28 |     typer.echo(
 29 |         f"Found {len(submissions)} submissions to evaluate on benchmark {benchmark}: {[s.id for s in submissions]}"
 30 |     )
 31 |     for submission in submissions:
 32 |         submission_dataset = submission.id
 33 |         typer.echo(f"Evaluating submission {submission_dataset}")
 34 |         card_data = submission.cardData
 35 |         # Format submission name to comply with AutoTrain API
 36 |         # _XXX_ for spaces, _DDD_ for double dashes
 37 |         # TODO: remove these dirty hacks - should really apply validation at submission time!
 38 |         submission_name = card_data.get("submission_name").replace(" ", "_XXX_")
 39 |         submission_name = submission_name.replace("--", "_DDD_")
 40 |         # Extract submission timestamp and convert to Unix epoch in nanoseconds
 41 |         timestamp = pd.to_datetime(submission.lastModified)
 42 |         submission_timestamp = int(timestamp.tz_localize(None).timestamp())
 43 |         # Use the user-generated submission name, Git commit SHA and timestamp to create submission ID
 44 |         submission_id = submission_name + "__" + submission.sha[:6] + "__" + str(submission_timestamp)
 45 |         # Define AutoTrain payload
 46 |         project_config = {}
 47 |         # Need a dummy dataset to use the dataset loader in AutoTrain
 48 |         # Derived from the `emotion` dataset => multiclass classification task
 49 |         project_config["dataset_name"] = "autoevaluator/benchmark-dummy-data"
 50 |         project_config["dataset_config"] = "autoevaluator--benchmark-dummy-data"
 51 |         project_config["dataset_split"] = "train"
 52 |         project_config["col_mapping"] = {"text": "text", "label": "target"}
 53 |         # Specify benchmark parameters
 54 |         project_config["dataset"] = evaluation_dataset
 55 |         project_config["model"] = benchmark
 56 |         project_config["submission_dataset"] = submission_dataset
 57 | 
 58 |         # Create project
 59 |         payload = {
 60 |             "username": AUTOTRAIN_USERNAME,
 61 |             "proj_name": submission_id,
 62 |             "task": 2,  # Need multi-class classification task to align with dummy dataset
 63 |             "config": {
 64 |                 "language": "en",
 65 |                 "max_models": 5,
 66 |                 "instance": {
 67 |                     "provider": "ovh",
 68 |                     "instance_type": "p3",
 69 |                     "max_runtime_seconds": 172800,
 70 |                     "num_instances": 1,
 71 |                     "disk_size_gb": 150,
 72 |                 },
 73 |                 "benchmark": {
 74 |                     "dataset": project_config["dataset"],
 75 |                     "model": project_config["model"],
 76 |                     "submission_dataset": project_config["submission_dataset"],
 77 |                 },
 78 |             },
 79 |         }
 80 |         project_json_resp = http_post(
 81 |             path="/projects/create", payload=payload, token=AUTOTRAIN_TOKEN, domain=AUTOTRAIN_BACKEND_API
 82 |         ).json()
 83 |         typer.echo("🎨🎨🎨 Project creation 🎨🎨🎨")
 84 |         typer.echo(project_json_resp)
 85 | 
 86 |         if project_json_resp["created"]:
 87 |             data_payload = {
 88 |                 "split": 4,  # use "auto" split choice in AutoTrain
 89 |                 "col_mapping": project_config["col_mapping"],
 90 |                 "load_config": {"max_size_bytes": 0, "shuffle": False},
 91 |                 "dataset_id": project_config["dataset_name"],
 92 |                 "dataset_config": project_config["dataset_config"],
 93 |                 "dataset_split": project_config["dataset_split"],
 94 |             }
 95 |             data_json_resp = http_post(
 96 |                 path=f"/projects/{project_json_resp['id']}/data/dataset",
 97 |                 payload=data_payload,
 98 |                 token=AUTOTRAIN_TOKEN,
 99 |                 domain=AUTOTRAIN_BACKEND_API,
100 |             ).json()
101 |             typer.echo("💾💾💾 Dataset creation 💾💾💾")
102 |             typer.echo(data_json_resp)
103 | 
104 |             # Process data
105 |             data_proc_json_resp = http_post(
106 |                 path=f"/projects/{project_json_resp['id']}/data/start_processing",
107 |                 token=AUTOTRAIN_TOKEN,
108 |                 domain=AUTOTRAIN_BACKEND_API,
109 |             ).json()
110 |             typer.echo(f"🍪 Start data processing response: {data_proc_json_resp}")
111 | 
112 |             typer.echo("⏳ Waiting for data processing to complete ...")
113 |             is_data_processing_success = False
114 |             while is_data_processing_success is not True:
115 |                 project_status = http_get(
116 |                     path=f"/projects/{project_json_resp['id']}",
117 |                     token=AUTOTRAIN_TOKEN,
118 |                     domain=AUTOTRAIN_BACKEND_API,
119 |                 ).json()
120 |                 # See database.database.enums.ProjectStatus for definitions of `status`
121 |                 if project_status["status"] == 3:
122 |                     is_data_processing_success = True
123 |                     print("✅ Data processing complete!")
124 |                     time.sleep(3)
125 |                 else:
126 |                     time.sleep(10)
127 |                     typer.echo("🥱 Dataset not ready, waiting 10 more seconds ...")
128 | 
129 |             # Approve training job
130 |             train_job_resp = http_post(
131 |                 path=f"/projects/{project_json_resp['id']}/start_training",
132 |                 token=AUTOTRAIN_TOKEN,
133 |                 domain=AUTOTRAIN_BACKEND_API,
134 |             ).json()
135 |             print(f"🏃 Training job approval response: {train_job_resp}")
136 |             print("🔥 Project and dataset preparation completed!")
137 | 
138 | 
139 | if __name__ == "__main__":
140 |     app()
141 | 


--------------------------------------------------------------------------------
/scripts/run_gem_scoring.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import shutil
  3 | import zipfile
  4 | from pathlib import Path
  5 | 
  6 | import numpy as np
  7 | import requests
  8 | import typer
  9 | from dotenv import load_dotenv
 10 | from huggingface_hub import Repository, cached_download, hf_hub_url
 11 | 
 12 | from hf_benchmarks import get_benchmark_repos, get_model_index, load_json, save_json
 13 | 
 14 | 
 15 | if Path(".env").is_file():
 16 |     load_dotenv(".env")
 17 | 
 18 | auth_token = os.getenv("HF_GEM_TOKEN")
 19 | 
 20 | SCORES_REPO_URL = "https://huggingface.co/datasets/GEM-submissions/submission-scores"
 21 | OUTPUTS_REPO_URL = "https://huggingface.co/datasets/GEM-submissions/v2-outputs-and-scores"
 22 | LOCAL_SCORES_REPO = "data/submission-scores"
 23 | LOCAL_OUTPUTS_REPO = "data/v2-outputs-and-scores"
 24 | LOCAL_GEM_V1_PATH = "data/gem-v1-outputs-and-scores"
 25 | # This file is used to configure the filtering of the raw submissions and also used to configure the GEM website
 26 | EVAL_CONFIG_URL = (
 27 |     "https://raw.githubusercontent.com/GEM-benchmark/GEM-benchmark.github.io/main/web/results/eval_config.json"
 28 | )
 29 | 
 30 | app = typer.Typer()
 31 | 
 32 | 
 33 | def extract_relevant_metrics(config: dict):
 34 |     """Extract the `measures` field from the config."""
 35 |     metric_names = []
 36 |     for k, v in config["measures"].items():
 37 |         metric_names.extend(v)
 38 |     return metric_names
 39 | 
 40 | 
 41 | def drop_unnecessary_metrics(submission_scores: dict, list_of_metrics: list):
 42 |     """Return submission_scores with every metric not in list_of_metrics removed."""
 43 |     for data_name, data in submission_scores.items():
 44 |         if data_name in ["param_count", "submission_name"]:
 45 |             continue
 46 |         filtered_scores = {k: v for k, v in data.items() if k in list_of_metrics}
 47 |         submission_scores[data_name] = filtered_scores
 48 |     return submission_scores
 49 | 
 50 | 
 51 | def _round_subelements(v):
 52 |     """traverses object and rounds items."""
 53 |     if isinstance(v, float):
 54 |         return round(v, 3)
 55 |     elif isinstance(v, int) or isinstance(v, str):
 56 |         return v
 57 |     elif isinstance(v, dict):
 58 |         return {k: (round(d, 3) if isinstance(d, float) else d) for k, d in v.items()}
 59 |     else:
 60 |         raise ValueError(f"unexpected type: {type(v)}: {v}.")
 61 | 
 62 | 
 63 | def round_results(submission_scores: dict):
 64 |     """rounds every metric result to three decimal places."""
 65 |     for data_name, data in submission_scores.items():
 66 |         if data_name in ["param_count", "submission_name"]:
 67 |             continue
 68 |         rounded_scores = {k: _round_subelements(v) for k, v in data.items()}
 69 |         submission_scores[data_name] = rounded_scores
 70 |     return submission_scores
 71 | 
 72 | 
 73 | def filter_submission_output(submission_scores: dict, config: dict):
 74 |     relevant_metrics = extract_relevant_metrics(config)
 75 |     filtered_scores = [drop_unnecessary_metrics(d, relevant_metrics) for d in submission_scores]
 76 |     return [round_results(d) for d in filtered_scores]
 77 | 
 78 | 
 79 | @app.command()
 80 | def run():
 81 |     # Download the submission from v1 of the GEM benchmark
 82 |     gem_v1_url = hf_hub_url(
 83 |         "GEM-submissions/v1-outputs-and-scores", filename="gem-v1-outputs-and-scores.zip", repo_type="dataset"
 84 |     )
 85 |     gem_v1_path = cached_download(gem_v1_url)
 86 |     # Load the submissions from v1
 87 |     with zipfile.ZipFile(gem_v1_path) as zf:
 88 |         zf.extractall("data")
 89 | 
 90 |     gem_v1_files = [p for p in Path(LOCAL_GEM_V1_PATH).glob("*.scores.json")]
 91 |     gem_v1_submissions = [load_json(p) for p in gem_v1_files]
 92 |     typer.echo(f"Number of submissions from version 1 of the benchmark: {len(gem_v1_submissions)}")
 93 |     # Some fields have NaNs which breaks the frontend - replace with -999 as a workaround
 94 |     gem_v1_scores = []
 95 |     for scores in gem_v1_submissions:
 96 |         for k, v in scores.items():
 97 |             if isinstance(v, dict):
 98 |                 for kk, vv in v.items():
 99 |                     if "msttr" in kk and np.isnan(vv):
100 |                         scores[k][kk] = -999
101 |         gem_v1_scores.append(scores)
102 |     # Download submission metadata from the Hub and combine with v1 scores
103 |     hub_submissions = get_benchmark_repos(benchmark="gem", repo_type="evaluation", use_auth_token=auth_token)
104 |     # Filter out the test submissions
105 |     hub_submissions = [sub for sub in hub_submissions if "lewtun" not in sub.id]
106 |     all_scores = get_model_index(hub_submissions)
107 |     all_scores.extend(gem_v1_scores)
108 |     typer.echo(f"Number of raw scores: {len(all_scores)}")
109 |     # Clone the Hub repo with the scores
110 |     scores_repo = Repository(
111 |         local_dir=LOCAL_SCORES_REPO,
112 |         clone_from=SCORES_REPO_URL,
113 |         repo_type="dataset",
114 |         use_auth_token=auth_token,
115 |     )
116 |     # Filter the scores for smaller payload to the website / Spaces
117 |     eval_config = requests.get(EVAL_CONFIG_URL).json()
118 |     filtered_scores = filter_submission_output(all_scores, eval_config)
119 |     typer.echo(f"Number of filtered scores: {len(filtered_scores)}")
120 |     if len(all_scores) != len(filtered_scores):
121 |         raise ValueError("The raw and filtered scores must have the same count!")
122 |     # Save and update the raw and filtered scores
123 |     save_json(f"{LOCAL_SCORES_REPO}/scores.json", all_scores)
124 |     save_json(f"{LOCAL_SCORES_REPO}/filtered_scores.json", filtered_scores)
125 | 
126 |     if scores_repo.is_repo_clean():
127 |         typer.echo("No new submissions were found! Skipping update to the scores repo ...")
128 |     else:
129 |         scores_repo.git_add()
130 |         typer.echo("Pushing scores and outputs to the hub ...")
131 |         scores_repo.push_to_hub("Update submission scores")
132 | 
133 |     # Dumping all scores and outputs - refactor this!
134 |     # Clone the Hub repo with the scores
135 |     outputs_repo = Repository(
136 |         local_dir=LOCAL_OUTPUTS_REPO,
137 |         clone_from=OUTPUTS_REPO_URL,
138 |         repo_type="dataset",
139 |         use_auth_token=auth_token,
140 |     )
141 | 
142 |     # Load the submissions from v1
143 |     gem_v1_scores_files = [p for p in Path(LOCAL_GEM_V1_PATH).glob("*.scores.json")]
144 |     gem_v1_outputs_files = [p for p in Path(LOCAL_GEM_V1_PATH).glob("*.outputs.json")]
145 |     # Load scores from v2
146 |     gem_v2_scores = get_model_index(hub_submissions)
147 |     scores_submission_names = []
148 |     gem_v2_scores_files = []
149 |     for score in gem_v2_scores:
150 |         submission_name = score["submission_name"]
151 |         scores_submission_names.append(submission_name)
152 |         filename = f"data/tmp/{submission_name}.scores.json"
153 |         gem_v2_scores_files.append(Path(f"data/tmp/{submission_name}.scores.json"))
154 |         save_json(filename, score)
155 | 
156 |     gem_v2_outputs = get_benchmark_repos("gem", use_auth_token=auth_token)
157 |     gem_v2_outputs = [s for s in gem_v2_outputs if "lewtun" not in s.id]
158 |     gem_v2_outputs_files = []
159 | 
160 |     for submission in gem_v2_outputs:
161 |         card_data = submission.cardData
162 |         submission_name = card_data["submission_name"]
163 |         if submission_name in scores_submission_names:
164 |             url = hf_hub_url(submission.id, "submission.json", repo_type="dataset")
165 |             cache_filepath = cached_download(
166 |                 url, cache_dir="data/tmp/", force_filename=f"{submission_name}.outputs.json"
167 |             )
168 |             gem_v2_outputs_files.append(Path(cache_filepath))
169 | 
170 |     with zipfile.ZipFile(f"{LOCAL_OUTPUTS_REPO}/gem-v2-outputs-and-scores.zip", "w") as f:
171 |         for path in gem_v1_scores_files:
172 |             f.write(path, path.relative_to("data/gem-v1-outputs-and-scores"), compress_type=zipfile.ZIP_DEFLATED)
173 |         for path in gem_v1_outputs_files:
174 |             f.write(path, path.relative_to("data/gem-v1-outputs-and-scores"), compress_type=zipfile.ZIP_DEFLATED)
175 |         for path in gem_v2_outputs_files:
176 |             f.write(path, path.relative_to("data/tmp"), compress_type=zipfile.ZIP_DEFLATED)
177 |         for path in gem_v2_scores_files:
178 |             f.write(path, path.relative_to("data/tmp"), compress_type=zipfile.ZIP_DEFLATED)
179 | 
180 |     if outputs_repo.is_repo_clean():
181 |         typer.echo("No new outputs were found! Skipping update to the outputs repo ...")
182 |     else:
183 |         outputs_repo.git_add()
184 |         typer.echo("Pushing scores and outputs to the hub ...")
185 |         outputs_repo.push_to_hub("Update scores and outputs")
186 | 
187 |     # Flush local repos
188 |     shutil.rmtree(LOCAL_SCORES_REPO, ignore_errors=True)
189 |     shutil.rmtree(LOCAL_OUTPUTS_REPO, ignore_errors=True)
190 | 
191 | 
192 | if __name__ == "__main__":
193 |     app()
194 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------