├── .github ├── hub │ ├── push_evaluations_to_hub.py │ └── requirements.txt └── workflows │ ├── build_documentation.yml │ ├── build_pr_documentation.yml │ ├── ci.yml │ ├── delete_doc_comment.yml │ ├── python-release.yml │ ├── trufflehog.yml │ └── update_spaces.yml ├── .gitignore ├── AUTHORS ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── Makefile ├── README.md ├── additional-tests-requirements.txt ├── comparisons ├── exact_match │ ├── README.md │ ├── app.py │ ├── exact_match.py │ └── requirements.txt ├── mcnemar │ ├── README.md │ ├── app.py │ ├── mcnemar.py │ └── requirements.txt └── wilcoxon │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── wilcoxon.py ├── docs ├── README.md └── source │ ├── _toctree.yml │ ├── a_quick_tour.mdx │ ├── base_evaluator.mdx │ ├── choosing_a_metric.mdx │ ├── considerations.mdx │ ├── creating_and_sharing.mdx │ ├── custom_evaluator.mdx │ ├── evaluation_suite.mdx │ ├── index.mdx │ ├── installation.mdx │ ├── keras_integrations.md │ ├── package_reference │ ├── evaluator_classes.mdx │ ├── hub_methods.mdx │ ├── loading_methods.mdx │ ├── logging_methods.mdx │ ├── main_classes.mdx │ ├── saving_methods.mdx │ └── visualization_methods.mdx │ ├── sklearn_integrations.mdx │ ├── transformers_integrations.mdx │ └── types_of_evaluations.mdx ├── measurements ├── honest │ ├── README.md │ ├── app.py │ ├── honest.py │ └── requirements.txt ├── label_distribution │ ├── README.md │ ├── app.py │ ├── label_distribution.py │ └── requirements.txt ├── perplexity │ ├── README.md │ ├── app.py │ ├── perplexity.py │ └── requirements.txt ├── regard │ ├── README.md │ ├── app.py │ ├── regard.py │ └── requirements.txt ├── text_duplicates │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── text_duplicates.py ├── toxicity │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── toxicity.py ├── word_count │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── word_count.py └── word_length │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── word_length.py ├── metrics ├── accuracy │ ├── README.md │ ├── accuracy.py │ ├── app.py │ └── requirements.txt ├── bertscore │ ├── README.md │ ├── app.py │ ├── bertscore.py │ └── requirements.txt ├── bleu │ ├── README.md │ ├── app.py │ ├── bleu.py │ ├── requirements.txt │ └── tokenizer_13a.py ├── bleurt │ ├── README.md │ ├── app.py │ ├── bleurt.py │ └── requirements.txt ├── brier_score │ ├── README.md │ ├── app.py │ ├── brier_score.py │ └── requirements.txt ├── cer │ ├── README.md │ ├── app.py │ ├── cer.py │ ├── requirements.txt │ └── test_cer.py ├── character │ ├── README.md │ ├── app.py │ ├── character.py │ └── requirements.txt ├── charcut_mt │ ├── README.md │ ├── app.py │ ├── charcut_mt.py │ └── requirements.txt ├── chrf │ ├── README.md │ ├── app.py │ ├── chrf.py │ └── requirements.txt ├── code_eval │ ├── README.md │ ├── app.py │ ├── code_eval.py │ ├── execute.py │ └── requirements.txt ├── comet │ ├── README.md │ ├── app.py │ ├── comet.py │ └── requirements.txt ├── competition_math │ ├── README.md │ ├── app.py │ ├── competition_math.py │ └── requirements.txt ├── confusion_matrix │ ├── README.md │ ├── app.py │ ├── confusion_matrix.py │ └── requirements.txt ├── coval │ ├── README.md │ ├── app.py │ ├── coval.py │ └── requirements.txt ├── cuad │ ├── README.md │ ├── app.py │ ├── compute_score.py │ ├── cuad.py │ └── requirements.txt ├── exact_match │ ├── README.md │ ├── app.py │ ├── exact_match.py │ └── requirements.txt ├── f1 │ ├── README.md │ ├── app.py │ ├── f1.py │ └── requirements.txt ├── frugalscore │ ├── README.md │ ├── app.py │ ├── frugalscore.py │ └── requirements.txt ├── glue │ ├── README.md │ ├── app.py │ ├── glue.py │ └── requirements.txt ├── google_bleu │ ├── README.md │ ├── app.py │ ├── google_bleu.py │ ├── requirements.txt │ └── tokenizer_13a.py ├── indic_glue │ ├── README.md │ ├── app.py │ ├── indic_glue.py │ └── requirements.txt ├── mae │ ├── README.md │ ├── app.py │ ├── mae.py │ └── requirements.txt ├── mahalanobis │ ├── README.md │ ├── app.py │ ├── mahalanobis.py │ └── requirements.txt ├── mape │ ├── README.md │ ├── app.py │ ├── mape.py │ └── requirements.txt ├── mase │ ├── README.md │ ├── app.py │ ├── mase.py │ └── requirements.txt ├── matthews_correlation │ ├── README.md │ ├── app.py │ ├── matthews_correlation.py │ └── requirements.txt ├── mauve │ ├── README.md │ ├── app.py │ ├── mauve.py │ └── requirements.txt ├── mean_iou │ ├── README.md │ ├── app.py │ ├── mean_iou.py │ └── requirements.txt ├── meteor │ ├── README.md │ ├── app.py │ ├── meteor.py │ └── requirements.txt ├── mse │ ├── README.md │ ├── app.py │ ├── mse.py │ └── requirements.txt ├── nist_mt │ ├── README.md │ ├── app.py │ ├── nist_mt.py │ ├── requirements.txt │ └── tests.py ├── pearsonr │ ├── README.md │ ├── app.py │ ├── pearsonr.py │ └── requirements.txt ├── perplexity │ ├── README.md │ ├── app.py │ ├── perplexity.py │ └── requirements.txt ├── poseval │ ├── README.md │ ├── app.py │ ├── poseval.py │ └── requirements.txt ├── precision │ ├── README.md │ ├── app.py │ ├── precision.py │ └── requirements.txt ├── r_squared │ ├── README.md │ ├── app.py │ ├── r_squared.py │ └── requirements.txt ├── recall │ ├── README.md │ ├── app.py │ ├── recall.py │ └── requirements.txt ├── rl_reliability │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── rl_reliability.py ├── roc_auc │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── roc_auc.py ├── rouge │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── rouge.py ├── sacrebleu │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── sacrebleu.py ├── sari │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── sari.py ├── seqeval │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── seqeval.py ├── smape │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── smape.py ├── spearmanr │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── spearmanr.py ├── squad │ ├── README.md │ ├── app.py │ ├── compute_score.py │ ├── requirements.txt │ └── squad.py ├── squad_v2 │ ├── README.md │ ├── app.py │ ├── compute_score.py │ ├── requirements.txt │ └── squad_v2.py ├── super_glue │ ├── README.md │ ├── app.py │ ├── record_evaluation.py │ ├── requirements.txt │ └── super_glue.py ├── ter │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── ter.py ├── trec_eval │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── trec_eval.py ├── wer │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── wer.py ├── wiki_split │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── wiki_split.py ├── xnli │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── xnli.py └── xtreme_s │ ├── README.md │ ├── app.py │ ├── requirements.txt │ └── xtreme_s.py ├── setup.cfg ├── setup.py ├── src └── evaluate │ ├── __init__.py │ ├── commands │ ├── __init__.py │ └── evaluate_cli.py │ ├── config.py │ ├── evaluation_suite │ └── __init__.py │ ├── evaluator │ ├── __init__.py │ ├── audio_classification.py │ ├── automatic_speech_recognition.py │ ├── base.py │ ├── image_classification.py │ ├── question_answering.py │ ├── text2text_generation.py │ ├── text_classification.py │ ├── text_generation.py │ ├── token_classification.py │ └── utils.py │ ├── hub.py │ ├── info.py │ ├── inspect.py │ ├── loading.py │ ├── module.py │ ├── naming.py │ ├── saving.py │ ├── utils │ ├── __init__.py │ ├── file_utils.py │ ├── gradio.py │ └── logging.py │ └── visualization.py ├── templates ├── cookiecutter.json └── {{ cookiecutter.module_slug }} │ ├── README.md │ ├── app.py │ ├── requirements.txt │ ├── tests.py │ └── {{ cookiecutter.module_slug }}.py └── tests ├── __init__.py ├── conftest.py ├── test_evaluation_suite.py ├── test_evaluator.py ├── test_file_utils.py ├── test_hub.py ├── test_load.py ├── test_metric.py ├── test_metric_common.py ├── test_save.py ├── test_trainer_evaluator_parity.py ├── test_viz.py └── utils.py /.github/hub/requirements.txt: -------------------------------------------------------------------------------- 1 | huggingface_hub -------------------------------------------------------------------------------- /.github/workflows/build_documentation.yml: -------------------------------------------------------------------------------- 1 | name: Build documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - doc-builder* 8 | - v*-release 9 | 10 | jobs: 11 | build: 12 | uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main 13 | with: 14 | commit_sha: ${{ github.sha }} 15 | package: evaluate 16 | secrets: 17 | hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} 18 | -------------------------------------------------------------------------------- /.github/workflows/build_pr_documentation.yml: -------------------------------------------------------------------------------- 1 | name: Build PR Documentation 2 | 3 | on: 4 | pull_request: 5 | 6 | concurrency: 7 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} 8 | cancel-in-progress: true 9 | 10 | jobs: 11 | build: 12 | uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main 13 | with: 14 | commit_sha: ${{ github.event.pull_request.head.sha }} 15 | pr_number: ${{ github.event.number }} 16 | package: evaluate -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - main 7 | push: 8 | branches: 9 | - main 10 | - ci-* 11 | 12 | env: 13 | HF_ALLOW_CODE_EVAL: 1 14 | 15 | jobs: 16 | 17 | check_code_quality: 18 | runs-on: ubuntu-latest 19 | steps: 20 | - uses: actions/checkout@v3 21 | - name: Set up Python 22 | uses: actions/setup-python@v4 23 | with: 24 | python-version: "3.8" 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install .[quality] 29 | - name: Check quality 30 | run: | 31 | black --check --line-length 119 --target-version py36 tests src metrics comparisons measurements 32 | isort --check-only tests src metrics comparisons measurements 33 | flake8 tests src metrics 34 | 35 | test: 36 | needs: check_code_quality 37 | strategy: 38 | fail-fast: false 39 | matrix: 40 | test: ['unit', 'parity'] 41 | os: [ubuntu-latest, windows-latest] 42 | runs-on: ${{ matrix.os }} 43 | steps: 44 | - uses: actions/checkout@v3 45 | with: 46 | fetch-depth: 0 47 | - name: Set up Python 3.8 48 | uses: actions/setup-python@v4 49 | with: 50 | python-version: "3.8" 51 | - name: Upgrade pip 52 | run: python -m pip install --upgrade pip 53 | - name: Install dependencies 54 | run: | 55 | pip install .[tests] 56 | pip install -r additional-tests-requirements.txt --no-deps 57 | - name: Test with pytest 58 | if: ${{ matrix.test == 'unit' }} 59 | run: | 60 | python -m pytest -n 2 --dist loadfile -sv ./tests/ --ignore=./tests/test_trainer_evaluator_parity.py 61 | - name: Integration test with transformers 62 | if: ${{ matrix.test == 'parity' }} 63 | run: | 64 | python -m pytest -n 2 --dist loadfile -sv ./tests/test_trainer_evaluator_parity.py 65 | -------------------------------------------------------------------------------- /.github/workflows/delete_doc_comment.yml: -------------------------------------------------------------------------------- 1 | name: Delete dev documentation 2 | 3 | on: 4 | pull_request: 5 | types: [ closed ] 6 | 7 | 8 | jobs: 9 | delete: 10 | uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main 11 | with: 12 | pr_number: ${{ github.event.number }} 13 | package: evaluate -------------------------------------------------------------------------------- /.github/workflows/python-release.yml: -------------------------------------------------------------------------------- 1 | name: Python release 2 | 3 | on: 4 | push: 5 | tags: 6 | - v* 7 | 8 | env: 9 | PYPI_TOKEN: ${{ secrets.PYPI_TOKEN_DIST }} 10 | 11 | jobs: 12 | python_release: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - uses: actions/checkout@v2 17 | - name: Set up Python 18 | uses: actions/setup-python@v2 19 | with: 20 | python-version: 3.9 21 | - name: Install dependencies 22 | run: | 23 | pip install --upgrade pip 24 | pip install setuptools wheel 25 | - run: python setup.py sdist bdist_wheel 26 | 27 | - run: | 28 | pip install twine 29 | - name: Upload to PyPi 30 | run: | 31 | twine upload dist/* -u __token__ -p "$PYPI_TOKEN" 32 | -------------------------------------------------------------------------------- /.github/workflows/trufflehog.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | 4 | name: Secret Leaks 5 | 6 | jobs: 7 | trufflehog: 8 | runs-on: ubuntu-latest 9 | steps: 10 | - name: Checkout code 11 | uses: actions/checkout@v4 12 | with: 13 | fetch-depth: 0 14 | - name: Secret Scanning 15 | uses: trufflesecurity/trufflehog@main 16 | -------------------------------------------------------------------------------- /.github/workflows/update_spaces.yml: -------------------------------------------------------------------------------- 1 | name: Update Hub repositories 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | update-hub-repositories: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout repository 13 | uses: actions/checkout@v2 14 | with: 15 | fetch-depth: 0 16 | - name: Set up Python 17 | uses: actions/setup-python@v2 18 | with: 19 | python-version: "3.8" 20 | - name: Set up default Git config 21 | run: | 22 | git config --global user.name evaluate-bot 23 | git config --global user.email leandro@huggingface.co 24 | - name: Install dependencies 25 | working-directory: ./.github/hub 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install -r requirements.txt 29 | - name: Update Hub repositories 30 | working-directory: ./.github/hub 31 | run: | 32 | export HF_TOKEN=${{ secrets.HF_HUB_TOKEN }} 33 | export EVALUATE_LIB_PATH=$GITHUB_WORKSPACE 34 | export GIT_HASH=$GITHUB_SHA 35 | export GIT_LFS_SKIP_SMUDGE=1 36 | python push_evaluations_to_hub.py -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Locked files 2 | *.lock 3 | !dvc.lock 4 | 5 | # Extracted dummy data 6 | datasets/**/dummy_data-zip-extracted/ 7 | 8 | # Compiled python modules. 9 | *.pyc 10 | 11 | # Byte-compiled 12 | _pycache__/ 13 | .cache/ 14 | 15 | # Python egg metadata, regenerated from source files by setuptools. 16 | *.egg-info 17 | .eggs/ 18 | 19 | # PyPI distribution artifacts. 20 | build/ 21 | dist/ 22 | 23 | # Environments 24 | .env 25 | .venv 26 | env/ 27 | venv/ 28 | ENV/ 29 | env.bak/ 30 | venv.bak/ 31 | 32 | # pyenv 33 | .python-version 34 | 35 | # Tests 36 | .pytest_cache/ 37 | 38 | # Other 39 | *.DS_Store 40 | 41 | # PyCharm/vscode 42 | .idea 43 | .vscode 44 | 45 | # keep only the empty datasets and metrics directory with it's __init__.py file 46 | /src/*/datasets/* 47 | !/src/*/datasets/__init__.py 48 | 49 | /src/*/metrics/* 50 | !/src/*/metrics/__init__.py 51 | 52 | # Vim 53 | .*.swp 54 | 55 | # playground 56 | /playground 57 | 58 | # Sphinx documentation 59 | docs/_build/ 60 | docs/source/_build/ 61 | 62 | # Benchmark results 63 | report.json 64 | report.md -------------------------------------------------------------------------------- /AUTHORS: -------------------------------------------------------------------------------- 1 | # This is the list of HuggingFace Datasets authors for copyright purposes. 2 | # 3 | # This does not necessarily list everyone who has contributed code, since in 4 | # some cases, their employer may be the copyright holder. To see the full list 5 | # of contributors, see the revision history in source control. 6 | 7 | Google Inc. 8 | HuggingFace Inc. 9 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: quality style test 2 | 3 | # Check that source code meets quality standards 4 | 5 | quality: 6 | black --check --line-length 119 --target-version py36 tests src metrics comparisons measurements 7 | isort --check-only tests src metrics measurements 8 | flake8 tests src metrics 9 | 10 | # Format source code automatically 11 | 12 | style: 13 | black --line-length 119 --target-version py36 tests src metrics comparisons measurements 14 | isort tests src metrics measurements 15 | 16 | # Run tests for the library 17 | 18 | test: 19 | python -m pytest -n auto --dist=loadfile -s -v ./tests/ 20 | -------------------------------------------------------------------------------- /additional-tests-requirements.txt: -------------------------------------------------------------------------------- 1 | unbabel-comet>=1.0.0;python_version>'3.6' 2 | git+https://github.com/google-research/bleurt.git 3 | git+https://github.com/ns-moosavi/coval.git 4 | git+https://github.com/hendrycks/math.git 5 | git+https://github.com/google-research/rl-reliability-metrics 6 | gin-config -------------------------------------------------------------------------------- /comparisons/exact_match/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Exact Match 3 | emoji: 🤗 4 | colorFrom: blue 5 | colorTo: green 6 | sdk: gradio 7 | sdk_version: 3.0.2 8 | app_file: app.py 9 | pinned: false 10 | tags: 11 | - evaluate 12 | - comparison 13 | description: >- 14 | Returns the rate at which the predictions of one model exactly match those of another model. 15 | --- 16 | 17 | 18 | # Comparison Card for Exact Match 19 | 20 | ## Comparison description 21 | 22 | Given two model predictions the exact match score is 1 if they are the exact same, and is 0 otherwise. The overall exact match score is the average. 23 | 24 | - **Example 1**: The exact match score if prediction 1.0 is [0, 1] is 0, given prediction 2 is [0, 1]. 25 | - **Example 2**: The exact match score if prediction 0.0 is [0, 1] is 0, given prediction 2 is [1, 0]. 26 | - **Example 3**: The exact match score if prediction 0.5 is [0, 1] is 0, given prediction 2 is [1, 1]. 27 | 28 | ## How to use 29 | 30 | At minimum, this metric takes as input predictions and references: 31 | ```python 32 | >>> exact_match = evaluate.load("exact_match", module_type="comparison") 33 | >>> results = exact_match.compute(predictions1=[0, 1, 1], predictions2=[1, 1, 1]) 34 | >>> print(results) 35 | {'exact_match': 0.66} 36 | ``` 37 | 38 | ## Output values 39 | 40 | Returns a float between 0.0 and 1.0 inclusive. 41 | 42 | ## Examples 43 | 44 | ```python 45 | >>> exact_match = evaluate.load("exact_match", module_type="comparison") 46 | >>> results = exact_match.compute(predictions1=[0, 0, 0], predictions2=[1, 1, 1]) 47 | >>> print(results) 48 | {'exact_match': 1.0} 49 | ``` 50 | 51 | ```python 52 | >>> exact_match = evaluate.load("exact_match", module_type="comparison") 53 | >>> results = exact_match.compute(predictions1=[0, 1, 1], predictions2=[1, 1, 1]) 54 | >>> print(results) 55 | {'exact_match': 0.66} 56 | ``` 57 | 58 | 59 | ## Limitations and bias 60 | 61 | ## Citations 62 | -------------------------------------------------------------------------------- /comparisons/exact_match/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("exact_match", module_type="comparison") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /comparisons/exact_match/exact_match.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Evaluate Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Exact match test for model comparison.""" 15 | 16 | import datasets 17 | import numpy as np 18 | 19 | import evaluate 20 | 21 | 22 | _DESCRIPTION = """ 23 | Returns the rate at which the predictions of one model exactly match those of another model. 24 | """ 25 | 26 | 27 | _KWARGS_DESCRIPTION = """ 28 | Args: 29 | predictions1 (`list` of `int`): Predicted labels for model 1. 30 | predictions2 (`list` of `int`): Predicted labels for model 2. 31 | 32 | Returns: 33 | exact_match (`float`): Dictionary containing exact_match rate. Possible values are between 0.0 and 1.0, inclusive. 34 | 35 | Examples: 36 | >>> exact_match = evaluate.load("exact_match", module_type="comparison") 37 | >>> results = exact_match.compute(predictions1=[1, 1, 1], predictions2=[1, 1, 1]) 38 | >>> print(results) 39 | {'exact_match': 1.0} 40 | """ 41 | 42 | 43 | _CITATION = """ 44 | """ 45 | 46 | 47 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 48 | class ExactMatch(evaluate.Comparison): 49 | def _info(self): 50 | return evaluate.ComparisonInfo( 51 | module_type="comparison", 52 | description=_DESCRIPTION, 53 | citation=_CITATION, 54 | inputs_description=_KWARGS_DESCRIPTION, 55 | features=datasets.Features( 56 | { 57 | "predictions1": datasets.Value("int64"), 58 | "predictions2": datasets.Value("int64"), 59 | } 60 | ), 61 | ) 62 | 63 | def _compute(self, predictions1, predictions2): 64 | score_list = [p1 == p2 for p1, p2 in zip(predictions1, predictions2)] 65 | return {"exact_match": np.mean(score_list)} 66 | -------------------------------------------------------------------------------- /comparisons/exact_match/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scipy -------------------------------------------------------------------------------- /comparisons/mcnemar/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: McNemar 3 | emoji: 🤗 4 | colorFrom: blue 5 | colorTo: green 6 | sdk: gradio 7 | sdk_version: 3.0.2 8 | app_file: app.py 9 | pinned: false 10 | tags: 11 | - evaluate 12 | - comparison 13 | description: >- 14 | McNemar's test is a diagnostic test over a contingency table resulting from the predictions of two classifiers. The test compares the sensitivity and specificity of the diagnostic tests on the same group reference labels. It can be computed with: 15 | McNemar = (SE - SP)**2 / SE + SP 16 | Where: 17 | SE: Sensitivity (Test 1 positive; Test 2 negative) 18 | SP: Specificity (Test 1 negative; Test 2 positive) 19 | --- 20 | 21 | 22 | # Comparison Card for McNemar 23 | 24 | ## Comparison description 25 | 26 | McNemar's test is a non-parametric diagnostic test over a contingency table resulting from the predictions of two classifiers. The test compares the sensitivity and specificity of the diagnostic tests on the same group reference labels. It can be computed with: 27 | 28 | McNemar = (SE - SP)**2 / SE + SP 29 | 30 | Where: 31 | * SE: Sensitivity (Test 1 positive; Test 2 negative) 32 | * SP: Specificity (Test 1 negative; Test 2 positive) 33 | 34 | In other words, SE and SP are the diagonal elements of the contingency table for the classifier predictions (`predictions1` and `predictions2`) with respect to the ground truth `references`. 35 | 36 | ## How to use 37 | 38 | The McNemar comparison calculates the proportions of responses that exhibit disagreement between two classifiers. It is used to analyze paired nominal data. 39 | 40 | ## Inputs 41 | 42 | Its arguments are: 43 | 44 | `predictions1`: a list of predictions from the first model. 45 | 46 | `predictions2`: a list of predictions from the second model. 47 | 48 | `references`: a list of the ground truth reference labels. 49 | 50 | ## Output values 51 | 52 | The McNemar comparison outputs two things: 53 | 54 | `stat`: The McNemar statistic. 55 | 56 | `p`: The p value. 57 | 58 | ## Examples 59 | 60 | Example comparison: 61 | 62 | ```python 63 | mcnemar = evaluate.load("mcnemar") 64 | results = mcnemar.compute(references=[1, 0, 1], predictions1=[1, 1, 1], predictions2=[1, 0, 1]) 65 | print(results) 66 | {'stat': 1.0, 'p': 0.31731050786291115} 67 | ``` 68 | 69 | ## Limitations and bias 70 | 71 | The McNemar test is a non-parametric test, so it has relatively few assumptions (basically only that the observations are independent). It should be used to analyze paired nominal data only. 72 | 73 | ## Citations 74 | 75 | ```bibtex 76 | @article{mcnemar1947note, 77 | title={Note on the sampling error of the difference between correlated proportions or percentages}, 78 | author={McNemar, Quinn}, 79 | journal={Psychometrika}, 80 | volume={12}, 81 | number={2}, 82 | pages={153--157}, 83 | year={1947}, 84 | publisher={Springer-Verlag} 85 | } 86 | ``` 87 | -------------------------------------------------------------------------------- /comparisons/mcnemar/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("mcnemar", module_type="comparison") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /comparisons/mcnemar/mcnemar.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Evaluate Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """McNemar test for model comparison.""" 15 | 16 | import datasets 17 | from scipy.stats import chi2 18 | 19 | import evaluate 20 | 21 | 22 | _DESCRIPTION = """ 23 | McNemar's test is a diagnostic test over a contingency table resulting from the predictions of two classifiers. The test compares the sensitivity and specificity of the diagnostic tests on the same group reference labels. It can be computed with: 24 | McNemar = (SE - SP)**2 / SE + SP 25 | Where: 26 | SE: Sensitivity (Test 1 positive; Test 2 negative) 27 | SP: Specificity (Test 1 negative; Test 2 positive) 28 | """ 29 | 30 | 31 | _KWARGS_DESCRIPTION = """ 32 | Args: 33 | predictions1 (`list` of `int`): Predicted labels for model 1. 34 | predictions2 (`list` of `int`): Predicted labels for model 2. 35 | references (`list` of `int`): Ground truth labels. 36 | 37 | Returns: 38 | stat (`float`): McNemar test score. 39 | p (`float`): The p value. Minimum possible value is 0. Maximum possible value is 1.0. A lower p value means a more significant difference. 40 | 41 | Examples: 42 | >>> mcnemar = evaluate.load("mcnemar") 43 | >>> results = mcnemar.compute(references=[1, 0, 1], predictions1=[1, 1, 1], predictions2=[1, 0, 1]) 44 | >>> print(results) 45 | {'stat': 1.0, 'p': 0.31731050786291115} 46 | """ 47 | 48 | 49 | _CITATION = """ 50 | @article{mcnemar1947note, 51 | title={Note on the sampling error of the difference between correlated proportions or percentages}, 52 | author={McNemar, Quinn}, 53 | journal={Psychometrika}, 54 | volume={12}, 55 | number={2}, 56 | pages={153--157}, 57 | year={1947}, 58 | publisher={Springer-Verlag} 59 | } 60 | """ 61 | 62 | 63 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 64 | class McNemar(evaluate.Comparison): 65 | def _info(self): 66 | return evaluate.ComparisonInfo( 67 | module_type="comparison", 68 | description=_DESCRIPTION, 69 | citation=_CITATION, 70 | inputs_description=_KWARGS_DESCRIPTION, 71 | features=datasets.Features( 72 | { 73 | "predictions1": datasets.Value("int64"), 74 | "predictions2": datasets.Value("int64"), 75 | "references": datasets.Value("int64"), 76 | } 77 | ), 78 | ) 79 | 80 | def _compute(self, predictions1, predictions2, references): 81 | # construct contingency table 82 | tbl = [[0, 0], [0, 0]] 83 | for gt, p1, p2 in zip(references, predictions1, predictions2): 84 | if p1 == gt and p2 == gt: 85 | tbl[0][0] += 1 86 | elif p1 == gt: 87 | tbl[0][1] += 1 88 | elif p2 == gt: 89 | tbl[1][0] += 1 90 | else: 91 | tbl[1][1] += 1 92 | 93 | # compute statistic 94 | b, c = tbl[0][1], tbl[1][0] 95 | statistic = abs(b - c) ** 2 / (1.0 * (b + c)) 96 | df = 1 97 | pvalue = chi2.sf(statistic, df) 98 | return {"stat": statistic, "p": pvalue} 99 | -------------------------------------------------------------------------------- /comparisons/mcnemar/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scipy -------------------------------------------------------------------------------- /comparisons/wilcoxon/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Wilcoxon 3 | emoji: 🤗 4 | colorFrom: blue 5 | colorTo: green 6 | sdk: gradio 7 | sdk_version: 3.0.2 8 | app_file: app.py 9 | pinned: false 10 | tags: 11 | - evaluate 12 | - comparison 13 | description: >- 14 | Wilcoxon's test is a signed-rank test for comparing paired samples. 15 | --- 16 | 17 | 18 | # Comparison Card for Wilcoxon 19 | 20 | ## Comparison description 21 | 22 | Wilcoxon's test is a non-parametric signed-rank test that tests whether the distribution of the differences is symmetric about zero. It can be used to compare the predictions of two models. 23 | 24 | ## How to use 25 | 26 | The Wilcoxon comparison is used to analyze paired ordinal data. 27 | 28 | ## Inputs 29 | 30 | Its arguments are: 31 | 32 | `predictions1`: a list of predictions from the first model. 33 | 34 | `predictions2`: a list of predictions from the second model. 35 | 36 | ## Output values 37 | 38 | The Wilcoxon comparison outputs two things: 39 | 40 | `stat`: The Wilcoxon statistic. 41 | 42 | `p`: The p value. 43 | 44 | ## Examples 45 | 46 | Example comparison: 47 | 48 | ```python 49 | wilcoxon = evaluate.load("wilcoxon") 50 | results = wilcoxon.compute(predictions1=[-7, 123.45, 43, 4.91, 5], predictions2=[1337.12, -9.74, 1, 2, 3.21]) 51 | print(results) 52 | {'stat': 5.0, 'p': 0.625} 53 | ``` 54 | 55 | ## Limitations and bias 56 | 57 | The Wilcoxon test is a non-parametric test, so it has relatively few assumptions (basically only that the observations are independent). It should be used to analyze paired ordinal data only. 58 | 59 | ## Citations 60 | 61 | ```bibtex 62 | @incollection{wilcoxon1992individual, 63 | title={Individual comparisons by ranking methods}, 64 | author={Wilcoxon, Frank}, 65 | booktitle={Breakthroughs in statistics}, 66 | pages={196--202}, 67 | year={1992}, 68 | publisher={Springer} 69 | } 70 | ``` 71 | -------------------------------------------------------------------------------- /comparisons/wilcoxon/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("wilcoxon", module_type="comparison") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /comparisons/wilcoxon/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@a45df1eb9996eec64ec3282ebe554061cb366388 2 | datasets~=2.0 3 | scipy 4 | -------------------------------------------------------------------------------- /comparisons/wilcoxon/wilcoxon.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Evaluate Authors 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Wilcoxon test for model comparison.""" 15 | 16 | import datasets 17 | from scipy.stats import wilcoxon 18 | 19 | import evaluate 20 | 21 | 22 | _DESCRIPTION = """ 23 | Wilcoxon's test is a non-parametric signed-rank test that tests whether the distribution of the differences is symmetric about zero. It can be used to compare the predictions of two models. 24 | """ 25 | 26 | 27 | _KWARGS_DESCRIPTION = """ 28 | Args: 29 | predictions1 (`list` of `float`): Predictions for model 1. 30 | predictions2 (`list` of `float`): Predictions for model 2. 31 | 32 | Returns: 33 | stat (`float`): Wilcoxon test score. 34 | p (`float`): The p value. Minimum possible value is 0. Maximum possible value is 1.0. A lower p value means a more significant difference. 35 | 36 | Examples: 37 | >>> wilcoxon = evaluate.load("wilcoxon") 38 | >>> results = wilcoxon.compute(predictions1=[-7, 123.45, 43, 4.91, 5], predictions2=[1337.12, -9.74, 1, 2, 3.21]) 39 | >>> print(results) 40 | {'stat': 5.0, 'p': 0.625} 41 | """ 42 | 43 | 44 | _CITATION = """ 45 | @incollection{wilcoxon1992individual, 46 | title={Individual comparisons by ranking methods}, 47 | author={Wilcoxon, Frank}, 48 | booktitle={Breakthroughs in statistics}, 49 | pages={196--202}, 50 | year={1992}, 51 | publisher={Springer} 52 | } 53 | """ 54 | 55 | 56 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 57 | class Wilcoxon(evaluate.Comparison): 58 | def _info(self): 59 | return evaluate.ComparisonInfo( 60 | module_type="comparison", 61 | description=_DESCRIPTION, 62 | citation=_CITATION, 63 | inputs_description=_KWARGS_DESCRIPTION, 64 | features=datasets.Features( 65 | { 66 | "predictions1": datasets.Value("float"), 67 | "predictions2": datasets.Value("float"), 68 | } 69 | ), 70 | ) 71 | 72 | def _compute(self, predictions1, predictions2): 73 | # calculate difference 74 | d = [p1 - p2 for (p1, p2) in zip(predictions1, predictions2)] 75 | 76 | # compute statistic 77 | res = wilcoxon(d) 78 | return {"stat": res.statistic, "p": res.pvalue} 79 | -------------------------------------------------------------------------------- /docs/source/_toctree.yml: -------------------------------------------------------------------------------- 1 | - sections: 2 | - local: index 3 | title: 🤗 Evaluate 4 | title: Get started 5 | - sections: 6 | - local: installation 7 | title: Installation 8 | - local: a_quick_tour 9 | title: A quick tour 10 | title: Tutorials 11 | - sections: 12 | - local: choosing_a_metric 13 | title: Choosing the right metric 14 | - local: creating_and_sharing 15 | title: Adding new evaluations 16 | - local: base_evaluator 17 | title: Using the evaluator 18 | - local: custom_evaluator 19 | title: Using the evaluator with custom pipelines 20 | - local: evaluation_suite 21 | title: Creating an EvaluationSuite 22 | - sections: 23 | - local: transformers_integrations 24 | title: Transformers 25 | - local: keras_integrations 26 | title: Keras and Tensorflow 27 | - local: sklearn_integrations 28 | title: scikit-learn 29 | title: Using 🤗 Evaluate with other ML frameworks 30 | title: "How-to guides" 31 | - sections: 32 | - local: types_of_evaluations 33 | title: Types of evaluations 34 | - local: considerations 35 | title: Considerations for model evaluation 36 | title: "Conceptual guides" 37 | - sections: 38 | - local: package_reference/main_classes 39 | title: Main classes 40 | - local: package_reference/loading_methods 41 | title: Loading methods 42 | - local: package_reference/saving_methods 43 | title: Saving methods 44 | - local: package_reference/hub_methods 45 | title: Hub methods 46 | - local: package_reference/evaluator_classes 47 | title: Evaluator classes 48 | - local: package_reference/visualization_methods 49 | title: Visualization methods 50 | - local: package_reference/logging_methods 51 | title: Logging methods 52 | title: "Reference" 53 | -------------------------------------------------------------------------------- /docs/source/evaluation_suite.mdx: -------------------------------------------------------------------------------- 1 | # Creating an EvaluationSuite 2 | 3 | It can be useful to evaluate models on a variety of different tasks to understand their downstream performance. Assessing the model on several types of tasks can reveal gaps in performance along some axis. For example, when training a language model, it is often useful to measure perplexity on an in-domain corpus, but also to concurrently evaluate on tasks which test for general language capabilities like natural language entailment or question-answering, or tasks designed to probe the model along fairness and bias dimensions. 4 | 5 | The `EvaluationSuite` provides a way to compose any number of ([evaluator](base_evaluator), dataset, metric) tuples as a SubTask to evaluate a model on a collection of several evaluation tasks. See the [evaluator documentation](base_evaluator) for a list of currently supported tasks. 6 | 7 | A new `EvaluationSuite` is made up of a list of `SubTask` classes, each defining an evaluation task. The Python file containing the definition can be uploaded to a Space on the Hugging Face Hub so it can be shared with the community or saved/loaded locally as a Python script. 8 | 9 | Some datasets require additional preprocessing before passing them to an `Evaluator`. You can set a `data_preprocessor` for each `SubTask` which is applied via a `map` operation using the `datasets` library. Keyword arguments for the `Evaluator` can be passed down through the `args_for_task` attribute. 10 | 11 | To create a new `EvaluationSuite`, create a [new Space](https://huggingface.co/new-space) with a .py file which matches the name of the Space, add the below template to a Python file, and fill in the attributes for a new task. 12 | 13 | The mandatory attributes for a new `SubTask` are `task_type` and `data`. 14 | 1. [`task_type`] maps to the tasks currently supported by the Evaluator. 15 | 2. [`data`] can be an instantiated Hugging Face dataset object or the name of a dataset. 16 | 3. [`subset`] and [`split`] can be used to define which name and split of the dataset should be used for evaluation. 17 | 4. [`args_for_task`] should be a dictionary with kwargs to be passed to the Evaluator. 18 | 19 | ```python 20 | import evaluate 21 | from evaluate.evaluation_suite import SubTask 22 | 23 | class Suite(evaluate.EvaluationSuite): 24 | 25 | def __init__(self, name): 26 | super().__init__(name) 27 | self.preprocessor = lambda x: {"text": x["text"].lower()} 28 | self.suite = [ 29 | SubTask( 30 | task_type="text-classification", 31 | data="glue", 32 | subset="sst2", 33 | split="validation[:10]", 34 | args_for_task={ 35 | "metric": "accuracy", 36 | "input_column": "sentence", 37 | "label_column": "label", 38 | "label_mapping": { 39 | "LABEL_0": 0.0, 40 | "LABEL_1": 1.0 41 | } 42 | } 43 | ), 44 | SubTask( 45 | task_type="text-classification", 46 | data="glue", 47 | subset="rte", 48 | split="validation[:10]", 49 | args_for_task={ 50 | "metric": "accuracy", 51 | "input_column": "sentence1", 52 | "second_input_column": "sentence2", 53 | "label_column": "label", 54 | "label_mapping": { 55 | "LABEL_0": 0, 56 | "LABEL_1": 1 57 | } 58 | } 59 | ) 60 | ] 61 | ``` 62 | 63 | An `EvaluationSuite` can be loaded by name from the Hugging Face Hub, or locally by providing a path, and run with the `run(model_or_pipeline)` method. The evaluation results are returned along with their task names and information about the time it took to obtain predictions through the pipeline. These can be easily displayed with a `pandas.DataFrame`: 64 | 65 | ``` 66 | >>> from evaluate import EvaluationSuite 67 | >>> suite = EvaluationSuite.load('mathemakitten/glue-evaluation-suite') 68 | >>> results = suite.run("gpt2") 69 | ``` 70 | 71 | | accuracy | total_time_in_seconds | samples_per_second | latency_in_seconds | task_name | 72 | |-----------:|------------------------:|---------------------:|---------------------:|:------------| 73 | | 0.5 | 0.740811 | 13.4987 | 0.0740811 | glue/sst2 | 74 | | 0.4 | 1.67552 | 5.9683 | 0.167552 | glue/rte | 75 | -------------------------------------------------------------------------------- /docs/source/index.mdx: -------------------------------------------------------------------------------- 1 |

2 |
3 | 4 |
5 |

6 | 7 | # 🤗 Evaluate 8 | 9 | A library for easily evaluating machine learning models and datasets. 10 | 11 | With a single line of code, you get access to dozens of evaluation methods for different domains (NLP, Computer Vision, Reinforcement Learning, and more!). Be it on your local machine or in a distributed training setup, you can evaluate your models in a consistent and reproducible way! 12 | 13 | Visit the 🤗 Evaluate [organization](https://huggingface.co/evaluate-metric) for a full list of available metrics. Each metric has a dedicated Space with an interactive demo for how to use the metric, and a documentation card detailing the metrics limitations and usage. 14 | 15 | > **Tip:** For more recent evaluation approaches, for example for evaluating LLMs, we recommend our newer and more actively maintained library [LightEval](https://github.com/huggingface/lighteval). 16 | 17 |
18 |
19 |
Tutorials
21 |

Learn the basics and become familiar with loading, computing, and saving with 🤗 Evaluate. Start here if you are using 🤗 Evaluate for the first time!

22 |
23 |
How-to guides
25 |

Practical guides to help you achieve a specific goal. Take a look at these guides to learn how to use 🤗 Evaluate to solve real-world problems.

26 |
27 |
Conceptual guides
29 |

High-level explanations for building a better understanding of important topics such as considerations going into evaluating a model or dataset and the difference between metrics, measurements, and comparisons.

30 |
31 |
Reference
33 |

Technical descriptions of how 🤗 Evaluate classes and methods work.

34 |
35 |
36 |
37 | -------------------------------------------------------------------------------- /docs/source/installation.mdx: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | Before you start, you will need to setup your environment and install the appropriate packages. 🤗 Evaluate is tested on **Python 3.7+**. 4 | 5 | ## Virtual environment 6 | 7 | You should install 🤗 Evaluate in a [virtual environment](https://docs.python.org/3/library/venv.html) to keep everything neat and tidy. 8 | 9 | 1. Create and navigate to your project directory: 10 | 11 | ```bash 12 | mkdir ~/my-project 13 | cd ~/my-project 14 | ``` 15 | 16 | 2. Start a virtual environment inside the directory: 17 | 18 | ```bash 19 | python -m venv .env 20 | ``` 21 | 22 | 3. Activate and deactivate the virtual environment with the following commands: 23 | 24 | ```bash 25 | # Activate the virtual environment 26 | source .env/bin/activate 27 | 28 | # Deactivate the virtual environment 29 | source .env/bin/deactivate 30 | ``` 31 | 32 | Once you have created your virtual environment, you can install 🤗 Evaluate in it. 33 | 34 | ## pip 35 | 36 | The most straightforward way to install 🤗 Evaluate is with pip: 37 | 38 | ```bash 39 | pip install evaluate 40 | ``` 41 | 42 | Run the following command to check if 🤗 Evaluate has been properly installed: 43 | 44 | ```bash 45 | python -c "import evaluate; print(evaluate.load('exact_match').compute(references=['hello'], predictions=['hello']))" 46 | ``` 47 | 48 | This should return: 49 | 50 | ```bash 51 | {'exact_match': 1.0} 52 | ``` 53 | 54 | ## source 55 | 56 | Building 🤗 Evaluate from source lets you make changes to the code base. To install from source, clone the repository and install with the following commands: 57 | 58 | ```bash 59 | git clone https://github.com/huggingface/evaluate.git 60 | cd evaluate 61 | pip install -e . 62 | ``` 63 | 64 | Again, you can check if 🤗 Evaluate has been properly installed with: 65 | 66 | ```bash 67 | python -c "import evaluate; print(evaluate.load('exact_match').compute(references=['hello'], predictions=['hello']))" 68 | ``` -------------------------------------------------------------------------------- /docs/source/keras_integrations.md: -------------------------------------------------------------------------------- 1 | # Working with Keras and Tensorflow 2 | 3 | 4 | 5 | Evaluate can be easily intergrated into your Keras and Tensorflow workflow. We'll demonstrate two ways of incorporating Evaluate into model training, using the Fashion MNIST example dataset. We'll train a standard classifier to predict two classes from this dataset, and show how to use a metric as a callback during training or afterwards for evaluation. 6 | 7 | 8 | ```python 9 | import numpy as np 10 | from tensorflow import keras 11 | from tensorflow.keras import layers 12 | import evaluate 13 | 14 | # We pull example code from Keras.io's guide on classifying with MNIST 15 | # Located here: https://keras.io/examples/vision/mnist_convnet/ 16 | 17 | # Model / data parameters 18 | input_shape = (28, 28, 1) 19 | 20 | # Load the data and split it between train and test sets 21 | (x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data() 22 | 23 | 24 | # Only select tshirts/tops and trousers, classes 0 and 1 25 | def get_tshirts_tops_and_trouser(x_vals, y_vals): 26 | mask = np.where((y_vals == 0) | (y_vals == 1)) 27 | return x_vals[mask], y_vals[mask] 28 | 29 | x_train, y_train = get_tshirts_tops_and_trouser(x_train, y_train) 30 | x_test, y_test = get_tshirts_tops_and_trouser(x_test, y_test) 31 | 32 | 33 | # Scale images to the [0, 1] range 34 | x_train = x_train.astype("float32") / 255 35 | x_test = x_test.astype("float32") / 255 36 | 37 | x_train = np.expand_dims(x_train, -1) 38 | x_test = np.expand_dims(x_test, -1) 39 | 40 | 41 | model = keras.Sequential( 42 | [ 43 | keras.Input(shape=input_shape), 44 | layers.Conv2D(32, kernel_size=(3, 3), activation="relu"), 45 | layers.MaxPooling2D(pool_size=(2, 2)), 46 | layers.Conv2D(64, kernel_size=(3, 3), activation="relu"), 47 | layers.MaxPooling2D(pool_size=(2, 2)), 48 | layers.Flatten(), 49 | layers.Dropout(0.5), 50 | layers.Dense(1, activation="sigmoid"), 51 | ] 52 | ) 53 | ``` 54 | 55 | ## Callbacks 56 | 57 | Suppose we want to keep track of model metrics while a model is training. We can use a Callback in order to calculate this metric during training, after an epoch ends. 58 | 59 | We'll define a callback here that will take a metric name and our training data, and have it calculate a metric after the epoch ends. 60 | 61 | 62 | ```python 63 | class MetricsCallback(keras.callbacks.Callback): 64 | 65 | def __init__(self, metric_name, x_data, y_data) -> None: 66 | super(MetricsCallback, self).__init__() 67 | 68 | self.x_data = x_data 69 | self.y_data = y_data 70 | self.metric_name = metric_name 71 | self.metric = evaluate.load(metric_name) 72 | 73 | def on_epoch_end(self, epoch, logs=dict()): 74 | m = self.model 75 | # Ensure we get labels of "1" or "0" 76 | training_preds = np.round(m.predict(self.x_data)) 77 | training_labels = self.y_data 78 | 79 | # Compute score and save 80 | score = self.metric.compute(predictions = training_preds, references = training_labels) 81 | 82 | logs.update(score) 83 | ``` 84 | 85 | We can pass this class to the `callbacks` keyword-argument to use it during training: 86 | 87 | 88 | ```python 89 | batch_size = 128 90 | epochs = 2 91 | 92 | model.compile(loss="binary_crossentropy", optimizer="adam") 93 | 94 | model_history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1, 95 | callbacks = [MetricsCallback(x_data = x_train, y_data = y_train, metric_name = "accuracy")]) 96 | ``` 97 | 98 | ## Using an Evaluate Metric for... Evaluation! 99 | 100 | We can also use the same metric after model training! Here, we show how to check accuracy of the model after training on the test set: 101 | 102 | 103 | ```python 104 | acc = evaluate.load("accuracy") 105 | # Round the predictions to turn them into "0" or "1" labels 106 | test_preds = np.round(model.predict(x_test)) 107 | test_labels = y_test 108 | ``` 109 | 110 | ```python 111 | print("Test accuracy is : ", acc.compute(predictions = test_preds, references = test_labels)) 112 | # Test accuracy is : 0.9855 113 | ``` 114 | -------------------------------------------------------------------------------- /docs/source/package_reference/evaluator_classes.mdx: -------------------------------------------------------------------------------- 1 | # Evaluator 2 | 3 | The evaluator classes for automatic evaluation. 4 | 5 | ## Evaluator classes 6 | 7 | The main entry point for using the evaluator: 8 | 9 | [[autodoc]] evaluate.evaluator 10 | 11 | The base class for all evaluator classes: 12 | 13 | [[autodoc]] evaluate.Evaluator 14 | 15 | ## The task specific evaluators 16 | 17 | ### ImageClassificationEvaluator 18 | 19 | [[autodoc]] evaluate.ImageClassificationEvaluator 20 | 21 | ### QuestionAnsweringEvaluator 22 | 23 | [[autodoc]] evaluate.QuestionAnsweringEvaluator 24 | - compute 25 | 26 | ### TextClassificationEvaluator 27 | 28 | [[autodoc]] evaluate.TextClassificationEvaluator 29 | 30 | ### TokenClassificationEvaluator 31 | 32 | [[autodoc]] evaluate.TokenClassificationEvaluator 33 | - compute 34 | 35 | ### TextGenerationEvaluator 36 | 37 | [[autodoc]] evaluate.TextGenerationEvaluator 38 | - compute 39 | 40 | ### Text2TextGenerationEvaluator 41 | 42 | [[autodoc]] evaluate.Text2TextGenerationEvaluator 43 | - compute 44 | 45 | ### SummarizationEvaluator 46 | 47 | [[autodoc]] evaluate.SummarizationEvaluator 48 | - compute 49 | 50 | ### TranslationEvaluator 51 | 52 | [[autodoc]] evaluate.TranslationEvaluator 53 | - compute 54 | 55 | ### AutomaticSpeechRecognitionEvaluator 56 | 57 | [[autodoc]] evaluate.AutomaticSpeechRecognitionEvaluator 58 | - compute 59 | 60 | ### AudioClassificationEvaluator 61 | 62 | [[autodoc]] evaluate.AudioClassificationEvaluator 63 | - compute -------------------------------------------------------------------------------- /docs/source/package_reference/hub_methods.mdx: -------------------------------------------------------------------------------- 1 | # Hub methods 2 | 3 | Methods for using the Hugging Face Hub: 4 | 5 | ## Push to hub 6 | 7 | [[autodoc]] evaluate.push_to_hub 8 | 9 | -------------------------------------------------------------------------------- /docs/source/package_reference/loading_methods.mdx: -------------------------------------------------------------------------------- 1 | # Loading methods 2 | 3 | Methods for listing and loading evaluation modules: 4 | 5 | ## List 6 | 7 | [[autodoc]] evaluate.list_evaluation_modules 8 | 9 | ## Load 10 | 11 | [[autodoc]] evaluate.load 12 | -------------------------------------------------------------------------------- /docs/source/package_reference/logging_methods.mdx: -------------------------------------------------------------------------------- 1 | # Logging methods 2 | 3 | 🤗 Evaluate strives to be transparent and explicit about how it works, but this can be quite verbose at times. We have included a series of logging methods which allow you to easily adjust the level of verbosity of the entire library. Currently the default verbosity of the library is set to `WARNING`. 4 | 5 | To change the level of verbosity, use one of the direct setters. For instance, here is how to change the verbosity to the `INFO` level: 6 | 7 | ```py 8 | import evaluate 9 | evaluate.logging.set_verbosity_info() 10 | ``` 11 | 12 | You can also use the environment variable `EVALUATE_VERBOSITY` to override the default verbosity, and set it to one of the following: `debug`, `info`, `warning`, `error`, `critical`: 13 | 14 | ```bash 15 | EVALUATE_VERBOSITY=error ./myprogram.py 16 | ``` 17 | 18 | All the methods of this logging module are documented below. The main ones are: 19 | 20 | - [`logging.get_verbosity`] to get the current level of verbosity in the logger 21 | - [`logging.set_verbosity`] to set the verbosity to the level of your choice 22 | 23 | In order from the least to the most verbose (with their corresponding `int` values): 24 | 25 | 1. `logging.CRITICAL` or `logging.FATAL` (int value, 50): only report the most critical errors. 26 | 2. `logging.ERROR` (int value, 40): only report errors. 27 | 3. `logging.WARNING` or `logging.WARN` (int value, 30): only reports error and warnings. This the default level used by the library. 28 | 4. `logging.INFO` (int value, 20): reports error, warnings and basic information. 29 | 5. `logging.DEBUG` (int value, 10): report all information. 30 | 31 | By default, `tqdm` progress bars will be displayed during evaluate download and processing. [`logging.disable_progress_bar`] and [`logging.enable_progress_bar`] can be used to suppress or unsuppress this behavior. 32 | 33 | ## Functions 34 | 35 | [[autodoc]] evaluate.logging.get_verbosity 36 | 37 | [[autodoc]] evaluate.logging.set_verbosity 38 | 39 | [[autodoc]] evaluate.logging.set_verbosity_info 40 | 41 | [[autodoc]] evaluate.logging.set_verbosity_warning 42 | 43 | [[autodoc]] evaluate.logging.set_verbosity_debug 44 | 45 | [[autodoc]] evaluate.logging.set_verbosity_error 46 | 47 | [[autodoc]] evaluate.logging.disable_propagation 48 | 49 | [[autodoc]] evaluate.logging.enable_propagation 50 | 51 | [[autodoc]] evaluate.logging.get_logger 52 | 53 | [[autodoc]] evaluate.logging.enable_progress_bar 54 | 55 | [[autodoc]] evaluate.logging.disable_progress_bar 56 | 57 | ## Levels 58 | 59 | ### evaluate.logging.CRITICAL 60 | 61 | evaluate.logging.CRITICAL = 50 62 | 63 | ### evaluate.logging.DEBUG 64 | 65 | evaluate.logging.DEBUG = 10 66 | 67 | ### evaluate.logging.ERROR 68 | 69 | evaluate.logging.ERROR = 40 70 | 71 | ### evaluate.logging.FATAL 72 | 73 | evaluate.logging.FATAL = 50 74 | 75 | ### evaluate.logging.INFO 76 | 77 | evaluate.logging.INFO = 20 78 | 79 | ### evaluate.logging.NOTSET 80 | 81 | evaluate.logging.NOTSET = 0 82 | 83 | ### evaluate.logging.WARN 84 | 85 | evaluate.logging.WARN = 30 86 | 87 | ### evaluate.logging.WARNING 88 | 89 | evaluate.logging.WARNING = 30 90 | -------------------------------------------------------------------------------- /docs/source/package_reference/main_classes.mdx: -------------------------------------------------------------------------------- 1 | # Main classes 2 | 3 | ## EvaluationModuleInfo 4 | 5 | The base class `EvaluationModuleInfo` implements a the logic for the subclasses `MetricInfo`, `ComparisonInfo`, and `MeasurementInfo`. 6 | 7 | [[autodoc]] evaluate.EvaluationModuleInfo 8 | 9 | [[autodoc]] evaluate.MetricInfo 10 | 11 | [[autodoc]] evaluate.ComparisonInfo 12 | 13 | [[autodoc]] evaluate.MeasurementInfo 14 | 15 | ## EvaluationModule 16 | 17 | The base class `EvaluationModule` implements a the logic for the subclasses `Metric`, `Comparison`, and `Measurement`. 18 | 19 | [[autodoc]] evaluate.EvaluationModule 20 | 21 | [[autodoc]] evaluate.Metric 22 | 23 | [[autodoc]] evaluate.Comparison 24 | 25 | [[autodoc]] evaluate.Measurement 26 | 27 | ## CombinedEvaluations 28 | 29 | The `combine` function allows to combine multiple `EvaluationModule`s into a single `CombinedEvaluations`. 30 | 31 | [[autodoc]] evaluate.combine 32 | 33 | [[autodoc]] CombinedEvaluations 34 | -------------------------------------------------------------------------------- /docs/source/package_reference/saving_methods.mdx: -------------------------------------------------------------------------------- 1 | # Saving methods 2 | 3 | Methods for saving evaluations results: 4 | 5 | ## Save 6 | 7 | [[autodoc]] evaluate.save 8 | 9 | -------------------------------------------------------------------------------- /docs/source/package_reference/visualization_methods.mdx: -------------------------------------------------------------------------------- 1 | # Visualization methods 2 | 3 | Methods for visualizing evaluations results: 4 | 5 | ## Radar Plot 6 | 7 | [[autodoc]] evaluate.visualization.radar_plot 8 | -------------------------------------------------------------------------------- /docs/source/sklearn_integrations.mdx: -------------------------------------------------------------------------------- 1 | # Scikit-Learn 2 | 3 | To run the scikit-learn examples make sure you have installed the following library: 4 | 5 | ```bash 6 | pip install -U scikit-learn 7 | ``` 8 | 9 | The metrics in `evaluate` can be easily integrated with an Scikit-Learn estimator or [pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline). 10 | 11 | However, these metrics require that we generate the predictions from the model. The predictions and labels from the estimators can be passed to `evaluate` mertics to compute the required values. 12 | 13 | ```python 14 | import numpy as np 15 | np.random.seed(0) 16 | import evaluate 17 | from sklearn.compose import ColumnTransformer 18 | from sklearn.datasets import fetch_openml 19 | from sklearn.pipeline import Pipeline 20 | from sklearn.impute import SimpleImputer 21 | from sklearn.preprocessing import StandardScaler, OneHotEncoder 22 | from sklearn.linear_model import LogisticRegression 23 | from sklearn.model_selection import train_test_split 24 | ``` 25 | 26 | Load data from https://www.openml.org/d/40945: 27 | 28 | ```python 29 | X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True) 30 | ``` 31 | 32 | Alternatively X and y can be obtained directly from the frame attribute: 33 | 34 | ```python 35 | X = titanic.frame.drop('survived', axis=1) 36 | y = titanic.frame['survived'] 37 | ``` 38 | 39 | We create the preprocessing pipelines for both numeric and categorical data. Note that pclass could either be treated as a categorical or numeric feature. 40 | 41 | ```python 42 | numeric_features = ["age", "fare"] 43 | numeric_transformer = Pipeline( 44 | steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())] 45 | ) 46 | 47 | categorical_features = ["embarked", "sex", "pclass"] 48 | categorical_transformer = OneHotEncoder(handle_unknown="ignore") 49 | 50 | preprocessor = ColumnTransformer( 51 | transformers=[ 52 | ("num", numeric_transformer, numeric_features), 53 | ("cat", categorical_transformer, categorical_features), 54 | ] 55 | ) 56 | ``` 57 | 58 | Append classifier to preprocessing pipeline. Now we have a full prediction pipeline. 59 | 60 | ```python 61 | clf = Pipeline( 62 | steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())] 63 | ) 64 | 65 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) 66 | 67 | clf.fit(X_train, y_train) 68 | y_pred = clf.predict(X_test) 69 | ``` 70 | 71 | As `Evaluate` metrics use lists as inputs for references and predictions, we need to convert them to Python lists. 72 | 73 | 74 | ```python 75 | # Evaluate metrics accept lists as inputs for values of references and predictions 76 | 77 | y_test = y_test.tolist() 78 | y_pred = y_pred.tolist() 79 | 80 | # Accuracy 81 | 82 | accuracy_metric = evaluate.load("accuracy") 83 | accuracy = accuracy_metric.compute(references=y_test, predictions=y_pred) 84 | print("Accuracy:", accuracy) 85 | # Accuracy: 0.79 86 | ``` 87 | 88 | You can use any suitable `evaluate` metric with the estimators as long as they are compatible with the task and predictions. 89 | -------------------------------------------------------------------------------- /docs/source/types_of_evaluations.mdx: -------------------------------------------------------------------------------- 1 | # Types of Evaluations in 🤗 Evaluate 2 | 3 | The goal of the 🤗 Evaluate library is to support different types of evaluation, depending on different goals, datasets and models. 4 | 5 | Here are the types of evaluations that are currently supported with a few examples for each: 6 | 7 | ## Metrics 8 | A metric measures the performance of a model on a given dataset. This is often based on an existing ground truth (i.e. a set of references), but there are also *referenceless metrics* which allow evaluating generated text by leveraging a pretrained model such as [GPT-2](https://huggingface.co/gpt2). 9 | 10 | Examples of metrics include: 11 | - [Accuracy](https://huggingface.co/metrics/accuracy) : the proportion of correct predictions among the total number of cases processed. 12 | - [Exact Match](https://huggingface.co/metrics/exact_match): the rate at which the input predicted strings exactly match their references. 13 | - [Mean Intersection over union (IoUO)](https://huggingface.co/metrics/mean_iou): the area of overlap between the predicted segmentation of an image and the ground truth divided by the area of union between the predicted segmentation and the ground truth. 14 | 15 | Metrics are often used to track model performance on benchmark datasets, and to report progress on tasks such as [machine translation](https://huggingface.co/tasks/translation) and [image classification](https://huggingface.co/tasks/image-classification). 16 | 17 | ## Comparisons 18 | 19 | Comparisons can be useful to compare the performance of two or more models on a single test dataset. 20 | 21 | For instance, the [McNemar Test](https://github.com/huggingface/evaluate/tree/main/comparisons/mcnemar) is a paired nonparametric statistical hypothesis test that takes the predictions of two models and compares them, aiming to measure whether the models's predictions diverge or not. The p value it outputs, which ranges from `0.0` to `1.0`, indicates the difference between the two models' predictions, with a lower p value indicating a more significant difference. 22 | 23 | Comparisons have yet to be systematically used when comparing and reporting model performance, however they are useful tools to go beyond simply comparing leaderboard scores and for getting more information on the way model prediction differ. 24 | 25 | ## Measurements 26 | 27 | In the 🤗 Evaluate library, measurements are tools for gaining more insights on datasets and model predictions. 28 | 29 | For instance, in the case of datasets, it can be useful to calculate the [average word length](https://github.com/huggingface/evaluate/tree/main/measurements/word_length) of a dataset's entries, and how it is distributed -- this can help when choosing the maximum input length for [Tokenizer](https://huggingface.co/docs/transformers/main_classes/tokenizer). 30 | 31 | In the case of model predictions, it can help to calculate the average [perplexity](https://huggingface.co/metrics/perplexity) of model predictions using different models such as [GPT-2](https://huggingface.co/gpt2) and [BERT](https://huggingface.co/bert-base-uncased), which can indicate the quality of generated text when no reference is available. 32 | 33 | All three types of evaluation supported by the 🤗 Evaluate library are meant to be mutually complementary, and help our community carry out more mindful and responsible evaluation. 34 | 35 | We will continue adding more types of metrics, measurements and comparisons in coming months, and are counting on community involvement (via [PRs](https://github.com/huggingface/evaluate/compare) and [issues](https://github.com/huggingface/evaluate/issues/new/choose)) to make the library as extensive and inclusive as possible! 36 | -------------------------------------------------------------------------------- /measurements/honest/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("honest", "en") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /measurements/honest/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | transformers 3 | unidecode==1.3.4 4 | torch 5 | -------------------------------------------------------------------------------- /measurements/label_distribution/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Label Distribution 3 | emoji: 🤗 4 | colorFrom: green 5 | colorTo: purple 6 | sdk: gradio 7 | sdk_version: 3.0.2 8 | app_file: app.py 9 | pinned: false 10 | tags: 11 | - evaluate 12 | - measurement 13 | description: >- 14 | Returns the label distribution and skew of the input data. 15 | --- 16 | 17 | # Measurement Card for Label Distribution 18 | 19 | ## Measurement Description 20 | The label distribution measurements returns the fraction of each label represented in the dataset. 21 | 22 | ## Intended Uses 23 | 24 | Calculating the distribution of labels in a dataset allows to see how balanced the labels in your dataset are, which 25 | can help choosing a relevant metric (e.g. accuracy when the dataset is balanced, versus F1 score when there is an 26 | imbalance). 27 | 28 | ## How to Use 29 | 30 | The measurement takes a list of labels as input: 31 | 32 | ```python 33 | >>> distribution = evaluate.load("label_distribution") 34 | >>> data = [1, 0, 2, 2, 0, 0, 0, 0, 0, 2] 35 | >>> results = distribution.compute(data=data) 36 | ``` 37 | 38 | ### Inputs 39 | - **data** (`list`): a list of integers or strings containing the data labels. 40 | 41 | ### Output Values 42 | By default, this metric outputs a dictionary that contains : 43 | -**label_distribution** (`dict`) : a dictionary containing two sets of keys and values: `labels`, which includes the list of labels contained in the dataset, and `fractions`, which includes the fraction of each label. 44 | -**label_skew** (`scalar`) : the asymmetry of the label distribution. 45 | 46 | ```python 47 | {'label_distribution': {'labels': [1, 0, 2], 'fractions': [0.1, 0.6, 0.3]}, 'label_skew': 0.7417688338666573} 48 | ``` 49 | 50 | If skewness is 0, the dataset is perfectly balanced; if it is less than -1 or greater than 1, the distribution is highly skewed; anything in between can be considered moderately skewed. 51 | 52 | #### Values from Popular Papers 53 | 54 | 55 | ### Examples 56 | Calculating the label distribution of a dataset with binary labels: 57 | 58 | ```python 59 | >>> data = [1, 0, 1, 1, 0, 1, 0] 60 | >>> distribution = evaluate.load("label_distribution") 61 | >>> results = distribution.compute(data=data) 62 | >>> print(results) 63 | {'label_distribution': {'labels': [1, 0], 'fractions': [0.5714285714285714, 0.42857142857142855]}} 64 | ``` 65 | 66 | Calculating the label distribution of the test subset of the [IMDb dataset](https://huggingface.co/datasets/imdb): 67 | ```python 68 | >>> from datasets import load_dataset 69 | >>> imdb = load_dataset('imdb', split = 'test') 70 | >>> distribution = evaluate.load("label_distribution") 71 | >>> results = distribution.compute(data=imdb['label']) 72 | >>> print(results) 73 | {'label_distribution': {'labels': [0, 1], 'fractions': [0.5, 0.5]}, 'label_skew': 0.0} 74 | ``` 75 | N.B. The IMDb dataset is perfectly balanced. 76 | 77 | The output of the measurement can easily be passed to matplotlib to plot a histogram of each label: 78 | 79 | ```python 80 | >>> data = [1, 0, 2, 2, 0, 0, 0, 0, 0, 2] 81 | >>> distribution = evaluate.load("label_distribution") 82 | >>> results = distribution.compute(data=data) 83 | >>> plt.bar(results['label_distribution']['labels'], results['label_distribution']['fractions']) 84 | >>> plt.show() 85 | ``` 86 | 87 | ## Limitations and Bias 88 | While label distribution can be a useful signal for analyzing datasets and choosing metrics for measuring model performance, it can be useful to accompany it with additional data exploration to better understand each subset of the dataset and how they differ. 89 | 90 | ## Citation 91 | 92 | ## Further References 93 | - [Facing Imbalanced Data Recommendations for the Use of Performance Metrics](https://sites.pitt.edu/~jeffcohn/skew/PID2829477.pdf) 94 | - [Scipy Stats Skew Documentation](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.skew.html#scipy-stats-skew) 95 | -------------------------------------------------------------------------------- /measurements/label_distribution/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("label_distribution", module_type="measurement") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /measurements/label_distribution/label_distribution.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Label Distribution Measurement.""" 15 | 16 | from collections import Counter 17 | 18 | import datasets 19 | import pandas as pd 20 | from scipy import stats 21 | 22 | import evaluate 23 | 24 | 25 | _DESCRIPTION = """ 26 | Returns the label ratios of the dataset labels, as well as a scalar for skewness. 27 | """ 28 | 29 | _KWARGS_DESCRIPTION = """ 30 | Args: 31 | `data`: a list containing the data labels 32 | 33 | Returns: 34 | `label_distribution` (`dict`) : a dictionary containing two sets of keys and values: `labels`, which includes the list of labels contained in the dataset, and `fractions`, which includes the fraction of each label. 35 | `label_skew` (`scalar`) : the asymmetry of the label distribution. 36 | Examples: 37 | >>> data = [1, 0, 1, 1, 0, 1, 0] 38 | >>> distribution = evaluate.load("label_distribution") 39 | >>> results = distribution.compute(data=data) 40 | >>> print(results) 41 | {'label_distribution': {'labels': [1, 0], 'fractions': [0.5714285714285714, 0.42857142857142855]}, 'label_skew': -0.2886751345948127} 42 | """ 43 | 44 | _CITATION = """\ 45 | @ARTICLE{2020SciPy-NMeth, 46 | author = {Virtanen, Pauli and Gommers, Ralf and Oliphant, Travis E. and 47 | Haberland, Matt and Reddy, Tyler and Cournapeau, David and 48 | Burovski, Evgeni and Peterson, Pearu and Weckesser, Warren and 49 | Bright, Jonathan and {van der Walt}, St{\'e}fan J. and 50 | Brett, Matthew and Wilson, Joshua and Millman, K. Jarrod and 51 | Mayorov, Nikolay and Nelson, Andrew R. J. and Jones, Eric and 52 | Kern, Robert and Larson, Eric and Carey, C J and 53 | Polat, {\.I}lhan and Feng, Yu and Moore, Eric W. and 54 | {VanderPlas}, Jake and Laxalde, Denis and Perktold, Josef and 55 | Cimrman, Robert and Henriksen, Ian and Quintero, E. A. and 56 | Harris, Charles R. and Archibald, Anne M. and 57 | Ribeiro, Ant{\^o}nio H. and Pedregosa, Fabian and 58 | {van Mulbregt}, Paul and {SciPy 1.0 Contributors}}, 59 | title = {{{SciPy} 1.0: Fundamental Algorithms for Scientific 60 | Computing in Python}}, 61 | journal = {Nature Methods}, 62 | year = {2020}, 63 | volume = {17}, 64 | pages = {261--272}, 65 | adsurl = {https://rdcu.be/b08Wh}, 66 | doi = {10.1038/s41592-019-0686-2}, 67 | } 68 | """ 69 | 70 | 71 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 72 | class LabelDistribution(evaluate.Measurement): 73 | def _info(self): 74 | return evaluate.MeasurementInfo( 75 | module_type="measurement", 76 | description=_DESCRIPTION, 77 | citation=_CITATION, 78 | inputs_description=_KWARGS_DESCRIPTION, 79 | features=[ 80 | datasets.Features({"data": datasets.Value("int32")}), 81 | datasets.Features({"data": datasets.Value("string")}), 82 | ], 83 | ) 84 | 85 | def _compute(self, data): 86 | """Returns the fraction of each label present in the data""" 87 | c = Counter(data) 88 | label_distribution = {"labels": [k for k in c.keys()], "fractions": [f / len(data) for f in c.values()]} 89 | if isinstance(data[0], str): 90 | label2id = {label: id for id, label in enumerate(label_distribution["labels"])} 91 | data = [label2id[d] for d in data] 92 | skew = stats.skew(data) 93 | return {"label_distribution": label_distribution, "label_skew": skew} 94 | -------------------------------------------------------------------------------- /measurements/label_distribution/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scipy 3 | -------------------------------------------------------------------------------- /measurements/perplexity/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("perplexity", module_type="measurement") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /measurements/perplexity/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | torch 3 | transformers -------------------------------------------------------------------------------- /measurements/regard/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("regard") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /measurements/regard/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate.git@{COMMIT_PLACEHOLDER} 2 | transformers 3 | torch 4 | -------------------------------------------------------------------------------- /measurements/text_duplicates/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Text Duplicates 3 | emoji: 🤗 4 | colorFrom: green 5 | colorTo: purple 6 | sdk: gradio 7 | sdk_version: 3.0.2 8 | app_file: app.py 9 | pinned: false 10 | tags: 11 | - evaluate 12 | - measurement 13 | description: >- 14 | Returns the duplicate fraction of duplicate strings in the input. 15 | --- 16 | 17 | # Measurement Card for Text Duplicates 18 | 19 | ## Measurement Description 20 | 21 | The `text_duplicates` measurement returns the fraction of duplicated strings in the input data. 22 | 23 | ## How to Use 24 | 25 | This measurement requires a list of strings as input: 26 | 27 | ```python 28 | >>> data = ["hello sun","hello moon", "hello sun"] 29 | >>> duplicates = evaluate.load("text_duplicates") 30 | >>> results = duplicates.compute(data=data) 31 | ``` 32 | 33 | ### Inputs 34 | - **data** (list of `str`): The input list of strings for which the duplicates are calculated. 35 | 36 | ### Output Values 37 | - **duplicate_fraction**(`float`): the fraction of duplicates in the input string(s). 38 | - **duplicates_dict**(`list`): (optional) a list of tuples with the duplicate strings and the number of times they are repeated. 39 | 40 | By default, this measurement outputs a dictionary containing the fraction of duplicates in the input string(s) (`duplicate_fraction`): 41 | ) 42 | ```python 43 | {'duplicate_fraction': 0.33333333333333337} 44 | ``` 45 | 46 | With the `list_duplicates=True` option, this measurement will also output a dictionary of tuples with duplicate strings and their counts. 47 | 48 | ```python 49 | {'duplicate_fraction': 0.33333333333333337, 'duplicates_dict': {'hello sun': 2}} 50 | ``` 51 | 52 | Warning: the `list_duplicates=True` function can be memory-intensive for large datasets. 53 | 54 | ### Examples 55 | 56 | Example with no duplicates 57 | 58 | ```python 59 | >>> data = ["foo", "bar", "foobar"] 60 | >>> duplicates = evaluate.load("text_duplicates") 61 | >>> results = duplicates.compute(data=data) 62 | >>> print(results) 63 | {'duplicate_fraction': 0.0} 64 | ``` 65 | 66 | Example with multiple duplicates and `list_duplicates=True`: 67 | ```python 68 | >>> data = ["hello sun", "goodbye moon", "hello sun", "foo bar", "foo bar"] 69 | >>> duplicates = evaluate.load("text_duplicates") 70 | >>> results = duplicates.compute(data=data, list_duplicates=True) 71 | >>> print(results) 72 | {'duplicate_fraction': 0.4, 'duplicates_dict': {'hello sun': 2, 'foo bar': 2}} 73 | ``` 74 | 75 | ## Citation(s) 76 | 77 | 78 | ## Further References 79 | - [`hashlib` library](https://docs.python.org/3/library/hashlib.html) 80 | -------------------------------------------------------------------------------- /measurements/text_duplicates/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("text_duplicates") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /measurements/text_duplicates/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate.git@{COMMIT_PLACEHOLDER} 2 | -------------------------------------------------------------------------------- /measurements/text_duplicates/text_duplicates.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import hashlib 16 | from collections import Counter 17 | 18 | import datasets 19 | 20 | import evaluate 21 | 22 | 23 | logger = evaluate.logging.get_logger(__name__) 24 | 25 | _DESCRIPTION = """ 26 | Returns the duplicate fraction of duplicate strings in the input. 27 | """ 28 | 29 | _KWARGS_DESCRIPTION = """ 30 | Args: 31 | `data`: a list of `str` to be checked for duplicates. 32 | 33 | Returns: 34 | `duplicate_fraction` (`float`) : the fraction of strings that are duplicated. 35 | `duplicates_dict` (`dict`) (optional) : a dictionary containing tuples with the duplicate strings and the number of times they are repeated. 36 | 37 | Examples: 38 | >>> data = ["hello sun","hello moon", "hello sun"] 39 | >>> duplicates = evaluate.load("text_duplicates") 40 | >>> results = duplicates.compute(data=data) 41 | >>> print(results) 42 | {'duplicate_fraction': 0.33333333333333337} 43 | 44 | >>> data = ["hello sun","hello moon", "hello sun"] 45 | >>> duplicates = evaluate.load("text_duplicates") 46 | >>> results = duplicates.compute(data=data, list_duplicates=True) 47 | >>> print(results) 48 | {'duplicate_fraction': 0.33333333333333337, 'duplicates_dict': {'hello sun': 2}} 49 | """ 50 | 51 | # TODO: Add BibTeX citation 52 | _CITATION = "" 53 | 54 | 55 | def get_hash(example): 56 | """Get the hash of a string""" 57 | return hashlib.md5(example.strip().encode("utf-8")).hexdigest() 58 | 59 | 60 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 61 | class TextDuplicates(evaluate.Measurement): 62 | """This measurement returns the duplicate strings contained in the input(s).""" 63 | 64 | def _info(self): 65 | # TODO: Specifies the evaluate.MeasurementInfo object 66 | return evaluate.MeasurementInfo( 67 | # This is the description that will appear on the modules page. 68 | module_type="measurement", 69 | description=_DESCRIPTION, 70 | citation=_CITATION, 71 | inputs_description=_KWARGS_DESCRIPTION, 72 | # This defines the format of each prediction and reference 73 | features=datasets.Features( 74 | { 75 | "data": datasets.Value("string"), 76 | } 77 | ), 78 | ) 79 | 80 | def _compute(self, data, list_duplicates=False): 81 | """Returns the duplicates contained in the input data and the number of times they are repeated.""" 82 | if list_duplicates == True: 83 | logger.warning("This functionality can be memory-intensive for large datasets!") 84 | n_dedup = len(set([get_hash(d) for d in data])) 85 | c = Counter(data) 86 | duplicates = {k: v for k, v in c.items() if v > 1} 87 | return {"duplicate_fraction": 1 - (n_dedup / len(data)), "duplicates_dict": duplicates} 88 | else: 89 | n_dedup = len(set([get_hash(d) for d in data])) 90 | return {"duplicate_fraction": 1 - (n_dedup / len(data))} 91 | -------------------------------------------------------------------------------- /measurements/toxicity/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("toxicity") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /measurements/toxicity/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | transformers 3 | torch 4 | -------------------------------------------------------------------------------- /measurements/word_count/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Word Count 3 | emoji: 🤗 4 | colorFrom: green 5 | colorTo: purple 6 | sdk: gradio 7 | sdk_version: 3.0.2 8 | app_file: app.py 9 | pinned: false 10 | tags: 11 | - evaluate 12 | - measurement 13 | description: >- 14 | Returns the total number of words, and the number of unique words in the input data. 15 | --- 16 | 17 | # Measurement Card for Word Count 18 | 19 | ## Measurement Description 20 | 21 | The `word_count` measurement returns the total number of word count of the input string, using the sklearn's [`CountVectorizer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) 22 | 23 | ## How to Use 24 | 25 | This measurement requires a list of strings as input: 26 | 27 | ```python 28 | >>> data = ["hello world and hello moon"] 29 | >>> wordcount= evaluate.load("word_count") 30 | >>> results = wordcount.compute(data=data) 31 | ``` 32 | 33 | ### Inputs 34 | - **data** (list of `str`): The input list of strings for which the word length is calculated. 35 | - **max_vocab** (`int`): (optional) the top number of words to consider (can be specified if dataset is too large) 36 | 37 | ### Output Values 38 | - **total_word_count** (`int`): the total number of words in the input string(s). 39 | - **unique_words** (`int`): the number of unique words in the input string(s). 40 | 41 | Output Example(s): 42 | 43 | ```python 44 | {'total_word_count': 5, 'unique_words': 4} 45 | 46 | 47 | ### Examples 48 | 49 | Example for a single string 50 | 51 | ```python 52 | >>> data = ["hello sun and goodbye moon"] 53 | >>> wordcount = evaluate.load("word_count") 54 | >>> results = wordcount.compute(data=data) 55 | >>> print(results) 56 | {'total_word_count': 5, 'unique_words': 5} 57 | ``` 58 | 59 | Example for a multiple strings 60 | ```python 61 | >>> data = ["hello sun and goodbye moon", "foo bar foo bar"] 62 | >>> wordcount = evaluate.load("word_count") 63 | >>> results = wordcount.compute(data=data) 64 | >>> print(results) 65 | {'total_word_count': 9, 'unique_words': 7} 66 | ``` 67 | 68 | Example for a dataset from 🤗 Datasets: 69 | 70 | ```python 71 | >>> imdb = datasets.load_dataset('imdb', split = 'train') 72 | >>> wordcount = evaluate.load("word_count") 73 | >>> results = wordcount.compute(data=imdb['text']) 74 | >>> print(results) 75 | {'total_word_count': 5678573, 'unique_words': 74849} 76 | ``` 77 | 78 | ## Citation(s) 79 | 80 | 81 | ## Further References 82 | - [Sklearn `CountVectorizer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html) 83 | -------------------------------------------------------------------------------- /measurements/word_count/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("word_count") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /measurements/word_count/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate.git@{COMMIT_PLACEHOLDER} 2 | scikit-learn~=0.0 3 | -------------------------------------------------------------------------------- /measurements/word_count/word_count.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import datasets 16 | from sklearn.feature_extraction.text import CountVectorizer 17 | 18 | import evaluate 19 | 20 | 21 | _DESCRIPTION = """ 22 | Returns the total number of words, and the number of unique words in the input data. 23 | """ 24 | 25 | _KWARGS_DESCRIPTION = """ 26 | Args: 27 | `data`: a list of `str` for which the words are counted. 28 | `max_vocab` (optional): the top number of words to consider (can be specified if dataset is too large) 29 | 30 | Returns: 31 | `total_word_count` (`int`) : the total number of words in the input string(s) 32 | `unique_words` (`int`) : the number of unique words in the input list of strings. 33 | 34 | Examples: 35 | >>> data = ["hello world and hello moon"] 36 | >>> wordcount= evaluate.load("word_count") 37 | >>> results = wordcount.compute(data=data) 38 | >>> print(results) 39 | {'total_word_count': 5, 'unique_words': 4} 40 | """ 41 | _CITATION = "" 42 | 43 | 44 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 45 | class WordCount(evaluate.Measurement): 46 | """This measurement returns the total number of words and the number of unique words 47 | in the input string(s).""" 48 | 49 | def _info(self): 50 | return evaluate.MeasurementInfo( 51 | # This is the description that will appear on the modules page. 52 | module_type="measurement", 53 | description=_DESCRIPTION, 54 | citation=_CITATION, 55 | inputs_description=_KWARGS_DESCRIPTION, 56 | features=datasets.Features( 57 | { 58 | "data": datasets.Value("string"), 59 | } 60 | ), 61 | ) 62 | 63 | def _compute(self, data, max_vocab=None): 64 | """Returns the number of unique words in the input data""" 65 | count_vectorizer = CountVectorizer(max_features=max_vocab) 66 | document_matrix = count_vectorizer.fit_transform(data) 67 | word_count = document_matrix.sum() 68 | unique_words = document_matrix.shape[1] 69 | return {"total_word_count": word_count, "unique_words": unique_words} 70 | -------------------------------------------------------------------------------- /measurements/word_length/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Word Length 3 | emoji: 🤗 4 | colorFrom: green 5 | colorTo: purple 6 | sdk: gradio 7 | sdk_version: 3.0.2 8 | app_file: app.py 9 | pinned: false 10 | tags: 11 | - evaluate 12 | - measurement 13 | description: >- 14 | Returns the average length (in terms of the number of words) of the input data. 15 | --- 16 | 17 | # Measurement Card for Word Length 18 | 19 | 20 | ## Measurement Description 21 | 22 | The `word_length` measurement returns the average word count of the input strings, based on tokenization using [NLTK word_tokenize](https://www.nltk.org/api/nltk.tokenize.html). 23 | 24 | ## How to Use 25 | 26 | This measurement requires a list of strings as input: 27 | 28 | ```python 29 | >>> data = ["hello world"] 30 | >>> wordlength = evaluate.load("word_length", module_type="measurement") 31 | >>> results = wordlength.compute(data=data) 32 | ``` 33 | 34 | ### Inputs 35 | - **data** (list of `str`): The input list of strings for which the word length is calculated. 36 | - **tokenizer** (`Callable`) : approach used for tokenizing `data` (optional). The default tokenizer is [NLTK's `word_tokenize`](https://www.nltk.org/api/nltk.tokenize.html). This can be replaced by any function that takes a string as input and returns a list of tokens as output. 37 | 38 | ### Output Values 39 | - **average_word_length**(`float`): the average number of words in the input string(s). 40 | 41 | Output Example(s): 42 | 43 | ```python 44 | {"average_word_length": 245} 45 | ``` 46 | 47 | This metric outputs a dictionary containing the number of words in the input string (`word length`). 48 | 49 | ### Examples 50 | 51 | Example for a single string 52 | 53 | ```python 54 | >>> data = ["hello sun and goodbye moon"] 55 | >>> wordlength = evaluate.load("word_length", module_type="measurement") 56 | >>> results = wordlength.compute(data=data) 57 | >>> print(results) 58 | {'average_word_length': 5} 59 | ``` 60 | 61 | Example for a multiple strings 62 | ```python 63 | >>> data = ["hello sun and goodbye moon", "foo bar foo bar"] 64 | >>> wordlength = evaluate.load("word_length", module_type="measurement") 65 | >>> results = wordlength.compute(data=text) 66 | {'average_word_length': 4.5} 67 | ``` 68 | 69 | ## Citation(s) 70 | 71 | 72 | ## Further References 73 | - [NLTK's `word_tokenize`](https://www.nltk.org/api/nltk.tokenize.html) 74 | -------------------------------------------------------------------------------- /measurements/word_length/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("word_length", module_type="measurement") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /measurements/word_length/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate.git@{COMMIT_PLACEHOLDER} 2 | nltk~=3.7 3 | -------------------------------------------------------------------------------- /measurements/word_length/word_length.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from statistics import mean 16 | 17 | import datasets 18 | from nltk import word_tokenize 19 | from packaging import version 20 | 21 | import evaluate 22 | 23 | 24 | if evaluate.config.PY_VERSION < version.parse("3.8"): 25 | import importlib_metadata 26 | else: 27 | import importlib.metadata as importlib_metadata 28 | 29 | 30 | NLTK_VERSION = version.parse(importlib_metadata.version("nltk")) 31 | 32 | _DESCRIPTION = """ 33 | Returns the average length (in terms of the number of words) of the input data. 34 | """ 35 | 36 | _KWARGS_DESCRIPTION = """ 37 | Args: 38 | `data`: a list of `str` for which the word length is calculated. 39 | `tokenizer` (`Callable`) : the approach used for tokenizing `data` (optional). 40 | The default tokenizer is `word_tokenize` from NLTK: https://www.nltk.org/api/nltk.tokenize.html 41 | This can be replaced by any function that takes a string as input and returns a list of tokens as output. 42 | 43 | Returns: 44 | `average_word_length` (`float`) : the average number of words in the input list of strings. 45 | 46 | Examples: 47 | >>> data = ["hello world"] 48 | >>> wordlength = evaluate.load("word_length", module_type="measurement") 49 | >>> results = wordlength.compute(data=data) 50 | >>> print(results) 51 | {'average_word_length': 2} 52 | """ 53 | 54 | # TODO: Add BibTeX citation 55 | _CITATION = """\ 56 | @InProceedings{huggingface:module, 57 | title = {A great new module}, 58 | authors={huggingface, Inc.}, 59 | year={2020} 60 | } 61 | """ 62 | 63 | 64 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 65 | class WordLength(evaluate.Measurement): 66 | """This measurement returns the average number of words in the input string(s).""" 67 | 68 | def _info(self): 69 | # TODO: Specifies the evaluate.MeasurementInfo object 70 | return evaluate.MeasurementInfo( 71 | # This is the description that will appear on the modules page. 72 | module_type="measurement", 73 | description=_DESCRIPTION, 74 | citation=_CITATION, 75 | inputs_description=_KWARGS_DESCRIPTION, 76 | # This defines the format of each prediction and reference 77 | features=datasets.Features( 78 | { 79 | "data": datasets.Value("string"), 80 | } 81 | ), 82 | ) 83 | 84 | def _download_and_prepare(self, dl_manager): 85 | import nltk 86 | 87 | if NLTK_VERSION >= version.Version("3.9.0"): 88 | nltk.download("punkt_tab") 89 | else: 90 | nltk.download("punkt") 91 | 92 | def _compute(self, data, tokenizer=word_tokenize): 93 | """Returns the average word length of the input data""" 94 | lengths = [len(tokenizer(d)) for d in data] 95 | average_length = mean(lengths) 96 | return {"average_word_length": average_length} 97 | -------------------------------------------------------------------------------- /metrics/accuracy/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Accuracy 3 | emoji: 🤗 4 | colorFrom: blue 5 | colorTo: red 6 | sdk: gradio 7 | sdk_version: 3.19.1 8 | app_file: app.py 9 | pinned: false 10 | tags: 11 | - evaluate 12 | - metric 13 | description: >- 14 | Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with: 15 | Accuracy = (TP + TN) / (TP + TN + FP + FN) 16 | Where: 17 | TP: True positive 18 | TN: True negative 19 | FP: False positive 20 | FN: False negative 21 | --- 22 | 23 | # Metric Card for Accuracy 24 | 25 | 26 | ## Metric Description 27 | 28 | Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with: 29 | Accuracy = (TP + TN) / (TP + TN + FP + FN) 30 | Where: 31 | TP: True positive 32 | TN: True negative 33 | FP: False positive 34 | FN: False negative 35 | 36 | 37 | ## How to Use 38 | 39 | At minimum, this metric requires predictions and references as inputs. 40 | 41 | ```python 42 | >>> accuracy_metric = evaluate.load("accuracy") 43 | >>> results = accuracy_metric.compute(references=[0, 1], predictions=[0, 1]) 44 | >>> print(results) 45 | {'accuracy': 1.0} 46 | ``` 47 | 48 | 49 | ### Inputs 50 | - **predictions** (`list` of `int`): Predicted labels. 51 | - **references** (`list` of `int`): Ground truth labels. 52 | - **normalize** (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True. 53 | - **sample_weight** (`list` of `float`): Sample weights Defaults to None. 54 | 55 | 56 | ### Output Values 57 | - **accuracy**(`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`. A higher score means higher accuracy. 58 | 59 | Output Example(s): 60 | ```python 61 | {'accuracy': 1.0} 62 | ``` 63 | 64 | This metric outputs a dictionary, containing the accuracy score. 65 | 66 | 67 | #### Values from Popular Papers 68 | 69 | Top-1 or top-5 accuracy is often used to report performance on supervised classification tasks such as image classification (e.g. on [ImageNet](https://paperswithcode.com/sota/image-classification-on-imagenet)) or sentiment analysis (e.g. on [IMDB](https://paperswithcode.com/sota/text-classification-on-imdb)). 70 | 71 | 72 | ### Examples 73 | 74 | Example 1-A simple example 75 | ```python 76 | >>> accuracy_metric = evaluate.load("accuracy") 77 | >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0]) 78 | >>> print(results) 79 | {'accuracy': 0.5} 80 | ``` 81 | 82 | Example 2-The same as Example 1, except with `normalize` set to `False`. 83 | ```python 84 | >>> accuracy_metric = evaluate.load("accuracy") 85 | >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False) 86 | >>> print(results) 87 | {'accuracy': 3.0} 88 | ``` 89 | 90 | Example 3-The same as Example 1, except with `sample_weight` set. 91 | ```python 92 | >>> accuracy_metric = evaluate.load("accuracy") 93 | >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4]) 94 | >>> print(results) 95 | {'accuracy': 0.8778625954198473} 96 | ``` 97 | 98 | 99 | ## Limitations and Bias 100 | This metric can be easily misleading, especially in the case of unbalanced classes. For example, a high accuracy might be because a model is doing well, but if the data is unbalanced, it might also be because the model is only accurately labeling the high-frequency class. In such cases, a more detailed analysis of the model's behavior, or the use of a different metric entirely, is necessary to determine how well the model is actually performing. 101 | 102 | 103 | ## Citation(s) 104 | ```bibtex 105 | @article{scikit-learn, 106 | title={Scikit-learn: Machine Learning in {P}ython}, 107 | author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. 108 | and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. 109 | and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and 110 | Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, 111 | journal={Journal of Machine Learning Research}, 112 | volume={12}, 113 | pages={2825--2830}, 114 | year={2011} 115 | } 116 | ``` 117 | 118 | 119 | ## Further References 120 | -------------------------------------------------------------------------------- /metrics/accuracy/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("accuracy") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/accuracy/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scikit-learn -------------------------------------------------------------------------------- /metrics/bertscore/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("bertscore") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/bertscore/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | bert_score -------------------------------------------------------------------------------- /metrics/bleu/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("bleu") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/bleu/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} -------------------------------------------------------------------------------- /metrics/bleu/tokenizer_13a.py: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/tokenizers/tokenizer_13a.py 2 | # Copyright 2020 SacreBLEU Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import re 17 | from functools import lru_cache 18 | 19 | 20 | class BaseTokenizer: 21 | """A base dummy tokenizer to derive from.""" 22 | 23 | def signature(self): 24 | """ 25 | Returns a signature for the tokenizer. 26 | :return: signature string 27 | """ 28 | return "none" 29 | 30 | def __call__(self, line): 31 | """ 32 | Tokenizes an input line with the tokenizer. 33 | :param line: a segment to tokenize 34 | :return: the tokenized line 35 | """ 36 | return line 37 | 38 | 39 | class TokenizerRegexp(BaseTokenizer): 40 | def signature(self): 41 | return "re" 42 | 43 | def __init__(self): 44 | self._re = [ 45 | # language-dependent part (assuming Western languages) 46 | (re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), r" \1 "), 47 | # tokenize period and comma unless preceded by a digit 48 | (re.compile(r"([^0-9])([\.,])"), r"\1 \2 "), 49 | # tokenize period and comma unless followed by a digit 50 | (re.compile(r"([\.,])([^0-9])"), r" \1 \2"), 51 | # tokenize dash when preceded by a digit 52 | (re.compile(r"([0-9])(-)"), r"\1 \2 "), 53 | # one space only between words 54 | # NOTE: Doing this in Python (below) is faster 55 | # (re.compile(r'\s+'), r' '), 56 | ] 57 | 58 | @lru_cache(maxsize=2**16) 59 | def __call__(self, line): 60 | """Common post-processing tokenizer for `13a` and `zh` tokenizers. 61 | :param line: a segment to tokenize 62 | :return: the tokenized line 63 | """ 64 | for (_re, repl) in self._re: 65 | line = _re.sub(repl, line) 66 | 67 | # no leading or trailing spaces, single space within words 68 | # return ' '.join(line.split()) 69 | # This line is changed with regards to the original tokenizer (seen above) to return individual words 70 | return line.split() 71 | 72 | 73 | class Tokenizer13a(BaseTokenizer): 74 | def signature(self): 75 | return "13a" 76 | 77 | def __init__(self): 78 | self._post_tokenizer = TokenizerRegexp() 79 | 80 | @lru_cache(maxsize=2**16) 81 | def __call__(self, line): 82 | """Tokenizes an input line using a relatively minimal tokenization 83 | that is however equivalent to mteval-v13a, used by WMT. 84 | 85 | :param line: a segment to tokenize 86 | :return: the tokenized line 87 | """ 88 | 89 | # language-independent part: 90 | line = line.replace("", "") 91 | line = line.replace("-\n", "") 92 | line = line.replace("\n", " ") 93 | 94 | if "&" in line: 95 | line = line.replace(""", '"') 96 | line = line.replace("&", "&") 97 | line = line.replace("<", "<") 98 | line = line.replace(">", ">") 99 | 100 | return self._post_tokenizer(f" {line} ") 101 | -------------------------------------------------------------------------------- /metrics/bleurt/app.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import evaluate 4 | from evaluate.utils import launch_gradio_widget 5 | 6 | 7 | sys.path = [p for p in sys.path if p != "/home/user/app"] 8 | module = evaluate.load("bleurt") 9 | sys.path = ["/home/user/app"] + sys.path 10 | 11 | launch_gradio_widget(module) 12 | -------------------------------------------------------------------------------- /metrics/bleurt/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | git+https://github.com/google-research/bleurt.git -------------------------------------------------------------------------------- /metrics/brier_score/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Brier Score 3 | emoji: 🤗 4 | colorFrom: blue 5 | colorTo: red 6 | sdk: gradio 7 | sdk_version: 3.19.1 8 | app_file: app.py 9 | pinned: false 10 | tags: 11 | - evaluate 12 | - metric 13 | description: >- 14 | The Brier score is a measure of the error between two probability distributions. 15 | --- 16 | 17 | # Metric Card for Brier Score 18 | 19 | 20 | ## Metric Description 21 | Brier score is a type of evaluation metric for classification tasks, where you predict outcomes such as win/lose, spam/ham, click/no-click etc. 22 | `BrierScore = 1/N * sum( (p_i - o_i)^2 )` 23 | 24 | Where `p_i` is the prediction probability of occurrence of the event, and the term `o_i` is equal to 1 if the event occurred and 0 if not. Which means: the lower the value of this score, the better the prediction. 25 | ## How to Use 26 | 27 | At minimum, this metric requires predictions and references as inputs. 28 | 29 | ```python 30 | >>> brier_score = evaluate.load("brier_score") 31 | >>> predictions = np.array([0, 0, 1, 1]) 32 | >>> references = np.array([0.1, 0.9, 0.8, 0.3]) 33 | >>> results = brier_score.compute(predictions=predictions, references=references) 34 | ``` 35 | 36 | ### Inputs 37 | 38 | Mandatory inputs: 39 | - `predictions`: numeric array-like of shape (`n_samples,`) or (`n_samples`, `n_outputs`), representing the estimated target values. 40 | 41 | - `references`: numeric array-like of shape (`n_samples,`) or (`n_samples`, `n_outputs`), representing the ground truth (correct) target values. 42 | 43 | Optional arguments: 44 | - `sample_weight`: numeric array-like of shape (`n_samples,`) representing sample weights. The default is `None`. 45 | - `pos_label`: the label of the positive class. The default is `1`. 46 | 47 | 48 | ### Output Values 49 | This metric returns a dictionary with the following keys: 50 | - `brier_score (float)`: the computed Brier score. 51 | 52 | 53 | Output Example(s): 54 | ```python 55 | {'brier_score': 0.5} 56 | ``` 57 | 58 | #### Values from Popular Papers 59 | 60 | 61 | ### Examples 62 | ```python 63 | >>> brier_score = evaluate.load("brier_score") 64 | >>> predictions = np.array([0, 0, 1, 1]) 65 | >>> references = np.array([0.1, 0.9, 0.8, 0.3]) 66 | >>> results = brier_score.compute(predictions=predictions, references=references) 67 | >>> print(results) 68 | {'brier_score': 0.3375} 69 | ``` 70 | Example with `y_true` contains string, an error will be raised and `pos_label` should be explicitly specified. 71 | ```python 72 | >>> brier_score_metric = evaluate.load("brier_score") 73 | >>> predictions = np.array(["spam", "ham", "ham", "spam"]) 74 | >>> references = np.array([0.1, 0.9, 0.8, 0.3]) 75 | >>> results = brier_score.compute(predictions, references, pos_label="ham") 76 | >>> print(results) 77 | {'brier_score': 0.0374} 78 | ``` 79 | ## Limitations and Bias 80 | The [brier_score](https://huggingface.co/metrics/brier_score) is appropriate for binary and categorical outcomes that can be structured as true or false, but it is inappropriate for ordinal variables which can take on three or more values. 81 | ## Citation(s) 82 | ```bibtex 83 | @article{scikit-learn, 84 | title={Scikit-learn: Machine Learning in {P}ython}, 85 | author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. 86 | and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. 87 | and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and 88 | Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, 89 | journal={Journal of Machine Learning Research}, 90 | volume={12}, 91 | pages={2825--2830}, 92 | year={2011} 93 | } 94 | 95 | @Article{brier1950verification, 96 | title={Verification of forecasts expressed in terms of probability}, 97 | author={Brier, Glenn W and others}, 98 | journal={Monthly weather review}, 99 | volume={78}, 100 | number={1}, 101 | pages={1--3}, 102 | year={1950} 103 | } 104 | ``` 105 | ## Further References 106 | - [Brier Score - Wikipedia](https://en.wikipedia.org/wiki/Brier_score) -------------------------------------------------------------------------------- /metrics/brier_score/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("brier_score") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/brier_score/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scikit-learn -------------------------------------------------------------------------------- /metrics/cer/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("cer") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/cer/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | jiwer -------------------------------------------------------------------------------- /metrics/character/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: CharacTER 3 | emoji: 🔤 4 | colorFrom: orange 5 | colorTo: red 6 | sdk: gradio 7 | sdk_version: 3.19.1 8 | app_file: app.py 9 | pinned: false 10 | tags: 11 | - evaluate 12 | - metric 13 | - machine-translation 14 | description: >- 15 | CharacTer is a character-level metric inspired by the commonly applied translation edit rate (TER). 16 | --- 17 | 18 | # Metric Card for CharacTER 19 | 20 | ## Metric Description 21 | CharacTer is a character-level metric inspired by the translation edit rate (TER) metric. It is 22 | defined as the minimum number of character edits required to adjust a hypothesis, until it completely matches the 23 | reference, normalized by the length of the hypothesis sentence. CharacTer calculates the character level edit 24 | distance while performing the shift edit on word level. Unlike the strict matching criterion in TER, a hypothesis 25 | word is considered to match a reference word and could be shifted, if the edit distance between them is below a 26 | threshold value. The Levenshtein distance between the reference and the shifted hypothesis sequence is computed on the 27 | character level. In addition, the lengths of hypothesis sequences instead of reference sequences are used for 28 | normalizing the edit distance, which effectively counters the issue that shorter translations normally achieve lower 29 | TER. 30 | 31 | ## Intended Uses 32 | CharacTER was developed for machine translation evaluation. 33 | 34 | ## How to Use 35 | 36 | ```python 37 | import evaluate 38 | character = evaluate.load("character") 39 | 40 | # Single hyp/ref 41 | preds = ["this week the saudis denied information published in the new york times"] 42 | refs = ["saudi arabia denied this week information published in the american new york times"] 43 | results = character.compute(references=refs, predictions=preds) 44 | 45 | # Corpus example 46 | preds = ["this week the saudis denied information published in the new york times", 47 | "this is in fact an estimate"] 48 | refs = ["saudi arabia denied this week information published in the american new york times", 49 | "this is actually an estimate"] 50 | results = character.compute(references=refs, predictions=preds) 51 | ``` 52 | 53 | ### Inputs 54 | - **predictions**: a single prediction or a list of predictions to score. Each prediction should be a string with 55 | tokens separated by spaces. 56 | - **references**: a single reference or a list of reference for each prediction. Each reference should be a string with 57 | tokens separated by spaces. 58 | 59 | 60 | ### Output Values 61 | 62 | *=only when a list of references/hypotheses are given 63 | 64 | - **count** (*): how many parallel sentences were processed 65 | - **mean** (*): the mean CharacTER score 66 | - **median** (*): the median score 67 | - **std** (*): standard deviation of the score 68 | - **min** (*): smallest score 69 | - **max** (*): largest score 70 | - **cer_scores**: all scores, one per ref/hyp pair 71 | 72 | ### Output Example 73 | ```python 74 | { 75 | 'count': 2, 76 | 'mean': 0.3127282211789254, 77 | 'median': 0.3127282211789254, 78 | 'std': 0.07561653111280243, 79 | 'min': 0.25925925925925924, 80 | 'max': 0.36619718309859156, 81 | 'cer_scores': [0.36619718309859156, 0.25925925925925924] 82 | } 83 | ``` 84 | 85 | ## Citation 86 | ```bibtex 87 | @inproceedings{wang-etal-2016-character, 88 | title = "{C}harac{T}er: Translation Edit Rate on Character Level", 89 | author = "Wang, Weiyue and 90 | Peter, Jan-Thorsten and 91 | Rosendahl, Hendrik and 92 | Ney, Hermann", 93 | booktitle = "Proceedings of the First Conference on Machine Translation: Volume 2, Shared Task Papers", 94 | month = aug, 95 | year = "2016", 96 | address = "Berlin, Germany", 97 | publisher = "Association for Computational Linguistics", 98 | url = "https://aclanthology.org/W16-2342", 99 | doi = "10.18653/v1/W16-2342", 100 | pages = "505--510", 101 | } 102 | ``` 103 | 104 | ## Further References 105 | - Repackaged version that is used in this HF implementation: [https://github.com/bramvanroy/CharacTER](https://github.com/bramvanroy/CharacTER) 106 | - Original version: [https://github.com/rwth-i6/CharacTER](https://github.com/rwth-i6/CharacTER) 107 | -------------------------------------------------------------------------------- /metrics/character/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("character") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/character/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | cer>=1.2.0 3 | -------------------------------------------------------------------------------- /metrics/charcut_mt/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: CharCut 3 | emoji: 🔤 4 | colorFrom: blue 5 | colorTo: red 6 | sdk: gradio 7 | sdk_version: 3.19.1 8 | app_file: app.py 9 | pinned: false 10 | tags: 11 | - evaluate 12 | - metric 13 | - machine-translation 14 | description: >- 15 | CharCut is a character-based machine translation evaluation metric. 16 | --- 17 | 18 | # Metric Card for CharacTER 19 | 20 | ## Metric Description 21 | CharCut compares outputs of MT systems with reference translations. The matching algorithm is based on an iterative 22 | search for longest common substrings, combined with a length-based threshold that limits short and noisy character 23 | matches. As a similarity metric this is not new, but to the best of our knowledge it was never applied to highlighting 24 | and scoring of MT outputs. It has the neat effect of keeping character-based differences readable by humans. 25 | 26 | ## Intended Uses 27 | CharCut was developed for machine translation evaluation. 28 | 29 | ## How to Use 30 | 31 | ```python 32 | import evaluate 33 | charcut = evaluate.load("charcut") 34 | preds = ["this week the saudis denied information published in the new york times", 35 | "this is in fact an estimate"] 36 | refs = ["saudi arabia denied this week information published in the american new york times", 37 | "this is actually an estimate"] 38 | results = charcut.compute(references=refs, predictions=preds) 39 | print(results) 40 | # {'charcut_mt': 0.1971153846153846} 41 | 42 | ``` 43 | ### Inputs 44 | - **predictions**: a single prediction or a list of predictions to score. Each prediction should be a string with 45 | tokens separated by spaces. 46 | - **references**: a single reference or a list of reference for each prediction. Each reference should be a string with 47 | tokens separated by spaces. 48 | 49 | 50 | ### Output Values 51 | - **charcut_mt**: the CharCut evaluation score (lower is better) 52 | 53 | ### Output Example 54 | ```python 55 | {'charcut_mt': 0.1971153846153846} 56 | ``` 57 | 58 | ## Citation 59 | ```bibtex 60 | @inproceedings{lardilleux-lepage-2017-charcut, 61 | title = "{CHARCUT}: Human-Targeted Character-Based {MT} Evaluation with Loose Differences", 62 | author = "Lardilleux, Adrien and 63 | Lepage, Yves", 64 | booktitle = "Proceedings of the 14th International Conference on Spoken Language Translation", 65 | month = dec # " 14-15", 66 | year = "2017", 67 | address = "Tokyo, Japan", 68 | publisher = "International Workshop on Spoken Language Translation", 69 | url = "https://aclanthology.org/2017.iwslt-1.20", 70 | pages = "146--153", 71 | abstract = "We present CHARCUT, a character-based machine translation evaluation metric derived from a human-targeted segment difference visualisation algorithm. It combines an iterative search for longest common substrings between the candidate and the reference translation with a simple length-based threshold, enabling loose differences that limit noisy character matches. Its main advantage is to produce scores that directly reflect human-readable string differences, making it a useful support tool for the manual analysis of MT output and its display to end users. Experiments on WMT16 metrics task data show that it is on par with the best {``}un-trained{''} metrics in terms of correlation with human judgement, well above BLEU and TER baselines, on both system and segment tasks.", 72 | } 73 | ``` 74 | 75 | ## Further References 76 | - Repackaged version that is used in this HF implementation: [https://github.com/BramVanroy/CharCut](https://github.com/BramVanroy/CharCut) 77 | - Original version: [https://github.com/alardill/CharCut](https://github.com/alardill/CharCut) 78 | -------------------------------------------------------------------------------- /metrics/charcut_mt/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("charcut_mt") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/charcut_mt/charcut_mt.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """An implementation for calculating CharCut, a character-based machine translation evaluation metric.""" 15 | from typing import Iterable, Union 16 | 17 | import datasets 18 | from charcut import calculate_charcut 19 | from datasets import Sequence, Value 20 | 21 | import evaluate 22 | 23 | 24 | _CITATION = """\ 25 | @inproceedings{lardilleux-lepage-2017-charcut, 26 | title = "{CHARCUT}: Human-Targeted Character-Based {MT} Evaluation with Loose Differences", 27 | author = "Lardilleux, Adrien and 28 | Lepage, Yves", 29 | booktitle = "Proceedings of the 14th International Conference on Spoken Language Translation", 30 | month = dec # " 14-15", 31 | year = "2017", 32 | address = "Tokyo, Japan", 33 | publisher = "International Workshop on Spoken Language Translation", 34 | url = "https://aclanthology.org/2017.iwslt-1.20", 35 | pages = "146--153" 36 | } 37 | """ 38 | 39 | _DESCRIPTION = """\ 40 | CharCut compares outputs of MT systems with reference translations. The matching algorithm is based on an iterative 41 | search for longest common substrings, combined with a length-based threshold that limits short and noisy character 42 | matches. As a similarity metric this is not new, but to the best of our knowledge it was never applied to highlighting 43 | and scoring of MT outputs. It has the neat effect of keeping character-based differences readable by humans.""" 44 | 45 | _KWARGS_DESCRIPTION = """ 46 | Calculates how good predictions are given some references. 47 | Args: 48 | predictions: a list of predictions to score. Each prediction should be a string with 49 | tokens separated by spaces. 50 | references: a list of reference for each prediction. Each reference should be a string with 51 | tokens separated by spaces. 52 | Returns: 53 | charcut_mt: the CharCut score 54 | Examples: 55 | >>> charcut_mt = evaluate.load("charcut_mt") 56 | >>> preds = ["this week the saudis denied information published in the new york times", 57 | ... "this is in fact an estimate"] 58 | >>> refs = ["saudi arabia denied this week information published in the american new york times", 59 | ... "this is actually an estimate"] 60 | >>> charcut_mt.compute(references=refs, predictions=preds) 61 | {'charcut_mt': 0.1971153846153846} 62 | """ 63 | 64 | 65 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 66 | class Charcut(evaluate.Metric): 67 | """Character-based MT evaluation.""" 68 | 69 | def _info(self): 70 | return evaluate.MetricInfo( 71 | # This is the description that will appear on the modules page. 72 | module_type="metric", 73 | description=_DESCRIPTION, 74 | citation=_CITATION, 75 | inputs_description=_KWARGS_DESCRIPTION, 76 | # This defines the format of each prediction and reference 77 | features=[ 78 | datasets.Features( 79 | {"predictions": Value("string", id="prediction"), "references": Value("string", id="reference")} 80 | ), 81 | ], 82 | # Homepage of the module for documentation 83 | homepage="https://github.com/BramVanroy/CharCut", 84 | # Additional links to the codebase or references 85 | codebase_urls=["https://github.com/BramVanroy/CharCut", "https://github.com/alardill/CharCut"], 86 | ) 87 | 88 | def _compute(self, predictions: Iterable[str], references: Iterable[str]): 89 | return {"charcut_mt": calculate_charcut(predictions, references)[0]} 90 | -------------------------------------------------------------------------------- /metrics/charcut_mt/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | charcut>=1.1.1 3 | -------------------------------------------------------------------------------- /metrics/chrf/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("chrf") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/chrf/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | sacrebleu -------------------------------------------------------------------------------- /metrics/code_eval/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("code_eval") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/code_eval/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} -------------------------------------------------------------------------------- /metrics/comet/app.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import evaluate 4 | from evaluate.utils import launch_gradio_widget 5 | 6 | 7 | sys.path = [p for p in sys.path if p != "/home/user/app"] 8 | module = evaluate.load("comet") 9 | sys.path = ["/home/user/app"] + sys.path 10 | 11 | launch_gradio_widget(module) 12 | -------------------------------------------------------------------------------- /metrics/comet/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | unbabel-comet 3 | torch -------------------------------------------------------------------------------- /metrics/competition_math/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Competition MATH 3 | emoji: 🤗 4 | colorFrom: blue 5 | colorTo: red 6 | sdk: gradio 7 | sdk_version: 3.19.1 8 | app_file: app.py 9 | pinned: false 10 | tags: 11 | - evaluate 12 | - metric 13 | description: >- 14 | This metric is used to assess performance on the Mathematics Aptitude Test of Heuristics (MATH) dataset. 15 | It first canonicalizes the inputs (e.g., converting "1/2" to "\frac{1}{2}") and then computes accuracy. 16 | --- 17 | 18 | # Metric Card for Competition MATH 19 | 20 | ## Metric description 21 | 22 | This metric is used to assess performance on the [Mathematics Aptitude Test of Heuristics (MATH) dataset](https://huggingface.co/datasets/competition_math). 23 | 24 | It first canonicalizes the inputs (e.g., converting `1/2` to `\\frac{1}{2}`) and then computes accuracy. 25 | 26 | ## How to use 27 | 28 | This metric takes two arguments: 29 | 30 | `predictions`: a list of predictions to score. Each prediction is a string that contains natural language and LaTeX. 31 | 32 | `references`: list of reference for each prediction. Each reference is a string that contains natural language and LaTeX. 33 | 34 | 35 | ```python 36 | >>> from evaluate import load 37 | >>> math = load("competition_math") 38 | >>> references = ["\\frac{1}{2}"] 39 | >>> predictions = ["1/2"] 40 | >>> results = math.compute(references=references, predictions=predictions) 41 | ``` 42 | 43 | N.B. To be able to use Competition MATH, you need to install the `math_equivalence` dependency using `pip install git+https://github.com/hendrycks/math.git`. 44 | 45 | 46 | ## Output values 47 | 48 | This metric returns a dictionary that contains the [accuracy](https://huggingface.co/metrics/accuracy) after canonicalizing inputs, on a scale between 0.0 and 1.0. 49 | 50 | ### Values from popular papers 51 | The [original MATH dataset paper](https://arxiv.org/abs/2103.03874) reported accuracies ranging from 3.0% to 6.9% by different large language models. 52 | 53 | More recent progress on the dataset can be found on the [dataset leaderboard](https://paperswithcode.com/sota/math-word-problem-solving-on-math). 54 | 55 | ## Examples 56 | 57 | Maximal values (full match): 58 | 59 | ```python 60 | >>> from evaluate import load 61 | >>> math = load("competition_math") 62 | >>> references = ["\\frac{1}{2}"] 63 | >>> predictions = ["1/2"] 64 | >>> results = math.compute(references=references, predictions=predictions) 65 | >>> print(results) 66 | {'accuracy': 1.0} 67 | ``` 68 | 69 | Minimal values (no match): 70 | 71 | ```python 72 | >>> from evaluate import load 73 | >>> math = load("competition_math") 74 | >>> references = ["\\frac{1}{2}"] 75 | >>> predictions = ["3/4"] 76 | >>> results = math.compute(references=references, predictions=predictions) 77 | >>> print(results) 78 | {'accuracy': 0.0} 79 | ``` 80 | 81 | Partial match: 82 | 83 | ```python 84 | >>> from evaluate import load 85 | >>> math = load("competition_math") 86 | >>> references = ["\\frac{1}{2}","\\frac{3}{4}"] 87 | >>> predictions = ["1/5", "3/4"] 88 | >>> results = math.compute(references=references, predictions=predictions) 89 | >>> print(results) 90 | {'accuracy': 0.5} 91 | ``` 92 | 93 | ## Limitations and bias 94 | 95 | This metric is limited to datasets with the same format as the [Mathematics Aptitude Test of Heuristics (MATH) dataset](https://huggingface.co/datasets/competition_math), and is meant to evaluate the performance of large language models at solving mathematical problems. 96 | 97 | N.B. The MATH dataset also assigns levels of difficulty to different problems, so disagregating model performance by difficulty level (similarly to what was done in the [original paper](https://arxiv.org/abs/2103.03874) can give a better indication of how a given model does on a given difficulty of math problem, compared to overall accuracy. 98 | 99 | ## Citation 100 | 101 | ```bibtex 102 | @article{hendrycksmath2021, 103 | title={Measuring Mathematical Problem Solving With the MATH Dataset}, 104 | author={Dan Hendrycks 105 | and Collin Burns 106 | and Saurav Kadavath 107 | and Akul Arora 108 | and Steven Basart 109 | and Eric Tang 110 | and Dawn Song 111 | and Jacob Steinhardt}, 112 | journal={arXiv preprint arXiv:2103.03874}, 113 | year={2021} 114 | } 115 | ``` 116 | 117 | ## Further References 118 | - [MATH dataset](https://huggingface.co/datasets/competition_math) 119 | - [MATH leaderboard](https://paperswithcode.com/sota/math-word-problem-solving-on-math) 120 | - [MATH paper](https://arxiv.org/abs/2103.03874) -------------------------------------------------------------------------------- /metrics/competition_math/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("competition_math") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/competition_math/competition_math.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Accuracy metric for the Mathematics Aptitude Test of Heuristics (MATH) dataset.""" 15 | 16 | import datasets 17 | import math_equivalence # From: git+https://github.com/hendrycks/math.git 18 | 19 | import evaluate 20 | 21 | 22 | _CITATION = """\ 23 | @article{hendrycksmath2021, 24 | title={Measuring Mathematical Problem Solving With the MATH Dataset}, 25 | author={Dan Hendrycks 26 | and Collin Burns 27 | and Saurav Kadavath 28 | and Akul Arora 29 | and Steven Basart 30 | and Eric Tang 31 | and Dawn Song 32 | and Jacob Steinhardt}, 33 | journal={arXiv preprint arXiv:2103.03874}, 34 | year={2021} 35 | } 36 | """ 37 | 38 | 39 | _DESCRIPTION = """\ 40 | This metric is used to assess performance on the Mathematics Aptitude Test of Heuristics (MATH) dataset. 41 | It first canonicalizes the inputs (e.g., converting "1/2" to "\\frac{1}{2}") and then computes accuracy. 42 | """ 43 | 44 | 45 | _KWARGS_DESCRIPTION = r""" 46 | Calculates accuracy after canonicalizing inputs. 47 | 48 | Args: 49 | predictions: list of predictions to score. Each prediction 50 | is a string that contains natural language and LaTex. 51 | references: list of reference for each prediction. Each 52 | reference is a string that contains natural language 53 | and LaTex. 54 | Returns: 55 | accuracy: accuracy after canonicalizing inputs 56 | (e.g., converting "1/2" to "\\frac{1}{2}") 57 | 58 | Examples: 59 | >>> metric = evaluate.load("competition_math") 60 | >>> results = metric.compute(references=["\\frac{1}{2}"], predictions=["1/2"]) 61 | >>> print(results) 62 | {'accuracy': 1.0} 63 | """ 64 | 65 | 66 | @datasets.utils.file_utils.add_end_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 67 | class CompetitionMathMetric(evaluate.Metric): 68 | """Accuracy metric for the MATH dataset.""" 69 | 70 | def _info(self): 71 | return evaluate.MetricInfo( 72 | description=_DESCRIPTION, 73 | citation=_CITATION, 74 | inputs_description=_KWARGS_DESCRIPTION, 75 | features=datasets.Features( 76 | { 77 | "predictions": datasets.Value("string"), 78 | "references": datasets.Value("string"), 79 | } 80 | ), 81 | # Homepage of the metric for documentation 82 | homepage="https://github.com/hendrycks/math", 83 | # Additional links to the codebase or references 84 | codebase_urls=["https://github.com/hendrycks/math"], 85 | ) 86 | 87 | def _compute(self, predictions, references): 88 | """Returns the scores""" 89 | n_correct = 0.0 90 | for i, j in zip(predictions, references): 91 | n_correct += 1.0 if math_equivalence.is_equiv(i, j) else 0.0 92 | accuracy = n_correct / len(predictions) 93 | return { 94 | "accuracy": accuracy, 95 | } 96 | -------------------------------------------------------------------------------- /metrics/competition_math/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | git+https://github.com/hendrycks/math.git -------------------------------------------------------------------------------- /metrics/confusion_matrix/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Confusion Matrix 3 | emoji: 🤗 4 | colorFrom: blue 5 | colorTo: red 6 | sdk: gradio 7 | sdk_version: 3.19.1 8 | app_file: app.py 9 | pinned: false 10 | tags: 11 | - evaluate 12 | - metric 13 | description: >- 14 | The confusion matrix evaluates classification accuracy. 15 | 16 | Each row in a confusion matrix represents a true class and each column represents the instances in a predicted class. 17 | --- 18 | 19 | # Metric Card for Confusion Matrix 20 | 21 | 22 | ## Metric Description 23 | 24 | The confusion matrix evaluates classification accuracy. Each row in a confusion matrix represents a true class and each column represents the instances in a predicted class. Let's look at an example: 25 | 26 | | | setosa | versicolor | virginica | 27 | | ---------- | ------ | ---------- | --------- | 28 | | setosa | 13 | 0 | 0 | 29 | | versicolor | 0 | 10 | 6 | 30 | | virginica | 0 | 0 | 9 | 31 | 32 | What information does this confusion matrix provide? 33 | 34 | * All setosa instances were properly predicted as such (true positives). 35 | * The model always correctly classifies the setosa class (there are no false positives). 36 | * 10 versicolor instances were properly classified, but 6 instances were misclassified as virginica. 37 | * All virginica insances were properly classified as such. 38 | 39 | 40 | ## How to Use 41 | 42 | At minimum, this metric requires predictions and references as inputs. 43 | 44 | ```python 45 | >>> confusion_metric = evaluate.load("confusion_matrix") 46 | >>> results = confusion_metric.compute(references=[0, 1, 1, 2, 0, 2, 2], predictions=[0, 2, 1, 1, 0, 2, 0]) 47 | >>> print(results) 48 | {'confusion_matrix': [[2, 0, 0], [0, 1, 1], [1, 1, 1]]} 49 | ``` 50 | 51 | 52 | ### Inputs 53 | - **predictions** (`list` of `int`): Predicted labels. 54 | - **references** (`list` of `int`): Ground truth labels. 55 | - **labels** (`list` of `int`): List of labels to index the matrix. This may be used to reorder or select a subset of labels. 56 | - **sample_weight** (`list` of `float`): Sample weights. 57 | - **normalize** (`str`): Normalizes confusion matrix over the true (rows), predicted (columns) conditions or all the population. 58 | 59 | 60 | ### Output Values 61 | - **confusion_matrix**(`list` of `list` of `str`): Confusion matrix. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`. 62 | 63 | Output Example(s): 64 | ```python 65 | {'confusion_matrix': [[2, 0, 0], [0, 1, 1], [1, 1, 1]]} 66 | ``` 67 | 68 | This metric outputs a dictionary, containing the confusion matrix. 69 | 70 | 71 | ### Examples 72 | 73 | Example 1 - A simple example 74 | 75 | ```python 76 | >>> confusion_metric = evaluate.load("confusion_matrix") 77 | >>> results = confusion_metric.compute(references=[0, 1, 1, 2, 0, 2, 2], predictions=[0, 2, 1, 1, 0, 2, 0]) 78 | >>> print(results) 79 | {'confusion_matrix': [[2, 0, 0], [0, 1, 1], [1, 1, 1]]} 80 | ``` 81 | 82 | ## Citation(s) 83 | ```bibtex 84 | @article{scikit-learn, 85 | title={Scikit-learn: Machine Learning in {P}ython}, 86 | author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. 87 | and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. 88 | and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and 89 | Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, 90 | journal={Journal of Machine Learning Research}, 91 | volume={12}, 92 | pages={2825--2830}, 93 | year={2011} 94 | } 95 | ``` 96 | 97 | 98 | ## Further References 99 | 100 | * https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html 101 | * https://en.wikipedia.org/wiki/Confusion_matrix -------------------------------------------------------------------------------- /metrics/confusion_matrix/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("confusion_matrix") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/confusion_matrix/confusion_matrix.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Confusion Matrix.""" 15 | 16 | import datasets 17 | from sklearn.metrics import confusion_matrix 18 | 19 | import evaluate 20 | 21 | 22 | _DESCRIPTION = """ 23 | The confusion matrix evaluates classification accuracy. Each row in a confusion matrix represents a true class and each column represents the instances in a predicted class 24 | """ 25 | 26 | _KWARGS_DESCRIPTION = """ 27 | Args: 28 | predictions (`list` of `int`): Predicted labels. 29 | references (`list` of `int`): Ground truth labels. 30 | labels (`list` of `int`): List of labels to index the matrix. This may be used to reorder or select a subset of labels. 31 | sample_weight (`list` of `float`): Sample weights. 32 | normalize (`str`): Normalizes confusion matrix over the true (rows), predicted (columns) conditions or all the population. 33 | 34 | Returns: 35 | confusion_matrix (`list` of `list` of `int`): Confusion matrix whose i-th row and j-th column entry indicates the number of samples with true label being i-th class and predicted label being j-th class. 36 | 37 | Examples: 38 | 39 | Example 1-A simple example 40 | >>> confusion_matrix_metric = evaluate.load("confusion_matrix") 41 | >>> results = confusion_matrix_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0]) 42 | >>> print(results) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE 43 | {'confusion_matrix': array([[1, 0, 1], [0, 2, 0], [1, 1, 0]][...])} 44 | """ 45 | 46 | 47 | _CITATION = """ 48 | @article{scikit-learn, 49 | title={Scikit-learn: Machine Learning in {P}ython}, 50 | author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. 51 | and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. 52 | and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and 53 | Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, 54 | journal={Journal of Machine Learning Research}, 55 | volume={12}, 56 | pages={2825--2830}, 57 | year={2011} 58 | } 59 | """ 60 | 61 | 62 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 63 | class ConfusionMatrix(evaluate.Metric): 64 | def _info(self): 65 | return evaluate.MetricInfo( 66 | description=_DESCRIPTION, 67 | citation=_CITATION, 68 | inputs_description=_KWARGS_DESCRIPTION, 69 | features=datasets.Features( 70 | { 71 | "predictions": datasets.Sequence(datasets.Value("int32")), 72 | "references": datasets.Sequence(datasets.Value("int32")), 73 | } 74 | if self.config_name == "multilabel" 75 | else { 76 | "predictions": datasets.Value("int32"), 77 | "references": datasets.Value("int32"), 78 | } 79 | ), 80 | reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html"], 81 | ) 82 | 83 | def _compute(self, predictions, references, labels=None, sample_weight=None, normalize=None): 84 | return { 85 | "confusion_matrix": confusion_matrix( 86 | references, predictions, labels=labels, sample_weight=sample_weight, normalize=normalize 87 | ) 88 | } 89 | -------------------------------------------------------------------------------- /metrics/confusion_matrix/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scikit-learn -------------------------------------------------------------------------------- /metrics/coval/app.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import evaluate 4 | from evaluate.utils import launch_gradio_widget 5 | 6 | 7 | sys.path = [p for p in sys.path if p != "/home/user/app"] 8 | module = evaluate.load("coval") 9 | sys.path = ["/home/user/app"] + sys.path 10 | 11 | launch_gradio_widget(module) 12 | -------------------------------------------------------------------------------- /metrics/coval/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | git+https://github.com/ns-moosavi/coval.git -------------------------------------------------------------------------------- /metrics/cuad/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("cuad") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/cuad/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} -------------------------------------------------------------------------------- /metrics/exact_match/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("exact_match") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/exact_match/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} -------------------------------------------------------------------------------- /metrics/f1/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("f1") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/f1/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scikit-learn -------------------------------------------------------------------------------- /metrics/frugalscore/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("frugalscore") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/frugalscore/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | torch 3 | transformers -------------------------------------------------------------------------------- /metrics/glue/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("glue", "sst2") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/glue/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scipy 3 | scikit-learn -------------------------------------------------------------------------------- /metrics/google_bleu/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("google_bleu") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/google_bleu/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | nltk -------------------------------------------------------------------------------- /metrics/google_bleu/tokenizer_13a.py: -------------------------------------------------------------------------------- 1 | # Source: https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/tokenizers/tokenizer_13a.py 2 | # Copyright 2020 SacreBLEU Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import re 17 | from functools import lru_cache 18 | 19 | 20 | class BaseTokenizer: 21 | """A base dummy tokenizer to derive from.""" 22 | 23 | def signature(self): 24 | """ 25 | Returns a signature for the tokenizer. 26 | :return: signature string 27 | """ 28 | return "none" 29 | 30 | def __call__(self, line): 31 | """ 32 | Tokenizes an input line with the tokenizer. 33 | :param line: a segment to tokenize 34 | :return: the tokenized line 35 | """ 36 | return line 37 | 38 | 39 | class TokenizerRegexp(BaseTokenizer): 40 | def signature(self): 41 | return "re" 42 | 43 | def __init__(self): 44 | self._re = [ 45 | # language-dependent part (assuming Western languages) 46 | (re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), r" \1 "), 47 | # tokenize period and comma unless preceded by a digit 48 | (re.compile(r"([^0-9])([\.,])"), r"\1 \2 "), 49 | # tokenize period and comma unless followed by a digit 50 | (re.compile(r"([\.,])([^0-9])"), r" \1 \2"), 51 | # tokenize dash when preceded by a digit 52 | (re.compile(r"([0-9])(-)"), r"\1 \2 "), 53 | # one space only between words 54 | # NOTE: Doing this in Python (below) is faster 55 | # (re.compile(r'\s+'), r' '), 56 | ] 57 | 58 | @lru_cache(maxsize=2**16) 59 | def __call__(self, line): 60 | """Common post-processing tokenizer for `13a` and `zh` tokenizers. 61 | :param line: a segment to tokenize 62 | :return: the tokenized line 63 | """ 64 | for (_re, repl) in self._re: 65 | line = _re.sub(repl, line) 66 | 67 | # no leading or trailing spaces, single space within words 68 | # return ' '.join(line.split()) 69 | # This line is changed with regards to the original tokenizer (seen above) to return individual words 70 | return line.split() 71 | 72 | 73 | class Tokenizer13a(BaseTokenizer): 74 | def signature(self): 75 | return "13a" 76 | 77 | def __init__(self): 78 | self._post_tokenizer = TokenizerRegexp() 79 | 80 | @lru_cache(maxsize=2**16) 81 | def __call__(self, line): 82 | """Tokenizes an input line using a relatively minimal tokenization 83 | that is however equivalent to mteval-v13a, used by WMT. 84 | 85 | :param line: a segment to tokenize 86 | :return: the tokenized line 87 | """ 88 | 89 | # language-independent part: 90 | line = line.replace("", "") 91 | line = line.replace("-\n", "") 92 | line = line.replace("\n", " ") 93 | 94 | if "&" in line: 95 | line = line.replace(""", '"') 96 | line = line.replace("&", "&") 97 | line = line.replace("<", "<") 98 | line = line.replace(">", ">") 99 | 100 | return self._post_tokenizer(f" {line} ") 101 | -------------------------------------------------------------------------------- /metrics/indic_glue/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("indic_glue", "wnli") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/indic_glue/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scipy 3 | scikit-learn -------------------------------------------------------------------------------- /metrics/mae/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("mae") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/mae/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scikit-learn -------------------------------------------------------------------------------- /metrics/mahalanobis/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: Mahalanobis Distance 3 | emoji: 🤗 4 | colorFrom: blue 5 | colorTo: red 6 | sdk: gradio 7 | sdk_version: 3.19.1 8 | app_file: app.py 9 | pinned: false 10 | tags: 11 | - evaluate 12 | - metric 13 | description: >- 14 | Compute the Mahalanobis Distance 15 | 16 | Mahalonobis distance is the distance between a point and a distribution. 17 | And not between two distinct points. It is effectively a multivariate equivalent of the Euclidean distance. 18 | It was introduced by Prof. P. C. Mahalanobis in 1936 19 | and has been used in various statistical applications ever since 20 | [source: https://www.machinelearningplus.com/statistics/mahalanobis-distance/] 21 | --- 22 | 23 | # Metric Card for Mahalanobis Distance 24 | 25 | ## Metric Description 26 | Mahalonobis distance is the distance between a point and a distribution (as opposed to the distance between two points), making it the multivariate equivalent of the Euclidean distance. 27 | 28 | It is often used in multivariate anomaly detection, classification on highly imbalanced datasets and one-class classification. 29 | 30 | ## How to Use 31 | At minimum, this metric requires two `list`s of datapoints: 32 | 33 | ```python 34 | >>> mahalanobis_metric = evaluate.load("mahalanobis") 35 | >>> results = mahalanobis_metric.compute(reference_distribution=[[0, 1], [1, 0]], X=[[0, 1]]) 36 | ``` 37 | 38 | ### Inputs 39 | - `X` (`list`): data points to be compared with the `reference_distribution`. 40 | - `reference_distribution` (`list`): data points from the reference distribution that we want to compare to. 41 | 42 | ### Output Values 43 | `mahalanobis` (`array`): the Mahalonobis distance for each data point in `X`. 44 | 45 | ```python 46 | >>> print(results) 47 | {'mahalanobis': array([0.5])} 48 | ``` 49 | 50 | #### Values from Popular Papers 51 | *N/A* 52 | 53 | ### Example 54 | 55 | ```python 56 | >>> mahalanobis_metric = evaluate.load("mahalanobis") 57 | >>> results = mahalanobis_metric.compute(reference_distribution=[[0, 1], [1, 0]], X=[[0, 1]]) 58 | >>> print(results) 59 | {'mahalanobis': array([0.5])} 60 | ``` 61 | 62 | ## Limitations and Bias 63 | 64 | The Mahalanobis distance is only able to capture linear relationships between the variables, which means it cannot capture all types of outliers. Mahalanobis distance also fails to faithfully represent data that is highly skewed or multimodal. 65 | 66 | ## Citation 67 | ```bibtex 68 | @inproceedings{mahalanobis1936generalized, 69 | title={On the generalized distance in statistics}, 70 | author={Mahalanobis, Prasanta Chandra}, 71 | year={1936}, 72 | organization={National Institute of Science of India} 73 | } 74 | ``` 75 | 76 | ```bibtex 77 | @article{de2000mahalanobis, 78 | title={The Mahalanobis distance}, 79 | author={De Maesschalck, Roy and Jouan-Rimbaud, Delphine and Massart, D{\'e}sir{\'e} L}, 80 | journal={Chemometrics and intelligent laboratory systems}, 81 | volume={50}, 82 | number={1}, 83 | pages={1--18}, 84 | year={2000}, 85 | publisher={Elsevier} 86 | } 87 | ``` 88 | 89 | ## Further References 90 | -[Wikipedia -- Mahalanobis Distance](https://en.wikipedia.org/wiki/Mahalanobis_distance) 91 | 92 | -[Machine Learning Plus -- Mahalanobis Distance](https://www.machinelearningplus.com/statistics/mahalanobis-distance/) 93 | -------------------------------------------------------------------------------- /metrics/mahalanobis/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("mahalanobis") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/mahalanobis/mahalanobis.py: -------------------------------------------------------------------------------- 1 | # Copyright 2021 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Mahalanobis metric.""" 15 | 16 | import datasets 17 | import numpy as np 18 | 19 | import evaluate 20 | 21 | 22 | _DESCRIPTION = """ 23 | Compute the Mahalanobis Distance 24 | 25 | Mahalonobis distance is the distance between a point and a distribution. 26 | And not between two distinct points. It is effectively a multivariate equivalent of the Euclidean distance. 27 | It was introduced by Prof. P. C. Mahalanobis in 1936 28 | and has been used in various statistical applications ever since 29 | [source: https://www.machinelearningplus.com/statistics/mahalanobis-distance/] 30 | """ 31 | 32 | _CITATION = """\ 33 | @article{de2000mahalanobis, 34 | title={The mahalanobis distance}, 35 | author={De Maesschalck, Roy and Jouan-Rimbaud, Delphine and Massart, D{\'e}sir{\'e} L}, 36 | journal={Chemometrics and intelligent laboratory systems}, 37 | volume={50}, 38 | number={1}, 39 | pages={1--18}, 40 | year={2000}, 41 | publisher={Elsevier} 42 | } 43 | """ 44 | 45 | _KWARGS_DESCRIPTION = """ 46 | Args: 47 | X: List of datapoints to be compared with the `reference_distribution`. 48 | reference_distribution: List of datapoints from the reference distribution we want to compare to. 49 | Returns: 50 | mahalanobis: The Mahalonobis distance for each datapoint in `X`. 51 | Examples: 52 | 53 | >>> mahalanobis_metric = evaluate.load("mahalanobis") 54 | >>> results = mahalanobis_metric.compute(reference_distribution=[[0, 1], [1, 0]], X=[[0, 1]]) 55 | >>> print(results) 56 | {'mahalanobis': array([0.5])} 57 | """ 58 | 59 | 60 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 61 | class Mahalanobis(evaluate.Metric): 62 | def _info(self): 63 | return evaluate.MetricInfo( 64 | description=_DESCRIPTION, 65 | citation=_CITATION, 66 | inputs_description=_KWARGS_DESCRIPTION, 67 | features=datasets.Features( 68 | { 69 | "X": datasets.Sequence(datasets.Value("float", id="sequence"), id="X"), 70 | } 71 | ), 72 | ) 73 | 74 | def _compute(self, X, reference_distribution): 75 | 76 | # convert to numpy arrays 77 | X = np.array(X) 78 | reference_distribution = np.array(reference_distribution) 79 | 80 | # Assert that arrays are 2D 81 | if len(X.shape) != 2: 82 | raise ValueError("Expected `X` to be a 2D vector") 83 | if len(reference_distribution.shape) != 2: 84 | raise ValueError("Expected `reference_distribution` to be a 2D vector") 85 | if reference_distribution.shape[0] < 2: 86 | raise ValueError( 87 | "Expected `reference_distribution` to be a 2D vector with more than one element in the first dimension" 88 | ) 89 | 90 | # Get mahalanobis distance for each prediction 91 | X_minus_mu = X - np.mean(reference_distribution) 92 | cov = np.cov(reference_distribution.T) 93 | try: 94 | inv_covmat = np.linalg.inv(cov) 95 | except np.linalg.LinAlgError: 96 | inv_covmat = np.linalg.pinv(cov) 97 | left_term = np.dot(X_minus_mu, inv_covmat) 98 | mahal_dist = np.dot(left_term, X_minus_mu.T).diagonal() 99 | 100 | return {"mahalanobis": mahal_dist} 101 | -------------------------------------------------------------------------------- /metrics/mahalanobis/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} -------------------------------------------------------------------------------- /metrics/mape/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("mape") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/mape/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scikit-learn 3 | -------------------------------------------------------------------------------- /metrics/mase/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("mase") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/mase/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scikit-learn 3 | -------------------------------------------------------------------------------- /metrics/matthews_correlation/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("matthews_correlation") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/matthews_correlation/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scikit-learn -------------------------------------------------------------------------------- /metrics/mauve/app.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import evaluate 4 | from evaluate.utils import launch_gradio_widget 5 | 6 | 7 | sys.path = [p for p in sys.path if p != "/home/user/app"] 8 | module = evaluate.load("mauve") 9 | sys.path = ["/home/user/app"] + sys.path 10 | 11 | launch_gradio_widget(module) 12 | -------------------------------------------------------------------------------- /metrics/mauve/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | faiss-cpu 3 | scikit-learn 4 | mauve-text -------------------------------------------------------------------------------- /metrics/mean_iou/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("mean_iou") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/mean_iou/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} -------------------------------------------------------------------------------- /metrics/meteor/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("meteor") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/meteor/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | nltk -------------------------------------------------------------------------------- /metrics/mse/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("mse") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/mse/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scikit-learn -------------------------------------------------------------------------------- /metrics/nist_mt/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: NIST_MT 3 | emoji: 🤗 4 | colorFrom: purple 5 | colorTo: red 6 | sdk: gradio 7 | sdk_version: 3.19.1 8 | app_file: app.py 9 | pinned: false 10 | tags: 11 | - evaluate 12 | - metric 13 | - machine-translation 14 | description: 15 | DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU score. 16 | --- 17 | 18 | # Metric Card for NIST's MT metric 19 | 20 | 21 | ## Metric Description 22 | DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU 23 | score. The official script used by NIST to compute BLEU and NIST score is 24 | mteval-14.pl. The main differences are: 25 | 26 | - BLEU uses geometric mean of the ngram overlaps, NIST uses arithmetic mean. 27 | - NIST has a different brevity penalty 28 | - NIST score from mteval-14.pl has a self-contained tokenizer (in the Hugging Face implementation we rely on NLTK's 29 | implementation of the NIST-specific tokenizer) 30 | 31 | ## Intended Uses 32 | NIST was developed for machine translation evaluation. 33 | 34 | ## How to Use 35 | 36 | ```python 37 | import evaluate 38 | nist_mt = evaluate.load("nist_mt") 39 | hypothesis1 = "It is a guide to action which ensures that the military always obeys the commands of the party" 40 | reference1 = "It is a guide to action that ensures that the military will forever heed Party commands" 41 | reference2 = "It is the guiding principle which guarantees the military forces always being under the command of the Party" 42 | nist_mt.compute(hypothesis1, [reference1, reference2]) 43 | # {'nist_mt': 3.3709935957649324} 44 | ``` 45 | 46 | ### Inputs 47 | - **predictions**: tokenized predictions to score. For sentence-level NIST, a list of tokens (str); 48 | for corpus-level NIST, a list (sentences) of lists of tokens (str) 49 | - **references**: potentially multiple tokenized references for each prediction. For sentence-level NIST, a 50 | list (multiple potential references) of list of tokens (str); for corpus-level NIST, a list (corpus) of lists 51 | (multiple potential references) of lists of tokens (str) 52 | - **n**: highest n-gram order 53 | - **tokenize_kwargs**: arguments passed to the tokenizer (see: https://github.com/nltk/nltk/blob/90fa546ea600194f2799ee51eaf1b729c128711e/nltk/tokenize/nist.py#L139) 54 | 55 | ### Output Values 56 | - **nist_mt** (`float`): NIST score 57 | 58 | Output Example: 59 | ```python 60 | {'nist_mt': 3.3709935957649324} 61 | ``` 62 | 63 | 64 | ## Citation 65 | ```bibtex 66 | @inproceedings{10.5555/1289189.1289273, 67 | author = {Doddington, George}, 68 | title = {Automatic Evaluation of Machine Translation Quality Using N-Gram Co-Occurrence Statistics}, 69 | year = {2002}, 70 | publisher = {Morgan Kaufmann Publishers Inc.}, 71 | address = {San Francisco, CA, USA}, 72 | booktitle = {Proceedings of the Second International Conference on Human Language Technology Research}, 73 | pages = {138–145}, 74 | numpages = {8}, 75 | location = {San Diego, California}, 76 | series = {HLT '02} 77 | } 78 | ``` 79 | 80 | ## Further References 81 | 82 | This Hugging Face implementation uses [the NLTK implementation](https://github.com/nltk/nltk/blob/develop/nltk/translate/nist_score.py) 83 | -------------------------------------------------------------------------------- /metrics/nist_mt/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("nist_mt") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/nist_mt/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | nltk 3 | -------------------------------------------------------------------------------- /metrics/nist_mt/tests.py: -------------------------------------------------------------------------------- 1 | from _pytest.fixtures import fixture 2 | from nist_mt import Nist_mt 3 | 4 | 5 | nist = Nist_mt() 6 | 7 | 8 | @fixture 9 | def hypothesis_sent(): 10 | return "It is a guide to action which ensures that the military always obeys the commands of the party" 11 | 12 | 13 | @fixture 14 | def reference_sent1(): 15 | return "It is a guide to action that ensures that the military will forever heed Party commands" 16 | 17 | 18 | @fixture 19 | def reference_sent2(): 20 | return ( 21 | "It is the guiding principle which guarantees the military forces always being under the command of the Party" 22 | ) 23 | 24 | 25 | @fixture 26 | def reference_sent3(): 27 | return "It is the practical guide for the army always to heed the directions of the party" 28 | 29 | 30 | def test_nist_sentence(hypothesis_sent, reference_sent1, reference_sent2, reference_sent3): 31 | nist_score = nist.compute( 32 | predictions=[hypothesis_sent], references=[[reference_sent1, reference_sent2, reference_sent3]] 33 | ) 34 | assert abs(nist_score["nist_mt"] - 3.3709935957649324) < 1e-6 35 | -------------------------------------------------------------------------------- /metrics/pearsonr/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("pearsonr") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/pearsonr/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scipy -------------------------------------------------------------------------------- /metrics/perplexity/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("perplexity", module_type="metric") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/perplexity/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | torch 3 | torch 4 | transformers -------------------------------------------------------------------------------- /metrics/poseval/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("poseval") 6 | 7 | launch_gradio_widget(module) 8 | -------------------------------------------------------------------------------- /metrics/poseval/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scikit-learn -------------------------------------------------------------------------------- /metrics/precision/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("precision") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/precision/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scikit-learn -------------------------------------------------------------------------------- /metrics/r_squared/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: r_squared 3 | emoji: 🤗 4 | colorFrom: blue 5 | colorTo: red 6 | sdk: gradio 7 | sdk_version: 3.0.2 8 | app_file: app.py 9 | pinned: false 10 | tags: 11 | - evaluate 12 | - metric 13 | description: >- 14 | The R^2 (R Squared) metric is a measure of the goodness of fit of a linear regression model. It is the proportion of the variance in the dependent variable that is predictable from the independent variable. 15 | --- 16 | 17 | # Metric Card for R^2 18 | 19 | ## Metric description 20 | 21 | An R-squared value of 1 indicates that the model perfectly explains the variance of the dependent variable. A value of 0 means that the model does not explain any of the variance. Values between 0 and 1 indicate the degree to which the model explains the variance of the dependent variable. 22 | 23 | where the Sum of Squared Errors is the sum of the squared differences between the predicted values and the true values, and the Sum of Squared Total is the sum of the squared differences between the true values and the mean of the true values. 24 | 25 | For example, if an R-squared value for a model is 0.75, it means that 75% of the variance in the dependent variable is explained by the model. 26 | 27 | R-squared is not always a reliable measure of the quality of a regression model, particularly when you have a small sample size or there are multiple independent variables. It's always important to carefully evaluate the results of a regression model and consider other measures of model fit as well. 28 | 29 | R squared can be calculated using the following formula: 30 | 31 | ```python 32 | r_squared = 1 - (Sum of Squared Errors / Sum of Squared Total) 33 | ``` 34 | 35 | * Calculate the residual sum of squares (RSS), which is the sum of the squared differences between the predicted values and the actual values. 36 | * Calculate the total sum of squares (TSS), which is the sum of the squared differences between the actual values and the mean of the actual values. 37 | * Calculate the R-squared value by taking 1 - (RSS / TSS). 38 | 39 | Here's an example of how to calculate the R-squared value: 40 | ```python 41 | r_squared = 1 - (SSR/SST) 42 | ``` 43 | 44 | ### How to Use Examples: 45 | 46 | The R2 class in the evaluate module can be used to compute the R^2 value for a given set of predictions and references. (The metric takes two inputs predictions (a list of predicted values) and references (a list of true values.)) 47 | 48 | ```python 49 | from evaluate import load 50 | >>> r2_metric = evaluate.load("r_squared") 51 | >>> r_squared = r2_metric.compute(predictions=[1, 2, 3, 4], references=[0.9, 2.1, 3.2, 3.8]) 52 | >>> print(r_squared) 53 | 0.98 54 | ``` 55 | 56 | Alternatively, if you want to see an example where there is a perfect match between the prediction and reference: 57 | ```python 58 | >>> from evaluate import load 59 | >>> r2_metric = evaluate.load("r_squared") 60 | >>> r_squared = r2_metric.compute(predictions=[1, 2, 3, 4], references=[1, 2, 3, 4]) 61 | >>> print(r_squared) 62 | 1.0 63 | ``` 64 | 65 | ## Limitations and Bias 66 | R^2 is a statistical measure of the goodness of fit of a regression model. It represents the proportion of the variance in the dependent variable that is predictable from the independent variables. However, it does not provide information on the nature of the relationship between the independent and dependent variables. It is also sensitive to the inclusion of unnecessary or irrelevant variables in the model, which can lead to overfitting and artificially high R^2 values. 67 | 68 | ## Citation 69 | 70 | ```bibtex 71 | @article{r_squared_model, 72 | title={The R^2 Model Metric: A Comprehensive Guide}, 73 | author={John Doe}, 74 | journal={Journal of Model Evaluation}, 75 | volume={10}, 76 | number={2}, 77 | pages={101-112}, 78 | year={2022}, 79 | publisher={Model Evaluation Society}} 80 | ``` 81 | 82 | ## Further References 83 | 84 | - [The Open University: R-Squared](https://www.open.edu/openlearn/ocw/mod/oucontent/view.php?id=55450§ion=3.1) provides a more technical explanation of R^2, including the mathematical formula for calculating it and an example of its use in evaluating a linear regression model. 85 | 86 | - [Khan Academy: R-Squared](https://www.khanacademy.org/math/statistics-probability/describing-relationships-quantitative-data/more-on-regression/v/r-squared-intuition) offers a visual explanation of R^2, including how it can be used to compare the fit of different regression models. 87 | -------------------------------------------------------------------------------- /metrics/r_squared/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("r_squared") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/r_squared/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | -------------------------------------------------------------------------------- /metrics/recall/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("recall") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/recall/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scikit-learn -------------------------------------------------------------------------------- /metrics/rl_reliability/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("rl_reliability", "online") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/rl_reliability/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | git+https://github.com/google-research/rl-reliability-metrics 3 | scipy 4 | tensorflow 5 | gin-config -------------------------------------------------------------------------------- /metrics/roc_auc/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("roc_auc") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/roc_auc/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scikit-learn -------------------------------------------------------------------------------- /metrics/rouge/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("rouge") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/rouge/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | absl-py 3 | nltk 4 | rouge_score>=0.1.2 -------------------------------------------------------------------------------- /metrics/sacrebleu/app.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import evaluate 4 | from evaluate.utils import launch_gradio_widget 5 | 6 | 7 | sys.path = [p for p in sys.path if p != "/home/user/app"] 8 | module = evaluate.load("sacrebleu") 9 | sys.path = ["/home/user/app"] + sys.path 10 | 11 | launch_gradio_widget(module) 12 | -------------------------------------------------------------------------------- /metrics/sacrebleu/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | sacrebleu -------------------------------------------------------------------------------- /metrics/sari/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("sari") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/sari/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | sacrebleu 3 | sacremoses -------------------------------------------------------------------------------- /metrics/seqeval/app.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import evaluate 4 | from evaluate.utils import launch_gradio_widget 5 | 6 | 7 | sys.path = [p for p in sys.path if p != "/home/user/app"] 8 | module = evaluate.load("seqeval") 9 | sys.path = ["/home/user/app"] + sys.path 10 | 11 | launch_gradio_widget(module) 12 | -------------------------------------------------------------------------------- /metrics/seqeval/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | seqeval -------------------------------------------------------------------------------- /metrics/smape/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: sMAPE 3 | emoji: 🤗 4 | colorFrom: blue 5 | colorTo: red 6 | sdk: gradio 7 | sdk_version: 3.19.1 8 | app_file: app.py 9 | pinned: false 10 | tags: 11 | - evaluate 12 | - metric 13 | description: >- 14 | Symmetric Mean Absolute Percentage Error (sMAPE) is the symmetric mean percentage error difference between the predicted and actual values defined by Chen and Yang (2004). 15 | --- 16 | 17 | # Metric Card for sMAPE 18 | 19 | 20 | ## Metric Description 21 | 22 | Symmetric Mean Absolute Error (sMAPE) is the symmetric mean of the percentage error of difference between the predicted $x_i$ and actual $y_i$ numeric values: 23 | 24 | ![image](https://user-images.githubusercontent.com/8100/200009801-ae8be6c8-facf-401b-8df0-3f80a458b9f4.png) 25 | 26 | 27 | ## How to Use 28 | 29 | At minimum, this metric requires predictions and references as inputs. 30 | 31 | ```python 32 | >>> smape_metric = evaluate.load("smape") 33 | >>> predictions = [2.5, 0.0, 2, 8] 34 | >>> references = [3, -0.5, 2, 7] 35 | >>> results = smape_metric.compute(predictions=predictions, references=references) 36 | ``` 37 | 38 | ### Inputs 39 | 40 | Mandatory inputs: 41 | - `predictions`: numeric array-like of shape (`n_samples,`) or (`n_samples`, `n_outputs`), representing the estimated target values. 42 | - `references`: numeric array-like of shape (`n_samples,`) or (`n_samples`, `n_outputs`), representing the ground truth (correct) target values. 43 | 44 | Optional arguments: 45 | - `sample_weight`: numeric array-like of shape (`n_samples,`) representing sample weights. The default is `None`. 46 | - `multioutput`: `raw_values`, `uniform_average` or numeric array-like of shape (`n_outputs,`), which defines the aggregation of multiple output values. The default value is `uniform_average`. 47 | - `raw_values` returns a full set of errors in case of multioutput input. 48 | - `uniform_average` means that the errors of all outputs are averaged with uniform weight. 49 | - the array-like value defines weights used to average errors. 50 | 51 | ### Output Values 52 | This metric outputs a dictionary, containing the mean absolute error score, which is of type: 53 | - `float`: if multioutput is `uniform_average` or an ndarray of weights, then the weighted average of all output errors is returned. 54 | - numeric array-like of shape (`n_outputs,`): if multioutput is `raw_values`, then the score is returned for each output separately. 55 | 56 | Each sMAPE `float` value ranges from `0.0` to `2.0`, with the best value being 0.0. 57 | 58 | Output Example(s): 59 | ```python 60 | {'smape': 0.5} 61 | ``` 62 | 63 | If `multioutput="raw_values"`: 64 | ```python 65 | {'smape': array([0.5, 1.5 ])} 66 | ``` 67 | 68 | #### Values from Popular Papers 69 | 70 | 71 | ### Examples 72 | 73 | Example with the `uniform_average` config: 74 | ```python 75 | >>> smape_metric = evaluate.load("smape") 76 | >>> predictions = [2.5, 0.0, 2, 8] 77 | >>> references = [3, -0.5, 2, 7] 78 | >>> results = smape_metric.compute(predictions=predictions, references=references) 79 | >>> print(results) 80 | {'smape': 0.5787...} 81 | ``` 82 | 83 | Example with multi-dimensional lists, and the `raw_values` config: 84 | ```python 85 | >>> smape_metric = evaluate.load("smape", "multilist") 86 | >>> predictions = [[0.5, 1], [-1, 1], [7, -6]] 87 | >>> references = [[0.1, 2], [-1, 2], [8, -5]] 88 | >>> results = smape_metric.compute(predictions=predictions, references=references) 89 | >>> print(results) 90 | {'smape': 0.8874...} 91 | >>> results = smape_metric.compute(predictions=predictions, references=references, multioutput='raw_values') 92 | >>> print(results) 93 | {'smape': array([1.3749..., 0.4])} 94 | ``` 95 | 96 | ## Limitations and Bias 97 | This metric is called a measure of "percentage error" even though there is no multiplier of 100. The range is between (0, 2) with it being two when the target and prediction are both zero. 98 | 99 | ## Citation(s) 100 | 101 | ```bibtex 102 | @article{article, 103 | author = {Chen, Zhuo and Yang, Yuhong}, 104 | year = {2004}, 105 | month = {04}, 106 | pages = {}, 107 | title = {Assessing forecast accuracy measures} 108 | } 109 | ``` 110 | 111 | ## Further References 112 | - [Symmetric Mean absolute percentage error - Wikipedia](https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error) 113 | -------------------------------------------------------------------------------- /metrics/smape/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("smape") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/smape/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scikit-learn 3 | -------------------------------------------------------------------------------- /metrics/spearmanr/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("spearmanr") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/spearmanr/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scipy -------------------------------------------------------------------------------- /metrics/squad/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("squad") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/squad/compute_score.py: -------------------------------------------------------------------------------- 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """ 2 | 3 | import argparse 4 | import json 5 | import re 6 | import string 7 | import sys 8 | from collections import Counter 9 | 10 | 11 | def normalize_answer(s): 12 | """Lower text and remove punctuation, articles and extra whitespace.""" 13 | 14 | def remove_articles(text): 15 | return re.sub(r"\b(a|an|the)\b", " ", text) 16 | 17 | def white_space_fix(text): 18 | return " ".join(text.split()) 19 | 20 | def remove_punc(text): 21 | exclude = set(string.punctuation) 22 | return "".join(ch for ch in text if ch not in exclude) 23 | 24 | def lower(text): 25 | return text.lower() 26 | 27 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 28 | 29 | 30 | def f1_score(prediction, ground_truth): 31 | prediction_tokens = normalize_answer(prediction).split() 32 | ground_truth_tokens = normalize_answer(ground_truth).split() 33 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 34 | num_same = sum(common.values()) 35 | if num_same == 0: 36 | return 0 37 | precision = 1.0 * num_same / len(prediction_tokens) 38 | recall = 1.0 * num_same / len(ground_truth_tokens) 39 | f1 = (2 * precision * recall) / (precision + recall) 40 | return f1 41 | 42 | 43 | def exact_match_score(prediction, ground_truth): 44 | return normalize_answer(prediction) == normalize_answer(ground_truth) 45 | 46 | 47 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 48 | scores_for_ground_truths = [] 49 | for ground_truth in ground_truths: 50 | score = metric_fn(prediction, ground_truth) 51 | scores_for_ground_truths.append(score) 52 | return max(scores_for_ground_truths) 53 | 54 | 55 | def compute_score(dataset, predictions): 56 | f1 = exact_match = total = 0 57 | for article in dataset: 58 | for paragraph in article["paragraphs"]: 59 | for qa in paragraph["qas"]: 60 | total += 1 61 | if qa["id"] not in predictions: 62 | message = "Unanswered question " + qa["id"] + " will receive score 0." 63 | print(message, file=sys.stderr) 64 | continue 65 | ground_truths = list(map(lambda x: x["text"], qa["answers"])) 66 | prediction = predictions[qa["id"]] 67 | exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths) 68 | f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths) 69 | 70 | exact_match = 100.0 * exact_match / total 71 | f1 = 100.0 * f1 / total 72 | 73 | return {"exact_match": exact_match, "f1": f1} 74 | 75 | 76 | if __name__ == "__main__": 77 | expected_version = "1.1" 78 | parser = argparse.ArgumentParser(description="Evaluation for SQuAD " + expected_version) 79 | parser.add_argument("dataset_file", help="Dataset file") 80 | parser.add_argument("prediction_file", help="Prediction File") 81 | args = parser.parse_args() 82 | with open(args.dataset_file) as dataset_file: 83 | dataset_json = json.load(dataset_file) 84 | if dataset_json["version"] != expected_version: 85 | print( 86 | "Evaluation expects v-" + expected_version + ", but got dataset with v-" + dataset_json["version"], 87 | file=sys.stderr, 88 | ) 89 | dataset = dataset_json["data"] 90 | with open(args.prediction_file) as prediction_file: 91 | predictions = json.load(prediction_file) 92 | print(json.dumps(compute_score(dataset, predictions))) 93 | -------------------------------------------------------------------------------- /metrics/squad/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} -------------------------------------------------------------------------------- /metrics/squad_v2/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("squad_v2") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/squad_v2/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} -------------------------------------------------------------------------------- /metrics/super_glue/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("super_glue", "copa") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/super_glue/record_evaluation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Official evaluation script for ReCoRD v1.0. 3 | (Some functions are adopted from the SQuAD evaluation script.) 4 | """ 5 | 6 | 7 | import argparse 8 | import json 9 | import re 10 | import string 11 | import sys 12 | from collections import Counter 13 | 14 | 15 | def normalize_answer(s): 16 | """Lower text and remove punctuation, articles and extra whitespace.""" 17 | 18 | def remove_articles(text): 19 | return re.sub(r"\b(a|an|the)\b", " ", text) 20 | 21 | def white_space_fix(text): 22 | return " ".join(text.split()) 23 | 24 | def remove_punc(text): 25 | exclude = set(string.punctuation) 26 | return "".join(ch for ch in text if ch not in exclude) 27 | 28 | def lower(text): 29 | return text.lower() 30 | 31 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 32 | 33 | 34 | def f1_score(prediction, ground_truth): 35 | prediction_tokens = normalize_answer(prediction).split() 36 | ground_truth_tokens = normalize_answer(ground_truth).split() 37 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 38 | num_same = sum(common.values()) 39 | if num_same == 0: 40 | return 0 41 | precision = 1.0 * num_same / len(prediction_tokens) 42 | recall = 1.0 * num_same / len(ground_truth_tokens) 43 | f1 = (2 * precision * recall) / (precision + recall) 44 | return f1 45 | 46 | 47 | def exact_match_score(prediction, ground_truth): 48 | return normalize_answer(prediction) == normalize_answer(ground_truth) 49 | 50 | 51 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 52 | scores_for_ground_truths = [] 53 | for ground_truth in ground_truths: 54 | score = metric_fn(prediction, ground_truth) 55 | scores_for_ground_truths.append(score) 56 | return max(scores_for_ground_truths) 57 | 58 | 59 | def evaluate(dataset, predictions): 60 | f1 = exact_match = total = 0 61 | correct_ids = [] 62 | for passage in dataset: 63 | for qa in passage["qas"]: 64 | total += 1 65 | if qa["id"] not in predictions: 66 | message = f'Unanswered question {qa["id"]} will receive score 0.' 67 | print(message, file=sys.stderr) 68 | continue 69 | 70 | ground_truths = list(map(lambda x: x["text"], qa["answers"])) 71 | prediction = predictions[qa["id"]] 72 | 73 | _exact_match = metric_max_over_ground_truths(exact_match_score, prediction, ground_truths) 74 | if int(_exact_match) == 1: 75 | correct_ids.append(qa["id"]) 76 | exact_match += _exact_match 77 | 78 | f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths) 79 | 80 | exact_match = exact_match / total 81 | f1 = f1 / total 82 | 83 | return {"exact_match": exact_match, "f1": f1}, correct_ids 84 | 85 | 86 | if __name__ == "__main__": 87 | expected_version = "1.0" 88 | parser = argparse.ArgumentParser("Official evaluation script for ReCoRD v1.0.") 89 | parser.add_argument("data_file", help="The dataset file in JSON format.") 90 | parser.add_argument("pred_file", help="The model prediction file in JSON format.") 91 | parser.add_argument("--output_correct_ids", action="store_true", help="Output the correctly answered query IDs.") 92 | args = parser.parse_args() 93 | 94 | with open(args.data_file) as data_file: 95 | dataset_json = json.load(data_file) 96 | if dataset_json["version"] != expected_version: 97 | print( 98 | f'Evaluation expects v-{expected_version}, but got dataset with v-{dataset_json["version"]}', 99 | file=sys.stderr, 100 | ) 101 | dataset = dataset_json["data"] 102 | 103 | with open(args.pred_file) as pred_file: 104 | predictions = json.load(pred_file) 105 | 106 | metrics, correct_ids = evaluate(dataset, predictions) 107 | 108 | if args.output_correct_ids: 109 | print(f"Output {len(correct_ids)} correctly answered question IDs.") 110 | with open("correct_ids.json", "w") as f: 111 | json.dump(correct_ids, f) 112 | -------------------------------------------------------------------------------- /metrics/super_glue/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scikit-learn -------------------------------------------------------------------------------- /metrics/ter/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("ter") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/ter/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | sacrebleu -------------------------------------------------------------------------------- /metrics/trec_eval/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("trec_eval") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/trec_eval/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | trectools -------------------------------------------------------------------------------- /metrics/wer/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("wer") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/wer/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | jiwer -------------------------------------------------------------------------------- /metrics/wiki_split/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("wiki_split") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/wiki_split/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | sacrebleu 3 | sacremoses -------------------------------------------------------------------------------- /metrics/xnli/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("xnli") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/xnli/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} -------------------------------------------------------------------------------- /metrics/xnli/xnli.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Evaluate Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ XNLI benchmark metric. """ 15 | 16 | import datasets 17 | 18 | import evaluate 19 | 20 | 21 | _CITATION = """\ 22 | @InProceedings{conneau2018xnli, 23 | author = "Conneau, Alexis 24 | and Rinott, Ruty 25 | and Lample, Guillaume 26 | and Williams, Adina 27 | and Bowman, Samuel R. 28 | and Schwenk, Holger 29 | and Stoyanov, Veselin", 30 | title = "XNLI: Evaluating Cross-lingual Sentence Representations", 31 | booktitle = "Proceedings of the 2018 Conference on Empirical Methods 32 | in Natural Language Processing", 33 | year = "2018", 34 | publisher = "Association for Computational Linguistics", 35 | location = "Brussels, Belgium", 36 | } 37 | """ 38 | 39 | _DESCRIPTION = """\ 40 | XNLI is a subset of a few thousand examples from MNLI which has been translated 41 | into a 14 different languages (some low-ish resource). As with MNLI, the goal is 42 | to predict textual entailment (does sentence A imply/contradict/neither sentence 43 | B) and is a classification task (given two sentences, predict one of three 44 | labels). 45 | """ 46 | 47 | _KWARGS_DESCRIPTION = """ 48 | Computes XNLI score which is just simple accuracy. 49 | Args: 50 | predictions: Predicted labels. 51 | references: Ground truth labels. 52 | Returns: 53 | 'accuracy': accuracy 54 | Examples: 55 | 56 | >>> predictions = [0, 1] 57 | >>> references = [0, 1] 58 | >>> xnli_metric = evaluate.load("xnli") 59 | >>> results = xnli_metric.compute(predictions=predictions, references=references) 60 | >>> print(results) 61 | {'accuracy': 1.0} 62 | """ 63 | 64 | 65 | def simple_accuracy(preds, labels): 66 | return (preds == labels).mean() 67 | 68 | 69 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 70 | class Xnli(evaluate.Metric): 71 | def _info(self): 72 | return evaluate.MetricInfo( 73 | description=_DESCRIPTION, 74 | citation=_CITATION, 75 | inputs_description=_KWARGS_DESCRIPTION, 76 | features=datasets.Features( 77 | { 78 | "predictions": datasets.Value("int64" if self.config_name != "sts-b" else "float32"), 79 | "references": datasets.Value("int64" if self.config_name != "sts-b" else "float32"), 80 | } 81 | ), 82 | codebase_urls=[], 83 | reference_urls=[], 84 | format="numpy", 85 | ) 86 | 87 | def _compute(self, predictions, references): 88 | return {"accuracy": simple_accuracy(predictions, references)} 89 | -------------------------------------------------------------------------------- /metrics/xtreme_s/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("xtreme_s", "mls") 6 | launch_gradio_widget(module) 7 | -------------------------------------------------------------------------------- /metrics/xtreme_s/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER} 2 | scikit-learn -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | license_file = LICENSE 3 | 4 | [isort] 5 | ensure_newline_before_comments = True 6 | force_grid_wrap = 0 7 | include_trailing_comma = True 8 | line_length = 119 9 | lines_after_imports = 2 10 | multi_line_output = 3 11 | use_parentheses = True 12 | 13 | [flake8] 14 | ignore = E203, E501, W503 15 | max-line-length = 119 16 | exclude = 17 | src/datasets/datasets 18 | src/datasets/metrics 19 | per-file-ignores = 20 | metrics/*:F401 21 | -------------------------------------------------------------------------------- /src/evaluate/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # Copyright 2020 The HuggingFace Evaluate Authors and the TensorFlow Datasets Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | # Lint as: python3 17 | # pylint: enable=line-too-long 18 | # pylint: disable=g-import-not-at-top,g-bad-import-order,wrong-import-position 19 | 20 | __version__ = "0.4.4.dev0" 21 | 22 | from packaging import version 23 | 24 | 25 | SCRIPTS_VERSION = "main" if version.parse(__version__).is_devrelease else __version__ 26 | 27 | del version 28 | 29 | from .evaluation_suite import EvaluationSuite 30 | from .evaluator import ( 31 | AudioClassificationEvaluator, 32 | AutomaticSpeechRecognitionEvaluator, 33 | Evaluator, 34 | ImageClassificationEvaluator, 35 | QuestionAnsweringEvaluator, 36 | SummarizationEvaluator, 37 | Text2TextGenerationEvaluator, 38 | TextClassificationEvaluator, 39 | TextGenerationEvaluator, 40 | TokenClassificationEvaluator, 41 | TranslationEvaluator, 42 | evaluator, 43 | ) 44 | from .hub import push_to_hub 45 | from .info import ComparisonInfo, EvaluationModuleInfo, MeasurementInfo, MetricInfo 46 | from .inspect import inspect_evaluation_module, list_evaluation_modules 47 | from .loading import load 48 | from .module import CombinedEvaluations, Comparison, EvaluationModule, Measurement, Metric, combine 49 | from .saving import save 50 | from .utils import * 51 | from .utils import gradio, logging 52 | -------------------------------------------------------------------------------- /src/evaluate/commands/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/evaluate/5aa3982a9a8c86e506860e381d428a64b0cce73b/src/evaluate/commands/__init__.py -------------------------------------------------------------------------------- /src/evaluate/evaluator/text_generation.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Evaluate Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Dict, Tuple 16 | 17 | from datasets import Dataset 18 | 19 | from .base import Evaluator 20 | from .utils import DatasetColumn 21 | 22 | 23 | TASK_DOCUMENTATION_KWARGS = r""" 24 | input_column (`str`, defaults to `"text"`): 25 | the name of the column containing the input text in the dataset specified by `data`. 26 | generation_kwargs (`Dict`, *optional*, defaults to `None`): 27 | The generation kwargs are passed to the pipeline and set the text generation strategy. 28 | """ 29 | 30 | 31 | class TextGenerationEvaluator(Evaluator): 32 | """ 33 | Text generation evaluator. 34 | This Text generation evaluator can currently be loaded from [`evaluator`] using the default task name 35 | `text-generation`. 36 | Methods in this class assume a data format compatible with the [`~transformers.TextGenerationPipeline`]. 37 | """ 38 | 39 | def predictions_processor(self, predictions, *args, **kwargs): 40 | """ 41 | Args: 42 | predictions: A list of lists of dicts 43 | 44 | Returns: 45 | `dict`: All the generated texts are flattened and stored under the "data" key. 46 | """ 47 | return {"data": [pred[f"{self.predictions_prefix}_text"] for pred_list in predictions for pred in pred_list]} 48 | 49 | def __init__(self, task="text-generation", default_metric_name=None, predictions_prefix: str = "generated"): 50 | super().__init__(task=task, default_metric_name=default_metric_name) 51 | self.predictions_prefix = predictions_prefix 52 | 53 | def prepare_data(self, data: Dataset, input_column: str, *args, **kwargs) -> Tuple[Dict, DatasetColumn]: 54 | """ 55 | Prepare data. 56 | 57 | Args: 58 | data ([`Dataset`]): 59 | Specifies the dataset we will run evaluation on. 60 | input_column (`str`, defaults to `"text"`): 61 | The name of the column containing the text feature in the dataset specified by `data`. 62 | Returns: 63 | `dict`: metric inputs. 64 | `list`: pipeline inputs. 65 | """ 66 | 67 | self.check_required_columns(data, {"input_column": input_column}) 68 | 69 | return {}, DatasetColumn(data, input_column) 70 | -------------------------------------------------------------------------------- /src/evaluate/evaluator/utils.py: -------------------------------------------------------------------------------- 1 | from datasets import Dataset, get_dataset_split_names 2 | 3 | 4 | class DatasetColumn(list): 5 | """Helper class to avoid loading a dataset column into memory when accessing it.""" 6 | 7 | def __init__(self, dataset: Dataset, key: str): 8 | self.dataset = dataset 9 | self.key = key 10 | 11 | def __len__(self): 12 | return len(self.dataset) 13 | 14 | def __getitem__(self, i): 15 | return self.dataset[i][self.key] 16 | 17 | def __iter__(self): 18 | return (self.dataset[i][self.key] for i in range(len(self))) 19 | 20 | 21 | def choose_split(data, subset=None): 22 | available_splits = get_dataset_split_names(data, subset) 23 | preferred_split_order = [ 24 | "test", 25 | "testing", 26 | "eval", 27 | "evaluation", 28 | "validation", 29 | "val", 30 | "valid", 31 | "dev", 32 | "train", 33 | "training", 34 | ] 35 | for split in preferred_split_order: 36 | if split in available_splits: 37 | return split 38 | raise ValueError("No dataset split defined! Pass an explicit value to the `split` kwarg.") 39 | 40 | 41 | class DatasetColumnPair(list): 42 | """Helper class to avoid loading two dataset columns into memory when accessing it.""" 43 | 44 | def __init__( 45 | self, 46 | dataset: Dataset, 47 | first_col: str, 48 | second_col: str, 49 | first_key: str, 50 | second_key: str, 51 | ): 52 | """ 53 | Args: 54 | dataset (Dataset): dataset to build an iterator on 55 | first_col (str): first column name to use in the dataset 56 | second_col (str): second column name to use in the dataset 57 | first_key (str): key name used for the first column in the returned dictionary 58 | second_key (str): key name used for the second column in the returned dictionary 59 | """ 60 | self.dataset = dataset 61 | 62 | self.first_col = first_col 63 | self.second_col = second_col 64 | 65 | self.first_key = first_key 66 | self.second_key = second_key 67 | 68 | def __len__(self): 69 | return len(self.dataset) 70 | 71 | def __getitem__(self, i): 72 | return { 73 | self.first_key: self.dataset[i][self.first_col], 74 | self.second_key: self.dataset[i][self.second_col] if self.second_col else None, 75 | } 76 | 77 | def __iter__(self): 78 | return ( 79 | { 80 | self.first_key: self.dataset[i][self.first_col], 81 | self.second_key: self.dataset[i][self.second_col] if self.second_col else None, 82 | } 83 | for i in range(len(self)) 84 | ) 85 | -------------------------------------------------------------------------------- /src/evaluate/naming.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # Lint as: python3 16 | """Utilities for file names.""" 17 | 18 | import itertools 19 | import os 20 | import re 21 | 22 | 23 | _uppercase_uppercase_re = re.compile(r"([A-Z]+)([A-Z][a-z])") 24 | _lowercase_uppercase_re = re.compile(r"([a-z\d])([A-Z])") 25 | 26 | _single_underscore_re = re.compile(r"(?>> import evaluate 26 | >>> result = {"bleu": 0.7} 27 | >>> params = {"model": "gpt-2"} 28 | >>> evaluate.save("./results/", **result, **params) 29 | ``` 30 | """ 31 | current_time = datetime.now() 32 | 33 | file_path = _setup_path(path_or_file, current_time) 34 | 35 | data["_timestamp"] = current_time.isoformat() 36 | data["_git_commit_hash"] = _git_commit_hash() 37 | data["_evaluate_version"] = __version__ 38 | data["_python_version"] = sys.version 39 | data["_interpreter_path"] = sys.executable 40 | 41 | with FileLock(str(file_path) + ".lock"): 42 | with open(file_path, "w") as f: 43 | json.dump(data, f) 44 | 45 | # cleanup lock file 46 | try: 47 | os.remove(str(file_path) + ".lock") 48 | except FileNotFoundError: 49 | pass 50 | 51 | return file_path 52 | 53 | 54 | def _setup_path(path_or_file, current_time): 55 | path_or_file = Path(path_or_file) 56 | is_file = len(path_or_file.suffix) > 0 57 | if is_file: 58 | folder = path_or_file.parent 59 | file_name = path_or_file.name 60 | else: 61 | folder = path_or_file 62 | file_name = "result-" + current_time.strftime("%Y_%m_%d-%H_%M_%S") + ".json" 63 | folder.mkdir(parents=True, exist_ok=True) 64 | return folder / file_name 65 | 66 | 67 | def _git_commit_hash(): 68 | res = subprocess.run("git rev-parse --is-inside-work-tree".split(), cwd="./", stdout=subprocess.PIPE) 69 | if res.stdout.decode().strip() == "true": 70 | res = subprocess.run("git rev-parse HEAD".split(), cwd=os.getcwd(), stdout=subprocess.PIPE) 71 | return res.stdout.decode().strip() 72 | else: 73 | return None 74 | -------------------------------------------------------------------------------- /src/evaluate/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # flake8: noqa 16 | # Lint as: python3 17 | """Util import.""" 18 | 19 | __all__ = [ 20 | "disable_progress_bar", 21 | "enable_progress_bar", 22 | "is_progress_bar_enabled", 23 | "infer_gradio_input_types", 24 | "json_to_string_type", 25 | "parse_readme", 26 | "parse_gradio_data", 27 | "parse_test_cases", 28 | "launch_gradio_widget", 29 | ] 30 | 31 | from .gradio import ( 32 | infer_gradio_input_types, 33 | json_to_string_type, 34 | launch_gradio_widget, 35 | parse_gradio_data, 36 | parse_readme, 37 | parse_test_cases, 38 | ) 39 | from .logging import disable_progress_bar, enable_progress_bar, is_progress_bar_enabled 40 | -------------------------------------------------------------------------------- /templates/cookiecutter.json: -------------------------------------------------------------------------------- 1 | { 2 | "module_name": "Awesome Module", 3 | "module_type": "module", 4 | "module_description": "This new module is designed to solve this great ML task and is crafted with a lot of care and love.", 5 | "module_slug": "{{ cookiecutter.module_name|lower|replace(' ', '_') }}", 6 | "module_class_name": "{{ cookiecutter.module_name|replace(' ', '') }}", 7 | "namespace": "", 8 | "dataset_name": "" 9 | } -------------------------------------------------------------------------------- /templates/{{ cookiecutter.module_slug }}/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | title: {{ cookiecutter.module_name }} 3 | datasets: 4 | - {{ cookiecutter.dataset_name }} 5 | tags: 6 | - evaluate 7 | - {{ cookiecutter.module_type }} 8 | description: "TODO: add a description here" 9 | sdk: gradio 10 | sdk_version: 3.19.1 11 | app_file: app.py 12 | pinned: false 13 | --- 14 | 15 | # {{ cookiecutter.module_type|capitalize }} Card for {{ cookiecutter.module_name }} 16 | 17 | ***Module Card Instructions:*** *Fill out the following subsections. Feel free to take a look at existing {{ cookiecutter.module_type }} cards if you'd like examples.* 18 | 19 | ## {{ cookiecutter.module_type|capitalize }} Description 20 | *Give a brief overview of this {{ cookiecutter.module_type }}, including what task(s) it is usually used for, if any.* 21 | 22 | ## How to Use 23 | *Give general statement of how to use the {{ cookiecutter.module_type }}* 24 | 25 | *Provide simplest possible example for using the {{ cookiecutter.module_type }}* 26 | 27 | ### Inputs 28 | *List all input arguments in the format below* 29 | - **input_field** *(type): Definition of input, with explanation if necessary. State any default value(s).* 30 | 31 | ### Output Values 32 | 33 | *Explain what this {{ cookiecutter.module_type }} outputs and provide an example of what the {{ cookiecutter.module_type }} output looks like. Modules should return a dictionary with one or multiple key-value pairs, e.g. {"bleu" : 6.02}* 34 | 35 | *State the range of possible values that the {{ cookiecutter.module_type }}'s output can take, as well as what in that range is considered good. For example: "This {{ cookiecutter.module_type }} can take on any value between 0 and 100, inclusive. Higher scores are better."* 36 | 37 | #### Values from Popular Papers 38 | *Give examples, preferrably with links to leaderboards or publications, to papers that have reported this {{ cookiecutter.module_type }}, along with the values they have reported.* 39 | 40 | ### Examples 41 | *Give code examples of the {{ cookiecutter.module_type }} being used. Try to include examples that clear up any potential ambiguity left from the {{ cookiecutter.module_type }} description above. If possible, provide a range of examples that show both typical and atypical results, as well as examples where a variety of input parameters are passed.* 42 | 43 | ## Limitations and Bias 44 | *Note any known limitations or biases that the {{ cookiecutter.module_type }} has, with links and references if possible.* 45 | 46 | ## Citation 47 | *Cite the source where this {{ cookiecutter.module_type }} was introduced.* 48 | 49 | ## Further References 50 | *Add any useful further references.* 51 | -------------------------------------------------------------------------------- /templates/{{ cookiecutter.module_slug }}/app.py: -------------------------------------------------------------------------------- 1 | import evaluate 2 | from evaluate.utils import launch_gradio_widget 3 | 4 | 5 | module = evaluate.load("{{ cookiecutter.namespace }}/{{ cookiecutter.module_slug }}") 6 | launch_gradio_widget(module) -------------------------------------------------------------------------------- /templates/{{ cookiecutter.module_slug }}/requirements.txt: -------------------------------------------------------------------------------- 1 | git+https://github.com/huggingface/evaluate@main -------------------------------------------------------------------------------- /templates/{{ cookiecutter.module_slug }}/tests.py: -------------------------------------------------------------------------------- 1 | test_cases = [ 2 | { 3 | "predictions": [0, 0], 4 | "references": [1, 1], 5 | "result": {"metric_score": 0} 6 | }, 7 | { 8 | "predictions": [1, 1], 9 | "references": [1, 1], 10 | "result": {"metric_score": 1} 11 | }, 12 | { 13 | "predictions": [1, 0], 14 | "references": [1, 1], 15 | "result": {"metric_score": 0.5} 16 | } 17 | ] -------------------------------------------------------------------------------- /templates/{{ cookiecutter.module_slug }}/{{ cookiecutter.module_slug }}.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """TODO: Add a description here.""" 15 | 16 | import evaluate 17 | import datasets 18 | 19 | 20 | # TODO: Add BibTeX citation 21 | _CITATION = """\ 22 | @InProceedings{huggingface:module, 23 | title = {A great new module}, 24 | authors={huggingface, Inc.}, 25 | year={2020} 26 | } 27 | """ 28 | 29 | # TODO: Add description of the module here 30 | _DESCRIPTION = """\ 31 | This new module is designed to solve this great ML task and is crafted with a lot of care. 32 | """ 33 | 34 | 35 | # TODO: Add description of the arguments of the module here 36 | _KWARGS_DESCRIPTION = """ 37 | Calculates how good are predictions given some references, using certain scores 38 | Args: 39 | predictions: list of predictions to score. Each predictions 40 | should be a string with tokens separated by spaces. 41 | references: list of reference for each prediction. Each 42 | reference should be a string with tokens separated by spaces. 43 | Returns: 44 | accuracy: description of the first score, 45 | another_score: description of the second score, 46 | Examples: 47 | Examples should be written in doctest format, and should illustrate how 48 | to use the function. 49 | 50 | >>> my_new_module = evaluate.load("my_new_module") 51 | >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1]) 52 | >>> print(results) 53 | {'accuracy': 1.0} 54 | """ 55 | 56 | # TODO: Define external resources urls if needed 57 | BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt" 58 | 59 | 60 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) 61 | class {{ cookiecutter.module_class_name }}(evaluate.{{ cookiecutter.module_type | capitalize}}): 62 | """TODO: Short description of my evaluation module.""" 63 | 64 | def _info(self): 65 | # TODO: Specifies the evaluate.EvaluationModuleInfo object 66 | return evaluate.{{ cookiecutter.module_type | capitalize}}Info( 67 | # This is the description that will appear on the modules page. 68 | module_type="{{ cookiecutter.module_type}}", 69 | description=_DESCRIPTION, 70 | citation=_CITATION, 71 | inputs_description=_KWARGS_DESCRIPTION, 72 | # This defines the format of each prediction and reference 73 | features=datasets.Features({ 74 | 'predictions': datasets.Value('int64'), 75 | 'references': datasets.Value('int64'), 76 | }), 77 | # Homepage of the module for documentation 78 | homepage="http://module.homepage", 79 | # Additional links to the codebase or references 80 | codebase_urls=["http://github.com/path/to/codebase/of/new_module"], 81 | reference_urls=["http://path.to.reference.url/new_module"] 82 | ) 83 | 84 | def _download_and_prepare(self, dl_manager): 85 | """Optional: download external resources useful to compute the scores""" 86 | # TODO: Download external resources if needed 87 | pass 88 | 89 | def _compute(self, predictions, references): 90 | """Returns the scores""" 91 | # TODO: Compute the different scores of the module 92 | accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions) 93 | return { 94 | "accuracy": accuracy, 95 | } -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/evaluate/5aa3982a9a8c86e506860e381d428a64b0cce73b/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_evaluation_suite.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from evaluate import EvaluationSuite 4 | from tests.test_evaluator import DummyTextClassificationPipeline 5 | 6 | 7 | class TestEvaluationSuite(TestCase): 8 | def setUp(self): 9 | # Check that the EvaluationSuite loads successfully 10 | self.evaluation_suite = EvaluationSuite.load("evaluate/evaluation-suite-ci") 11 | 12 | # Setup a dummy model for usage with the EvaluationSuite 13 | self.dummy_model = DummyTextClassificationPipeline() 14 | 15 | def test_running_evaluation_suite(self): 16 | 17 | # Check that the evaluation suite successfully runs 18 | results = self.evaluation_suite.run(self.dummy_model) 19 | 20 | # Check that the results are correct 21 | for r in results: 22 | self.assertEqual(r["accuracy"], 0.5) 23 | 24 | # Check that correct number of tasks were run 25 | self.assertEqual(len(results), 2) 26 | 27 | def test_empty_suite(self): 28 | 29 | self.empty_suite = self.evaluation_suite 30 | self.empty_suite.suite = [] 31 | self.assertRaises(ValueError, self.empty_suite.run, self.dummy_model) 32 | -------------------------------------------------------------------------------- /tests/test_file_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from unittest.mock import patch 4 | 5 | import pytest 6 | 7 | from evaluate.utils.file_utils import OfflineModeIsEnabled, cached_path, ftp_get, ftp_head, http_get, http_head 8 | 9 | 10 | FILE_CONTENT = """\ 11 | Text data. 12 | Second line of data.""" 13 | 14 | 15 | def test_cached_path_local(text_file): 16 | # absolute path 17 | text_file = str(Path(text_file).resolve()) 18 | assert cached_path(text_file) == text_file 19 | # relative path 20 | text_file = str(Path(__file__).resolve().relative_to(Path(os.getcwd()))) 21 | assert cached_path(text_file) == text_file 22 | 23 | 24 | def test_cached_path_missing_local(tmp_path): 25 | # absolute path 26 | missing_file = str(tmp_path.resolve() / "__missing_file__.txt") 27 | with pytest.raises(FileNotFoundError): 28 | cached_path(missing_file) 29 | # relative path 30 | missing_file = "./__missing_file__.txt" 31 | with pytest.raises(FileNotFoundError): 32 | cached_path(missing_file) 33 | 34 | 35 | @patch("evaluate.config.HF_EVALUATE_OFFLINE", True) 36 | def test_cached_path_offline(): 37 | with pytest.raises(OfflineModeIsEnabled): 38 | cached_path("https://huggingface.co") 39 | 40 | 41 | @patch("evaluate.config.HF_EVALUATE_OFFLINE", True) 42 | def test_http_offline(tmp_path_factory): 43 | filename = tmp_path_factory.mktemp("data") / "file.html" 44 | with pytest.raises(OfflineModeIsEnabled): 45 | http_get("https://huggingface.co", temp_file=filename) 46 | with pytest.raises(OfflineModeIsEnabled): 47 | http_head("https://huggingface.co") 48 | 49 | 50 | @patch("evaluate.config.HF_EVALUATE_OFFLINE", True) 51 | def test_ftp_offline(tmp_path_factory): 52 | filename = tmp_path_factory.mktemp("data") / "file.html" 53 | with pytest.raises(OfflineModeIsEnabled): 54 | ftp_get("ftp://huggingface.co", temp_file=filename) 55 | with pytest.raises(OfflineModeIsEnabled): 56 | ftp_head("ftp://huggingface.co") 57 | -------------------------------------------------------------------------------- /tests/test_save.py: -------------------------------------------------------------------------------- 1 | import json 2 | import shutil 3 | import tempfile 4 | from pathlib import Path 5 | from unittest import TestCase 6 | 7 | import evaluate 8 | 9 | 10 | result_dict = {"metric": 1.0, "model_name": "x"} 11 | 12 | SAVE_EXTRA_KEYS = ["_timestamp", "_git_commit_hash", "_evaluate_version", "_python_version", "_interpreter_path"] 13 | 14 | 15 | class TestSave(TestCase): 16 | def setUp(self): 17 | self.save_path = Path(tempfile.mkdtemp()) 18 | 19 | def tearDown(self): 20 | shutil.rmtree(self.save_path) 21 | 22 | def test_save_to_folder(self): 23 | file_path = evaluate.save(self.save_path, **result_dict) 24 | with open(file_path, "r") as f: 25 | loaded_result_dict = json.load(f) 26 | for key in SAVE_EXTRA_KEYS: 27 | _ = loaded_result_dict.pop(key) 28 | self.assertDictEqual(result_dict, loaded_result_dict) 29 | 30 | def test_save_to_folder_nested(self): 31 | file_path = evaluate.save(self.save_path / "sub_dir1/sub_dir2", **result_dict) 32 | with open(file_path, "r") as f: 33 | loaded_result_dict = json.load(f) 34 | for key in SAVE_EXTRA_KEYS: 35 | _ = loaded_result_dict.pop(key) 36 | self.assertDictEqual(result_dict, loaded_result_dict) 37 | 38 | def test_save_to_file(self): 39 | _ = evaluate.save(self.save_path / "test.json", **result_dict) 40 | with open(self.save_path / "test.json", "r") as f: 41 | loaded_result_dict = json.load(f) 42 | for key in SAVE_EXTRA_KEYS: 43 | _ = loaded_result_dict.pop(key) 44 | self.assertDictEqual(result_dict, loaded_result_dict) 45 | -------------------------------------------------------------------------------- /tests/test_viz.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | import matplotlib.pyplot as plt 4 | 5 | from evaluate.visualization import radar_plot 6 | 7 | 8 | class TestViz(TestCase): 9 | def test_invert_range(self): 10 | data = [{"accuracy": 0.9, "precision": 0.8}, {"accuracy": 0.7, "precision": 0.6}] 11 | model_names = ["model1", "model2"] 12 | wrong_invert_range = ["latency_in_seconds"] # Value not present in data 13 | with self.assertRaises(ValueError): 14 | radar_plot(data, model_names, wrong_invert_range) 15 | 16 | def test_output_is_plot(self): 17 | data = [ 18 | {"accuracy": 0.9, "precision": 0.8, "latency_in_seconds": 48.1}, 19 | {"accuracy": 0.7, "precision": 0.6, "latency_in_seconds": 51.4}, 20 | ] 21 | model_names = ["model1", "model2"] 22 | invert_range = ["latency_in_seconds"] 23 | out_plt = radar_plot(data, model_names, invert_range) 24 | self.assertIsInstance(out_plt, plt.Figure) 25 | --------------------------------------------------------------------------------