├── .github
├── hub
│ ├── push_evaluations_to_hub.py
│ └── requirements.txt
└── workflows
│ ├── build_documentation.yml
│ ├── build_pr_documentation.yml
│ ├── ci.yml
│ ├── delete_doc_comment.yml
│ ├── python-release.yml
│ ├── trufflehog.yml
│ └── update_spaces.yml
├── .gitignore
├── AUTHORS
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── Makefile
├── README.md
├── additional-tests-requirements.txt
├── comparisons
├── exact_match
│ ├── README.md
│ ├── app.py
│ ├── exact_match.py
│ └── requirements.txt
├── mcnemar
│ ├── README.md
│ ├── app.py
│ ├── mcnemar.py
│ └── requirements.txt
└── wilcoxon
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ └── wilcoxon.py
├── docs
├── README.md
└── source
│ ├── _toctree.yml
│ ├── a_quick_tour.mdx
│ ├── base_evaluator.mdx
│ ├── choosing_a_metric.mdx
│ ├── considerations.mdx
│ ├── creating_and_sharing.mdx
│ ├── custom_evaluator.mdx
│ ├── evaluation_suite.mdx
│ ├── index.mdx
│ ├── installation.mdx
│ ├── keras_integrations.md
│ ├── package_reference
│ ├── evaluator_classes.mdx
│ ├── hub_methods.mdx
│ ├── loading_methods.mdx
│ ├── logging_methods.mdx
│ ├── main_classes.mdx
│ ├── saving_methods.mdx
│ └── visualization_methods.mdx
│ ├── sklearn_integrations.mdx
│ ├── transformers_integrations.mdx
│ └── types_of_evaluations.mdx
├── measurements
├── honest
│ ├── README.md
│ ├── app.py
│ ├── honest.py
│ └── requirements.txt
├── label_distribution
│ ├── README.md
│ ├── app.py
│ ├── label_distribution.py
│ └── requirements.txt
├── perplexity
│ ├── README.md
│ ├── app.py
│ ├── perplexity.py
│ └── requirements.txt
├── regard
│ ├── README.md
│ ├── app.py
│ ├── regard.py
│ └── requirements.txt
├── text_duplicates
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ └── text_duplicates.py
├── toxicity
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ └── toxicity.py
├── word_count
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ └── word_count.py
└── word_length
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ └── word_length.py
├── metrics
├── accuracy
│ ├── README.md
│ ├── accuracy.py
│ ├── app.py
│ └── requirements.txt
├── bertscore
│ ├── README.md
│ ├── app.py
│ ├── bertscore.py
│ └── requirements.txt
├── bleu
│ ├── README.md
│ ├── app.py
│ ├── bleu.py
│ ├── requirements.txt
│ └── tokenizer_13a.py
├── bleurt
│ ├── README.md
│ ├── app.py
│ ├── bleurt.py
│ └── requirements.txt
├── brier_score
│ ├── README.md
│ ├── app.py
│ ├── brier_score.py
│ └── requirements.txt
├── cer
│ ├── README.md
│ ├── app.py
│ ├── cer.py
│ ├── requirements.txt
│ └── test_cer.py
├── character
│ ├── README.md
│ ├── app.py
│ ├── character.py
│ └── requirements.txt
├── charcut_mt
│ ├── README.md
│ ├── app.py
│ ├── charcut_mt.py
│ └── requirements.txt
├── chrf
│ ├── README.md
│ ├── app.py
│ ├── chrf.py
│ └── requirements.txt
├── code_eval
│ ├── README.md
│ ├── app.py
│ ├── code_eval.py
│ ├── execute.py
│ └── requirements.txt
├── comet
│ ├── README.md
│ ├── app.py
│ ├── comet.py
│ └── requirements.txt
├── competition_math
│ ├── README.md
│ ├── app.py
│ ├── competition_math.py
│ └── requirements.txt
├── confusion_matrix
│ ├── README.md
│ ├── app.py
│ ├── confusion_matrix.py
│ └── requirements.txt
├── coval
│ ├── README.md
│ ├── app.py
│ ├── coval.py
│ └── requirements.txt
├── cuad
│ ├── README.md
│ ├── app.py
│ ├── compute_score.py
│ ├── cuad.py
│ └── requirements.txt
├── exact_match
│ ├── README.md
│ ├── app.py
│ ├── exact_match.py
│ └── requirements.txt
├── f1
│ ├── README.md
│ ├── app.py
│ ├── f1.py
│ └── requirements.txt
├── frugalscore
│ ├── README.md
│ ├── app.py
│ ├── frugalscore.py
│ └── requirements.txt
├── glue
│ ├── README.md
│ ├── app.py
│ ├── glue.py
│ └── requirements.txt
├── google_bleu
│ ├── README.md
│ ├── app.py
│ ├── google_bleu.py
│ ├── requirements.txt
│ └── tokenizer_13a.py
├── indic_glue
│ ├── README.md
│ ├── app.py
│ ├── indic_glue.py
│ └── requirements.txt
├── mae
│ ├── README.md
│ ├── app.py
│ ├── mae.py
│ └── requirements.txt
├── mahalanobis
│ ├── README.md
│ ├── app.py
│ ├── mahalanobis.py
│ └── requirements.txt
├── mape
│ ├── README.md
│ ├── app.py
│ ├── mape.py
│ └── requirements.txt
├── mase
│ ├── README.md
│ ├── app.py
│ ├── mase.py
│ └── requirements.txt
├── matthews_correlation
│ ├── README.md
│ ├── app.py
│ ├── matthews_correlation.py
│ └── requirements.txt
├── mauve
│ ├── README.md
│ ├── app.py
│ ├── mauve.py
│ └── requirements.txt
├── mean_iou
│ ├── README.md
│ ├── app.py
│ ├── mean_iou.py
│ └── requirements.txt
├── meteor
│ ├── README.md
│ ├── app.py
│ ├── meteor.py
│ └── requirements.txt
├── mse
│ ├── README.md
│ ├── app.py
│ ├── mse.py
│ └── requirements.txt
├── nist_mt
│ ├── README.md
│ ├── app.py
│ ├── nist_mt.py
│ ├── requirements.txt
│ └── tests.py
├── pearsonr
│ ├── README.md
│ ├── app.py
│ ├── pearsonr.py
│ └── requirements.txt
├── perplexity
│ ├── README.md
│ ├── app.py
│ ├── perplexity.py
│ └── requirements.txt
├── poseval
│ ├── README.md
│ ├── app.py
│ ├── poseval.py
│ └── requirements.txt
├── precision
│ ├── README.md
│ ├── app.py
│ ├── precision.py
│ └── requirements.txt
├── r_squared
│ ├── README.md
│ ├── app.py
│ ├── r_squared.py
│ └── requirements.txt
├── recall
│ ├── README.md
│ ├── app.py
│ ├── recall.py
│ └── requirements.txt
├── rl_reliability
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ └── rl_reliability.py
├── roc_auc
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ └── roc_auc.py
├── rouge
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ └── rouge.py
├── sacrebleu
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ └── sacrebleu.py
├── sari
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ └── sari.py
├── seqeval
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ └── seqeval.py
├── smape
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ └── smape.py
├── spearmanr
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ └── spearmanr.py
├── squad
│ ├── README.md
│ ├── app.py
│ ├── compute_score.py
│ ├── requirements.txt
│ └── squad.py
├── squad_v2
│ ├── README.md
│ ├── app.py
│ ├── compute_score.py
│ ├── requirements.txt
│ └── squad_v2.py
├── super_glue
│ ├── README.md
│ ├── app.py
│ ├── record_evaluation.py
│ ├── requirements.txt
│ └── super_glue.py
├── ter
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ └── ter.py
├── trec_eval
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ └── trec_eval.py
├── wer
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ └── wer.py
├── wiki_split
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ └── wiki_split.py
├── xnli
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ └── xnli.py
└── xtreme_s
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ └── xtreme_s.py
├── setup.cfg
├── setup.py
├── src
└── evaluate
│ ├── __init__.py
│ ├── commands
│ ├── __init__.py
│ └── evaluate_cli.py
│ ├── config.py
│ ├── evaluation_suite
│ └── __init__.py
│ ├── evaluator
│ ├── __init__.py
│ ├── audio_classification.py
│ ├── automatic_speech_recognition.py
│ ├── base.py
│ ├── image_classification.py
│ ├── question_answering.py
│ ├── text2text_generation.py
│ ├── text_classification.py
│ ├── text_generation.py
│ ├── token_classification.py
│ └── utils.py
│ ├── hub.py
│ ├── info.py
│ ├── inspect.py
│ ├── loading.py
│ ├── module.py
│ ├── naming.py
│ ├── saving.py
│ ├── utils
│ ├── __init__.py
│ ├── file_utils.py
│ ├── gradio.py
│ └── logging.py
│ └── visualization.py
├── templates
├── cookiecutter.json
└── {{ cookiecutter.module_slug }}
│ ├── README.md
│ ├── app.py
│ ├── requirements.txt
│ ├── tests.py
│ └── {{ cookiecutter.module_slug }}.py
└── tests
├── __init__.py
├── conftest.py
├── test_evaluation_suite.py
├── test_evaluator.py
├── test_file_utils.py
├── test_hub.py
├── test_load.py
├── test_metric.py
├── test_metric_common.py
├── test_save.py
├── test_trainer_evaluator_parity.py
├── test_viz.py
└── utils.py
/.github/hub/requirements.txt:
--------------------------------------------------------------------------------
1 | huggingface_hub
--------------------------------------------------------------------------------
/.github/workflows/build_documentation.yml:
--------------------------------------------------------------------------------
1 | name: Build documentation
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | - doc-builder*
8 | - v*-release
9 |
10 | jobs:
11 | build:
12 | uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
13 | with:
14 | commit_sha: ${{ github.sha }}
15 | package: evaluate
16 | secrets:
17 | hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
18 |
--------------------------------------------------------------------------------
/.github/workflows/build_pr_documentation.yml:
--------------------------------------------------------------------------------
1 | name: Build PR Documentation
2 |
3 | on:
4 | pull_request:
5 |
6 | concurrency:
7 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
8 | cancel-in-progress: true
9 |
10 | jobs:
11 | build:
12 | uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
13 | with:
14 | commit_sha: ${{ github.event.pull_request.head.sha }}
15 | pr_number: ${{ github.event.number }}
16 | package: evaluate
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: CI
2 |
3 | on:
4 | pull_request:
5 | branches:
6 | - main
7 | push:
8 | branches:
9 | - main
10 | - ci-*
11 |
12 | env:
13 | HF_ALLOW_CODE_EVAL: 1
14 |
15 | jobs:
16 |
17 | check_code_quality:
18 | runs-on: ubuntu-latest
19 | steps:
20 | - uses: actions/checkout@v3
21 | - name: Set up Python
22 | uses: actions/setup-python@v4
23 | with:
24 | python-version: "3.8"
25 | - name: Install dependencies
26 | run: |
27 | python -m pip install --upgrade pip
28 | pip install .[quality]
29 | - name: Check quality
30 | run: |
31 | black --check --line-length 119 --target-version py36 tests src metrics comparisons measurements
32 | isort --check-only tests src metrics comparisons measurements
33 | flake8 tests src metrics
34 |
35 | test:
36 | needs: check_code_quality
37 | strategy:
38 | fail-fast: false
39 | matrix:
40 | test: ['unit', 'parity']
41 | os: [ubuntu-latest, windows-latest]
42 | runs-on: ${{ matrix.os }}
43 | steps:
44 | - uses: actions/checkout@v3
45 | with:
46 | fetch-depth: 0
47 | - name: Set up Python 3.8
48 | uses: actions/setup-python@v4
49 | with:
50 | python-version: "3.8"
51 | - name: Upgrade pip
52 | run: python -m pip install --upgrade pip
53 | - name: Install dependencies
54 | run: |
55 | pip install .[tests]
56 | pip install -r additional-tests-requirements.txt --no-deps
57 | - name: Test with pytest
58 | if: ${{ matrix.test == 'unit' }}
59 | run: |
60 | python -m pytest -n 2 --dist loadfile -sv ./tests/ --ignore=./tests/test_trainer_evaluator_parity.py
61 | - name: Integration test with transformers
62 | if: ${{ matrix.test == 'parity' }}
63 | run: |
64 | python -m pytest -n 2 --dist loadfile -sv ./tests/test_trainer_evaluator_parity.py
65 |
--------------------------------------------------------------------------------
/.github/workflows/delete_doc_comment.yml:
--------------------------------------------------------------------------------
1 | name: Delete dev documentation
2 |
3 | on:
4 | pull_request:
5 | types: [ closed ]
6 |
7 |
8 | jobs:
9 | delete:
10 | uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main
11 | with:
12 | pr_number: ${{ github.event.number }}
13 | package: evaluate
--------------------------------------------------------------------------------
/.github/workflows/python-release.yml:
--------------------------------------------------------------------------------
1 | name: Python release
2 |
3 | on:
4 | push:
5 | tags:
6 | - v*
7 |
8 | env:
9 | PYPI_TOKEN: ${{ secrets.PYPI_TOKEN_DIST }}
10 |
11 | jobs:
12 | python_release:
13 | runs-on: ubuntu-latest
14 |
15 | steps:
16 | - uses: actions/checkout@v2
17 | - name: Set up Python
18 | uses: actions/setup-python@v2
19 | with:
20 | python-version: 3.9
21 | - name: Install dependencies
22 | run: |
23 | pip install --upgrade pip
24 | pip install setuptools wheel
25 | - run: python setup.py sdist bdist_wheel
26 |
27 | - run: |
28 | pip install twine
29 | - name: Upload to PyPi
30 | run: |
31 | twine upload dist/* -u __token__ -p "$PYPI_TOKEN"
32 |
--------------------------------------------------------------------------------
/.github/workflows/trufflehog.yml:
--------------------------------------------------------------------------------
1 | on:
2 | push:
3 |
4 | name: Secret Leaks
5 |
6 | jobs:
7 | trufflehog:
8 | runs-on: ubuntu-latest
9 | steps:
10 | - name: Checkout code
11 | uses: actions/checkout@v4
12 | with:
13 | fetch-depth: 0
14 | - name: Secret Scanning
15 | uses: trufflesecurity/trufflehog@main
16 |
--------------------------------------------------------------------------------
/.github/workflows/update_spaces.yml:
--------------------------------------------------------------------------------
1 | name: Update Hub repositories
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 |
8 | jobs:
9 | update-hub-repositories:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - name: Checkout repository
13 | uses: actions/checkout@v2
14 | with:
15 | fetch-depth: 0
16 | - name: Set up Python
17 | uses: actions/setup-python@v2
18 | with:
19 | python-version: "3.8"
20 | - name: Set up default Git config
21 | run: |
22 | git config --global user.name evaluate-bot
23 | git config --global user.email leandro@huggingface.co
24 | - name: Install dependencies
25 | working-directory: ./.github/hub
26 | run: |
27 | python -m pip install --upgrade pip
28 | pip install -r requirements.txt
29 | - name: Update Hub repositories
30 | working-directory: ./.github/hub
31 | run: |
32 | export HF_TOKEN=${{ secrets.HF_HUB_TOKEN }}
33 | export EVALUATE_LIB_PATH=$GITHUB_WORKSPACE
34 | export GIT_HASH=$GITHUB_SHA
35 | export GIT_LFS_SKIP_SMUDGE=1
36 | python push_evaluations_to_hub.py
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Locked files
2 | *.lock
3 | !dvc.lock
4 |
5 | # Extracted dummy data
6 | datasets/**/dummy_data-zip-extracted/
7 |
8 | # Compiled python modules.
9 | *.pyc
10 |
11 | # Byte-compiled
12 | _pycache__/
13 | .cache/
14 |
15 | # Python egg metadata, regenerated from source files by setuptools.
16 | *.egg-info
17 | .eggs/
18 |
19 | # PyPI distribution artifacts.
20 | build/
21 | dist/
22 |
23 | # Environments
24 | .env
25 | .venv
26 | env/
27 | venv/
28 | ENV/
29 | env.bak/
30 | venv.bak/
31 |
32 | # pyenv
33 | .python-version
34 |
35 | # Tests
36 | .pytest_cache/
37 |
38 | # Other
39 | *.DS_Store
40 |
41 | # PyCharm/vscode
42 | .idea
43 | .vscode
44 |
45 | # keep only the empty datasets and metrics directory with it's __init__.py file
46 | /src/*/datasets/*
47 | !/src/*/datasets/__init__.py
48 |
49 | /src/*/metrics/*
50 | !/src/*/metrics/__init__.py
51 |
52 | # Vim
53 | .*.swp
54 |
55 | # playground
56 | /playground
57 |
58 | # Sphinx documentation
59 | docs/_build/
60 | docs/source/_build/
61 |
62 | # Benchmark results
63 | report.json
64 | report.md
--------------------------------------------------------------------------------
/AUTHORS:
--------------------------------------------------------------------------------
1 | # This is the list of HuggingFace Datasets authors for copyright purposes.
2 | #
3 | # This does not necessarily list everyone who has contributed code, since in
4 | # some cases, their employer may be the copyright holder. To see the full list
5 | # of contributors, see the revision history in source control.
6 |
7 | Google Inc.
8 | HuggingFace Inc.
9 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: quality style test
2 |
3 | # Check that source code meets quality standards
4 |
5 | quality:
6 | black --check --line-length 119 --target-version py36 tests src metrics comparisons measurements
7 | isort --check-only tests src metrics measurements
8 | flake8 tests src metrics
9 |
10 | # Format source code automatically
11 |
12 | style:
13 | black --line-length 119 --target-version py36 tests src metrics comparisons measurements
14 | isort tests src metrics measurements
15 |
16 | # Run tests for the library
17 |
18 | test:
19 | python -m pytest -n auto --dist=loadfile -s -v ./tests/
20 |
--------------------------------------------------------------------------------
/additional-tests-requirements.txt:
--------------------------------------------------------------------------------
1 | unbabel-comet>=1.0.0;python_version>'3.6'
2 | git+https://github.com/google-research/bleurt.git
3 | git+https://github.com/ns-moosavi/coval.git
4 | git+https://github.com/hendrycks/math.git
5 | git+https://github.com/google-research/rl-reliability-metrics
6 | gin-config
--------------------------------------------------------------------------------
/comparisons/exact_match/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Exact Match
3 | emoji: 🤗
4 | colorFrom: blue
5 | colorTo: green
6 | sdk: gradio
7 | sdk_version: 3.0.2
8 | app_file: app.py
9 | pinned: false
10 | tags:
11 | - evaluate
12 | - comparison
13 | description: >-
14 | Returns the rate at which the predictions of one model exactly match those of another model.
15 | ---
16 |
17 |
18 | # Comparison Card for Exact Match
19 |
20 | ## Comparison description
21 |
22 | Given two model predictions the exact match score is 1 if they are the exact same, and is 0 otherwise. The overall exact match score is the average.
23 |
24 | - **Example 1**: The exact match score if prediction 1.0 is [0, 1] is 0, given prediction 2 is [0, 1].
25 | - **Example 2**: The exact match score if prediction 0.0 is [0, 1] is 0, given prediction 2 is [1, 0].
26 | - **Example 3**: The exact match score if prediction 0.5 is [0, 1] is 0, given prediction 2 is [1, 1].
27 |
28 | ## How to use
29 |
30 | At minimum, this metric takes as input predictions and references:
31 | ```python
32 | >>> exact_match = evaluate.load("exact_match", module_type="comparison")
33 | >>> results = exact_match.compute(predictions1=[0, 1, 1], predictions2=[1, 1, 1])
34 | >>> print(results)
35 | {'exact_match': 0.66}
36 | ```
37 |
38 | ## Output values
39 |
40 | Returns a float between 0.0 and 1.0 inclusive.
41 |
42 | ## Examples
43 |
44 | ```python
45 | >>> exact_match = evaluate.load("exact_match", module_type="comparison")
46 | >>> results = exact_match.compute(predictions1=[0, 0, 0], predictions2=[1, 1, 1])
47 | >>> print(results)
48 | {'exact_match': 1.0}
49 | ```
50 |
51 | ```python
52 | >>> exact_match = evaluate.load("exact_match", module_type="comparison")
53 | >>> results = exact_match.compute(predictions1=[0, 1, 1], predictions2=[1, 1, 1])
54 | >>> print(results)
55 | {'exact_match': 0.66}
56 | ```
57 |
58 |
59 | ## Limitations and bias
60 |
61 | ## Citations
62 |
--------------------------------------------------------------------------------
/comparisons/exact_match/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("exact_match", module_type="comparison")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/comparisons/exact_match/exact_match.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Evaluate Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Exact match test for model comparison."""
15 |
16 | import datasets
17 | import numpy as np
18 |
19 | import evaluate
20 |
21 |
22 | _DESCRIPTION = """
23 | Returns the rate at which the predictions of one model exactly match those of another model.
24 | """
25 |
26 |
27 | _KWARGS_DESCRIPTION = """
28 | Args:
29 | predictions1 (`list` of `int`): Predicted labels for model 1.
30 | predictions2 (`list` of `int`): Predicted labels for model 2.
31 |
32 | Returns:
33 | exact_match (`float`): Dictionary containing exact_match rate. Possible values are between 0.0 and 1.0, inclusive.
34 |
35 | Examples:
36 | >>> exact_match = evaluate.load("exact_match", module_type="comparison")
37 | >>> results = exact_match.compute(predictions1=[1, 1, 1], predictions2=[1, 1, 1])
38 | >>> print(results)
39 | {'exact_match': 1.0}
40 | """
41 |
42 |
43 | _CITATION = """
44 | """
45 |
46 |
47 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
48 | class ExactMatch(evaluate.Comparison):
49 | def _info(self):
50 | return evaluate.ComparisonInfo(
51 | module_type="comparison",
52 | description=_DESCRIPTION,
53 | citation=_CITATION,
54 | inputs_description=_KWARGS_DESCRIPTION,
55 | features=datasets.Features(
56 | {
57 | "predictions1": datasets.Value("int64"),
58 | "predictions2": datasets.Value("int64"),
59 | }
60 | ),
61 | )
62 |
63 | def _compute(self, predictions1, predictions2):
64 | score_list = [p1 == p2 for p1, p2 in zip(predictions1, predictions2)]
65 | return {"exact_match": np.mean(score_list)}
66 |
--------------------------------------------------------------------------------
/comparisons/exact_match/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scipy
--------------------------------------------------------------------------------
/comparisons/mcnemar/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: McNemar
3 | emoji: 🤗
4 | colorFrom: blue
5 | colorTo: green
6 | sdk: gradio
7 | sdk_version: 3.0.2
8 | app_file: app.py
9 | pinned: false
10 | tags:
11 | - evaluate
12 | - comparison
13 | description: >-
14 | McNemar's test is a diagnostic test over a contingency table resulting from the predictions of two classifiers. The test compares the sensitivity and specificity of the diagnostic tests on the same group reference labels. It can be computed with:
15 | McNemar = (SE - SP)**2 / SE + SP
16 | Where:
17 | SE: Sensitivity (Test 1 positive; Test 2 negative)
18 | SP: Specificity (Test 1 negative; Test 2 positive)
19 | ---
20 |
21 |
22 | # Comparison Card for McNemar
23 |
24 | ## Comparison description
25 |
26 | McNemar's test is a non-parametric diagnostic test over a contingency table resulting from the predictions of two classifiers. The test compares the sensitivity and specificity of the diagnostic tests on the same group reference labels. It can be computed with:
27 |
28 | McNemar = (SE - SP)**2 / SE + SP
29 |
30 | Where:
31 | * SE: Sensitivity (Test 1 positive; Test 2 negative)
32 | * SP: Specificity (Test 1 negative; Test 2 positive)
33 |
34 | In other words, SE and SP are the diagonal elements of the contingency table for the classifier predictions (`predictions1` and `predictions2`) with respect to the ground truth `references`.
35 |
36 | ## How to use
37 |
38 | The McNemar comparison calculates the proportions of responses that exhibit disagreement between two classifiers. It is used to analyze paired nominal data.
39 |
40 | ## Inputs
41 |
42 | Its arguments are:
43 |
44 | `predictions1`: a list of predictions from the first model.
45 |
46 | `predictions2`: a list of predictions from the second model.
47 |
48 | `references`: a list of the ground truth reference labels.
49 |
50 | ## Output values
51 |
52 | The McNemar comparison outputs two things:
53 |
54 | `stat`: The McNemar statistic.
55 |
56 | `p`: The p value.
57 |
58 | ## Examples
59 |
60 | Example comparison:
61 |
62 | ```python
63 | mcnemar = evaluate.load("mcnemar")
64 | results = mcnemar.compute(references=[1, 0, 1], predictions1=[1, 1, 1], predictions2=[1, 0, 1])
65 | print(results)
66 | {'stat': 1.0, 'p': 0.31731050786291115}
67 | ```
68 |
69 | ## Limitations and bias
70 |
71 | The McNemar test is a non-parametric test, so it has relatively few assumptions (basically only that the observations are independent). It should be used to analyze paired nominal data only.
72 |
73 | ## Citations
74 |
75 | ```bibtex
76 | @article{mcnemar1947note,
77 | title={Note on the sampling error of the difference between correlated proportions or percentages},
78 | author={McNemar, Quinn},
79 | journal={Psychometrika},
80 | volume={12},
81 | number={2},
82 | pages={153--157},
83 | year={1947},
84 | publisher={Springer-Verlag}
85 | }
86 | ```
87 |
--------------------------------------------------------------------------------
/comparisons/mcnemar/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("mcnemar", module_type="comparison")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/comparisons/mcnemar/mcnemar.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Evaluate Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """McNemar test for model comparison."""
15 |
16 | import datasets
17 | from scipy.stats import chi2
18 |
19 | import evaluate
20 |
21 |
22 | _DESCRIPTION = """
23 | McNemar's test is a diagnostic test over a contingency table resulting from the predictions of two classifiers. The test compares the sensitivity and specificity of the diagnostic tests on the same group reference labels. It can be computed with:
24 | McNemar = (SE - SP)**2 / SE + SP
25 | Where:
26 | SE: Sensitivity (Test 1 positive; Test 2 negative)
27 | SP: Specificity (Test 1 negative; Test 2 positive)
28 | """
29 |
30 |
31 | _KWARGS_DESCRIPTION = """
32 | Args:
33 | predictions1 (`list` of `int`): Predicted labels for model 1.
34 | predictions2 (`list` of `int`): Predicted labels for model 2.
35 | references (`list` of `int`): Ground truth labels.
36 |
37 | Returns:
38 | stat (`float`): McNemar test score.
39 | p (`float`): The p value. Minimum possible value is 0. Maximum possible value is 1.0. A lower p value means a more significant difference.
40 |
41 | Examples:
42 | >>> mcnemar = evaluate.load("mcnemar")
43 | >>> results = mcnemar.compute(references=[1, 0, 1], predictions1=[1, 1, 1], predictions2=[1, 0, 1])
44 | >>> print(results)
45 | {'stat': 1.0, 'p': 0.31731050786291115}
46 | """
47 |
48 |
49 | _CITATION = """
50 | @article{mcnemar1947note,
51 | title={Note on the sampling error of the difference between correlated proportions or percentages},
52 | author={McNemar, Quinn},
53 | journal={Psychometrika},
54 | volume={12},
55 | number={2},
56 | pages={153--157},
57 | year={1947},
58 | publisher={Springer-Verlag}
59 | }
60 | """
61 |
62 |
63 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
64 | class McNemar(evaluate.Comparison):
65 | def _info(self):
66 | return evaluate.ComparisonInfo(
67 | module_type="comparison",
68 | description=_DESCRIPTION,
69 | citation=_CITATION,
70 | inputs_description=_KWARGS_DESCRIPTION,
71 | features=datasets.Features(
72 | {
73 | "predictions1": datasets.Value("int64"),
74 | "predictions2": datasets.Value("int64"),
75 | "references": datasets.Value("int64"),
76 | }
77 | ),
78 | )
79 |
80 | def _compute(self, predictions1, predictions2, references):
81 | # construct contingency table
82 | tbl = [[0, 0], [0, 0]]
83 | for gt, p1, p2 in zip(references, predictions1, predictions2):
84 | if p1 == gt and p2 == gt:
85 | tbl[0][0] += 1
86 | elif p1 == gt:
87 | tbl[0][1] += 1
88 | elif p2 == gt:
89 | tbl[1][0] += 1
90 | else:
91 | tbl[1][1] += 1
92 |
93 | # compute statistic
94 | b, c = tbl[0][1], tbl[1][0]
95 | statistic = abs(b - c) ** 2 / (1.0 * (b + c))
96 | df = 1
97 | pvalue = chi2.sf(statistic, df)
98 | return {"stat": statistic, "p": pvalue}
99 |
--------------------------------------------------------------------------------
/comparisons/mcnemar/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scipy
--------------------------------------------------------------------------------
/comparisons/wilcoxon/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Wilcoxon
3 | emoji: 🤗
4 | colorFrom: blue
5 | colorTo: green
6 | sdk: gradio
7 | sdk_version: 3.0.2
8 | app_file: app.py
9 | pinned: false
10 | tags:
11 | - evaluate
12 | - comparison
13 | description: >-
14 | Wilcoxon's test is a signed-rank test for comparing paired samples.
15 | ---
16 |
17 |
18 | # Comparison Card for Wilcoxon
19 |
20 | ## Comparison description
21 |
22 | Wilcoxon's test is a non-parametric signed-rank test that tests whether the distribution of the differences is symmetric about zero. It can be used to compare the predictions of two models.
23 |
24 | ## How to use
25 |
26 | The Wilcoxon comparison is used to analyze paired ordinal data.
27 |
28 | ## Inputs
29 |
30 | Its arguments are:
31 |
32 | `predictions1`: a list of predictions from the first model.
33 |
34 | `predictions2`: a list of predictions from the second model.
35 |
36 | ## Output values
37 |
38 | The Wilcoxon comparison outputs two things:
39 |
40 | `stat`: The Wilcoxon statistic.
41 |
42 | `p`: The p value.
43 |
44 | ## Examples
45 |
46 | Example comparison:
47 |
48 | ```python
49 | wilcoxon = evaluate.load("wilcoxon")
50 | results = wilcoxon.compute(predictions1=[-7, 123.45, 43, 4.91, 5], predictions2=[1337.12, -9.74, 1, 2, 3.21])
51 | print(results)
52 | {'stat': 5.0, 'p': 0.625}
53 | ```
54 |
55 | ## Limitations and bias
56 |
57 | The Wilcoxon test is a non-parametric test, so it has relatively few assumptions (basically only that the observations are independent). It should be used to analyze paired ordinal data only.
58 |
59 | ## Citations
60 |
61 | ```bibtex
62 | @incollection{wilcoxon1992individual,
63 | title={Individual comparisons by ranking methods},
64 | author={Wilcoxon, Frank},
65 | booktitle={Breakthroughs in statistics},
66 | pages={196--202},
67 | year={1992},
68 | publisher={Springer}
69 | }
70 | ```
71 |
--------------------------------------------------------------------------------
/comparisons/wilcoxon/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("wilcoxon", module_type="comparison")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/comparisons/wilcoxon/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@a45df1eb9996eec64ec3282ebe554061cb366388
2 | datasets~=2.0
3 | scipy
4 |
--------------------------------------------------------------------------------
/comparisons/wilcoxon/wilcoxon.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Evaluate Authors
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Wilcoxon test for model comparison."""
15 |
16 | import datasets
17 | from scipy.stats import wilcoxon
18 |
19 | import evaluate
20 |
21 |
22 | _DESCRIPTION = """
23 | Wilcoxon's test is a non-parametric signed-rank test that tests whether the distribution of the differences is symmetric about zero. It can be used to compare the predictions of two models.
24 | """
25 |
26 |
27 | _KWARGS_DESCRIPTION = """
28 | Args:
29 | predictions1 (`list` of `float`): Predictions for model 1.
30 | predictions2 (`list` of `float`): Predictions for model 2.
31 |
32 | Returns:
33 | stat (`float`): Wilcoxon test score.
34 | p (`float`): The p value. Minimum possible value is 0. Maximum possible value is 1.0. A lower p value means a more significant difference.
35 |
36 | Examples:
37 | >>> wilcoxon = evaluate.load("wilcoxon")
38 | >>> results = wilcoxon.compute(predictions1=[-7, 123.45, 43, 4.91, 5], predictions2=[1337.12, -9.74, 1, 2, 3.21])
39 | >>> print(results)
40 | {'stat': 5.0, 'p': 0.625}
41 | """
42 |
43 |
44 | _CITATION = """
45 | @incollection{wilcoxon1992individual,
46 | title={Individual comparisons by ranking methods},
47 | author={Wilcoxon, Frank},
48 | booktitle={Breakthroughs in statistics},
49 | pages={196--202},
50 | year={1992},
51 | publisher={Springer}
52 | }
53 | """
54 |
55 |
56 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
57 | class Wilcoxon(evaluate.Comparison):
58 | def _info(self):
59 | return evaluate.ComparisonInfo(
60 | module_type="comparison",
61 | description=_DESCRIPTION,
62 | citation=_CITATION,
63 | inputs_description=_KWARGS_DESCRIPTION,
64 | features=datasets.Features(
65 | {
66 | "predictions1": datasets.Value("float"),
67 | "predictions2": datasets.Value("float"),
68 | }
69 | ),
70 | )
71 |
72 | def _compute(self, predictions1, predictions2):
73 | # calculate difference
74 | d = [p1 - p2 for (p1, p2) in zip(predictions1, predictions2)]
75 |
76 | # compute statistic
77 | res = wilcoxon(d)
78 | return {"stat": res.statistic, "p": res.pvalue}
79 |
--------------------------------------------------------------------------------
/docs/source/_toctree.yml:
--------------------------------------------------------------------------------
1 | - sections:
2 | - local: index
3 | title: 🤗 Evaluate
4 | title: Get started
5 | - sections:
6 | - local: installation
7 | title: Installation
8 | - local: a_quick_tour
9 | title: A quick tour
10 | title: Tutorials
11 | - sections:
12 | - local: choosing_a_metric
13 | title: Choosing the right metric
14 | - local: creating_and_sharing
15 | title: Adding new evaluations
16 | - local: base_evaluator
17 | title: Using the evaluator
18 | - local: custom_evaluator
19 | title: Using the evaluator with custom pipelines
20 | - local: evaluation_suite
21 | title: Creating an EvaluationSuite
22 | - sections:
23 | - local: transformers_integrations
24 | title: Transformers
25 | - local: keras_integrations
26 | title: Keras and Tensorflow
27 | - local: sklearn_integrations
28 | title: scikit-learn
29 | title: Using 🤗 Evaluate with other ML frameworks
30 | title: "How-to guides"
31 | - sections:
32 | - local: types_of_evaluations
33 | title: Types of evaluations
34 | - local: considerations
35 | title: Considerations for model evaluation
36 | title: "Conceptual guides"
37 | - sections:
38 | - local: package_reference/main_classes
39 | title: Main classes
40 | - local: package_reference/loading_methods
41 | title: Loading methods
42 | - local: package_reference/saving_methods
43 | title: Saving methods
44 | - local: package_reference/hub_methods
45 | title: Hub methods
46 | - local: package_reference/evaluator_classes
47 | title: Evaluator classes
48 | - local: package_reference/visualization_methods
49 | title: Visualization methods
50 | - local: package_reference/logging_methods
51 | title: Logging methods
52 | title: "Reference"
53 |
--------------------------------------------------------------------------------
/docs/source/evaluation_suite.mdx:
--------------------------------------------------------------------------------
1 | # Creating an EvaluationSuite
2 |
3 | It can be useful to evaluate models on a variety of different tasks to understand their downstream performance. Assessing the model on several types of tasks can reveal gaps in performance along some axis. For example, when training a language model, it is often useful to measure perplexity on an in-domain corpus, but also to concurrently evaluate on tasks which test for general language capabilities like natural language entailment or question-answering, or tasks designed to probe the model along fairness and bias dimensions.
4 |
5 | The `EvaluationSuite` provides a way to compose any number of ([evaluator](base_evaluator), dataset, metric) tuples as a SubTask to evaluate a model on a collection of several evaluation tasks. See the [evaluator documentation](base_evaluator) for a list of currently supported tasks.
6 |
7 | A new `EvaluationSuite` is made up of a list of `SubTask` classes, each defining an evaluation task. The Python file containing the definition can be uploaded to a Space on the Hugging Face Hub so it can be shared with the community or saved/loaded locally as a Python script.
8 |
9 | Some datasets require additional preprocessing before passing them to an `Evaluator`. You can set a `data_preprocessor` for each `SubTask` which is applied via a `map` operation using the `datasets` library. Keyword arguments for the `Evaluator` can be passed down through the `args_for_task` attribute.
10 |
11 | To create a new `EvaluationSuite`, create a [new Space](https://huggingface.co/new-space) with a .py file which matches the name of the Space, add the below template to a Python file, and fill in the attributes for a new task.
12 |
13 | The mandatory attributes for a new `SubTask` are `task_type` and `data`.
14 | 1. [`task_type`] maps to the tasks currently supported by the Evaluator.
15 | 2. [`data`] can be an instantiated Hugging Face dataset object or the name of a dataset.
16 | 3. [`subset`] and [`split`] can be used to define which name and split of the dataset should be used for evaluation.
17 | 4. [`args_for_task`] should be a dictionary with kwargs to be passed to the Evaluator.
18 |
19 | ```python
20 | import evaluate
21 | from evaluate.evaluation_suite import SubTask
22 |
23 | class Suite(evaluate.EvaluationSuite):
24 |
25 | def __init__(self, name):
26 | super().__init__(name)
27 | self.preprocessor = lambda x: {"text": x["text"].lower()}
28 | self.suite = [
29 | SubTask(
30 | task_type="text-classification",
31 | data="glue",
32 | subset="sst2",
33 | split="validation[:10]",
34 | args_for_task={
35 | "metric": "accuracy",
36 | "input_column": "sentence",
37 | "label_column": "label",
38 | "label_mapping": {
39 | "LABEL_0": 0.0,
40 | "LABEL_1": 1.0
41 | }
42 | }
43 | ),
44 | SubTask(
45 | task_type="text-classification",
46 | data="glue",
47 | subset="rte",
48 | split="validation[:10]",
49 | args_for_task={
50 | "metric": "accuracy",
51 | "input_column": "sentence1",
52 | "second_input_column": "sentence2",
53 | "label_column": "label",
54 | "label_mapping": {
55 | "LABEL_0": 0,
56 | "LABEL_1": 1
57 | }
58 | }
59 | )
60 | ]
61 | ```
62 |
63 | An `EvaluationSuite` can be loaded by name from the Hugging Face Hub, or locally by providing a path, and run with the `run(model_or_pipeline)` method. The evaluation results are returned along with their task names and information about the time it took to obtain predictions through the pipeline. These can be easily displayed with a `pandas.DataFrame`:
64 |
65 | ```
66 | >>> from evaluate import EvaluationSuite
67 | >>> suite = EvaluationSuite.load('mathemakitten/glue-evaluation-suite')
68 | >>> results = suite.run("gpt2")
69 | ```
70 |
71 | | accuracy | total_time_in_seconds | samples_per_second | latency_in_seconds | task_name |
72 | |-----------:|------------------------:|---------------------:|---------------------:|:------------|
73 | | 0.5 | 0.740811 | 13.4987 | 0.0740811 | glue/sst2 |
74 | | 0.4 | 1.67552 | 5.9683 | 0.167552 | glue/rte |
75 |
--------------------------------------------------------------------------------
/docs/source/index.mdx:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | # 🤗 Evaluate
8 |
9 | A library for easily evaluating machine learning models and datasets.
10 |
11 | With a single line of code, you get access to dozens of evaluation methods for different domains (NLP, Computer Vision, Reinforcement Learning, and more!). Be it on your local machine or in a distributed training setup, you can evaluate your models in a consistent and reproducible way!
12 |
13 | Visit the 🤗 Evaluate [organization](https://huggingface.co/evaluate-metric) for a full list of available metrics. Each metric has a dedicated Space with an interactive demo for how to use the metric, and a documentation card detailing the metrics limitations and usage.
14 |
15 | > **Tip:** For more recent evaluation approaches, for example for evaluating LLMs, we recommend our newer and more actively maintained library [LightEval](https://github.com/huggingface/lighteval).
16 |
17 |
37 |
--------------------------------------------------------------------------------
/docs/source/installation.mdx:
--------------------------------------------------------------------------------
1 | # Installation
2 |
3 | Before you start, you will need to setup your environment and install the appropriate packages. 🤗 Evaluate is tested on **Python 3.7+**.
4 |
5 | ## Virtual environment
6 |
7 | You should install 🤗 Evaluate in a [virtual environment](https://docs.python.org/3/library/venv.html) to keep everything neat and tidy.
8 |
9 | 1. Create and navigate to your project directory:
10 |
11 | ```bash
12 | mkdir ~/my-project
13 | cd ~/my-project
14 | ```
15 |
16 | 2. Start a virtual environment inside the directory:
17 |
18 | ```bash
19 | python -m venv .env
20 | ```
21 |
22 | 3. Activate and deactivate the virtual environment with the following commands:
23 |
24 | ```bash
25 | # Activate the virtual environment
26 | source .env/bin/activate
27 |
28 | # Deactivate the virtual environment
29 | source .env/bin/deactivate
30 | ```
31 |
32 | Once you have created your virtual environment, you can install 🤗 Evaluate in it.
33 |
34 | ## pip
35 |
36 | The most straightforward way to install 🤗 Evaluate is with pip:
37 |
38 | ```bash
39 | pip install evaluate
40 | ```
41 |
42 | Run the following command to check if 🤗 Evaluate has been properly installed:
43 |
44 | ```bash
45 | python -c "import evaluate; print(evaluate.load('exact_match').compute(references=['hello'], predictions=['hello']))"
46 | ```
47 |
48 | This should return:
49 |
50 | ```bash
51 | {'exact_match': 1.0}
52 | ```
53 |
54 | ## source
55 |
56 | Building 🤗 Evaluate from source lets you make changes to the code base. To install from source, clone the repository and install with the following commands:
57 |
58 | ```bash
59 | git clone https://github.com/huggingface/evaluate.git
60 | cd evaluate
61 | pip install -e .
62 | ```
63 |
64 | Again, you can check if 🤗 Evaluate has been properly installed with:
65 |
66 | ```bash
67 | python -c "import evaluate; print(evaluate.load('exact_match').compute(references=['hello'], predictions=['hello']))"
68 | ```
--------------------------------------------------------------------------------
/docs/source/keras_integrations.md:
--------------------------------------------------------------------------------
1 | # Working with Keras and Tensorflow
2 |
3 |
4 |
5 | Evaluate can be easily intergrated into your Keras and Tensorflow workflow. We'll demonstrate two ways of incorporating Evaluate into model training, using the Fashion MNIST example dataset. We'll train a standard classifier to predict two classes from this dataset, and show how to use a metric as a callback during training or afterwards for evaluation.
6 |
7 |
8 | ```python
9 | import numpy as np
10 | from tensorflow import keras
11 | from tensorflow.keras import layers
12 | import evaluate
13 |
14 | # We pull example code from Keras.io's guide on classifying with MNIST
15 | # Located here: https://keras.io/examples/vision/mnist_convnet/
16 |
17 | # Model / data parameters
18 | input_shape = (28, 28, 1)
19 |
20 | # Load the data and split it between train and test sets
21 | (x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()
22 |
23 |
24 | # Only select tshirts/tops and trousers, classes 0 and 1
25 | def get_tshirts_tops_and_trouser(x_vals, y_vals):
26 | mask = np.where((y_vals == 0) | (y_vals == 1))
27 | return x_vals[mask], y_vals[mask]
28 |
29 | x_train, y_train = get_tshirts_tops_and_trouser(x_train, y_train)
30 | x_test, y_test = get_tshirts_tops_and_trouser(x_test, y_test)
31 |
32 |
33 | # Scale images to the [0, 1] range
34 | x_train = x_train.astype("float32") / 255
35 | x_test = x_test.astype("float32") / 255
36 |
37 | x_train = np.expand_dims(x_train, -1)
38 | x_test = np.expand_dims(x_test, -1)
39 |
40 |
41 | model = keras.Sequential(
42 | [
43 | keras.Input(shape=input_shape),
44 | layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
45 | layers.MaxPooling2D(pool_size=(2, 2)),
46 | layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
47 | layers.MaxPooling2D(pool_size=(2, 2)),
48 | layers.Flatten(),
49 | layers.Dropout(0.5),
50 | layers.Dense(1, activation="sigmoid"),
51 | ]
52 | )
53 | ```
54 |
55 | ## Callbacks
56 |
57 | Suppose we want to keep track of model metrics while a model is training. We can use a Callback in order to calculate this metric during training, after an epoch ends.
58 |
59 | We'll define a callback here that will take a metric name and our training data, and have it calculate a metric after the epoch ends.
60 |
61 |
62 | ```python
63 | class MetricsCallback(keras.callbacks.Callback):
64 |
65 | def __init__(self, metric_name, x_data, y_data) -> None:
66 | super(MetricsCallback, self).__init__()
67 |
68 | self.x_data = x_data
69 | self.y_data = y_data
70 | self.metric_name = metric_name
71 | self.metric = evaluate.load(metric_name)
72 |
73 | def on_epoch_end(self, epoch, logs=dict()):
74 | m = self.model
75 | # Ensure we get labels of "1" or "0"
76 | training_preds = np.round(m.predict(self.x_data))
77 | training_labels = self.y_data
78 |
79 | # Compute score and save
80 | score = self.metric.compute(predictions = training_preds, references = training_labels)
81 |
82 | logs.update(score)
83 | ```
84 |
85 | We can pass this class to the `callbacks` keyword-argument to use it during training:
86 |
87 |
88 | ```python
89 | batch_size = 128
90 | epochs = 2
91 |
92 | model.compile(loss="binary_crossentropy", optimizer="adam")
93 |
94 | model_history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1,
95 | callbacks = [MetricsCallback(x_data = x_train, y_data = y_train, metric_name = "accuracy")])
96 | ```
97 |
98 | ## Using an Evaluate Metric for... Evaluation!
99 |
100 | We can also use the same metric after model training! Here, we show how to check accuracy of the model after training on the test set:
101 |
102 |
103 | ```python
104 | acc = evaluate.load("accuracy")
105 | # Round the predictions to turn them into "0" or "1" labels
106 | test_preds = np.round(model.predict(x_test))
107 | test_labels = y_test
108 | ```
109 |
110 | ```python
111 | print("Test accuracy is : ", acc.compute(predictions = test_preds, references = test_labels))
112 | # Test accuracy is : 0.9855
113 | ```
114 |
--------------------------------------------------------------------------------
/docs/source/package_reference/evaluator_classes.mdx:
--------------------------------------------------------------------------------
1 | # Evaluator
2 |
3 | The evaluator classes for automatic evaluation.
4 |
5 | ## Evaluator classes
6 |
7 | The main entry point for using the evaluator:
8 |
9 | [[autodoc]] evaluate.evaluator
10 |
11 | The base class for all evaluator classes:
12 |
13 | [[autodoc]] evaluate.Evaluator
14 |
15 | ## The task specific evaluators
16 |
17 | ### ImageClassificationEvaluator
18 |
19 | [[autodoc]] evaluate.ImageClassificationEvaluator
20 |
21 | ### QuestionAnsweringEvaluator
22 |
23 | [[autodoc]] evaluate.QuestionAnsweringEvaluator
24 | - compute
25 |
26 | ### TextClassificationEvaluator
27 |
28 | [[autodoc]] evaluate.TextClassificationEvaluator
29 |
30 | ### TokenClassificationEvaluator
31 |
32 | [[autodoc]] evaluate.TokenClassificationEvaluator
33 | - compute
34 |
35 | ### TextGenerationEvaluator
36 |
37 | [[autodoc]] evaluate.TextGenerationEvaluator
38 | - compute
39 |
40 | ### Text2TextGenerationEvaluator
41 |
42 | [[autodoc]] evaluate.Text2TextGenerationEvaluator
43 | - compute
44 |
45 | ### SummarizationEvaluator
46 |
47 | [[autodoc]] evaluate.SummarizationEvaluator
48 | - compute
49 |
50 | ### TranslationEvaluator
51 |
52 | [[autodoc]] evaluate.TranslationEvaluator
53 | - compute
54 |
55 | ### AutomaticSpeechRecognitionEvaluator
56 |
57 | [[autodoc]] evaluate.AutomaticSpeechRecognitionEvaluator
58 | - compute
59 |
60 | ### AudioClassificationEvaluator
61 |
62 | [[autodoc]] evaluate.AudioClassificationEvaluator
63 | - compute
--------------------------------------------------------------------------------
/docs/source/package_reference/hub_methods.mdx:
--------------------------------------------------------------------------------
1 | # Hub methods
2 |
3 | Methods for using the Hugging Face Hub:
4 |
5 | ## Push to hub
6 |
7 | [[autodoc]] evaluate.push_to_hub
8 |
9 |
--------------------------------------------------------------------------------
/docs/source/package_reference/loading_methods.mdx:
--------------------------------------------------------------------------------
1 | # Loading methods
2 |
3 | Methods for listing and loading evaluation modules:
4 |
5 | ## List
6 |
7 | [[autodoc]] evaluate.list_evaluation_modules
8 |
9 | ## Load
10 |
11 | [[autodoc]] evaluate.load
12 |
--------------------------------------------------------------------------------
/docs/source/package_reference/logging_methods.mdx:
--------------------------------------------------------------------------------
1 | # Logging methods
2 |
3 | 🤗 Evaluate strives to be transparent and explicit about how it works, but this can be quite verbose at times. We have included a series of logging methods which allow you to easily adjust the level of verbosity of the entire library. Currently the default verbosity of the library is set to `WARNING`.
4 |
5 | To change the level of verbosity, use one of the direct setters. For instance, here is how to change the verbosity to the `INFO` level:
6 |
7 | ```py
8 | import evaluate
9 | evaluate.logging.set_verbosity_info()
10 | ```
11 |
12 | You can also use the environment variable `EVALUATE_VERBOSITY` to override the default verbosity, and set it to one of the following: `debug`, `info`, `warning`, `error`, `critical`:
13 |
14 | ```bash
15 | EVALUATE_VERBOSITY=error ./myprogram.py
16 | ```
17 |
18 | All the methods of this logging module are documented below. The main ones are:
19 |
20 | - [`logging.get_verbosity`] to get the current level of verbosity in the logger
21 | - [`logging.set_verbosity`] to set the verbosity to the level of your choice
22 |
23 | In order from the least to the most verbose (with their corresponding `int` values):
24 |
25 | 1. `logging.CRITICAL` or `logging.FATAL` (int value, 50): only report the most critical errors.
26 | 2. `logging.ERROR` (int value, 40): only report errors.
27 | 3. `logging.WARNING` or `logging.WARN` (int value, 30): only reports error and warnings. This the default level used by the library.
28 | 4. `logging.INFO` (int value, 20): reports error, warnings and basic information.
29 | 5. `logging.DEBUG` (int value, 10): report all information.
30 |
31 | By default, `tqdm` progress bars will be displayed during evaluate download and processing. [`logging.disable_progress_bar`] and [`logging.enable_progress_bar`] can be used to suppress or unsuppress this behavior.
32 |
33 | ## Functions
34 |
35 | [[autodoc]] evaluate.logging.get_verbosity
36 |
37 | [[autodoc]] evaluate.logging.set_verbosity
38 |
39 | [[autodoc]] evaluate.logging.set_verbosity_info
40 |
41 | [[autodoc]] evaluate.logging.set_verbosity_warning
42 |
43 | [[autodoc]] evaluate.logging.set_verbosity_debug
44 |
45 | [[autodoc]] evaluate.logging.set_verbosity_error
46 |
47 | [[autodoc]] evaluate.logging.disable_propagation
48 |
49 | [[autodoc]] evaluate.logging.enable_propagation
50 |
51 | [[autodoc]] evaluate.logging.get_logger
52 |
53 | [[autodoc]] evaluate.logging.enable_progress_bar
54 |
55 | [[autodoc]] evaluate.logging.disable_progress_bar
56 |
57 | ## Levels
58 |
59 | ### evaluate.logging.CRITICAL
60 |
61 | evaluate.logging.CRITICAL = 50
62 |
63 | ### evaluate.logging.DEBUG
64 |
65 | evaluate.logging.DEBUG = 10
66 |
67 | ### evaluate.logging.ERROR
68 |
69 | evaluate.logging.ERROR = 40
70 |
71 | ### evaluate.logging.FATAL
72 |
73 | evaluate.logging.FATAL = 50
74 |
75 | ### evaluate.logging.INFO
76 |
77 | evaluate.logging.INFO = 20
78 |
79 | ### evaluate.logging.NOTSET
80 |
81 | evaluate.logging.NOTSET = 0
82 |
83 | ### evaluate.logging.WARN
84 |
85 | evaluate.logging.WARN = 30
86 |
87 | ### evaluate.logging.WARNING
88 |
89 | evaluate.logging.WARNING = 30
90 |
--------------------------------------------------------------------------------
/docs/source/package_reference/main_classes.mdx:
--------------------------------------------------------------------------------
1 | # Main classes
2 |
3 | ## EvaluationModuleInfo
4 |
5 | The base class `EvaluationModuleInfo` implements a the logic for the subclasses `MetricInfo`, `ComparisonInfo`, and `MeasurementInfo`.
6 |
7 | [[autodoc]] evaluate.EvaluationModuleInfo
8 |
9 | [[autodoc]] evaluate.MetricInfo
10 |
11 | [[autodoc]] evaluate.ComparisonInfo
12 |
13 | [[autodoc]] evaluate.MeasurementInfo
14 |
15 | ## EvaluationModule
16 |
17 | The base class `EvaluationModule` implements a the logic for the subclasses `Metric`, `Comparison`, and `Measurement`.
18 |
19 | [[autodoc]] evaluate.EvaluationModule
20 |
21 | [[autodoc]] evaluate.Metric
22 |
23 | [[autodoc]] evaluate.Comparison
24 |
25 | [[autodoc]] evaluate.Measurement
26 |
27 | ## CombinedEvaluations
28 |
29 | The `combine` function allows to combine multiple `EvaluationModule`s into a single `CombinedEvaluations`.
30 |
31 | [[autodoc]] evaluate.combine
32 |
33 | [[autodoc]] CombinedEvaluations
34 |
--------------------------------------------------------------------------------
/docs/source/package_reference/saving_methods.mdx:
--------------------------------------------------------------------------------
1 | # Saving methods
2 |
3 | Methods for saving evaluations results:
4 |
5 | ## Save
6 |
7 | [[autodoc]] evaluate.save
8 |
9 |
--------------------------------------------------------------------------------
/docs/source/package_reference/visualization_methods.mdx:
--------------------------------------------------------------------------------
1 | # Visualization methods
2 |
3 | Methods for visualizing evaluations results:
4 |
5 | ## Radar Plot
6 |
7 | [[autodoc]] evaluate.visualization.radar_plot
8 |
--------------------------------------------------------------------------------
/docs/source/sklearn_integrations.mdx:
--------------------------------------------------------------------------------
1 | # Scikit-Learn
2 |
3 | To run the scikit-learn examples make sure you have installed the following library:
4 |
5 | ```bash
6 | pip install -U scikit-learn
7 | ```
8 |
9 | The metrics in `evaluate` can be easily integrated with an Scikit-Learn estimator or [pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html#sklearn.pipeline.Pipeline).
10 |
11 | However, these metrics require that we generate the predictions from the model. The predictions and labels from the estimators can be passed to `evaluate` mertics to compute the required values.
12 |
13 | ```python
14 | import numpy as np
15 | np.random.seed(0)
16 | import evaluate
17 | from sklearn.compose import ColumnTransformer
18 | from sklearn.datasets import fetch_openml
19 | from sklearn.pipeline import Pipeline
20 | from sklearn.impute import SimpleImputer
21 | from sklearn.preprocessing import StandardScaler, OneHotEncoder
22 | from sklearn.linear_model import LogisticRegression
23 | from sklearn.model_selection import train_test_split
24 | ```
25 |
26 | Load data from https://www.openml.org/d/40945:
27 |
28 | ```python
29 | X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
30 | ```
31 |
32 | Alternatively X and y can be obtained directly from the frame attribute:
33 |
34 | ```python
35 | X = titanic.frame.drop('survived', axis=1)
36 | y = titanic.frame['survived']
37 | ```
38 |
39 | We create the preprocessing pipelines for both numeric and categorical data. Note that pclass could either be treated as a categorical or numeric feature.
40 |
41 | ```python
42 | numeric_features = ["age", "fare"]
43 | numeric_transformer = Pipeline(
44 | steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
45 | )
46 |
47 | categorical_features = ["embarked", "sex", "pclass"]
48 | categorical_transformer = OneHotEncoder(handle_unknown="ignore")
49 |
50 | preprocessor = ColumnTransformer(
51 | transformers=[
52 | ("num", numeric_transformer, numeric_features),
53 | ("cat", categorical_transformer, categorical_features),
54 | ]
55 | )
56 | ```
57 |
58 | Append classifier to preprocessing pipeline. Now we have a full prediction pipeline.
59 |
60 | ```python
61 | clf = Pipeline(
62 | steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
63 | )
64 |
65 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
66 |
67 | clf.fit(X_train, y_train)
68 | y_pred = clf.predict(X_test)
69 | ```
70 |
71 | As `Evaluate` metrics use lists as inputs for references and predictions, we need to convert them to Python lists.
72 |
73 |
74 | ```python
75 | # Evaluate metrics accept lists as inputs for values of references and predictions
76 |
77 | y_test = y_test.tolist()
78 | y_pred = y_pred.tolist()
79 |
80 | # Accuracy
81 |
82 | accuracy_metric = evaluate.load("accuracy")
83 | accuracy = accuracy_metric.compute(references=y_test, predictions=y_pred)
84 | print("Accuracy:", accuracy)
85 | # Accuracy: 0.79
86 | ```
87 |
88 | You can use any suitable `evaluate` metric with the estimators as long as they are compatible with the task and predictions.
89 |
--------------------------------------------------------------------------------
/docs/source/types_of_evaluations.mdx:
--------------------------------------------------------------------------------
1 | # Types of Evaluations in 🤗 Evaluate
2 |
3 | The goal of the 🤗 Evaluate library is to support different types of evaluation, depending on different goals, datasets and models.
4 |
5 | Here are the types of evaluations that are currently supported with a few examples for each:
6 |
7 | ## Metrics
8 | A metric measures the performance of a model on a given dataset. This is often based on an existing ground truth (i.e. a set of references), but there are also *referenceless metrics* which allow evaluating generated text by leveraging a pretrained model such as [GPT-2](https://huggingface.co/gpt2).
9 |
10 | Examples of metrics include:
11 | - [Accuracy](https://huggingface.co/metrics/accuracy) : the proportion of correct predictions among the total number of cases processed.
12 | - [Exact Match](https://huggingface.co/metrics/exact_match): the rate at which the input predicted strings exactly match their references.
13 | - [Mean Intersection over union (IoUO)](https://huggingface.co/metrics/mean_iou): the area of overlap between the predicted segmentation of an image and the ground truth divided by the area of union between the predicted segmentation and the ground truth.
14 |
15 | Metrics are often used to track model performance on benchmark datasets, and to report progress on tasks such as [machine translation](https://huggingface.co/tasks/translation) and [image classification](https://huggingface.co/tasks/image-classification).
16 |
17 | ## Comparisons
18 |
19 | Comparisons can be useful to compare the performance of two or more models on a single test dataset.
20 |
21 | For instance, the [McNemar Test](https://github.com/huggingface/evaluate/tree/main/comparisons/mcnemar) is a paired nonparametric statistical hypothesis test that takes the predictions of two models and compares them, aiming to measure whether the models's predictions diverge or not. The p value it outputs, which ranges from `0.0` to `1.0`, indicates the difference between the two models' predictions, with a lower p value indicating a more significant difference.
22 |
23 | Comparisons have yet to be systematically used when comparing and reporting model performance, however they are useful tools to go beyond simply comparing leaderboard scores and for getting more information on the way model prediction differ.
24 |
25 | ## Measurements
26 |
27 | In the 🤗 Evaluate library, measurements are tools for gaining more insights on datasets and model predictions.
28 |
29 | For instance, in the case of datasets, it can be useful to calculate the [average word length](https://github.com/huggingface/evaluate/tree/main/measurements/word_length) of a dataset's entries, and how it is distributed -- this can help when choosing the maximum input length for [Tokenizer](https://huggingface.co/docs/transformers/main_classes/tokenizer).
30 |
31 | In the case of model predictions, it can help to calculate the average [perplexity](https://huggingface.co/metrics/perplexity) of model predictions using different models such as [GPT-2](https://huggingface.co/gpt2) and [BERT](https://huggingface.co/bert-base-uncased), which can indicate the quality of generated text when no reference is available.
32 |
33 | All three types of evaluation supported by the 🤗 Evaluate library are meant to be mutually complementary, and help our community carry out more mindful and responsible evaluation.
34 |
35 | We will continue adding more types of metrics, measurements and comparisons in coming months, and are counting on community involvement (via [PRs](https://github.com/huggingface/evaluate/compare) and [issues](https://github.com/huggingface/evaluate/issues/new/choose)) to make the library as extensive and inclusive as possible!
36 |
--------------------------------------------------------------------------------
/measurements/honest/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("honest", "en")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/measurements/honest/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | transformers
3 | unidecode==1.3.4
4 | torch
5 |
--------------------------------------------------------------------------------
/measurements/label_distribution/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Label Distribution
3 | emoji: 🤗
4 | colorFrom: green
5 | colorTo: purple
6 | sdk: gradio
7 | sdk_version: 3.0.2
8 | app_file: app.py
9 | pinned: false
10 | tags:
11 | - evaluate
12 | - measurement
13 | description: >-
14 | Returns the label distribution and skew of the input data.
15 | ---
16 |
17 | # Measurement Card for Label Distribution
18 |
19 | ## Measurement Description
20 | The label distribution measurements returns the fraction of each label represented in the dataset.
21 |
22 | ## Intended Uses
23 |
24 | Calculating the distribution of labels in a dataset allows to see how balanced the labels in your dataset are, which
25 | can help choosing a relevant metric (e.g. accuracy when the dataset is balanced, versus F1 score when there is an
26 | imbalance).
27 |
28 | ## How to Use
29 |
30 | The measurement takes a list of labels as input:
31 |
32 | ```python
33 | >>> distribution = evaluate.load("label_distribution")
34 | >>> data = [1, 0, 2, 2, 0, 0, 0, 0, 0, 2]
35 | >>> results = distribution.compute(data=data)
36 | ```
37 |
38 | ### Inputs
39 | - **data** (`list`): a list of integers or strings containing the data labels.
40 |
41 | ### Output Values
42 | By default, this metric outputs a dictionary that contains :
43 | -**label_distribution** (`dict`) : a dictionary containing two sets of keys and values: `labels`, which includes the list of labels contained in the dataset, and `fractions`, which includes the fraction of each label.
44 | -**label_skew** (`scalar`) : the asymmetry of the label distribution.
45 |
46 | ```python
47 | {'label_distribution': {'labels': [1, 0, 2], 'fractions': [0.1, 0.6, 0.3]}, 'label_skew': 0.7417688338666573}
48 | ```
49 |
50 | If skewness is 0, the dataset is perfectly balanced; if it is less than -1 or greater than 1, the distribution is highly skewed; anything in between can be considered moderately skewed.
51 |
52 | #### Values from Popular Papers
53 |
54 |
55 | ### Examples
56 | Calculating the label distribution of a dataset with binary labels:
57 |
58 | ```python
59 | >>> data = [1, 0, 1, 1, 0, 1, 0]
60 | >>> distribution = evaluate.load("label_distribution")
61 | >>> results = distribution.compute(data=data)
62 | >>> print(results)
63 | {'label_distribution': {'labels': [1, 0], 'fractions': [0.5714285714285714, 0.42857142857142855]}}
64 | ```
65 |
66 | Calculating the label distribution of the test subset of the [IMDb dataset](https://huggingface.co/datasets/imdb):
67 | ```python
68 | >>> from datasets import load_dataset
69 | >>> imdb = load_dataset('imdb', split = 'test')
70 | >>> distribution = evaluate.load("label_distribution")
71 | >>> results = distribution.compute(data=imdb['label'])
72 | >>> print(results)
73 | {'label_distribution': {'labels': [0, 1], 'fractions': [0.5, 0.5]}, 'label_skew': 0.0}
74 | ```
75 | N.B. The IMDb dataset is perfectly balanced.
76 |
77 | The output of the measurement can easily be passed to matplotlib to plot a histogram of each label:
78 |
79 | ```python
80 | >>> data = [1, 0, 2, 2, 0, 0, 0, 0, 0, 2]
81 | >>> distribution = evaluate.load("label_distribution")
82 | >>> results = distribution.compute(data=data)
83 | >>> plt.bar(results['label_distribution']['labels'], results['label_distribution']['fractions'])
84 | >>> plt.show()
85 | ```
86 |
87 | ## Limitations and Bias
88 | While label distribution can be a useful signal for analyzing datasets and choosing metrics for measuring model performance, it can be useful to accompany it with additional data exploration to better understand each subset of the dataset and how they differ.
89 |
90 | ## Citation
91 |
92 | ## Further References
93 | - [Facing Imbalanced Data Recommendations for the Use of Performance Metrics](https://sites.pitt.edu/~jeffcohn/skew/PID2829477.pdf)
94 | - [Scipy Stats Skew Documentation](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.skew.html#scipy-stats-skew)
95 |
--------------------------------------------------------------------------------
/measurements/label_distribution/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("label_distribution", module_type="measurement")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/measurements/label_distribution/label_distribution.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Datasets Authors and the current dataset script contributor.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Label Distribution Measurement."""
15 |
16 | from collections import Counter
17 |
18 | import datasets
19 | import pandas as pd
20 | from scipy import stats
21 |
22 | import evaluate
23 |
24 |
25 | _DESCRIPTION = """
26 | Returns the label ratios of the dataset labels, as well as a scalar for skewness.
27 | """
28 |
29 | _KWARGS_DESCRIPTION = """
30 | Args:
31 | `data`: a list containing the data labels
32 |
33 | Returns:
34 | `label_distribution` (`dict`) : a dictionary containing two sets of keys and values: `labels`, which includes the list of labels contained in the dataset, and `fractions`, which includes the fraction of each label.
35 | `label_skew` (`scalar`) : the asymmetry of the label distribution.
36 | Examples:
37 | >>> data = [1, 0, 1, 1, 0, 1, 0]
38 | >>> distribution = evaluate.load("label_distribution")
39 | >>> results = distribution.compute(data=data)
40 | >>> print(results)
41 | {'label_distribution': {'labels': [1, 0], 'fractions': [0.5714285714285714, 0.42857142857142855]}, 'label_skew': -0.2886751345948127}
42 | """
43 |
44 | _CITATION = """\
45 | @ARTICLE{2020SciPy-NMeth,
46 | author = {Virtanen, Pauli and Gommers, Ralf and Oliphant, Travis E. and
47 | Haberland, Matt and Reddy, Tyler and Cournapeau, David and
48 | Burovski, Evgeni and Peterson, Pearu and Weckesser, Warren and
49 | Bright, Jonathan and {van der Walt}, St{\'e}fan J. and
50 | Brett, Matthew and Wilson, Joshua and Millman, K. Jarrod and
51 | Mayorov, Nikolay and Nelson, Andrew R. J. and Jones, Eric and
52 | Kern, Robert and Larson, Eric and Carey, C J and
53 | Polat, {\.I}lhan and Feng, Yu and Moore, Eric W. and
54 | {VanderPlas}, Jake and Laxalde, Denis and Perktold, Josef and
55 | Cimrman, Robert and Henriksen, Ian and Quintero, E. A. and
56 | Harris, Charles R. and Archibald, Anne M. and
57 | Ribeiro, Ant{\^o}nio H. and Pedregosa, Fabian and
58 | {van Mulbregt}, Paul and {SciPy 1.0 Contributors}},
59 | title = {{{SciPy} 1.0: Fundamental Algorithms for Scientific
60 | Computing in Python}},
61 | journal = {Nature Methods},
62 | year = {2020},
63 | volume = {17},
64 | pages = {261--272},
65 | adsurl = {https://rdcu.be/b08Wh},
66 | doi = {10.1038/s41592-019-0686-2},
67 | }
68 | """
69 |
70 |
71 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
72 | class LabelDistribution(evaluate.Measurement):
73 | def _info(self):
74 | return evaluate.MeasurementInfo(
75 | module_type="measurement",
76 | description=_DESCRIPTION,
77 | citation=_CITATION,
78 | inputs_description=_KWARGS_DESCRIPTION,
79 | features=[
80 | datasets.Features({"data": datasets.Value("int32")}),
81 | datasets.Features({"data": datasets.Value("string")}),
82 | ],
83 | )
84 |
85 | def _compute(self, data):
86 | """Returns the fraction of each label present in the data"""
87 | c = Counter(data)
88 | label_distribution = {"labels": [k for k in c.keys()], "fractions": [f / len(data) for f in c.values()]}
89 | if isinstance(data[0], str):
90 | label2id = {label: id for id, label in enumerate(label_distribution["labels"])}
91 | data = [label2id[d] for d in data]
92 | skew = stats.skew(data)
93 | return {"label_distribution": label_distribution, "label_skew": skew}
94 |
--------------------------------------------------------------------------------
/measurements/label_distribution/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scipy
3 |
--------------------------------------------------------------------------------
/measurements/perplexity/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("perplexity", module_type="measurement")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/measurements/perplexity/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | torch
3 | transformers
--------------------------------------------------------------------------------
/measurements/regard/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("regard")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/measurements/regard/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate.git@{COMMIT_PLACEHOLDER}
2 | transformers
3 | torch
4 |
--------------------------------------------------------------------------------
/measurements/text_duplicates/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Text Duplicates
3 | emoji: 🤗
4 | colorFrom: green
5 | colorTo: purple
6 | sdk: gradio
7 | sdk_version: 3.0.2
8 | app_file: app.py
9 | pinned: false
10 | tags:
11 | - evaluate
12 | - measurement
13 | description: >-
14 | Returns the duplicate fraction of duplicate strings in the input.
15 | ---
16 |
17 | # Measurement Card for Text Duplicates
18 |
19 | ## Measurement Description
20 |
21 | The `text_duplicates` measurement returns the fraction of duplicated strings in the input data.
22 |
23 | ## How to Use
24 |
25 | This measurement requires a list of strings as input:
26 |
27 | ```python
28 | >>> data = ["hello sun","hello moon", "hello sun"]
29 | >>> duplicates = evaluate.load("text_duplicates")
30 | >>> results = duplicates.compute(data=data)
31 | ```
32 |
33 | ### Inputs
34 | - **data** (list of `str`): The input list of strings for which the duplicates are calculated.
35 |
36 | ### Output Values
37 | - **duplicate_fraction**(`float`): the fraction of duplicates in the input string(s).
38 | - **duplicates_dict**(`list`): (optional) a list of tuples with the duplicate strings and the number of times they are repeated.
39 |
40 | By default, this measurement outputs a dictionary containing the fraction of duplicates in the input string(s) (`duplicate_fraction`):
41 | )
42 | ```python
43 | {'duplicate_fraction': 0.33333333333333337}
44 | ```
45 |
46 | With the `list_duplicates=True` option, this measurement will also output a dictionary of tuples with duplicate strings and their counts.
47 |
48 | ```python
49 | {'duplicate_fraction': 0.33333333333333337, 'duplicates_dict': {'hello sun': 2}}
50 | ```
51 |
52 | Warning: the `list_duplicates=True` function can be memory-intensive for large datasets.
53 |
54 | ### Examples
55 |
56 | Example with no duplicates
57 |
58 | ```python
59 | >>> data = ["foo", "bar", "foobar"]
60 | >>> duplicates = evaluate.load("text_duplicates")
61 | >>> results = duplicates.compute(data=data)
62 | >>> print(results)
63 | {'duplicate_fraction': 0.0}
64 | ```
65 |
66 | Example with multiple duplicates and `list_duplicates=True`:
67 | ```python
68 | >>> data = ["hello sun", "goodbye moon", "hello sun", "foo bar", "foo bar"]
69 | >>> duplicates = evaluate.load("text_duplicates")
70 | >>> results = duplicates.compute(data=data, list_duplicates=True)
71 | >>> print(results)
72 | {'duplicate_fraction': 0.4, 'duplicates_dict': {'hello sun': 2, 'foo bar': 2}}
73 | ```
74 |
75 | ## Citation(s)
76 |
77 |
78 | ## Further References
79 | - [`hashlib` library](https://docs.python.org/3/library/hashlib.html)
80 |
--------------------------------------------------------------------------------
/measurements/text_duplicates/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("text_duplicates")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/measurements/text_duplicates/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate.git@{COMMIT_PLACEHOLDER}
2 |
--------------------------------------------------------------------------------
/measurements/text_duplicates/text_duplicates.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import hashlib
16 | from collections import Counter
17 |
18 | import datasets
19 |
20 | import evaluate
21 |
22 |
23 | logger = evaluate.logging.get_logger(__name__)
24 |
25 | _DESCRIPTION = """
26 | Returns the duplicate fraction of duplicate strings in the input.
27 | """
28 |
29 | _KWARGS_DESCRIPTION = """
30 | Args:
31 | `data`: a list of `str` to be checked for duplicates.
32 |
33 | Returns:
34 | `duplicate_fraction` (`float`) : the fraction of strings that are duplicated.
35 | `duplicates_dict` (`dict`) (optional) : a dictionary containing tuples with the duplicate strings and the number of times they are repeated.
36 |
37 | Examples:
38 | >>> data = ["hello sun","hello moon", "hello sun"]
39 | >>> duplicates = evaluate.load("text_duplicates")
40 | >>> results = duplicates.compute(data=data)
41 | >>> print(results)
42 | {'duplicate_fraction': 0.33333333333333337}
43 |
44 | >>> data = ["hello sun","hello moon", "hello sun"]
45 | >>> duplicates = evaluate.load("text_duplicates")
46 | >>> results = duplicates.compute(data=data, list_duplicates=True)
47 | >>> print(results)
48 | {'duplicate_fraction': 0.33333333333333337, 'duplicates_dict': {'hello sun': 2}}
49 | """
50 |
51 | # TODO: Add BibTeX citation
52 | _CITATION = ""
53 |
54 |
55 | def get_hash(example):
56 | """Get the hash of a string"""
57 | return hashlib.md5(example.strip().encode("utf-8")).hexdigest()
58 |
59 |
60 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
61 | class TextDuplicates(evaluate.Measurement):
62 | """This measurement returns the duplicate strings contained in the input(s)."""
63 |
64 | def _info(self):
65 | # TODO: Specifies the evaluate.MeasurementInfo object
66 | return evaluate.MeasurementInfo(
67 | # This is the description that will appear on the modules page.
68 | module_type="measurement",
69 | description=_DESCRIPTION,
70 | citation=_CITATION,
71 | inputs_description=_KWARGS_DESCRIPTION,
72 | # This defines the format of each prediction and reference
73 | features=datasets.Features(
74 | {
75 | "data": datasets.Value("string"),
76 | }
77 | ),
78 | )
79 |
80 | def _compute(self, data, list_duplicates=False):
81 | """Returns the duplicates contained in the input data and the number of times they are repeated."""
82 | if list_duplicates == True:
83 | logger.warning("This functionality can be memory-intensive for large datasets!")
84 | n_dedup = len(set([get_hash(d) for d in data]))
85 | c = Counter(data)
86 | duplicates = {k: v for k, v in c.items() if v > 1}
87 | return {"duplicate_fraction": 1 - (n_dedup / len(data)), "duplicates_dict": duplicates}
88 | else:
89 | n_dedup = len(set([get_hash(d) for d in data]))
90 | return {"duplicate_fraction": 1 - (n_dedup / len(data))}
91 |
--------------------------------------------------------------------------------
/measurements/toxicity/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("toxicity")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/measurements/toxicity/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | transformers
3 | torch
4 |
--------------------------------------------------------------------------------
/measurements/word_count/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Word Count
3 | emoji: 🤗
4 | colorFrom: green
5 | colorTo: purple
6 | sdk: gradio
7 | sdk_version: 3.0.2
8 | app_file: app.py
9 | pinned: false
10 | tags:
11 | - evaluate
12 | - measurement
13 | description: >-
14 | Returns the total number of words, and the number of unique words in the input data.
15 | ---
16 |
17 | # Measurement Card for Word Count
18 |
19 | ## Measurement Description
20 |
21 | The `word_count` measurement returns the total number of word count of the input string, using the sklearn's [`CountVectorizer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
22 |
23 | ## How to Use
24 |
25 | This measurement requires a list of strings as input:
26 |
27 | ```python
28 | >>> data = ["hello world and hello moon"]
29 | >>> wordcount= evaluate.load("word_count")
30 | >>> results = wordcount.compute(data=data)
31 | ```
32 |
33 | ### Inputs
34 | - **data** (list of `str`): The input list of strings for which the word length is calculated.
35 | - **max_vocab** (`int`): (optional) the top number of words to consider (can be specified if dataset is too large)
36 |
37 | ### Output Values
38 | - **total_word_count** (`int`): the total number of words in the input string(s).
39 | - **unique_words** (`int`): the number of unique words in the input string(s).
40 |
41 | Output Example(s):
42 |
43 | ```python
44 | {'total_word_count': 5, 'unique_words': 4}
45 |
46 |
47 | ### Examples
48 |
49 | Example for a single string
50 |
51 | ```python
52 | >>> data = ["hello sun and goodbye moon"]
53 | >>> wordcount = evaluate.load("word_count")
54 | >>> results = wordcount.compute(data=data)
55 | >>> print(results)
56 | {'total_word_count': 5, 'unique_words': 5}
57 | ```
58 |
59 | Example for a multiple strings
60 | ```python
61 | >>> data = ["hello sun and goodbye moon", "foo bar foo bar"]
62 | >>> wordcount = evaluate.load("word_count")
63 | >>> results = wordcount.compute(data=data)
64 | >>> print(results)
65 | {'total_word_count': 9, 'unique_words': 7}
66 | ```
67 |
68 | Example for a dataset from 🤗 Datasets:
69 |
70 | ```python
71 | >>> imdb = datasets.load_dataset('imdb', split = 'train')
72 | >>> wordcount = evaluate.load("word_count")
73 | >>> results = wordcount.compute(data=imdb['text'])
74 | >>> print(results)
75 | {'total_word_count': 5678573, 'unique_words': 74849}
76 | ```
77 |
78 | ## Citation(s)
79 |
80 |
81 | ## Further References
82 | - [Sklearn `CountVectorizer`](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)
83 |
--------------------------------------------------------------------------------
/measurements/word_count/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("word_count")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/measurements/word_count/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate.git@{COMMIT_PLACEHOLDER}
2 | scikit-learn~=0.0
3 |
--------------------------------------------------------------------------------
/measurements/word_count/word_count.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | import datasets
16 | from sklearn.feature_extraction.text import CountVectorizer
17 |
18 | import evaluate
19 |
20 |
21 | _DESCRIPTION = """
22 | Returns the total number of words, and the number of unique words in the input data.
23 | """
24 |
25 | _KWARGS_DESCRIPTION = """
26 | Args:
27 | `data`: a list of `str` for which the words are counted.
28 | `max_vocab` (optional): the top number of words to consider (can be specified if dataset is too large)
29 |
30 | Returns:
31 | `total_word_count` (`int`) : the total number of words in the input string(s)
32 | `unique_words` (`int`) : the number of unique words in the input list of strings.
33 |
34 | Examples:
35 | >>> data = ["hello world and hello moon"]
36 | >>> wordcount= evaluate.load("word_count")
37 | >>> results = wordcount.compute(data=data)
38 | >>> print(results)
39 | {'total_word_count': 5, 'unique_words': 4}
40 | """
41 | _CITATION = ""
42 |
43 |
44 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
45 | class WordCount(evaluate.Measurement):
46 | """This measurement returns the total number of words and the number of unique words
47 | in the input string(s)."""
48 |
49 | def _info(self):
50 | return evaluate.MeasurementInfo(
51 | # This is the description that will appear on the modules page.
52 | module_type="measurement",
53 | description=_DESCRIPTION,
54 | citation=_CITATION,
55 | inputs_description=_KWARGS_DESCRIPTION,
56 | features=datasets.Features(
57 | {
58 | "data": datasets.Value("string"),
59 | }
60 | ),
61 | )
62 |
63 | def _compute(self, data, max_vocab=None):
64 | """Returns the number of unique words in the input data"""
65 | count_vectorizer = CountVectorizer(max_features=max_vocab)
66 | document_matrix = count_vectorizer.fit_transform(data)
67 | word_count = document_matrix.sum()
68 | unique_words = document_matrix.shape[1]
69 | return {"total_word_count": word_count, "unique_words": unique_words}
70 |
--------------------------------------------------------------------------------
/measurements/word_length/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Word Length
3 | emoji: 🤗
4 | colorFrom: green
5 | colorTo: purple
6 | sdk: gradio
7 | sdk_version: 3.0.2
8 | app_file: app.py
9 | pinned: false
10 | tags:
11 | - evaluate
12 | - measurement
13 | description: >-
14 | Returns the average length (in terms of the number of words) of the input data.
15 | ---
16 |
17 | # Measurement Card for Word Length
18 |
19 |
20 | ## Measurement Description
21 |
22 | The `word_length` measurement returns the average word count of the input strings, based on tokenization using [NLTK word_tokenize](https://www.nltk.org/api/nltk.tokenize.html).
23 |
24 | ## How to Use
25 |
26 | This measurement requires a list of strings as input:
27 |
28 | ```python
29 | >>> data = ["hello world"]
30 | >>> wordlength = evaluate.load("word_length", module_type="measurement")
31 | >>> results = wordlength.compute(data=data)
32 | ```
33 |
34 | ### Inputs
35 | - **data** (list of `str`): The input list of strings for which the word length is calculated.
36 | - **tokenizer** (`Callable`) : approach used for tokenizing `data` (optional). The default tokenizer is [NLTK's `word_tokenize`](https://www.nltk.org/api/nltk.tokenize.html). This can be replaced by any function that takes a string as input and returns a list of tokens as output.
37 |
38 | ### Output Values
39 | - **average_word_length**(`float`): the average number of words in the input string(s).
40 |
41 | Output Example(s):
42 |
43 | ```python
44 | {"average_word_length": 245}
45 | ```
46 |
47 | This metric outputs a dictionary containing the number of words in the input string (`word length`).
48 |
49 | ### Examples
50 |
51 | Example for a single string
52 |
53 | ```python
54 | >>> data = ["hello sun and goodbye moon"]
55 | >>> wordlength = evaluate.load("word_length", module_type="measurement")
56 | >>> results = wordlength.compute(data=data)
57 | >>> print(results)
58 | {'average_word_length': 5}
59 | ```
60 |
61 | Example for a multiple strings
62 | ```python
63 | >>> data = ["hello sun and goodbye moon", "foo bar foo bar"]
64 | >>> wordlength = evaluate.load("word_length", module_type="measurement")
65 | >>> results = wordlength.compute(data=text)
66 | {'average_word_length': 4.5}
67 | ```
68 |
69 | ## Citation(s)
70 |
71 |
72 | ## Further References
73 | - [NLTK's `word_tokenize`](https://www.nltk.org/api/nltk.tokenize.html)
74 |
--------------------------------------------------------------------------------
/measurements/word_length/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("word_length", module_type="measurement")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/measurements/word_length/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate.git@{COMMIT_PLACEHOLDER}
2 | nltk~=3.7
3 |
--------------------------------------------------------------------------------
/measurements/word_length/word_length.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from statistics import mean
16 |
17 | import datasets
18 | from nltk import word_tokenize
19 | from packaging import version
20 |
21 | import evaluate
22 |
23 |
24 | if evaluate.config.PY_VERSION < version.parse("3.8"):
25 | import importlib_metadata
26 | else:
27 | import importlib.metadata as importlib_metadata
28 |
29 |
30 | NLTK_VERSION = version.parse(importlib_metadata.version("nltk"))
31 |
32 | _DESCRIPTION = """
33 | Returns the average length (in terms of the number of words) of the input data.
34 | """
35 |
36 | _KWARGS_DESCRIPTION = """
37 | Args:
38 | `data`: a list of `str` for which the word length is calculated.
39 | `tokenizer` (`Callable`) : the approach used for tokenizing `data` (optional).
40 | The default tokenizer is `word_tokenize` from NLTK: https://www.nltk.org/api/nltk.tokenize.html
41 | This can be replaced by any function that takes a string as input and returns a list of tokens as output.
42 |
43 | Returns:
44 | `average_word_length` (`float`) : the average number of words in the input list of strings.
45 |
46 | Examples:
47 | >>> data = ["hello world"]
48 | >>> wordlength = evaluate.load("word_length", module_type="measurement")
49 | >>> results = wordlength.compute(data=data)
50 | >>> print(results)
51 | {'average_word_length': 2}
52 | """
53 |
54 | # TODO: Add BibTeX citation
55 | _CITATION = """\
56 | @InProceedings{huggingface:module,
57 | title = {A great new module},
58 | authors={huggingface, Inc.},
59 | year={2020}
60 | }
61 | """
62 |
63 |
64 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
65 | class WordLength(evaluate.Measurement):
66 | """This measurement returns the average number of words in the input string(s)."""
67 |
68 | def _info(self):
69 | # TODO: Specifies the evaluate.MeasurementInfo object
70 | return evaluate.MeasurementInfo(
71 | # This is the description that will appear on the modules page.
72 | module_type="measurement",
73 | description=_DESCRIPTION,
74 | citation=_CITATION,
75 | inputs_description=_KWARGS_DESCRIPTION,
76 | # This defines the format of each prediction and reference
77 | features=datasets.Features(
78 | {
79 | "data": datasets.Value("string"),
80 | }
81 | ),
82 | )
83 |
84 | def _download_and_prepare(self, dl_manager):
85 | import nltk
86 |
87 | if NLTK_VERSION >= version.Version("3.9.0"):
88 | nltk.download("punkt_tab")
89 | else:
90 | nltk.download("punkt")
91 |
92 | def _compute(self, data, tokenizer=word_tokenize):
93 | """Returns the average word length of the input data"""
94 | lengths = [len(tokenizer(d)) for d in data]
95 | average_length = mean(lengths)
96 | return {"average_word_length": average_length}
97 |
--------------------------------------------------------------------------------
/metrics/accuracy/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Accuracy
3 | emoji: 🤗
4 | colorFrom: blue
5 | colorTo: red
6 | sdk: gradio
7 | sdk_version: 3.19.1
8 | app_file: app.py
9 | pinned: false
10 | tags:
11 | - evaluate
12 | - metric
13 | description: >-
14 | Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
15 | Accuracy = (TP + TN) / (TP + TN + FP + FN)
16 | Where:
17 | TP: True positive
18 | TN: True negative
19 | FP: False positive
20 | FN: False negative
21 | ---
22 |
23 | # Metric Card for Accuracy
24 |
25 |
26 | ## Metric Description
27 |
28 | Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
29 | Accuracy = (TP + TN) / (TP + TN + FP + FN)
30 | Where:
31 | TP: True positive
32 | TN: True negative
33 | FP: False positive
34 | FN: False negative
35 |
36 |
37 | ## How to Use
38 |
39 | At minimum, this metric requires predictions and references as inputs.
40 |
41 | ```python
42 | >>> accuracy_metric = evaluate.load("accuracy")
43 | >>> results = accuracy_metric.compute(references=[0, 1], predictions=[0, 1])
44 | >>> print(results)
45 | {'accuracy': 1.0}
46 | ```
47 |
48 |
49 | ### Inputs
50 | - **predictions** (`list` of `int`): Predicted labels.
51 | - **references** (`list` of `int`): Ground truth labels.
52 | - **normalize** (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
53 | - **sample_weight** (`list` of `float`): Sample weights Defaults to None.
54 |
55 |
56 | ### Output Values
57 | - **accuracy**(`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`. A higher score means higher accuracy.
58 |
59 | Output Example(s):
60 | ```python
61 | {'accuracy': 1.0}
62 | ```
63 |
64 | This metric outputs a dictionary, containing the accuracy score.
65 |
66 |
67 | #### Values from Popular Papers
68 |
69 | Top-1 or top-5 accuracy is often used to report performance on supervised classification tasks such as image classification (e.g. on [ImageNet](https://paperswithcode.com/sota/image-classification-on-imagenet)) or sentiment analysis (e.g. on [IMDB](https://paperswithcode.com/sota/text-classification-on-imdb)).
70 |
71 |
72 | ### Examples
73 |
74 | Example 1-A simple example
75 | ```python
76 | >>> accuracy_metric = evaluate.load("accuracy")
77 | >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
78 | >>> print(results)
79 | {'accuracy': 0.5}
80 | ```
81 |
82 | Example 2-The same as Example 1, except with `normalize` set to `False`.
83 | ```python
84 | >>> accuracy_metric = evaluate.load("accuracy")
85 | >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False)
86 | >>> print(results)
87 | {'accuracy': 3.0}
88 | ```
89 |
90 | Example 3-The same as Example 1, except with `sample_weight` set.
91 | ```python
92 | >>> accuracy_metric = evaluate.load("accuracy")
93 | >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4])
94 | >>> print(results)
95 | {'accuracy': 0.8778625954198473}
96 | ```
97 |
98 |
99 | ## Limitations and Bias
100 | This metric can be easily misleading, especially in the case of unbalanced classes. For example, a high accuracy might be because a model is doing well, but if the data is unbalanced, it might also be because the model is only accurately labeling the high-frequency class. In such cases, a more detailed analysis of the model's behavior, or the use of a different metric entirely, is necessary to determine how well the model is actually performing.
101 |
102 |
103 | ## Citation(s)
104 | ```bibtex
105 | @article{scikit-learn,
106 | title={Scikit-learn: Machine Learning in {P}ython},
107 | author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
108 | and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
109 | and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
110 | Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
111 | journal={Journal of Machine Learning Research},
112 | volume={12},
113 | pages={2825--2830},
114 | year={2011}
115 | }
116 | ```
117 |
118 |
119 | ## Further References
120 |
--------------------------------------------------------------------------------
/metrics/accuracy/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("accuracy")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/accuracy/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scikit-learn
--------------------------------------------------------------------------------
/metrics/bertscore/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("bertscore")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/bertscore/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | bert_score
--------------------------------------------------------------------------------
/metrics/bleu/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("bleu")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/bleu/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
--------------------------------------------------------------------------------
/metrics/bleu/tokenizer_13a.py:
--------------------------------------------------------------------------------
1 | # Source: https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/tokenizers/tokenizer_13a.py
2 | # Copyright 2020 SacreBLEU Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | import re
17 | from functools import lru_cache
18 |
19 |
20 | class BaseTokenizer:
21 | """A base dummy tokenizer to derive from."""
22 |
23 | def signature(self):
24 | """
25 | Returns a signature for the tokenizer.
26 | :return: signature string
27 | """
28 | return "none"
29 |
30 | def __call__(self, line):
31 | """
32 | Tokenizes an input line with the tokenizer.
33 | :param line: a segment to tokenize
34 | :return: the tokenized line
35 | """
36 | return line
37 |
38 |
39 | class TokenizerRegexp(BaseTokenizer):
40 | def signature(self):
41 | return "re"
42 |
43 | def __init__(self):
44 | self._re = [
45 | # language-dependent part (assuming Western languages)
46 | (re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), r" \1 "),
47 | # tokenize period and comma unless preceded by a digit
48 | (re.compile(r"([^0-9])([\.,])"), r"\1 \2 "),
49 | # tokenize period and comma unless followed by a digit
50 | (re.compile(r"([\.,])([^0-9])"), r" \1 \2"),
51 | # tokenize dash when preceded by a digit
52 | (re.compile(r"([0-9])(-)"), r"\1 \2 "),
53 | # one space only between words
54 | # NOTE: Doing this in Python (below) is faster
55 | # (re.compile(r'\s+'), r' '),
56 | ]
57 |
58 | @lru_cache(maxsize=2**16)
59 | def __call__(self, line):
60 | """Common post-processing tokenizer for `13a` and `zh` tokenizers.
61 | :param line: a segment to tokenize
62 | :return: the tokenized line
63 | """
64 | for (_re, repl) in self._re:
65 | line = _re.sub(repl, line)
66 |
67 | # no leading or trailing spaces, single space within words
68 | # return ' '.join(line.split())
69 | # This line is changed with regards to the original tokenizer (seen above) to return individual words
70 | return line.split()
71 |
72 |
73 | class Tokenizer13a(BaseTokenizer):
74 | def signature(self):
75 | return "13a"
76 |
77 | def __init__(self):
78 | self._post_tokenizer = TokenizerRegexp()
79 |
80 | @lru_cache(maxsize=2**16)
81 | def __call__(self, line):
82 | """Tokenizes an input line using a relatively minimal tokenization
83 | that is however equivalent to mteval-v13a, used by WMT.
84 |
85 | :param line: a segment to tokenize
86 | :return: the tokenized line
87 | """
88 |
89 | # language-independent part:
90 | line = line.replace("", "")
91 | line = line.replace("-\n", "")
92 | line = line.replace("\n", " ")
93 |
94 | if "&" in line:
95 | line = line.replace(""", '"')
96 | line = line.replace("&", "&")
97 | line = line.replace("<", "<")
98 | line = line.replace(">", ">")
99 |
100 | return self._post_tokenizer(f" {line} ")
101 |
--------------------------------------------------------------------------------
/metrics/bleurt/app.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import evaluate
4 | from evaluate.utils import launch_gradio_widget
5 |
6 |
7 | sys.path = [p for p in sys.path if p != "/home/user/app"]
8 | module = evaluate.load("bleurt")
9 | sys.path = ["/home/user/app"] + sys.path
10 |
11 | launch_gradio_widget(module)
12 |
--------------------------------------------------------------------------------
/metrics/bleurt/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | git+https://github.com/google-research/bleurt.git
--------------------------------------------------------------------------------
/metrics/brier_score/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Brier Score
3 | emoji: 🤗
4 | colorFrom: blue
5 | colorTo: red
6 | sdk: gradio
7 | sdk_version: 3.19.1
8 | app_file: app.py
9 | pinned: false
10 | tags:
11 | - evaluate
12 | - metric
13 | description: >-
14 | The Brier score is a measure of the error between two probability distributions.
15 | ---
16 |
17 | # Metric Card for Brier Score
18 |
19 |
20 | ## Metric Description
21 | Brier score is a type of evaluation metric for classification tasks, where you predict outcomes such as win/lose, spam/ham, click/no-click etc.
22 | `BrierScore = 1/N * sum( (p_i - o_i)^2 )`
23 |
24 | Where `p_i` is the prediction probability of occurrence of the event, and the term `o_i` is equal to 1 if the event occurred and 0 if not. Which means: the lower the value of this score, the better the prediction.
25 | ## How to Use
26 |
27 | At minimum, this metric requires predictions and references as inputs.
28 |
29 | ```python
30 | >>> brier_score = evaluate.load("brier_score")
31 | >>> predictions = np.array([0, 0, 1, 1])
32 | >>> references = np.array([0.1, 0.9, 0.8, 0.3])
33 | >>> results = brier_score.compute(predictions=predictions, references=references)
34 | ```
35 |
36 | ### Inputs
37 |
38 | Mandatory inputs:
39 | - `predictions`: numeric array-like of shape (`n_samples,`) or (`n_samples`, `n_outputs`), representing the estimated target values.
40 |
41 | - `references`: numeric array-like of shape (`n_samples,`) or (`n_samples`, `n_outputs`), representing the ground truth (correct) target values.
42 |
43 | Optional arguments:
44 | - `sample_weight`: numeric array-like of shape (`n_samples,`) representing sample weights. The default is `None`.
45 | - `pos_label`: the label of the positive class. The default is `1`.
46 |
47 |
48 | ### Output Values
49 | This metric returns a dictionary with the following keys:
50 | - `brier_score (float)`: the computed Brier score.
51 |
52 |
53 | Output Example(s):
54 | ```python
55 | {'brier_score': 0.5}
56 | ```
57 |
58 | #### Values from Popular Papers
59 |
60 |
61 | ### Examples
62 | ```python
63 | >>> brier_score = evaluate.load("brier_score")
64 | >>> predictions = np.array([0, 0, 1, 1])
65 | >>> references = np.array([0.1, 0.9, 0.8, 0.3])
66 | >>> results = brier_score.compute(predictions=predictions, references=references)
67 | >>> print(results)
68 | {'brier_score': 0.3375}
69 | ```
70 | Example with `y_true` contains string, an error will be raised and `pos_label` should be explicitly specified.
71 | ```python
72 | >>> brier_score_metric = evaluate.load("brier_score")
73 | >>> predictions = np.array(["spam", "ham", "ham", "spam"])
74 | >>> references = np.array([0.1, 0.9, 0.8, 0.3])
75 | >>> results = brier_score.compute(predictions, references, pos_label="ham")
76 | >>> print(results)
77 | {'brier_score': 0.0374}
78 | ```
79 | ## Limitations and Bias
80 | The [brier_score](https://huggingface.co/metrics/brier_score) is appropriate for binary and categorical outcomes that can be structured as true or false, but it is inappropriate for ordinal variables which can take on three or more values.
81 | ## Citation(s)
82 | ```bibtex
83 | @article{scikit-learn,
84 | title={Scikit-learn: Machine Learning in {P}ython},
85 | author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
86 | and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
87 | and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
88 | Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
89 | journal={Journal of Machine Learning Research},
90 | volume={12},
91 | pages={2825--2830},
92 | year={2011}
93 | }
94 |
95 | @Article{brier1950verification,
96 | title={Verification of forecasts expressed in terms of probability},
97 | author={Brier, Glenn W and others},
98 | journal={Monthly weather review},
99 | volume={78},
100 | number={1},
101 | pages={1--3},
102 | year={1950}
103 | }
104 | ```
105 | ## Further References
106 | - [Brier Score - Wikipedia](https://en.wikipedia.org/wiki/Brier_score)
--------------------------------------------------------------------------------
/metrics/brier_score/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("brier_score")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/brier_score/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scikit-learn
--------------------------------------------------------------------------------
/metrics/cer/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("cer")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/cer/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | jiwer
--------------------------------------------------------------------------------
/metrics/character/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: CharacTER
3 | emoji: 🔤
4 | colorFrom: orange
5 | colorTo: red
6 | sdk: gradio
7 | sdk_version: 3.19.1
8 | app_file: app.py
9 | pinned: false
10 | tags:
11 | - evaluate
12 | - metric
13 | - machine-translation
14 | description: >-
15 | CharacTer is a character-level metric inspired by the commonly applied translation edit rate (TER).
16 | ---
17 |
18 | # Metric Card for CharacTER
19 |
20 | ## Metric Description
21 | CharacTer is a character-level metric inspired by the translation edit rate (TER) metric. It is
22 | defined as the minimum number of character edits required to adjust a hypothesis, until it completely matches the
23 | reference, normalized by the length of the hypothesis sentence. CharacTer calculates the character level edit
24 | distance while performing the shift edit on word level. Unlike the strict matching criterion in TER, a hypothesis
25 | word is considered to match a reference word and could be shifted, if the edit distance between them is below a
26 | threshold value. The Levenshtein distance between the reference and the shifted hypothesis sequence is computed on the
27 | character level. In addition, the lengths of hypothesis sequences instead of reference sequences are used for
28 | normalizing the edit distance, which effectively counters the issue that shorter translations normally achieve lower
29 | TER.
30 |
31 | ## Intended Uses
32 | CharacTER was developed for machine translation evaluation.
33 |
34 | ## How to Use
35 |
36 | ```python
37 | import evaluate
38 | character = evaluate.load("character")
39 |
40 | # Single hyp/ref
41 | preds = ["this week the saudis denied information published in the new york times"]
42 | refs = ["saudi arabia denied this week information published in the american new york times"]
43 | results = character.compute(references=refs, predictions=preds)
44 |
45 | # Corpus example
46 | preds = ["this week the saudis denied information published in the new york times",
47 | "this is in fact an estimate"]
48 | refs = ["saudi arabia denied this week information published in the american new york times",
49 | "this is actually an estimate"]
50 | results = character.compute(references=refs, predictions=preds)
51 | ```
52 |
53 | ### Inputs
54 | - **predictions**: a single prediction or a list of predictions to score. Each prediction should be a string with
55 | tokens separated by spaces.
56 | - **references**: a single reference or a list of reference for each prediction. Each reference should be a string with
57 | tokens separated by spaces.
58 |
59 |
60 | ### Output Values
61 |
62 | *=only when a list of references/hypotheses are given
63 |
64 | - **count** (*): how many parallel sentences were processed
65 | - **mean** (*): the mean CharacTER score
66 | - **median** (*): the median score
67 | - **std** (*): standard deviation of the score
68 | - **min** (*): smallest score
69 | - **max** (*): largest score
70 | - **cer_scores**: all scores, one per ref/hyp pair
71 |
72 | ### Output Example
73 | ```python
74 | {
75 | 'count': 2,
76 | 'mean': 0.3127282211789254,
77 | 'median': 0.3127282211789254,
78 | 'std': 0.07561653111280243,
79 | 'min': 0.25925925925925924,
80 | 'max': 0.36619718309859156,
81 | 'cer_scores': [0.36619718309859156, 0.25925925925925924]
82 | }
83 | ```
84 |
85 | ## Citation
86 | ```bibtex
87 | @inproceedings{wang-etal-2016-character,
88 | title = "{C}harac{T}er: Translation Edit Rate on Character Level",
89 | author = "Wang, Weiyue and
90 | Peter, Jan-Thorsten and
91 | Rosendahl, Hendrik and
92 | Ney, Hermann",
93 | booktitle = "Proceedings of the First Conference on Machine Translation: Volume 2, Shared Task Papers",
94 | month = aug,
95 | year = "2016",
96 | address = "Berlin, Germany",
97 | publisher = "Association for Computational Linguistics",
98 | url = "https://aclanthology.org/W16-2342",
99 | doi = "10.18653/v1/W16-2342",
100 | pages = "505--510",
101 | }
102 | ```
103 |
104 | ## Further References
105 | - Repackaged version that is used in this HF implementation: [https://github.com/bramvanroy/CharacTER](https://github.com/bramvanroy/CharacTER)
106 | - Original version: [https://github.com/rwth-i6/CharacTER](https://github.com/rwth-i6/CharacTER)
107 |
--------------------------------------------------------------------------------
/metrics/character/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("character")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/character/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | cer>=1.2.0
3 |
--------------------------------------------------------------------------------
/metrics/charcut_mt/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: CharCut
3 | emoji: 🔤
4 | colorFrom: blue
5 | colorTo: red
6 | sdk: gradio
7 | sdk_version: 3.19.1
8 | app_file: app.py
9 | pinned: false
10 | tags:
11 | - evaluate
12 | - metric
13 | - machine-translation
14 | description: >-
15 | CharCut is a character-based machine translation evaluation metric.
16 | ---
17 |
18 | # Metric Card for CharacTER
19 |
20 | ## Metric Description
21 | CharCut compares outputs of MT systems with reference translations. The matching algorithm is based on an iterative
22 | search for longest common substrings, combined with a length-based threshold that limits short and noisy character
23 | matches. As a similarity metric this is not new, but to the best of our knowledge it was never applied to highlighting
24 | and scoring of MT outputs. It has the neat effect of keeping character-based differences readable by humans.
25 |
26 | ## Intended Uses
27 | CharCut was developed for machine translation evaluation.
28 |
29 | ## How to Use
30 |
31 | ```python
32 | import evaluate
33 | charcut = evaluate.load("charcut")
34 | preds = ["this week the saudis denied information published in the new york times",
35 | "this is in fact an estimate"]
36 | refs = ["saudi arabia denied this week information published in the american new york times",
37 | "this is actually an estimate"]
38 | results = charcut.compute(references=refs, predictions=preds)
39 | print(results)
40 | # {'charcut_mt': 0.1971153846153846}
41 |
42 | ```
43 | ### Inputs
44 | - **predictions**: a single prediction or a list of predictions to score. Each prediction should be a string with
45 | tokens separated by spaces.
46 | - **references**: a single reference or a list of reference for each prediction. Each reference should be a string with
47 | tokens separated by spaces.
48 |
49 |
50 | ### Output Values
51 | - **charcut_mt**: the CharCut evaluation score (lower is better)
52 |
53 | ### Output Example
54 | ```python
55 | {'charcut_mt': 0.1971153846153846}
56 | ```
57 |
58 | ## Citation
59 | ```bibtex
60 | @inproceedings{lardilleux-lepage-2017-charcut,
61 | title = "{CHARCUT}: Human-Targeted Character-Based {MT} Evaluation with Loose Differences",
62 | author = "Lardilleux, Adrien and
63 | Lepage, Yves",
64 | booktitle = "Proceedings of the 14th International Conference on Spoken Language Translation",
65 | month = dec # " 14-15",
66 | year = "2017",
67 | address = "Tokyo, Japan",
68 | publisher = "International Workshop on Spoken Language Translation",
69 | url = "https://aclanthology.org/2017.iwslt-1.20",
70 | pages = "146--153",
71 | abstract = "We present CHARCUT, a character-based machine translation evaluation metric derived from a human-targeted segment difference visualisation algorithm. It combines an iterative search for longest common substrings between the candidate and the reference translation with a simple length-based threshold, enabling loose differences that limit noisy character matches. Its main advantage is to produce scores that directly reflect human-readable string differences, making it a useful support tool for the manual analysis of MT output and its display to end users. Experiments on WMT16 metrics task data show that it is on par with the best {``}un-trained{''} metrics in terms of correlation with human judgement, well above BLEU and TER baselines, on both system and segment tasks.",
72 | }
73 | ```
74 |
75 | ## Further References
76 | - Repackaged version that is used in this HF implementation: [https://github.com/BramVanroy/CharCut](https://github.com/BramVanroy/CharCut)
77 | - Original version: [https://github.com/alardill/CharCut](https://github.com/alardill/CharCut)
78 |
--------------------------------------------------------------------------------
/metrics/charcut_mt/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("charcut_mt")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/charcut_mt/charcut_mt.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """An implementation for calculating CharCut, a character-based machine translation evaluation metric."""
15 | from typing import Iterable, Union
16 |
17 | import datasets
18 | from charcut import calculate_charcut
19 | from datasets import Sequence, Value
20 |
21 | import evaluate
22 |
23 |
24 | _CITATION = """\
25 | @inproceedings{lardilleux-lepage-2017-charcut,
26 | title = "{CHARCUT}: Human-Targeted Character-Based {MT} Evaluation with Loose Differences",
27 | author = "Lardilleux, Adrien and
28 | Lepage, Yves",
29 | booktitle = "Proceedings of the 14th International Conference on Spoken Language Translation",
30 | month = dec # " 14-15",
31 | year = "2017",
32 | address = "Tokyo, Japan",
33 | publisher = "International Workshop on Spoken Language Translation",
34 | url = "https://aclanthology.org/2017.iwslt-1.20",
35 | pages = "146--153"
36 | }
37 | """
38 |
39 | _DESCRIPTION = """\
40 | CharCut compares outputs of MT systems with reference translations. The matching algorithm is based on an iterative
41 | search for longest common substrings, combined with a length-based threshold that limits short and noisy character
42 | matches. As a similarity metric this is not new, but to the best of our knowledge it was never applied to highlighting
43 | and scoring of MT outputs. It has the neat effect of keeping character-based differences readable by humans."""
44 |
45 | _KWARGS_DESCRIPTION = """
46 | Calculates how good predictions are given some references.
47 | Args:
48 | predictions: a list of predictions to score. Each prediction should be a string with
49 | tokens separated by spaces.
50 | references: a list of reference for each prediction. Each reference should be a string with
51 | tokens separated by spaces.
52 | Returns:
53 | charcut_mt: the CharCut score
54 | Examples:
55 | >>> charcut_mt = evaluate.load("charcut_mt")
56 | >>> preds = ["this week the saudis denied information published in the new york times",
57 | ... "this is in fact an estimate"]
58 | >>> refs = ["saudi arabia denied this week information published in the american new york times",
59 | ... "this is actually an estimate"]
60 | >>> charcut_mt.compute(references=refs, predictions=preds)
61 | {'charcut_mt': 0.1971153846153846}
62 | """
63 |
64 |
65 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
66 | class Charcut(evaluate.Metric):
67 | """Character-based MT evaluation."""
68 |
69 | def _info(self):
70 | return evaluate.MetricInfo(
71 | # This is the description that will appear on the modules page.
72 | module_type="metric",
73 | description=_DESCRIPTION,
74 | citation=_CITATION,
75 | inputs_description=_KWARGS_DESCRIPTION,
76 | # This defines the format of each prediction and reference
77 | features=[
78 | datasets.Features(
79 | {"predictions": Value("string", id="prediction"), "references": Value("string", id="reference")}
80 | ),
81 | ],
82 | # Homepage of the module for documentation
83 | homepage="https://github.com/BramVanroy/CharCut",
84 | # Additional links to the codebase or references
85 | codebase_urls=["https://github.com/BramVanroy/CharCut", "https://github.com/alardill/CharCut"],
86 | )
87 |
88 | def _compute(self, predictions: Iterable[str], references: Iterable[str]):
89 | return {"charcut_mt": calculate_charcut(predictions, references)[0]}
90 |
--------------------------------------------------------------------------------
/metrics/charcut_mt/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | charcut>=1.1.1
3 |
--------------------------------------------------------------------------------
/metrics/chrf/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("chrf")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/chrf/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | sacrebleu
--------------------------------------------------------------------------------
/metrics/code_eval/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("code_eval")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/code_eval/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
--------------------------------------------------------------------------------
/metrics/comet/app.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import evaluate
4 | from evaluate.utils import launch_gradio_widget
5 |
6 |
7 | sys.path = [p for p in sys.path if p != "/home/user/app"]
8 | module = evaluate.load("comet")
9 | sys.path = ["/home/user/app"] + sys.path
10 |
11 | launch_gradio_widget(module)
12 |
--------------------------------------------------------------------------------
/metrics/comet/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | unbabel-comet
3 | torch
--------------------------------------------------------------------------------
/metrics/competition_math/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Competition MATH
3 | emoji: 🤗
4 | colorFrom: blue
5 | colorTo: red
6 | sdk: gradio
7 | sdk_version: 3.19.1
8 | app_file: app.py
9 | pinned: false
10 | tags:
11 | - evaluate
12 | - metric
13 | description: >-
14 | This metric is used to assess performance on the Mathematics Aptitude Test of Heuristics (MATH) dataset.
15 | It first canonicalizes the inputs (e.g., converting "1/2" to "\frac{1}{2}") and then computes accuracy.
16 | ---
17 |
18 | # Metric Card for Competition MATH
19 |
20 | ## Metric description
21 |
22 | This metric is used to assess performance on the [Mathematics Aptitude Test of Heuristics (MATH) dataset](https://huggingface.co/datasets/competition_math).
23 |
24 | It first canonicalizes the inputs (e.g., converting `1/2` to `\\frac{1}{2}`) and then computes accuracy.
25 |
26 | ## How to use
27 |
28 | This metric takes two arguments:
29 |
30 | `predictions`: a list of predictions to score. Each prediction is a string that contains natural language and LaTeX.
31 |
32 | `references`: list of reference for each prediction. Each reference is a string that contains natural language and LaTeX.
33 |
34 |
35 | ```python
36 | >>> from evaluate import load
37 | >>> math = load("competition_math")
38 | >>> references = ["\\frac{1}{2}"]
39 | >>> predictions = ["1/2"]
40 | >>> results = math.compute(references=references, predictions=predictions)
41 | ```
42 |
43 | N.B. To be able to use Competition MATH, you need to install the `math_equivalence` dependency using `pip install git+https://github.com/hendrycks/math.git`.
44 |
45 |
46 | ## Output values
47 |
48 | This metric returns a dictionary that contains the [accuracy](https://huggingface.co/metrics/accuracy) after canonicalizing inputs, on a scale between 0.0 and 1.0.
49 |
50 | ### Values from popular papers
51 | The [original MATH dataset paper](https://arxiv.org/abs/2103.03874) reported accuracies ranging from 3.0% to 6.9% by different large language models.
52 |
53 | More recent progress on the dataset can be found on the [dataset leaderboard](https://paperswithcode.com/sota/math-word-problem-solving-on-math).
54 |
55 | ## Examples
56 |
57 | Maximal values (full match):
58 |
59 | ```python
60 | >>> from evaluate import load
61 | >>> math = load("competition_math")
62 | >>> references = ["\\frac{1}{2}"]
63 | >>> predictions = ["1/2"]
64 | >>> results = math.compute(references=references, predictions=predictions)
65 | >>> print(results)
66 | {'accuracy': 1.0}
67 | ```
68 |
69 | Minimal values (no match):
70 |
71 | ```python
72 | >>> from evaluate import load
73 | >>> math = load("competition_math")
74 | >>> references = ["\\frac{1}{2}"]
75 | >>> predictions = ["3/4"]
76 | >>> results = math.compute(references=references, predictions=predictions)
77 | >>> print(results)
78 | {'accuracy': 0.0}
79 | ```
80 |
81 | Partial match:
82 |
83 | ```python
84 | >>> from evaluate import load
85 | >>> math = load("competition_math")
86 | >>> references = ["\\frac{1}{2}","\\frac{3}{4}"]
87 | >>> predictions = ["1/5", "3/4"]
88 | >>> results = math.compute(references=references, predictions=predictions)
89 | >>> print(results)
90 | {'accuracy': 0.5}
91 | ```
92 |
93 | ## Limitations and bias
94 |
95 | This metric is limited to datasets with the same format as the [Mathematics Aptitude Test of Heuristics (MATH) dataset](https://huggingface.co/datasets/competition_math), and is meant to evaluate the performance of large language models at solving mathematical problems.
96 |
97 | N.B. The MATH dataset also assigns levels of difficulty to different problems, so disagregating model performance by difficulty level (similarly to what was done in the [original paper](https://arxiv.org/abs/2103.03874) can give a better indication of how a given model does on a given difficulty of math problem, compared to overall accuracy.
98 |
99 | ## Citation
100 |
101 | ```bibtex
102 | @article{hendrycksmath2021,
103 | title={Measuring Mathematical Problem Solving With the MATH Dataset},
104 | author={Dan Hendrycks
105 | and Collin Burns
106 | and Saurav Kadavath
107 | and Akul Arora
108 | and Steven Basart
109 | and Eric Tang
110 | and Dawn Song
111 | and Jacob Steinhardt},
112 | journal={arXiv preprint arXiv:2103.03874},
113 | year={2021}
114 | }
115 | ```
116 |
117 | ## Further References
118 | - [MATH dataset](https://huggingface.co/datasets/competition_math)
119 | - [MATH leaderboard](https://paperswithcode.com/sota/math-word-problem-solving-on-math)
120 | - [MATH paper](https://arxiv.org/abs/2103.03874)
--------------------------------------------------------------------------------
/metrics/competition_math/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("competition_math")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/competition_math/competition_math.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Accuracy metric for the Mathematics Aptitude Test of Heuristics (MATH) dataset."""
15 |
16 | import datasets
17 | import math_equivalence # From: git+https://github.com/hendrycks/math.git
18 |
19 | import evaluate
20 |
21 |
22 | _CITATION = """\
23 | @article{hendrycksmath2021,
24 | title={Measuring Mathematical Problem Solving With the MATH Dataset},
25 | author={Dan Hendrycks
26 | and Collin Burns
27 | and Saurav Kadavath
28 | and Akul Arora
29 | and Steven Basart
30 | and Eric Tang
31 | and Dawn Song
32 | and Jacob Steinhardt},
33 | journal={arXiv preprint arXiv:2103.03874},
34 | year={2021}
35 | }
36 | """
37 |
38 |
39 | _DESCRIPTION = """\
40 | This metric is used to assess performance on the Mathematics Aptitude Test of Heuristics (MATH) dataset.
41 | It first canonicalizes the inputs (e.g., converting "1/2" to "\\frac{1}{2}") and then computes accuracy.
42 | """
43 |
44 |
45 | _KWARGS_DESCRIPTION = r"""
46 | Calculates accuracy after canonicalizing inputs.
47 |
48 | Args:
49 | predictions: list of predictions to score. Each prediction
50 | is a string that contains natural language and LaTex.
51 | references: list of reference for each prediction. Each
52 | reference is a string that contains natural language
53 | and LaTex.
54 | Returns:
55 | accuracy: accuracy after canonicalizing inputs
56 | (e.g., converting "1/2" to "\\frac{1}{2}")
57 |
58 | Examples:
59 | >>> metric = evaluate.load("competition_math")
60 | >>> results = metric.compute(references=["\\frac{1}{2}"], predictions=["1/2"])
61 | >>> print(results)
62 | {'accuracy': 1.0}
63 | """
64 |
65 |
66 | @datasets.utils.file_utils.add_end_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
67 | class CompetitionMathMetric(evaluate.Metric):
68 | """Accuracy metric for the MATH dataset."""
69 |
70 | def _info(self):
71 | return evaluate.MetricInfo(
72 | description=_DESCRIPTION,
73 | citation=_CITATION,
74 | inputs_description=_KWARGS_DESCRIPTION,
75 | features=datasets.Features(
76 | {
77 | "predictions": datasets.Value("string"),
78 | "references": datasets.Value("string"),
79 | }
80 | ),
81 | # Homepage of the metric for documentation
82 | homepage="https://github.com/hendrycks/math",
83 | # Additional links to the codebase or references
84 | codebase_urls=["https://github.com/hendrycks/math"],
85 | )
86 |
87 | def _compute(self, predictions, references):
88 | """Returns the scores"""
89 | n_correct = 0.0
90 | for i, j in zip(predictions, references):
91 | n_correct += 1.0 if math_equivalence.is_equiv(i, j) else 0.0
92 | accuracy = n_correct / len(predictions)
93 | return {
94 | "accuracy": accuracy,
95 | }
96 |
--------------------------------------------------------------------------------
/metrics/competition_math/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | git+https://github.com/hendrycks/math.git
--------------------------------------------------------------------------------
/metrics/confusion_matrix/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Confusion Matrix
3 | emoji: 🤗
4 | colorFrom: blue
5 | colorTo: red
6 | sdk: gradio
7 | sdk_version: 3.19.1
8 | app_file: app.py
9 | pinned: false
10 | tags:
11 | - evaluate
12 | - metric
13 | description: >-
14 | The confusion matrix evaluates classification accuracy.
15 |
16 | Each row in a confusion matrix represents a true class and each column represents the instances in a predicted class.
17 | ---
18 |
19 | # Metric Card for Confusion Matrix
20 |
21 |
22 | ## Metric Description
23 |
24 | The confusion matrix evaluates classification accuracy. Each row in a confusion matrix represents a true class and each column represents the instances in a predicted class. Let's look at an example:
25 |
26 | | | setosa | versicolor | virginica |
27 | | ---------- | ------ | ---------- | --------- |
28 | | setosa | 13 | 0 | 0 |
29 | | versicolor | 0 | 10 | 6 |
30 | | virginica | 0 | 0 | 9 |
31 |
32 | What information does this confusion matrix provide?
33 |
34 | * All setosa instances were properly predicted as such (true positives).
35 | * The model always correctly classifies the setosa class (there are no false positives).
36 | * 10 versicolor instances were properly classified, but 6 instances were misclassified as virginica.
37 | * All virginica insances were properly classified as such.
38 |
39 |
40 | ## How to Use
41 |
42 | At minimum, this metric requires predictions and references as inputs.
43 |
44 | ```python
45 | >>> confusion_metric = evaluate.load("confusion_matrix")
46 | >>> results = confusion_metric.compute(references=[0, 1, 1, 2, 0, 2, 2], predictions=[0, 2, 1, 1, 0, 2, 0])
47 | >>> print(results)
48 | {'confusion_matrix': [[2, 0, 0], [0, 1, 1], [1, 1, 1]]}
49 | ```
50 |
51 |
52 | ### Inputs
53 | - **predictions** (`list` of `int`): Predicted labels.
54 | - **references** (`list` of `int`): Ground truth labels.
55 | - **labels** (`list` of `int`): List of labels to index the matrix. This may be used to reorder or select a subset of labels.
56 | - **sample_weight** (`list` of `float`): Sample weights.
57 | - **normalize** (`str`): Normalizes confusion matrix over the true (rows), predicted (columns) conditions or all the population.
58 |
59 |
60 | ### Output Values
61 | - **confusion_matrix**(`list` of `list` of `str`): Confusion matrix. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.
62 |
63 | Output Example(s):
64 | ```python
65 | {'confusion_matrix': [[2, 0, 0], [0, 1, 1], [1, 1, 1]]}
66 | ```
67 |
68 | This metric outputs a dictionary, containing the confusion matrix.
69 |
70 |
71 | ### Examples
72 |
73 | Example 1 - A simple example
74 |
75 | ```python
76 | >>> confusion_metric = evaluate.load("confusion_matrix")
77 | >>> results = confusion_metric.compute(references=[0, 1, 1, 2, 0, 2, 2], predictions=[0, 2, 1, 1, 0, 2, 0])
78 | >>> print(results)
79 | {'confusion_matrix': [[2, 0, 0], [0, 1, 1], [1, 1, 1]]}
80 | ```
81 |
82 | ## Citation(s)
83 | ```bibtex
84 | @article{scikit-learn,
85 | title={Scikit-learn: Machine Learning in {P}ython},
86 | author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
87 | and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
88 | and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
89 | Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
90 | journal={Journal of Machine Learning Research},
91 | volume={12},
92 | pages={2825--2830},
93 | year={2011}
94 | }
95 | ```
96 |
97 |
98 | ## Further References
99 |
100 | * https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html
101 | * https://en.wikipedia.org/wiki/Confusion_matrix
--------------------------------------------------------------------------------
/metrics/confusion_matrix/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("confusion_matrix")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/confusion_matrix/confusion_matrix.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Confusion Matrix."""
15 |
16 | import datasets
17 | from sklearn.metrics import confusion_matrix
18 |
19 | import evaluate
20 |
21 |
22 | _DESCRIPTION = """
23 | The confusion matrix evaluates classification accuracy. Each row in a confusion matrix represents a true class and each column represents the instances in a predicted class
24 | """
25 |
26 | _KWARGS_DESCRIPTION = """
27 | Args:
28 | predictions (`list` of `int`): Predicted labels.
29 | references (`list` of `int`): Ground truth labels.
30 | labels (`list` of `int`): List of labels to index the matrix. This may be used to reorder or select a subset of labels.
31 | sample_weight (`list` of `float`): Sample weights.
32 | normalize (`str`): Normalizes confusion matrix over the true (rows), predicted (columns) conditions or all the population.
33 |
34 | Returns:
35 | confusion_matrix (`list` of `list` of `int`): Confusion matrix whose i-th row and j-th column entry indicates the number of samples with true label being i-th class and predicted label being j-th class.
36 |
37 | Examples:
38 |
39 | Example 1-A simple example
40 | >>> confusion_matrix_metric = evaluate.load("confusion_matrix")
41 | >>> results = confusion_matrix_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
42 | >>> print(results) # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
43 | {'confusion_matrix': array([[1, 0, 1], [0, 2, 0], [1, 1, 0]][...])}
44 | """
45 |
46 |
47 | _CITATION = """
48 | @article{scikit-learn,
49 | title={Scikit-learn: Machine Learning in {P}ython},
50 | author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
51 | and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
52 | and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
53 | Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
54 | journal={Journal of Machine Learning Research},
55 | volume={12},
56 | pages={2825--2830},
57 | year={2011}
58 | }
59 | """
60 |
61 |
62 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
63 | class ConfusionMatrix(evaluate.Metric):
64 | def _info(self):
65 | return evaluate.MetricInfo(
66 | description=_DESCRIPTION,
67 | citation=_CITATION,
68 | inputs_description=_KWARGS_DESCRIPTION,
69 | features=datasets.Features(
70 | {
71 | "predictions": datasets.Sequence(datasets.Value("int32")),
72 | "references": datasets.Sequence(datasets.Value("int32")),
73 | }
74 | if self.config_name == "multilabel"
75 | else {
76 | "predictions": datasets.Value("int32"),
77 | "references": datasets.Value("int32"),
78 | }
79 | ),
80 | reference_urls=["https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html"],
81 | )
82 |
83 | def _compute(self, predictions, references, labels=None, sample_weight=None, normalize=None):
84 | return {
85 | "confusion_matrix": confusion_matrix(
86 | references, predictions, labels=labels, sample_weight=sample_weight, normalize=normalize
87 | )
88 | }
89 |
--------------------------------------------------------------------------------
/metrics/confusion_matrix/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scikit-learn
--------------------------------------------------------------------------------
/metrics/coval/app.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import evaluate
4 | from evaluate.utils import launch_gradio_widget
5 |
6 |
7 | sys.path = [p for p in sys.path if p != "/home/user/app"]
8 | module = evaluate.load("coval")
9 | sys.path = ["/home/user/app"] + sys.path
10 |
11 | launch_gradio_widget(module)
12 |
--------------------------------------------------------------------------------
/metrics/coval/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | git+https://github.com/ns-moosavi/coval.git
--------------------------------------------------------------------------------
/metrics/cuad/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("cuad")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/cuad/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
--------------------------------------------------------------------------------
/metrics/exact_match/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("exact_match")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/exact_match/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
--------------------------------------------------------------------------------
/metrics/f1/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("f1")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/f1/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scikit-learn
--------------------------------------------------------------------------------
/metrics/frugalscore/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("frugalscore")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/frugalscore/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | torch
3 | transformers
--------------------------------------------------------------------------------
/metrics/glue/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("glue", "sst2")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/glue/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scipy
3 | scikit-learn
--------------------------------------------------------------------------------
/metrics/google_bleu/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("google_bleu")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/google_bleu/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | nltk
--------------------------------------------------------------------------------
/metrics/google_bleu/tokenizer_13a.py:
--------------------------------------------------------------------------------
1 | # Source: https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/tokenizers/tokenizer_13a.py
2 | # Copyright 2020 SacreBLEU Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | import re
17 | from functools import lru_cache
18 |
19 |
20 | class BaseTokenizer:
21 | """A base dummy tokenizer to derive from."""
22 |
23 | def signature(self):
24 | """
25 | Returns a signature for the tokenizer.
26 | :return: signature string
27 | """
28 | return "none"
29 |
30 | def __call__(self, line):
31 | """
32 | Tokenizes an input line with the tokenizer.
33 | :param line: a segment to tokenize
34 | :return: the tokenized line
35 | """
36 | return line
37 |
38 |
39 | class TokenizerRegexp(BaseTokenizer):
40 | def signature(self):
41 | return "re"
42 |
43 | def __init__(self):
44 | self._re = [
45 | # language-dependent part (assuming Western languages)
46 | (re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), r" \1 "),
47 | # tokenize period and comma unless preceded by a digit
48 | (re.compile(r"([^0-9])([\.,])"), r"\1 \2 "),
49 | # tokenize period and comma unless followed by a digit
50 | (re.compile(r"([\.,])([^0-9])"), r" \1 \2"),
51 | # tokenize dash when preceded by a digit
52 | (re.compile(r"([0-9])(-)"), r"\1 \2 "),
53 | # one space only between words
54 | # NOTE: Doing this in Python (below) is faster
55 | # (re.compile(r'\s+'), r' '),
56 | ]
57 |
58 | @lru_cache(maxsize=2**16)
59 | def __call__(self, line):
60 | """Common post-processing tokenizer for `13a` and `zh` tokenizers.
61 | :param line: a segment to tokenize
62 | :return: the tokenized line
63 | """
64 | for (_re, repl) in self._re:
65 | line = _re.sub(repl, line)
66 |
67 | # no leading or trailing spaces, single space within words
68 | # return ' '.join(line.split())
69 | # This line is changed with regards to the original tokenizer (seen above) to return individual words
70 | return line.split()
71 |
72 |
73 | class Tokenizer13a(BaseTokenizer):
74 | def signature(self):
75 | return "13a"
76 |
77 | def __init__(self):
78 | self._post_tokenizer = TokenizerRegexp()
79 |
80 | @lru_cache(maxsize=2**16)
81 | def __call__(self, line):
82 | """Tokenizes an input line using a relatively minimal tokenization
83 | that is however equivalent to mteval-v13a, used by WMT.
84 |
85 | :param line: a segment to tokenize
86 | :return: the tokenized line
87 | """
88 |
89 | # language-independent part:
90 | line = line.replace("", "")
91 | line = line.replace("-\n", "")
92 | line = line.replace("\n", " ")
93 |
94 | if "&" in line:
95 | line = line.replace(""", '"')
96 | line = line.replace("&", "&")
97 | line = line.replace("<", "<")
98 | line = line.replace(">", ">")
99 |
100 | return self._post_tokenizer(f" {line} ")
101 |
--------------------------------------------------------------------------------
/metrics/indic_glue/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("indic_glue", "wnli")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/indic_glue/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scipy
3 | scikit-learn
--------------------------------------------------------------------------------
/metrics/mae/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("mae")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/mae/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scikit-learn
--------------------------------------------------------------------------------
/metrics/mahalanobis/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: Mahalanobis Distance
3 | emoji: 🤗
4 | colorFrom: blue
5 | colorTo: red
6 | sdk: gradio
7 | sdk_version: 3.19.1
8 | app_file: app.py
9 | pinned: false
10 | tags:
11 | - evaluate
12 | - metric
13 | description: >-
14 | Compute the Mahalanobis Distance
15 |
16 | Mahalonobis distance is the distance between a point and a distribution.
17 | And not between two distinct points. It is effectively a multivariate equivalent of the Euclidean distance.
18 | It was introduced by Prof. P. C. Mahalanobis in 1936
19 | and has been used in various statistical applications ever since
20 | [source: https://www.machinelearningplus.com/statistics/mahalanobis-distance/]
21 | ---
22 |
23 | # Metric Card for Mahalanobis Distance
24 |
25 | ## Metric Description
26 | Mahalonobis distance is the distance between a point and a distribution (as opposed to the distance between two points), making it the multivariate equivalent of the Euclidean distance.
27 |
28 | It is often used in multivariate anomaly detection, classification on highly imbalanced datasets and one-class classification.
29 |
30 | ## How to Use
31 | At minimum, this metric requires two `list`s of datapoints:
32 |
33 | ```python
34 | >>> mahalanobis_metric = evaluate.load("mahalanobis")
35 | >>> results = mahalanobis_metric.compute(reference_distribution=[[0, 1], [1, 0]], X=[[0, 1]])
36 | ```
37 |
38 | ### Inputs
39 | - `X` (`list`): data points to be compared with the `reference_distribution`.
40 | - `reference_distribution` (`list`): data points from the reference distribution that we want to compare to.
41 |
42 | ### Output Values
43 | `mahalanobis` (`array`): the Mahalonobis distance for each data point in `X`.
44 |
45 | ```python
46 | >>> print(results)
47 | {'mahalanobis': array([0.5])}
48 | ```
49 |
50 | #### Values from Popular Papers
51 | *N/A*
52 |
53 | ### Example
54 |
55 | ```python
56 | >>> mahalanobis_metric = evaluate.load("mahalanobis")
57 | >>> results = mahalanobis_metric.compute(reference_distribution=[[0, 1], [1, 0]], X=[[0, 1]])
58 | >>> print(results)
59 | {'mahalanobis': array([0.5])}
60 | ```
61 |
62 | ## Limitations and Bias
63 |
64 | The Mahalanobis distance is only able to capture linear relationships between the variables, which means it cannot capture all types of outliers. Mahalanobis distance also fails to faithfully represent data that is highly skewed or multimodal.
65 |
66 | ## Citation
67 | ```bibtex
68 | @inproceedings{mahalanobis1936generalized,
69 | title={On the generalized distance in statistics},
70 | author={Mahalanobis, Prasanta Chandra},
71 | year={1936},
72 | organization={National Institute of Science of India}
73 | }
74 | ```
75 |
76 | ```bibtex
77 | @article{de2000mahalanobis,
78 | title={The Mahalanobis distance},
79 | author={De Maesschalck, Roy and Jouan-Rimbaud, Delphine and Massart, D{\'e}sir{\'e} L},
80 | journal={Chemometrics and intelligent laboratory systems},
81 | volume={50},
82 | number={1},
83 | pages={1--18},
84 | year={2000},
85 | publisher={Elsevier}
86 | }
87 | ```
88 |
89 | ## Further References
90 | -[Wikipedia -- Mahalanobis Distance](https://en.wikipedia.org/wiki/Mahalanobis_distance)
91 |
92 | -[Machine Learning Plus -- Mahalanobis Distance](https://www.machinelearningplus.com/statistics/mahalanobis-distance/)
93 |
--------------------------------------------------------------------------------
/metrics/mahalanobis/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("mahalanobis")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/mahalanobis/mahalanobis.py:
--------------------------------------------------------------------------------
1 | # Copyright 2021 The HuggingFace Datasets Authors and the current dataset script contributor.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """Mahalanobis metric."""
15 |
16 | import datasets
17 | import numpy as np
18 |
19 | import evaluate
20 |
21 |
22 | _DESCRIPTION = """
23 | Compute the Mahalanobis Distance
24 |
25 | Mahalonobis distance is the distance between a point and a distribution.
26 | And not between two distinct points. It is effectively a multivariate equivalent of the Euclidean distance.
27 | It was introduced by Prof. P. C. Mahalanobis in 1936
28 | and has been used in various statistical applications ever since
29 | [source: https://www.machinelearningplus.com/statistics/mahalanobis-distance/]
30 | """
31 |
32 | _CITATION = """\
33 | @article{de2000mahalanobis,
34 | title={The mahalanobis distance},
35 | author={De Maesschalck, Roy and Jouan-Rimbaud, Delphine and Massart, D{\'e}sir{\'e} L},
36 | journal={Chemometrics and intelligent laboratory systems},
37 | volume={50},
38 | number={1},
39 | pages={1--18},
40 | year={2000},
41 | publisher={Elsevier}
42 | }
43 | """
44 |
45 | _KWARGS_DESCRIPTION = """
46 | Args:
47 | X: List of datapoints to be compared with the `reference_distribution`.
48 | reference_distribution: List of datapoints from the reference distribution we want to compare to.
49 | Returns:
50 | mahalanobis: The Mahalonobis distance for each datapoint in `X`.
51 | Examples:
52 |
53 | >>> mahalanobis_metric = evaluate.load("mahalanobis")
54 | >>> results = mahalanobis_metric.compute(reference_distribution=[[0, 1], [1, 0]], X=[[0, 1]])
55 | >>> print(results)
56 | {'mahalanobis': array([0.5])}
57 | """
58 |
59 |
60 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
61 | class Mahalanobis(evaluate.Metric):
62 | def _info(self):
63 | return evaluate.MetricInfo(
64 | description=_DESCRIPTION,
65 | citation=_CITATION,
66 | inputs_description=_KWARGS_DESCRIPTION,
67 | features=datasets.Features(
68 | {
69 | "X": datasets.Sequence(datasets.Value("float", id="sequence"), id="X"),
70 | }
71 | ),
72 | )
73 |
74 | def _compute(self, X, reference_distribution):
75 |
76 | # convert to numpy arrays
77 | X = np.array(X)
78 | reference_distribution = np.array(reference_distribution)
79 |
80 | # Assert that arrays are 2D
81 | if len(X.shape) != 2:
82 | raise ValueError("Expected `X` to be a 2D vector")
83 | if len(reference_distribution.shape) != 2:
84 | raise ValueError("Expected `reference_distribution` to be a 2D vector")
85 | if reference_distribution.shape[0] < 2:
86 | raise ValueError(
87 | "Expected `reference_distribution` to be a 2D vector with more than one element in the first dimension"
88 | )
89 |
90 | # Get mahalanobis distance for each prediction
91 | X_minus_mu = X - np.mean(reference_distribution)
92 | cov = np.cov(reference_distribution.T)
93 | try:
94 | inv_covmat = np.linalg.inv(cov)
95 | except np.linalg.LinAlgError:
96 | inv_covmat = np.linalg.pinv(cov)
97 | left_term = np.dot(X_minus_mu, inv_covmat)
98 | mahal_dist = np.dot(left_term, X_minus_mu.T).diagonal()
99 |
100 | return {"mahalanobis": mahal_dist}
101 |
--------------------------------------------------------------------------------
/metrics/mahalanobis/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
--------------------------------------------------------------------------------
/metrics/mape/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("mape")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/mape/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scikit-learn
3 |
--------------------------------------------------------------------------------
/metrics/mase/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("mase")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/mase/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scikit-learn
3 |
--------------------------------------------------------------------------------
/metrics/matthews_correlation/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("matthews_correlation")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/matthews_correlation/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scikit-learn
--------------------------------------------------------------------------------
/metrics/mauve/app.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import evaluate
4 | from evaluate.utils import launch_gradio_widget
5 |
6 |
7 | sys.path = [p for p in sys.path if p != "/home/user/app"]
8 | module = evaluate.load("mauve")
9 | sys.path = ["/home/user/app"] + sys.path
10 |
11 | launch_gradio_widget(module)
12 |
--------------------------------------------------------------------------------
/metrics/mauve/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | faiss-cpu
3 | scikit-learn
4 | mauve-text
--------------------------------------------------------------------------------
/metrics/mean_iou/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("mean_iou")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/mean_iou/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
--------------------------------------------------------------------------------
/metrics/meteor/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("meteor")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/meteor/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | nltk
--------------------------------------------------------------------------------
/metrics/mse/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("mse")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/mse/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scikit-learn
--------------------------------------------------------------------------------
/metrics/nist_mt/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: NIST_MT
3 | emoji: 🤗
4 | colorFrom: purple
5 | colorTo: red
6 | sdk: gradio
7 | sdk_version: 3.19.1
8 | app_file: app.py
9 | pinned: false
10 | tags:
11 | - evaluate
12 | - metric
13 | - machine-translation
14 | description:
15 | DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU score.
16 | ---
17 |
18 | # Metric Card for NIST's MT metric
19 |
20 |
21 | ## Metric Description
22 | DARPA commissioned NIST to develop an MT evaluation facility based on the BLEU
23 | score. The official script used by NIST to compute BLEU and NIST score is
24 | mteval-14.pl. The main differences are:
25 |
26 | - BLEU uses geometric mean of the ngram overlaps, NIST uses arithmetic mean.
27 | - NIST has a different brevity penalty
28 | - NIST score from mteval-14.pl has a self-contained tokenizer (in the Hugging Face implementation we rely on NLTK's
29 | implementation of the NIST-specific tokenizer)
30 |
31 | ## Intended Uses
32 | NIST was developed for machine translation evaluation.
33 |
34 | ## How to Use
35 |
36 | ```python
37 | import evaluate
38 | nist_mt = evaluate.load("nist_mt")
39 | hypothesis1 = "It is a guide to action which ensures that the military always obeys the commands of the party"
40 | reference1 = "It is a guide to action that ensures that the military will forever heed Party commands"
41 | reference2 = "It is the guiding principle which guarantees the military forces always being under the command of the Party"
42 | nist_mt.compute(hypothesis1, [reference1, reference2])
43 | # {'nist_mt': 3.3709935957649324}
44 | ```
45 |
46 | ### Inputs
47 | - **predictions**: tokenized predictions to score. For sentence-level NIST, a list of tokens (str);
48 | for corpus-level NIST, a list (sentences) of lists of tokens (str)
49 | - **references**: potentially multiple tokenized references for each prediction. For sentence-level NIST, a
50 | list (multiple potential references) of list of tokens (str); for corpus-level NIST, a list (corpus) of lists
51 | (multiple potential references) of lists of tokens (str)
52 | - **n**: highest n-gram order
53 | - **tokenize_kwargs**: arguments passed to the tokenizer (see: https://github.com/nltk/nltk/blob/90fa546ea600194f2799ee51eaf1b729c128711e/nltk/tokenize/nist.py#L139)
54 |
55 | ### Output Values
56 | - **nist_mt** (`float`): NIST score
57 |
58 | Output Example:
59 | ```python
60 | {'nist_mt': 3.3709935957649324}
61 | ```
62 |
63 |
64 | ## Citation
65 | ```bibtex
66 | @inproceedings{10.5555/1289189.1289273,
67 | author = {Doddington, George},
68 | title = {Automatic Evaluation of Machine Translation Quality Using N-Gram Co-Occurrence Statistics},
69 | year = {2002},
70 | publisher = {Morgan Kaufmann Publishers Inc.},
71 | address = {San Francisco, CA, USA},
72 | booktitle = {Proceedings of the Second International Conference on Human Language Technology Research},
73 | pages = {138–145},
74 | numpages = {8},
75 | location = {San Diego, California},
76 | series = {HLT '02}
77 | }
78 | ```
79 |
80 | ## Further References
81 |
82 | This Hugging Face implementation uses [the NLTK implementation](https://github.com/nltk/nltk/blob/develop/nltk/translate/nist_score.py)
83 |
--------------------------------------------------------------------------------
/metrics/nist_mt/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("nist_mt")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/nist_mt/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | nltk
3 |
--------------------------------------------------------------------------------
/metrics/nist_mt/tests.py:
--------------------------------------------------------------------------------
1 | from _pytest.fixtures import fixture
2 | from nist_mt import Nist_mt
3 |
4 |
5 | nist = Nist_mt()
6 |
7 |
8 | @fixture
9 | def hypothesis_sent():
10 | return "It is a guide to action which ensures that the military always obeys the commands of the party"
11 |
12 |
13 | @fixture
14 | def reference_sent1():
15 | return "It is a guide to action that ensures that the military will forever heed Party commands"
16 |
17 |
18 | @fixture
19 | def reference_sent2():
20 | return (
21 | "It is the guiding principle which guarantees the military forces always being under the command of the Party"
22 | )
23 |
24 |
25 | @fixture
26 | def reference_sent3():
27 | return "It is the practical guide for the army always to heed the directions of the party"
28 |
29 |
30 | def test_nist_sentence(hypothesis_sent, reference_sent1, reference_sent2, reference_sent3):
31 | nist_score = nist.compute(
32 | predictions=[hypothesis_sent], references=[[reference_sent1, reference_sent2, reference_sent3]]
33 | )
34 | assert abs(nist_score["nist_mt"] - 3.3709935957649324) < 1e-6
35 |
--------------------------------------------------------------------------------
/metrics/pearsonr/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("pearsonr")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/pearsonr/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scipy
--------------------------------------------------------------------------------
/metrics/perplexity/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("perplexity", module_type="metric")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/perplexity/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | torch
3 | torch
4 | transformers
--------------------------------------------------------------------------------
/metrics/poseval/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("poseval")
6 |
7 | launch_gradio_widget(module)
8 |
--------------------------------------------------------------------------------
/metrics/poseval/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scikit-learn
--------------------------------------------------------------------------------
/metrics/precision/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("precision")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/precision/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scikit-learn
--------------------------------------------------------------------------------
/metrics/r_squared/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: r_squared
3 | emoji: 🤗
4 | colorFrom: blue
5 | colorTo: red
6 | sdk: gradio
7 | sdk_version: 3.0.2
8 | app_file: app.py
9 | pinned: false
10 | tags:
11 | - evaluate
12 | - metric
13 | description: >-
14 | The R^2 (R Squared) metric is a measure of the goodness of fit of a linear regression model. It is the proportion of the variance in the dependent variable that is predictable from the independent variable.
15 | ---
16 |
17 | # Metric Card for R^2
18 |
19 | ## Metric description
20 |
21 | An R-squared value of 1 indicates that the model perfectly explains the variance of the dependent variable. A value of 0 means that the model does not explain any of the variance. Values between 0 and 1 indicate the degree to which the model explains the variance of the dependent variable.
22 |
23 | where the Sum of Squared Errors is the sum of the squared differences between the predicted values and the true values, and the Sum of Squared Total is the sum of the squared differences between the true values and the mean of the true values.
24 |
25 | For example, if an R-squared value for a model is 0.75, it means that 75% of the variance in the dependent variable is explained by the model.
26 |
27 | R-squared is not always a reliable measure of the quality of a regression model, particularly when you have a small sample size or there are multiple independent variables. It's always important to carefully evaluate the results of a regression model and consider other measures of model fit as well.
28 |
29 | R squared can be calculated using the following formula:
30 |
31 | ```python
32 | r_squared = 1 - (Sum of Squared Errors / Sum of Squared Total)
33 | ```
34 |
35 | * Calculate the residual sum of squares (RSS), which is the sum of the squared differences between the predicted values and the actual values.
36 | * Calculate the total sum of squares (TSS), which is the sum of the squared differences between the actual values and the mean of the actual values.
37 | * Calculate the R-squared value by taking 1 - (RSS / TSS).
38 |
39 | Here's an example of how to calculate the R-squared value:
40 | ```python
41 | r_squared = 1 - (SSR/SST)
42 | ```
43 |
44 | ### How to Use Examples:
45 |
46 | The R2 class in the evaluate module can be used to compute the R^2 value for a given set of predictions and references. (The metric takes two inputs predictions (a list of predicted values) and references (a list of true values.))
47 |
48 | ```python
49 | from evaluate import load
50 | >>> r2_metric = evaluate.load("r_squared")
51 | >>> r_squared = r2_metric.compute(predictions=[1, 2, 3, 4], references=[0.9, 2.1, 3.2, 3.8])
52 | >>> print(r_squared)
53 | 0.98
54 | ```
55 |
56 | Alternatively, if you want to see an example where there is a perfect match between the prediction and reference:
57 | ```python
58 | >>> from evaluate import load
59 | >>> r2_metric = evaluate.load("r_squared")
60 | >>> r_squared = r2_metric.compute(predictions=[1, 2, 3, 4], references=[1, 2, 3, 4])
61 | >>> print(r_squared)
62 | 1.0
63 | ```
64 |
65 | ## Limitations and Bias
66 | R^2 is a statistical measure of the goodness of fit of a regression model. It represents the proportion of the variance in the dependent variable that is predictable from the independent variables. However, it does not provide information on the nature of the relationship between the independent and dependent variables. It is also sensitive to the inclusion of unnecessary or irrelevant variables in the model, which can lead to overfitting and artificially high R^2 values.
67 |
68 | ## Citation
69 |
70 | ```bibtex
71 | @article{r_squared_model,
72 | title={The R^2 Model Metric: A Comprehensive Guide},
73 | author={John Doe},
74 | journal={Journal of Model Evaluation},
75 | volume={10},
76 | number={2},
77 | pages={101-112},
78 | year={2022},
79 | publisher={Model Evaluation Society}}
80 | ```
81 |
82 | ## Further References
83 |
84 | - [The Open University: R-Squared](https://www.open.edu/openlearn/ocw/mod/oucontent/view.php?id=55450§ion=3.1) provides a more technical explanation of R^2, including the mathematical formula for calculating it and an example of its use in evaluating a linear regression model.
85 |
86 | - [Khan Academy: R-Squared](https://www.khanacademy.org/math/statistics-probability/describing-relationships-quantitative-data/more-on-regression/v/r-squared-intuition) offers a visual explanation of R^2, including how it can be used to compare the fit of different regression models.
87 |
--------------------------------------------------------------------------------
/metrics/r_squared/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("r_squared")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/r_squared/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 |
--------------------------------------------------------------------------------
/metrics/recall/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("recall")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/recall/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scikit-learn
--------------------------------------------------------------------------------
/metrics/rl_reliability/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("rl_reliability", "online")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/rl_reliability/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | git+https://github.com/google-research/rl-reliability-metrics
3 | scipy
4 | tensorflow
5 | gin-config
--------------------------------------------------------------------------------
/metrics/roc_auc/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("roc_auc")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/roc_auc/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scikit-learn
--------------------------------------------------------------------------------
/metrics/rouge/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("rouge")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/rouge/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | absl-py
3 | nltk
4 | rouge_score>=0.1.2
--------------------------------------------------------------------------------
/metrics/sacrebleu/app.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import evaluate
4 | from evaluate.utils import launch_gradio_widget
5 |
6 |
7 | sys.path = [p for p in sys.path if p != "/home/user/app"]
8 | module = evaluate.load("sacrebleu")
9 | sys.path = ["/home/user/app"] + sys.path
10 |
11 | launch_gradio_widget(module)
12 |
--------------------------------------------------------------------------------
/metrics/sacrebleu/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | sacrebleu
--------------------------------------------------------------------------------
/metrics/sari/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("sari")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/sari/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | sacrebleu
3 | sacremoses
--------------------------------------------------------------------------------
/metrics/seqeval/app.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import evaluate
4 | from evaluate.utils import launch_gradio_widget
5 |
6 |
7 | sys.path = [p for p in sys.path if p != "/home/user/app"]
8 | module = evaluate.load("seqeval")
9 | sys.path = ["/home/user/app"] + sys.path
10 |
11 | launch_gradio_widget(module)
12 |
--------------------------------------------------------------------------------
/metrics/seqeval/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | seqeval
--------------------------------------------------------------------------------
/metrics/smape/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: sMAPE
3 | emoji: 🤗
4 | colorFrom: blue
5 | colorTo: red
6 | sdk: gradio
7 | sdk_version: 3.19.1
8 | app_file: app.py
9 | pinned: false
10 | tags:
11 | - evaluate
12 | - metric
13 | description: >-
14 | Symmetric Mean Absolute Percentage Error (sMAPE) is the symmetric mean percentage error difference between the predicted and actual values defined by Chen and Yang (2004).
15 | ---
16 |
17 | # Metric Card for sMAPE
18 |
19 |
20 | ## Metric Description
21 |
22 | Symmetric Mean Absolute Error (sMAPE) is the symmetric mean of the percentage error of difference between the predicted $x_i$ and actual $y_i$ numeric values:
23 |
24 | 
25 |
26 |
27 | ## How to Use
28 |
29 | At minimum, this metric requires predictions and references as inputs.
30 |
31 | ```python
32 | >>> smape_metric = evaluate.load("smape")
33 | >>> predictions = [2.5, 0.0, 2, 8]
34 | >>> references = [3, -0.5, 2, 7]
35 | >>> results = smape_metric.compute(predictions=predictions, references=references)
36 | ```
37 |
38 | ### Inputs
39 |
40 | Mandatory inputs:
41 | - `predictions`: numeric array-like of shape (`n_samples,`) or (`n_samples`, `n_outputs`), representing the estimated target values.
42 | - `references`: numeric array-like of shape (`n_samples,`) or (`n_samples`, `n_outputs`), representing the ground truth (correct) target values.
43 |
44 | Optional arguments:
45 | - `sample_weight`: numeric array-like of shape (`n_samples,`) representing sample weights. The default is `None`.
46 | - `multioutput`: `raw_values`, `uniform_average` or numeric array-like of shape (`n_outputs,`), which defines the aggregation of multiple output values. The default value is `uniform_average`.
47 | - `raw_values` returns a full set of errors in case of multioutput input.
48 | - `uniform_average` means that the errors of all outputs are averaged with uniform weight.
49 | - the array-like value defines weights used to average errors.
50 |
51 | ### Output Values
52 | This metric outputs a dictionary, containing the mean absolute error score, which is of type:
53 | - `float`: if multioutput is `uniform_average` or an ndarray of weights, then the weighted average of all output errors is returned.
54 | - numeric array-like of shape (`n_outputs,`): if multioutput is `raw_values`, then the score is returned for each output separately.
55 |
56 | Each sMAPE `float` value ranges from `0.0` to `2.0`, with the best value being 0.0.
57 |
58 | Output Example(s):
59 | ```python
60 | {'smape': 0.5}
61 | ```
62 |
63 | If `multioutput="raw_values"`:
64 | ```python
65 | {'smape': array([0.5, 1.5 ])}
66 | ```
67 |
68 | #### Values from Popular Papers
69 |
70 |
71 | ### Examples
72 |
73 | Example with the `uniform_average` config:
74 | ```python
75 | >>> smape_metric = evaluate.load("smape")
76 | >>> predictions = [2.5, 0.0, 2, 8]
77 | >>> references = [3, -0.5, 2, 7]
78 | >>> results = smape_metric.compute(predictions=predictions, references=references)
79 | >>> print(results)
80 | {'smape': 0.5787...}
81 | ```
82 |
83 | Example with multi-dimensional lists, and the `raw_values` config:
84 | ```python
85 | >>> smape_metric = evaluate.load("smape", "multilist")
86 | >>> predictions = [[0.5, 1], [-1, 1], [7, -6]]
87 | >>> references = [[0.1, 2], [-1, 2], [8, -5]]
88 | >>> results = smape_metric.compute(predictions=predictions, references=references)
89 | >>> print(results)
90 | {'smape': 0.8874...}
91 | >>> results = smape_metric.compute(predictions=predictions, references=references, multioutput='raw_values')
92 | >>> print(results)
93 | {'smape': array([1.3749..., 0.4])}
94 | ```
95 |
96 | ## Limitations and Bias
97 | This metric is called a measure of "percentage error" even though there is no multiplier of 100. The range is between (0, 2) with it being two when the target and prediction are both zero.
98 |
99 | ## Citation(s)
100 |
101 | ```bibtex
102 | @article{article,
103 | author = {Chen, Zhuo and Yang, Yuhong},
104 | year = {2004},
105 | month = {04},
106 | pages = {},
107 | title = {Assessing forecast accuracy measures}
108 | }
109 | ```
110 |
111 | ## Further References
112 | - [Symmetric Mean absolute percentage error - Wikipedia](https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error)
113 |
--------------------------------------------------------------------------------
/metrics/smape/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("smape")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/smape/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scikit-learn
3 |
--------------------------------------------------------------------------------
/metrics/spearmanr/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("spearmanr")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/spearmanr/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scipy
--------------------------------------------------------------------------------
/metrics/squad/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("squad")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/squad/compute_score.py:
--------------------------------------------------------------------------------
1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """
2 |
3 | import argparse
4 | import json
5 | import re
6 | import string
7 | import sys
8 | from collections import Counter
9 |
10 |
11 | def normalize_answer(s):
12 | """Lower text and remove punctuation, articles and extra whitespace."""
13 |
14 | def remove_articles(text):
15 | return re.sub(r"\b(a|an|the)\b", " ", text)
16 |
17 | def white_space_fix(text):
18 | return " ".join(text.split())
19 |
20 | def remove_punc(text):
21 | exclude = set(string.punctuation)
22 | return "".join(ch for ch in text if ch not in exclude)
23 |
24 | def lower(text):
25 | return text.lower()
26 |
27 | return white_space_fix(remove_articles(remove_punc(lower(s))))
28 |
29 |
30 | def f1_score(prediction, ground_truth):
31 | prediction_tokens = normalize_answer(prediction).split()
32 | ground_truth_tokens = normalize_answer(ground_truth).split()
33 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
34 | num_same = sum(common.values())
35 | if num_same == 0:
36 | return 0
37 | precision = 1.0 * num_same / len(prediction_tokens)
38 | recall = 1.0 * num_same / len(ground_truth_tokens)
39 | f1 = (2 * precision * recall) / (precision + recall)
40 | return f1
41 |
42 |
43 | def exact_match_score(prediction, ground_truth):
44 | return normalize_answer(prediction) == normalize_answer(ground_truth)
45 |
46 |
47 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
48 | scores_for_ground_truths = []
49 | for ground_truth in ground_truths:
50 | score = metric_fn(prediction, ground_truth)
51 | scores_for_ground_truths.append(score)
52 | return max(scores_for_ground_truths)
53 |
54 |
55 | def compute_score(dataset, predictions):
56 | f1 = exact_match = total = 0
57 | for article in dataset:
58 | for paragraph in article["paragraphs"]:
59 | for qa in paragraph["qas"]:
60 | total += 1
61 | if qa["id"] not in predictions:
62 | message = "Unanswered question " + qa["id"] + " will receive score 0."
63 | print(message, file=sys.stderr)
64 | continue
65 | ground_truths = list(map(lambda x: x["text"], qa["answers"]))
66 | prediction = predictions[qa["id"]]
67 | exact_match += metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
68 | f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)
69 |
70 | exact_match = 100.0 * exact_match / total
71 | f1 = 100.0 * f1 / total
72 |
73 | return {"exact_match": exact_match, "f1": f1}
74 |
75 |
76 | if __name__ == "__main__":
77 | expected_version = "1.1"
78 | parser = argparse.ArgumentParser(description="Evaluation for SQuAD " + expected_version)
79 | parser.add_argument("dataset_file", help="Dataset file")
80 | parser.add_argument("prediction_file", help="Prediction File")
81 | args = parser.parse_args()
82 | with open(args.dataset_file) as dataset_file:
83 | dataset_json = json.load(dataset_file)
84 | if dataset_json["version"] != expected_version:
85 | print(
86 | "Evaluation expects v-" + expected_version + ", but got dataset with v-" + dataset_json["version"],
87 | file=sys.stderr,
88 | )
89 | dataset = dataset_json["data"]
90 | with open(args.prediction_file) as prediction_file:
91 | predictions = json.load(prediction_file)
92 | print(json.dumps(compute_score(dataset, predictions)))
93 |
--------------------------------------------------------------------------------
/metrics/squad/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
--------------------------------------------------------------------------------
/metrics/squad_v2/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("squad_v2")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/squad_v2/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
--------------------------------------------------------------------------------
/metrics/super_glue/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("super_glue", "copa")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/super_glue/record_evaluation.py:
--------------------------------------------------------------------------------
1 | """
2 | Official evaluation script for ReCoRD v1.0.
3 | (Some functions are adopted from the SQuAD evaluation script.)
4 | """
5 |
6 |
7 | import argparse
8 | import json
9 | import re
10 | import string
11 | import sys
12 | from collections import Counter
13 |
14 |
15 | def normalize_answer(s):
16 | """Lower text and remove punctuation, articles and extra whitespace."""
17 |
18 | def remove_articles(text):
19 | return re.sub(r"\b(a|an|the)\b", " ", text)
20 |
21 | def white_space_fix(text):
22 | return " ".join(text.split())
23 |
24 | def remove_punc(text):
25 | exclude = set(string.punctuation)
26 | return "".join(ch for ch in text if ch not in exclude)
27 |
28 | def lower(text):
29 | return text.lower()
30 |
31 | return white_space_fix(remove_articles(remove_punc(lower(s))))
32 |
33 |
34 | def f1_score(prediction, ground_truth):
35 | prediction_tokens = normalize_answer(prediction).split()
36 | ground_truth_tokens = normalize_answer(ground_truth).split()
37 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
38 | num_same = sum(common.values())
39 | if num_same == 0:
40 | return 0
41 | precision = 1.0 * num_same / len(prediction_tokens)
42 | recall = 1.0 * num_same / len(ground_truth_tokens)
43 | f1 = (2 * precision * recall) / (precision + recall)
44 | return f1
45 |
46 |
47 | def exact_match_score(prediction, ground_truth):
48 | return normalize_answer(prediction) == normalize_answer(ground_truth)
49 |
50 |
51 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
52 | scores_for_ground_truths = []
53 | for ground_truth in ground_truths:
54 | score = metric_fn(prediction, ground_truth)
55 | scores_for_ground_truths.append(score)
56 | return max(scores_for_ground_truths)
57 |
58 |
59 | def evaluate(dataset, predictions):
60 | f1 = exact_match = total = 0
61 | correct_ids = []
62 | for passage in dataset:
63 | for qa in passage["qas"]:
64 | total += 1
65 | if qa["id"] not in predictions:
66 | message = f'Unanswered question {qa["id"]} will receive score 0.'
67 | print(message, file=sys.stderr)
68 | continue
69 |
70 | ground_truths = list(map(lambda x: x["text"], qa["answers"]))
71 | prediction = predictions[qa["id"]]
72 |
73 | _exact_match = metric_max_over_ground_truths(exact_match_score, prediction, ground_truths)
74 | if int(_exact_match) == 1:
75 | correct_ids.append(qa["id"])
76 | exact_match += _exact_match
77 |
78 | f1 += metric_max_over_ground_truths(f1_score, prediction, ground_truths)
79 |
80 | exact_match = exact_match / total
81 | f1 = f1 / total
82 |
83 | return {"exact_match": exact_match, "f1": f1}, correct_ids
84 |
85 |
86 | if __name__ == "__main__":
87 | expected_version = "1.0"
88 | parser = argparse.ArgumentParser("Official evaluation script for ReCoRD v1.0.")
89 | parser.add_argument("data_file", help="The dataset file in JSON format.")
90 | parser.add_argument("pred_file", help="The model prediction file in JSON format.")
91 | parser.add_argument("--output_correct_ids", action="store_true", help="Output the correctly answered query IDs.")
92 | args = parser.parse_args()
93 |
94 | with open(args.data_file) as data_file:
95 | dataset_json = json.load(data_file)
96 | if dataset_json["version"] != expected_version:
97 | print(
98 | f'Evaluation expects v-{expected_version}, but got dataset with v-{dataset_json["version"]}',
99 | file=sys.stderr,
100 | )
101 | dataset = dataset_json["data"]
102 |
103 | with open(args.pred_file) as pred_file:
104 | predictions = json.load(pred_file)
105 |
106 | metrics, correct_ids = evaluate(dataset, predictions)
107 |
108 | if args.output_correct_ids:
109 | print(f"Output {len(correct_ids)} correctly answered question IDs.")
110 | with open("correct_ids.json", "w") as f:
111 | json.dump(correct_ids, f)
112 |
--------------------------------------------------------------------------------
/metrics/super_glue/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scikit-learn
--------------------------------------------------------------------------------
/metrics/ter/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("ter")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/ter/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | sacrebleu
--------------------------------------------------------------------------------
/metrics/trec_eval/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("trec_eval")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/trec_eval/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | trectools
--------------------------------------------------------------------------------
/metrics/wer/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("wer")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/wer/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | jiwer
--------------------------------------------------------------------------------
/metrics/wiki_split/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("wiki_split")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/wiki_split/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | sacrebleu
3 | sacremoses
--------------------------------------------------------------------------------
/metrics/xnli/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("xnli")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/xnli/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
--------------------------------------------------------------------------------
/metrics/xnli/xnli.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Evaluate Authors.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """ XNLI benchmark metric. """
15 |
16 | import datasets
17 |
18 | import evaluate
19 |
20 |
21 | _CITATION = """\
22 | @InProceedings{conneau2018xnli,
23 | author = "Conneau, Alexis
24 | and Rinott, Ruty
25 | and Lample, Guillaume
26 | and Williams, Adina
27 | and Bowman, Samuel R.
28 | and Schwenk, Holger
29 | and Stoyanov, Veselin",
30 | title = "XNLI: Evaluating Cross-lingual Sentence Representations",
31 | booktitle = "Proceedings of the 2018 Conference on Empirical Methods
32 | in Natural Language Processing",
33 | year = "2018",
34 | publisher = "Association for Computational Linguistics",
35 | location = "Brussels, Belgium",
36 | }
37 | """
38 |
39 | _DESCRIPTION = """\
40 | XNLI is a subset of a few thousand examples from MNLI which has been translated
41 | into a 14 different languages (some low-ish resource). As with MNLI, the goal is
42 | to predict textual entailment (does sentence A imply/contradict/neither sentence
43 | B) and is a classification task (given two sentences, predict one of three
44 | labels).
45 | """
46 |
47 | _KWARGS_DESCRIPTION = """
48 | Computes XNLI score which is just simple accuracy.
49 | Args:
50 | predictions: Predicted labels.
51 | references: Ground truth labels.
52 | Returns:
53 | 'accuracy': accuracy
54 | Examples:
55 |
56 | >>> predictions = [0, 1]
57 | >>> references = [0, 1]
58 | >>> xnli_metric = evaluate.load("xnli")
59 | >>> results = xnli_metric.compute(predictions=predictions, references=references)
60 | >>> print(results)
61 | {'accuracy': 1.0}
62 | """
63 |
64 |
65 | def simple_accuracy(preds, labels):
66 | return (preds == labels).mean()
67 |
68 |
69 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
70 | class Xnli(evaluate.Metric):
71 | def _info(self):
72 | return evaluate.MetricInfo(
73 | description=_DESCRIPTION,
74 | citation=_CITATION,
75 | inputs_description=_KWARGS_DESCRIPTION,
76 | features=datasets.Features(
77 | {
78 | "predictions": datasets.Value("int64" if self.config_name != "sts-b" else "float32"),
79 | "references": datasets.Value("int64" if self.config_name != "sts-b" else "float32"),
80 | }
81 | ),
82 | codebase_urls=[],
83 | reference_urls=[],
84 | format="numpy",
85 | )
86 |
87 | def _compute(self, predictions, references):
88 | return {"accuracy": simple_accuracy(predictions, references)}
89 |
--------------------------------------------------------------------------------
/metrics/xtreme_s/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("xtreme_s", "mls")
6 | launch_gradio_widget(module)
7 |
--------------------------------------------------------------------------------
/metrics/xtreme_s/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@{COMMIT_PLACEHOLDER}
2 | scikit-learn
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | license_file = LICENSE
3 |
4 | [isort]
5 | ensure_newline_before_comments = True
6 | force_grid_wrap = 0
7 | include_trailing_comma = True
8 | line_length = 119
9 | lines_after_imports = 2
10 | multi_line_output = 3
11 | use_parentheses = True
12 |
13 | [flake8]
14 | ignore = E203, E501, W503
15 | max-line-length = 119
16 | exclude =
17 | src/datasets/datasets
18 | src/datasets/metrics
19 | per-file-ignores =
20 | metrics/*:F401
21 |
--------------------------------------------------------------------------------
/src/evaluate/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # Copyright 2020 The HuggingFace Evaluate Authors and the TensorFlow Datasets Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | # Lint as: python3
17 | # pylint: enable=line-too-long
18 | # pylint: disable=g-import-not-at-top,g-bad-import-order,wrong-import-position
19 |
20 | __version__ = "0.4.4.dev0"
21 |
22 | from packaging import version
23 |
24 |
25 | SCRIPTS_VERSION = "main" if version.parse(__version__).is_devrelease else __version__
26 |
27 | del version
28 |
29 | from .evaluation_suite import EvaluationSuite
30 | from .evaluator import (
31 | AudioClassificationEvaluator,
32 | AutomaticSpeechRecognitionEvaluator,
33 | Evaluator,
34 | ImageClassificationEvaluator,
35 | QuestionAnsweringEvaluator,
36 | SummarizationEvaluator,
37 | Text2TextGenerationEvaluator,
38 | TextClassificationEvaluator,
39 | TextGenerationEvaluator,
40 | TokenClassificationEvaluator,
41 | TranslationEvaluator,
42 | evaluator,
43 | )
44 | from .hub import push_to_hub
45 | from .info import ComparisonInfo, EvaluationModuleInfo, MeasurementInfo, MetricInfo
46 | from .inspect import inspect_evaluation_module, list_evaluation_modules
47 | from .loading import load
48 | from .module import CombinedEvaluations, Comparison, EvaluationModule, Measurement, Metric, combine
49 | from .saving import save
50 | from .utils import *
51 | from .utils import gradio, logging
52 |
--------------------------------------------------------------------------------
/src/evaluate/commands/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/evaluate/5aa3982a9a8c86e506860e381d428a64b0cce73b/src/evaluate/commands/__init__.py
--------------------------------------------------------------------------------
/src/evaluate/evaluator/text_generation.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Evaluate Authors.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from typing import Dict, Tuple
16 |
17 | from datasets import Dataset
18 |
19 | from .base import Evaluator
20 | from .utils import DatasetColumn
21 |
22 |
23 | TASK_DOCUMENTATION_KWARGS = r"""
24 | input_column (`str`, defaults to `"text"`):
25 | the name of the column containing the input text in the dataset specified by `data`.
26 | generation_kwargs (`Dict`, *optional*, defaults to `None`):
27 | The generation kwargs are passed to the pipeline and set the text generation strategy.
28 | """
29 |
30 |
31 | class TextGenerationEvaluator(Evaluator):
32 | """
33 | Text generation evaluator.
34 | This Text generation evaluator can currently be loaded from [`evaluator`] using the default task name
35 | `text-generation`.
36 | Methods in this class assume a data format compatible with the [`~transformers.TextGenerationPipeline`].
37 | """
38 |
39 | def predictions_processor(self, predictions, *args, **kwargs):
40 | """
41 | Args:
42 | predictions: A list of lists of dicts
43 |
44 | Returns:
45 | `dict`: All the generated texts are flattened and stored under the "data" key.
46 | """
47 | return {"data": [pred[f"{self.predictions_prefix}_text"] for pred_list in predictions for pred in pred_list]}
48 |
49 | def __init__(self, task="text-generation", default_metric_name=None, predictions_prefix: str = "generated"):
50 | super().__init__(task=task, default_metric_name=default_metric_name)
51 | self.predictions_prefix = predictions_prefix
52 |
53 | def prepare_data(self, data: Dataset, input_column: str, *args, **kwargs) -> Tuple[Dict, DatasetColumn]:
54 | """
55 | Prepare data.
56 |
57 | Args:
58 | data ([`Dataset`]):
59 | Specifies the dataset we will run evaluation on.
60 | input_column (`str`, defaults to `"text"`):
61 | The name of the column containing the text feature in the dataset specified by `data`.
62 | Returns:
63 | `dict`: metric inputs.
64 | `list`: pipeline inputs.
65 | """
66 |
67 | self.check_required_columns(data, {"input_column": input_column})
68 |
69 | return {}, DatasetColumn(data, input_column)
70 |
--------------------------------------------------------------------------------
/src/evaluate/evaluator/utils.py:
--------------------------------------------------------------------------------
1 | from datasets import Dataset, get_dataset_split_names
2 |
3 |
4 | class DatasetColumn(list):
5 | """Helper class to avoid loading a dataset column into memory when accessing it."""
6 |
7 | def __init__(self, dataset: Dataset, key: str):
8 | self.dataset = dataset
9 | self.key = key
10 |
11 | def __len__(self):
12 | return len(self.dataset)
13 |
14 | def __getitem__(self, i):
15 | return self.dataset[i][self.key]
16 |
17 | def __iter__(self):
18 | return (self.dataset[i][self.key] for i in range(len(self)))
19 |
20 |
21 | def choose_split(data, subset=None):
22 | available_splits = get_dataset_split_names(data, subset)
23 | preferred_split_order = [
24 | "test",
25 | "testing",
26 | "eval",
27 | "evaluation",
28 | "validation",
29 | "val",
30 | "valid",
31 | "dev",
32 | "train",
33 | "training",
34 | ]
35 | for split in preferred_split_order:
36 | if split in available_splits:
37 | return split
38 | raise ValueError("No dataset split defined! Pass an explicit value to the `split` kwarg.")
39 |
40 |
41 | class DatasetColumnPair(list):
42 | """Helper class to avoid loading two dataset columns into memory when accessing it."""
43 |
44 | def __init__(
45 | self,
46 | dataset: Dataset,
47 | first_col: str,
48 | second_col: str,
49 | first_key: str,
50 | second_key: str,
51 | ):
52 | """
53 | Args:
54 | dataset (Dataset): dataset to build an iterator on
55 | first_col (str): first column name to use in the dataset
56 | second_col (str): second column name to use in the dataset
57 | first_key (str): key name used for the first column in the returned dictionary
58 | second_key (str): key name used for the second column in the returned dictionary
59 | """
60 | self.dataset = dataset
61 |
62 | self.first_col = first_col
63 | self.second_col = second_col
64 |
65 | self.first_key = first_key
66 | self.second_key = second_key
67 |
68 | def __len__(self):
69 | return len(self.dataset)
70 |
71 | def __getitem__(self, i):
72 | return {
73 | self.first_key: self.dataset[i][self.first_col],
74 | self.second_key: self.dataset[i][self.second_col] if self.second_col else None,
75 | }
76 |
77 | def __iter__(self):
78 | return (
79 | {
80 | self.first_key: self.dataset[i][self.first_col],
81 | self.second_key: self.dataset[i][self.second_col] if self.second_col else None,
82 | }
83 | for i in range(len(self))
84 | )
85 |
--------------------------------------------------------------------------------
/src/evaluate/naming.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # Lint as: python3
16 | """Utilities for file names."""
17 |
18 | import itertools
19 | import os
20 | import re
21 |
22 |
23 | _uppercase_uppercase_re = re.compile(r"([A-Z]+)([A-Z][a-z])")
24 | _lowercase_uppercase_re = re.compile(r"([a-z\d])([A-Z])")
25 |
26 | _single_underscore_re = re.compile(r"(?>> import evaluate
26 | >>> result = {"bleu": 0.7}
27 | >>> params = {"model": "gpt-2"}
28 | >>> evaluate.save("./results/", **result, **params)
29 | ```
30 | """
31 | current_time = datetime.now()
32 |
33 | file_path = _setup_path(path_or_file, current_time)
34 |
35 | data["_timestamp"] = current_time.isoformat()
36 | data["_git_commit_hash"] = _git_commit_hash()
37 | data["_evaluate_version"] = __version__
38 | data["_python_version"] = sys.version
39 | data["_interpreter_path"] = sys.executable
40 |
41 | with FileLock(str(file_path) + ".lock"):
42 | with open(file_path, "w") as f:
43 | json.dump(data, f)
44 |
45 | # cleanup lock file
46 | try:
47 | os.remove(str(file_path) + ".lock")
48 | except FileNotFoundError:
49 | pass
50 |
51 | return file_path
52 |
53 |
54 | def _setup_path(path_or_file, current_time):
55 | path_or_file = Path(path_or_file)
56 | is_file = len(path_or_file.suffix) > 0
57 | if is_file:
58 | folder = path_or_file.parent
59 | file_name = path_or_file.name
60 | else:
61 | folder = path_or_file
62 | file_name = "result-" + current_time.strftime("%Y_%m_%d-%H_%M_%S") + ".json"
63 | folder.mkdir(parents=True, exist_ok=True)
64 | return folder / file_name
65 |
66 |
67 | def _git_commit_hash():
68 | res = subprocess.run("git rev-parse --is-inside-work-tree".split(), cwd="./", stdout=subprocess.PIPE)
69 | if res.stdout.decode().strip() == "true":
70 | res = subprocess.run("git rev-parse HEAD".split(), cwd=os.getcwd(), stdout=subprocess.PIPE)
71 | return res.stdout.decode().strip()
72 | else:
73 | return None
74 |
--------------------------------------------------------------------------------
/src/evaluate/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Datasets Authors and the TensorFlow Datasets Authors.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | # flake8: noqa
16 | # Lint as: python3
17 | """Util import."""
18 |
19 | __all__ = [
20 | "disable_progress_bar",
21 | "enable_progress_bar",
22 | "is_progress_bar_enabled",
23 | "infer_gradio_input_types",
24 | "json_to_string_type",
25 | "parse_readme",
26 | "parse_gradio_data",
27 | "parse_test_cases",
28 | "launch_gradio_widget",
29 | ]
30 |
31 | from .gradio import (
32 | infer_gradio_input_types,
33 | json_to_string_type,
34 | launch_gradio_widget,
35 | parse_gradio_data,
36 | parse_readme,
37 | parse_test_cases,
38 | )
39 | from .logging import disable_progress_bar, enable_progress_bar, is_progress_bar_enabled
40 |
--------------------------------------------------------------------------------
/templates/cookiecutter.json:
--------------------------------------------------------------------------------
1 | {
2 | "module_name": "Awesome Module",
3 | "module_type": "module",
4 | "module_description": "This new module is designed to solve this great ML task and is crafted with a lot of care and love.",
5 | "module_slug": "{{ cookiecutter.module_name|lower|replace(' ', '_') }}",
6 | "module_class_name": "{{ cookiecutter.module_name|replace(' ', '') }}",
7 | "namespace": "",
8 | "dataset_name": ""
9 | }
--------------------------------------------------------------------------------
/templates/{{ cookiecutter.module_slug }}/README.md:
--------------------------------------------------------------------------------
1 | ---
2 | title: {{ cookiecutter.module_name }}
3 | datasets:
4 | - {{ cookiecutter.dataset_name }}
5 | tags:
6 | - evaluate
7 | - {{ cookiecutter.module_type }}
8 | description: "TODO: add a description here"
9 | sdk: gradio
10 | sdk_version: 3.19.1
11 | app_file: app.py
12 | pinned: false
13 | ---
14 |
15 | # {{ cookiecutter.module_type|capitalize }} Card for {{ cookiecutter.module_name }}
16 |
17 | ***Module Card Instructions:*** *Fill out the following subsections. Feel free to take a look at existing {{ cookiecutter.module_type }} cards if you'd like examples.*
18 |
19 | ## {{ cookiecutter.module_type|capitalize }} Description
20 | *Give a brief overview of this {{ cookiecutter.module_type }}, including what task(s) it is usually used for, if any.*
21 |
22 | ## How to Use
23 | *Give general statement of how to use the {{ cookiecutter.module_type }}*
24 |
25 | *Provide simplest possible example for using the {{ cookiecutter.module_type }}*
26 |
27 | ### Inputs
28 | *List all input arguments in the format below*
29 | - **input_field** *(type): Definition of input, with explanation if necessary. State any default value(s).*
30 |
31 | ### Output Values
32 |
33 | *Explain what this {{ cookiecutter.module_type }} outputs and provide an example of what the {{ cookiecutter.module_type }} output looks like. Modules should return a dictionary with one or multiple key-value pairs, e.g. {"bleu" : 6.02}*
34 |
35 | *State the range of possible values that the {{ cookiecutter.module_type }}'s output can take, as well as what in that range is considered good. For example: "This {{ cookiecutter.module_type }} can take on any value between 0 and 100, inclusive. Higher scores are better."*
36 |
37 | #### Values from Popular Papers
38 | *Give examples, preferrably with links to leaderboards or publications, to papers that have reported this {{ cookiecutter.module_type }}, along with the values they have reported.*
39 |
40 | ### Examples
41 | *Give code examples of the {{ cookiecutter.module_type }} being used. Try to include examples that clear up any potential ambiguity left from the {{ cookiecutter.module_type }} description above. If possible, provide a range of examples that show both typical and atypical results, as well as examples where a variety of input parameters are passed.*
42 |
43 | ## Limitations and Bias
44 | *Note any known limitations or biases that the {{ cookiecutter.module_type }} has, with links and references if possible.*
45 |
46 | ## Citation
47 | *Cite the source where this {{ cookiecutter.module_type }} was introduced.*
48 |
49 | ## Further References
50 | *Add any useful further references.*
51 |
--------------------------------------------------------------------------------
/templates/{{ cookiecutter.module_slug }}/app.py:
--------------------------------------------------------------------------------
1 | import evaluate
2 | from evaluate.utils import launch_gradio_widget
3 |
4 |
5 | module = evaluate.load("{{ cookiecutter.namespace }}/{{ cookiecutter.module_slug }}")
6 | launch_gradio_widget(module)
--------------------------------------------------------------------------------
/templates/{{ cookiecutter.module_slug }}/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/huggingface/evaluate@main
--------------------------------------------------------------------------------
/templates/{{ cookiecutter.module_slug }}/tests.py:
--------------------------------------------------------------------------------
1 | test_cases = [
2 | {
3 | "predictions": [0, 0],
4 | "references": [1, 1],
5 | "result": {"metric_score": 0}
6 | },
7 | {
8 | "predictions": [1, 1],
9 | "references": [1, 1],
10 | "result": {"metric_score": 1}
11 | },
12 | {
13 | "predictions": [1, 0],
14 | "references": [1, 1],
15 | "result": {"metric_score": 0.5}
16 | }
17 | ]
--------------------------------------------------------------------------------
/templates/{{ cookiecutter.module_slug }}/{{ cookiecutter.module_slug }}.py:
--------------------------------------------------------------------------------
1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | """TODO: Add a description here."""
15 |
16 | import evaluate
17 | import datasets
18 |
19 |
20 | # TODO: Add BibTeX citation
21 | _CITATION = """\
22 | @InProceedings{huggingface:module,
23 | title = {A great new module},
24 | authors={huggingface, Inc.},
25 | year={2020}
26 | }
27 | """
28 |
29 | # TODO: Add description of the module here
30 | _DESCRIPTION = """\
31 | This new module is designed to solve this great ML task and is crafted with a lot of care.
32 | """
33 |
34 |
35 | # TODO: Add description of the arguments of the module here
36 | _KWARGS_DESCRIPTION = """
37 | Calculates how good are predictions given some references, using certain scores
38 | Args:
39 | predictions: list of predictions to score. Each predictions
40 | should be a string with tokens separated by spaces.
41 | references: list of reference for each prediction. Each
42 | reference should be a string with tokens separated by spaces.
43 | Returns:
44 | accuracy: description of the first score,
45 | another_score: description of the second score,
46 | Examples:
47 | Examples should be written in doctest format, and should illustrate how
48 | to use the function.
49 |
50 | >>> my_new_module = evaluate.load("my_new_module")
51 | >>> results = my_new_module.compute(references=[0, 1], predictions=[0, 1])
52 | >>> print(results)
53 | {'accuracy': 1.0}
54 | """
55 |
56 | # TODO: Define external resources urls if needed
57 | BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
58 |
59 |
60 | @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
61 | class {{ cookiecutter.module_class_name }}(evaluate.{{ cookiecutter.module_type | capitalize}}):
62 | """TODO: Short description of my evaluation module."""
63 |
64 | def _info(self):
65 | # TODO: Specifies the evaluate.EvaluationModuleInfo object
66 | return evaluate.{{ cookiecutter.module_type | capitalize}}Info(
67 | # This is the description that will appear on the modules page.
68 | module_type="{{ cookiecutter.module_type}}",
69 | description=_DESCRIPTION,
70 | citation=_CITATION,
71 | inputs_description=_KWARGS_DESCRIPTION,
72 | # This defines the format of each prediction and reference
73 | features=datasets.Features({
74 | 'predictions': datasets.Value('int64'),
75 | 'references': datasets.Value('int64'),
76 | }),
77 | # Homepage of the module for documentation
78 | homepage="http://module.homepage",
79 | # Additional links to the codebase or references
80 | codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
81 | reference_urls=["http://path.to.reference.url/new_module"]
82 | )
83 |
84 | def _download_and_prepare(self, dl_manager):
85 | """Optional: download external resources useful to compute the scores"""
86 | # TODO: Download external resources if needed
87 | pass
88 |
89 | def _compute(self, predictions, references):
90 | """Returns the scores"""
91 | # TODO: Compute the different scores of the module
92 | accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
93 | return {
94 | "accuracy": accuracy,
95 | }
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/evaluate/5aa3982a9a8c86e506860e381d428a64b0cce73b/tests/__init__.py
--------------------------------------------------------------------------------
/tests/test_evaluation_suite.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | from evaluate import EvaluationSuite
4 | from tests.test_evaluator import DummyTextClassificationPipeline
5 |
6 |
7 | class TestEvaluationSuite(TestCase):
8 | def setUp(self):
9 | # Check that the EvaluationSuite loads successfully
10 | self.evaluation_suite = EvaluationSuite.load("evaluate/evaluation-suite-ci")
11 |
12 | # Setup a dummy model for usage with the EvaluationSuite
13 | self.dummy_model = DummyTextClassificationPipeline()
14 |
15 | def test_running_evaluation_suite(self):
16 |
17 | # Check that the evaluation suite successfully runs
18 | results = self.evaluation_suite.run(self.dummy_model)
19 |
20 | # Check that the results are correct
21 | for r in results:
22 | self.assertEqual(r["accuracy"], 0.5)
23 |
24 | # Check that correct number of tasks were run
25 | self.assertEqual(len(results), 2)
26 |
27 | def test_empty_suite(self):
28 |
29 | self.empty_suite = self.evaluation_suite
30 | self.empty_suite.suite = []
31 | self.assertRaises(ValueError, self.empty_suite.run, self.dummy_model)
32 |
--------------------------------------------------------------------------------
/tests/test_file_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 | from unittest.mock import patch
4 |
5 | import pytest
6 |
7 | from evaluate.utils.file_utils import OfflineModeIsEnabled, cached_path, ftp_get, ftp_head, http_get, http_head
8 |
9 |
10 | FILE_CONTENT = """\
11 | Text data.
12 | Second line of data."""
13 |
14 |
15 | def test_cached_path_local(text_file):
16 | # absolute path
17 | text_file = str(Path(text_file).resolve())
18 | assert cached_path(text_file) == text_file
19 | # relative path
20 | text_file = str(Path(__file__).resolve().relative_to(Path(os.getcwd())))
21 | assert cached_path(text_file) == text_file
22 |
23 |
24 | def test_cached_path_missing_local(tmp_path):
25 | # absolute path
26 | missing_file = str(tmp_path.resolve() / "__missing_file__.txt")
27 | with pytest.raises(FileNotFoundError):
28 | cached_path(missing_file)
29 | # relative path
30 | missing_file = "./__missing_file__.txt"
31 | with pytest.raises(FileNotFoundError):
32 | cached_path(missing_file)
33 |
34 |
35 | @patch("evaluate.config.HF_EVALUATE_OFFLINE", True)
36 | def test_cached_path_offline():
37 | with pytest.raises(OfflineModeIsEnabled):
38 | cached_path("https://huggingface.co")
39 |
40 |
41 | @patch("evaluate.config.HF_EVALUATE_OFFLINE", True)
42 | def test_http_offline(tmp_path_factory):
43 | filename = tmp_path_factory.mktemp("data") / "file.html"
44 | with pytest.raises(OfflineModeIsEnabled):
45 | http_get("https://huggingface.co", temp_file=filename)
46 | with pytest.raises(OfflineModeIsEnabled):
47 | http_head("https://huggingface.co")
48 |
49 |
50 | @patch("evaluate.config.HF_EVALUATE_OFFLINE", True)
51 | def test_ftp_offline(tmp_path_factory):
52 | filename = tmp_path_factory.mktemp("data") / "file.html"
53 | with pytest.raises(OfflineModeIsEnabled):
54 | ftp_get("ftp://huggingface.co", temp_file=filename)
55 | with pytest.raises(OfflineModeIsEnabled):
56 | ftp_head("ftp://huggingface.co")
57 |
--------------------------------------------------------------------------------
/tests/test_save.py:
--------------------------------------------------------------------------------
1 | import json
2 | import shutil
3 | import tempfile
4 | from pathlib import Path
5 | from unittest import TestCase
6 |
7 | import evaluate
8 |
9 |
10 | result_dict = {"metric": 1.0, "model_name": "x"}
11 |
12 | SAVE_EXTRA_KEYS = ["_timestamp", "_git_commit_hash", "_evaluate_version", "_python_version", "_interpreter_path"]
13 |
14 |
15 | class TestSave(TestCase):
16 | def setUp(self):
17 | self.save_path = Path(tempfile.mkdtemp())
18 |
19 | def tearDown(self):
20 | shutil.rmtree(self.save_path)
21 |
22 | def test_save_to_folder(self):
23 | file_path = evaluate.save(self.save_path, **result_dict)
24 | with open(file_path, "r") as f:
25 | loaded_result_dict = json.load(f)
26 | for key in SAVE_EXTRA_KEYS:
27 | _ = loaded_result_dict.pop(key)
28 | self.assertDictEqual(result_dict, loaded_result_dict)
29 |
30 | def test_save_to_folder_nested(self):
31 | file_path = evaluate.save(self.save_path / "sub_dir1/sub_dir2", **result_dict)
32 | with open(file_path, "r") as f:
33 | loaded_result_dict = json.load(f)
34 | for key in SAVE_EXTRA_KEYS:
35 | _ = loaded_result_dict.pop(key)
36 | self.assertDictEqual(result_dict, loaded_result_dict)
37 |
38 | def test_save_to_file(self):
39 | _ = evaluate.save(self.save_path / "test.json", **result_dict)
40 | with open(self.save_path / "test.json", "r") as f:
41 | loaded_result_dict = json.load(f)
42 | for key in SAVE_EXTRA_KEYS:
43 | _ = loaded_result_dict.pop(key)
44 | self.assertDictEqual(result_dict, loaded_result_dict)
45 |
--------------------------------------------------------------------------------
/tests/test_viz.py:
--------------------------------------------------------------------------------
1 | from unittest import TestCase
2 |
3 | import matplotlib.pyplot as plt
4 |
5 | from evaluate.visualization import radar_plot
6 |
7 |
8 | class TestViz(TestCase):
9 | def test_invert_range(self):
10 | data = [{"accuracy": 0.9, "precision": 0.8}, {"accuracy": 0.7, "precision": 0.6}]
11 | model_names = ["model1", "model2"]
12 | wrong_invert_range = ["latency_in_seconds"] # Value not present in data
13 | with self.assertRaises(ValueError):
14 | radar_plot(data, model_names, wrong_invert_range)
15 |
16 | def test_output_is_plot(self):
17 | data = [
18 | {"accuracy": 0.9, "precision": 0.8, "latency_in_seconds": 48.1},
19 | {"accuracy": 0.7, "precision": 0.6, "latency_in_seconds": 51.4},
20 | ]
21 | model_names = ["model1", "model2"]
22 | invert_range = ["latency_in_seconds"]
23 | out_plt = radar_plot(data, model_names, invert_range)
24 | self.assertIsInstance(out_plt, plt.Figure)
25 |
--------------------------------------------------------------------------------