├── src
└── lighteval
│ ├── py.typed
│ ├── tasks
│ ├── tasks
│ │ ├── jeopardy.py
│ │ ├── pubmedqa.py
│ │ ├── quac.py
│ │ ├── natural_questions.py
│ │ ├── toxigen.py
│ │ ├── coqa.py
│ │ ├── real_toxicity_prompts.py
│ │ ├── prost.py
│ │ ├── narrativeqa.py
│ │ ├── legalsupport.py
│ │ ├── sciq.py
│ │ ├── qasper.py
│ │ ├── webqs.py
│ │ ├── aimo.py
│ │ ├── asdiv.py
│ │ ├── twitterAAE.py
│ │ ├── logiqa.py
│ │ ├── winogrande.py
│ │ ├── swag.py
│ │ ├── med_dialog.py
│ │ ├── piqa.py
│ │ ├── hellaswag.py
│ │ ├── storycloze.py
│ │ ├── squad_v2.py
│ │ ├── mathqa.py
│ │ ├── triviaqa.py
│ │ ├── simpleqa.py
│ │ ├── entity_data_imputation.py
│ │ ├── covid_dialogue.py
│ │ ├── babi_qa.py
│ │ └── openbookqa.py
│ ├── templates
│ │ ├── __init__.py
│ │ └── utils
│ │ │ ├── __init__.py
│ │ │ └── adapter_utils.py
│ ├── multilingual
│ │ ├── __init__.py
│ │ ├── utils
│ │ │ ├── __init__.py
│ │ │ └── task_utils.py
│ │ └── tasks
│ │ │ ├── cmath.py
│ │ │ ├── chegeka.py
│ │ │ ├── french_triviqa.py
│ │ │ ├── tquad_v2.py
│ │ │ ├── thaiqa.py
│ │ │ ├── kenswquad.py
│ │ │ ├── french_boolq.py
│ │ │ ├── fquad_v2.py
│ │ │ ├── cmrc2018.py
│ │ │ ├── sber_squad.py
│ │ │ ├── chinese_squad.py
│ │ │ ├── squad_it.py
│ │ │ ├── arcd.py
│ │ │ ├── squad_es.py
│ │ │ ├── faquad.py
│ │ │ ├── germanquad.py
│ │ │ ├── hindi_boolq.py
│ │ │ ├── mintaka.py
│ │ │ ├── mgsm.py
│ │ │ ├── soqal.py
│ │ │ ├── thai_exams.py
│ │ │ ├── hellaswag_tel.py
│ │ │ ├── arabic_arc.py
│ │ │ ├── hellaswag_hin.py
│ │ │ ├── tydiqa.py
│ │ │ ├── afri_mgsm.py
│ │ │ └── openbook_es.py
│ └── __init__.py
│ ├── utils
│ └── __init__.py
│ ├── metrics
│ └── imports
│ │ └── __init__.py
│ └── __init__.py
├── docs
└── source
│ ├── package_reference
│ ├── doc.mdx
│ ├── evaluation_tracker.mdx
│ ├── models_outputs.mdx
│ ├── pipeline.mdx
│ ├── logging.mdx
│ ├── tasks.mdx
│ └── models.mdx
│ ├── available-tasks.mdx
│ ├── _toctree.yml
│ └── caching.mdx
├── MANIFEST.in
├── examples
├── tasks
│ ├── serbian_task_group
│ │ ├── sr_all_inclusive.txt
│ │ ├── sr_custom_task.txt
│ │ ├── sr_qa_knowledge.txt
│ │ ├── sr_arc.txt
│ │ ├── sr_mmlu_business_professional.txt
│ │ ├── sr_commonsense_reasoning.txt
│ │ ├── sr_mmlu_social_sciences.txt
│ │ ├── sr_mmlu_ethics_philosophy.txt
│ │ ├── sr_misc.txt
│ │ ├── sr_mmlu_math_logic.txt
│ │ ├── sr_mmlu_college_level.txt
│ │ └── sr_mmlu_high_school_level.txt
│ ├── all_german_rag_evals.txt
│ ├── fine_tasks
│ │ ├── cf
│ │ │ ├── th.txt
│ │ │ ├── te.txt
│ │ │ ├── fr.txt
│ │ │ ├── tr.txt
│ │ │ ├── sw.txt
│ │ │ ├── hi.txt
│ │ │ ├── ru.txt
│ │ │ ├── zh.txt
│ │ │ └── ar.txt
│ │ └── mcf
│ │ │ ├── th.txt
│ │ │ ├── te.txt
│ │ │ ├── fr.txt
│ │ │ ├── tr.txt
│ │ │ ├── sw.txt
│ │ │ ├── hi.txt
│ │ │ ├── ru.txt
│ │ │ ├── zh.txt
│ │ │ └── ar.txt
│ ├── all_filipino_tasks.txt
│ └── bbh.txt
├── model_configs
│ ├── inference_providers.yaml
│ ├── transformers_vlm_model.yaml
│ ├── tgi_model.yaml
│ ├── litellm_model.yaml
│ ├── quantized_model.yaml
│ ├── transformers_model.yaml
│ ├── sglang_model_config.yaml
│ ├── vllm_model_config.yaml
│ ├── peft_model.yaml
│ └── endpoint_model.yaml
├── nanotron
│ └── lighteval_config_override_template.yaml
└── test_tasks.txt
├── .gitattributes
├── Makefile
├── tests
├── reference_scores
│ ├── harness_metrics.json
│ ├── harness_prompts.json
│ ├── Qwen2.5-VL-3B-Instruct-results-vlm.json
│ ├── Qwen2.5-VL-7B-Instruct-results-vlm.json
│ ├── SmolLM2-1.7B-Instruct-results-vllm.json
│ └── SmolLM2-1.7B-Instruct-results-accelerate.json
├── reference_details
│ ├── SmolLM2-1.7B-Instruct-vllm
│ │ ├── details_gsm8k_test|0_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_hellaswag|10_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_agieval:lsat-ar|0_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_agieval:lsat-lr|0_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_agieval:sat-en|0_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_arc:challenge|25_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_truthfulqa:mc|0_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_agieval:aqua-rat|0_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_agieval:logiqa-en|0_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_agieval:lsat-rc|0_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_bigbench_hard:snarks|3_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_bigbench_hard:navigate|3_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_bigbench_hard:ruin_names|3_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_mmlu:college_chemistry|5_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_mmlu:us_foreign_policy|5_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_agieval:sat-en-without-passage|0_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_bigbench_hard:causal_judgment|3_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_bigbench_hard:geometric_shapes|3_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_bigbench_hard:date_understanding|3_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_bigbench_hard:disambiguation_qa|3_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_bigbench_hard:movie_recommendation|3_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_bigbench_hard:temporal_sequences|3_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_bigbench_hard:logical_deduction_five_objects|3_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_bigbench_hard:logical_deduction_seven_objects|3_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_bigbench_hard:salient_translation_error_detection|3_2025-11-05T14-52-08.352779.parquet
│ │ ├── details_bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-11-05T14-52-08.352779.parquet
│ │ └── details_bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-11-05T14-52-08.352779.parquet
│ ├── SmolLM2-1.7B-Instruct-transformers
│ │ ├── details_gsm8k_test|0_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_hellaswag|10_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_agieval:aqua-rat|0_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_agieval:logiqa-en|0_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_agieval:lsat-ar|0_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_agieval:lsat-lr|0_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_agieval:lsat-rc|0_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_agieval:sat-en|0_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_arc:challenge|25_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_truthfulqa:mc|0_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_bigbench_hard:navigate|3_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_bigbench_hard:snarks|3_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_mmlu:college_chemistry|5_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_mmlu:us_foreign_policy|5_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_bigbench_hard:ruin_names|3_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_agieval:sat-en-without-passage|0_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_bigbench_hard:causal_judgment|3_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_bigbench_hard:date_understanding|3_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_bigbench_hard:disambiguation_qa|3_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_bigbench_hard:geometric_shapes|3_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_bigbench_hard:temporal_sequences|3_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_bigbench_hard:movie_recommendation|3_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_bigbench_hard:logical_deduction_five_objects|3_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_bigbench_hard:logical_deduction_seven_objects|3_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_bigbench_hard:salient_translation_error_detection|3_2025-11-05T14-43-47.148527.parquet
│ │ ├── details_bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-11-05T14-43-47.148527.parquet
│ │ └── details_bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-11-05T14-43-47.148527.parquet
│ └── Qwen2.5-VL-3B-Instruct-vlm
│ │ └── details_mmmu_pro:standard-4|0_2025-11-05T15-23-34.026089.parquet
├── unit
│ ├── metrics
│ │ ├── pytest.ini
│ │ └── test_cases
│ │ │ ├── rouge1.json
│ │ │ ├── simpleqa_judge.json
│ │ │ ├── bert_score.json
│ │ │ ├── bits_per_byte.json
│ │ │ ├── byte_perplexity.json
│ │ │ ├── expr_gold_metric.json
│ │ │ ├── prediction_perplexity.json
│ │ │ ├── mcc.json
│ │ │ ├── exact_match.json
│ │ │ ├── acc_golds_likelihood.json
│ │ │ ├── avg_at_k_math.json
│ │ │ ├── pass_at_k_math.json
│ │ │ ├── avg_at_k.json
│ │ │ └── drop.json
│ └── models
│ │ ├── test_base_model.py
│ │ └── test_abstract_model.py
├── conftest.py
├── __init__.py
└── slow_tests
│ └── __init__.py
├── .github
├── workflows
│ ├── pr_style_bot.yaml
│ ├── trufflehog.yml
│ ├── doc-build.yml
│ ├── doc-pr-build.yml
│ ├── doc-pr-upload.yml
│ ├── quality.yaml
│ ├── slow_tests.yaml
│ └── tests.yaml
├── ISSUE_TEMPLATE
│ ├── evaluation-task-request.md
│ ├── feature-request.md
│ └── bug_report.md
└── release.yml
├── LICENSE
├── setup.py
└── .pre-commit-config.yaml
/src/lighteval/py.typed:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/source/package_reference/doc.mdx:
--------------------------------------------------------------------------------
1 | # Doc
2 |
3 | [[autodoc]] tasks.requests.Doc
4 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include src/lighteval/tasks/tasks_table.jsonl
2 | include src/lighteval/metrics/*.jsonl
3 |
--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_all_inclusive.txt:
--------------------------------------------------------------------------------
1 | # MMLU (All-inclusive Task Entry)
2 | community|serbian_evals:mmlu|0
3 |
--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_custom_task.txt:
--------------------------------------------------------------------------------
1 | # Serbian Evaluations - Custom/Other Task
2 | community|serbian_evals:oz_eval|0
3 |
--------------------------------------------------------------------------------
/docs/source/package_reference/evaluation_tracker.mdx:
--------------------------------------------------------------------------------
1 | # EvaluationTracker
2 |
3 | [[autodoc]] logging.evaluation_tracker.EvaluationTracker
4 |
--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_qa_knowledge.txt:
--------------------------------------------------------------------------------
1 | # Question Answering and Knowledge
2 | community|serbian_evals:boolq|0
3 | community|serbian_evals:openbook|0
4 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.json filter=lfs diff=lfs merge=lfs -text
2 | tests/unit/metrics/test_cases/*.json -filter -diff -merge text
3 | *.parquet filter=lfs diff=lfs merge=lfs -text
4 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | .PHONY: style format
2 |
3 |
4 | style:
5 | ruff format .
6 | ruff check --fix .
7 |
8 |
9 | quality:
10 | ruff format --check .
11 | ruff check .
12 |
--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_arc.txt:
--------------------------------------------------------------------------------
1 | # Serbian Evaluations - ARC (AI2 Reasoning Challenge)
2 | community|serbian_evals:arc_easy|0
3 | community|serbian_evals:arc_challenge|0
4 |
--------------------------------------------------------------------------------
/tests/reference_scores/harness_metrics.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c2080305011a7ac8b0895ec1fbb26b45af4e3dced6272abf67156ebf57656f88
3 | size 48360080
4 |
--------------------------------------------------------------------------------
/tests/reference_scores/harness_prompts.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:059a48631d4243cda36d067db50350639c12b0a88fb209f76bbcd0c3ff266ffb
3 | size 20244711
4 |
--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_mmlu_business_professional.txt:
--------------------------------------------------------------------------------
1 | # MMLU (Business Professional)
2 | community|serbian_evals:mmlu_marketing|0
3 | community|serbian_evals:mmlu_manadzment|0
4 |
--------------------------------------------------------------------------------
/tests/reference_scores/Qwen2.5-VL-3B-Instruct-results-vlm.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5b38f65703ddd426111ba45e6f6b8b82ee2049c7e754e977a5c6269aa2d94ade
3 | size 3968
4 |
--------------------------------------------------------------------------------
/tests/reference_scores/Qwen2.5-VL-7B-Instruct-results-vlm.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d07d8341188999f359a530e1dae4cd8ec3936d4046232a68b90a56c9f2994b3c
3 | size 3083
4 |
--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_commonsense_reasoning.txt:
--------------------------------------------------------------------------------
1 | # Commonsense Reasoning
2 | community|serbian_evals:hellaswag|0
3 | community|serbian_evals:piqa|0
4 | community|serbian_evals:winogrande|0
5 |
--------------------------------------------------------------------------------
/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4dcc899c5963df3e98cc9d144f3c582edda227d8d9e2c24fabc1f794a4fab524
3 | size 47986
4 |
--------------------------------------------------------------------------------
/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:55b420e5ff6061d2d2d66c3f9ce8cab541820766b1bd7becc0d7b290b99144b6
3 | size 47858
4 |
--------------------------------------------------------------------------------
/docs/source/package_reference/models_outputs.mdx:
--------------------------------------------------------------------------------
1 | # Model's Output
2 |
3 | All models will generate an ouput per Doc supplied to the `generation` or `loglikelihood` fuctions.
4 |
5 | [[autodoc]] lighteval.models.model_output.ModelResponse
6 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_gsm8k_test|0_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:df34c40c43eeea4355e86ec505b053db421189b2082c670409e66d93defdd0d1
3 | size 39054
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_hellaswag|10_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:973fa1740490bf212831075ac9842dd88a31db7aa422e240c01eafb840979207
3 | size 88599
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:lsat-ar|0_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:29588411e9390fe550e3ca353e0d7c89e381d25673ced35399f5896e0c613216
3 | size 50719
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:lsat-lr|0_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:450f5a17118613189a749f0fd9f7807265b43733482367e969b04ae7971a749c
3 | size 55774
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:sat-en|0_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:bd4eed73faaf58a18a302a1d2f0c8b8b8e1fbd482a5fd4ca77c375f1e3082f0e
3 | size 109931
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_arc:challenge|25_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7bab826310f526d7aaa9c5e15ff50314524d54847de37699b762adec3c57fb78
3 | size 144793
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_truthfulqa:mc|0_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ec46468169068183da1c57ace7064fcfa8664e4acad3a76f2d37e260468b67ee
3 | size 32367
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_gsm8k_test|0_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:419f252eddb158e185b515b39ca9e1784f7b4122a620a2a67034178bb1ea6abb
3 | size 35694
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_hellaswag|10_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2c74caa34cd1c6b227a1d66dcda7a0c61986435f925f17cf81e676f8c542d146
3 | size 67250
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:aqua-rat|0_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6337b98efe5a6d10a02d4c13d78bcff056797c65999dbfb8ef5ab128f88fe4cf
3 | size 26482
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:logiqa-en|0_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1a998148dbb28f6826861479e8d9fc7bf7f73b0ab6921dc9a6da811e70eddba4
3 | size 45688
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:lsat-rc|0_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5fab7e1da1cdd0e8f66831b57bcda28b6350c7cf16c9905417746278a6f30f31
3 | size 148786
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:snarks|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:00a0ea645c10c6a8e0d55408f20bf59f95c3cd135afaebc5df0d1fbb89c3b93d
3 | size 37857
4 |
--------------------------------------------------------------------------------
/tests/reference_details/Qwen2.5-VL-3B-Instruct-vlm/details_mmmu_pro:standard-4|0_2025-11-05T15-23-34.026089.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5eda1dbcf8c9604005ce8c27239a57e5f41f852dbd3da13656d94b01b325f16c
3 | size 11538690
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:aqua-rat|0_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ef968329bee498b3387ec8df3677ca9bbac72e90599efbe7f78db23f4227b2f6
3 | size 21785
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:logiqa-en|0_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5bb00dd8c872d95dd5b2999d788ece8c34d43b5b5ea4ef8f0859ba535d7b8cbf
3 | size 34021
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:lsat-ar|0_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:161eb467544ca6273231945067a8d70aedb0e4e6c3eba4e8b40532cd1c37e6b0
3 | size 30662
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:lsat-lr|0_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:da3a3dcc9ae24c6f3bc80f0ce72b64d09a5ab19d7803b168a8f1ad540f7f66c1
3 | size 39332
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:lsat-rc|0_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fb3787ff3e796b49199e8d438a30ce768438b6d4fc5df5941e393bfd1fdf2ae6
3 | size 74124
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:sat-en|0_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8a20696f4036b3a6e2b41b309f405d9a5445c4573150fc7e7bec15f28fd77bdf
3 | size 72441
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_arc:challenge|25_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:803a445f6d563c2d2097df7c7772cd81b3df0bacc9e702caaab5f0dde7fe5b25
3 | size 87624
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_truthfulqa:mc|0_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d0886aca82f5687e1fd742c0c5b9fe046fb20622d9e67818d21d7300de27e746
3 | size 26034
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:navigate|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e62a25828dddf557091b9dcbc065f2c9e36fdf0c8d365dd1976584fc3f516eae
3 | size 34538
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:ruin_names|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:480bb8570ec01a173ebe85649989dc9a8ab64a4a2de2152d82bc344787bfffee
3 | size 34511
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_mmlu:college_chemistry|5_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fa71775ee3b585b1612f2bbbd8327ba4f564c9eddcdce63533fd6d11c67c5d95
3 | size 50977
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_mmlu:us_foreign_policy|5_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c5cae6c52cee4a49e9cf28920ec303235a15f2b9f075b4f2deff65bd220aea77
3 | size 52510
4 |
--------------------------------------------------------------------------------
/examples/tasks/all_german_rag_evals.txt:
--------------------------------------------------------------------------------
1 | community|german_rag_eval:choose_question_by_context|0
2 | community|german_rag_eval:choose_context_by_question|0
3 | community|german_rag_eval:question_answer_match|0
4 | community|german_rag_eval:context_question_match|0
5 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:navigate|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6baa807a3c85b3ca43708f5810f3898dd517db806ce4106124d4913b0fcea8b0
3 | size 28834
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:snarks|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:13659b3b6bcd5c2744fc3b33d8752589a1f6c52b2ed8ee17c6a3a4f28cd46908
3 | size 29149
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_mmlu:college_chemistry|5_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:25ea151a418829d528da4f762102052308b1cbb15b00d7190d5d0b9fd033436d
3 | size 37684
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_mmlu:us_foreign_policy|5_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:df6206dba3812d089f03078b97d067eb851eee03fd2aa295cbac2551f82837c0
3 | size 38453
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:sat-en-without-passage|0_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:61c1a8942675c523f499e19b01432fb0ce7b0cb8bbd910fe0a16b2b60bb7e80c
3 | size 32704
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:causal_judgment|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4b020f10c882cb482d6e2ac597e14d662a5d38f873f5f7eada8c839c5e13b476
3 | size 72052
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:geometric_shapes|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5ecdb28b097666cd4ccb0a208708a2076779e06145719ebd24e75060a272bdcf
3 | size 49571
4 |
--------------------------------------------------------------------------------
/examples/model_configs/inference_providers.yaml:
--------------------------------------------------------------------------------
1 | model_parameters:
2 | model_name: "meta-llama/Llama-3.1-8B-Instruct"
3 | provider: "nebius"
4 | timeout: null
5 | proxies: null
6 | parallel_calls_count: 20
7 | generation_parameters:
8 | temperature: 0.4
9 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:ruin_names|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7fe3c64179df68407c6fe6c6ad75b3f4a83a1edd286ca81da3fe96bcb5b21e9b
3 | size 27971
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:date_understanding|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a8a1f1ef9ba8e3a58d2538f2f2e016769155f2b6c18da49454849c8b276cd398
3 | size 39423
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:disambiguation_qa|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:13fa9f4b64152c7140112b607f6dfddb5f9f755646bbef0b9cc5516a9c0e6de4
3 | size 38263
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:movie_recommendation|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:88e7d66c2396ab8a3f56ae9f4a342037a0f13f4ed83982312fdc7424eb74f60b
3 | size 36502
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:temporal_sequences|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:226b2d3fc783dcfecf3c634558746b1314f9f80a32a259c9fe174332fb1e3173
3 | size 50277
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:sat-en-without-passage|0_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:47204bfa1d6843f06ef6c08bb66d85adceab6457295f03303b7cd39bc7e4dd37
3 | size 25864
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:causal_judgment|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:eb844c74f574b377e4b27110dbdf0c28c227a96f4e8d1c0eac52578f4608bc49
3 | size 47558
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:date_understanding|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b5ce79da0c3657667830882fa28ce623cb463bf5fb3c5e1367d6a5c13c480973
3 | size 30006
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:disambiguation_qa|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4d4919aa444d52a1589883329eb3fdbb583b029a6213d4af13aa17c11a835399
3 | size 30932
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:geometric_shapes|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0dffe2c495874fa53e0289b2307161107c54e9d15c9a8aa39016c990f7d62f8f
3 | size 32464
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:temporal_sequences|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:23ffe95306670d3a737b30bf34866734dcba717742011a424cc0230377f52363
3 | size 34393
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:movie_recommendation|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:64975666dfc61cd3a3a515a88134775c0f90cff1e1b9120a8ab9c8861c68bb99
3 | size 29221
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:logical_deduction_five_objects|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4582e35192caeae218a50aa76738582d360914fd96cc9a4c3608d3683c44c33a
3 | size 47557
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:logical_deduction_seven_objects|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1a4b6292fb5df093df5ac43fba76b0af5b00337e0d2579a9c2b2f6398007b842
3 | size 56164
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:salient_translation_error_detection|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ed82ceb8a3c05ae2c47b1769b333173e15069eb83710bc5d66918abb4ef4b7e7
3 | size 69137
4 |
--------------------------------------------------------------------------------
/docs/source/package_reference/pipeline.mdx:
--------------------------------------------------------------------------------
1 | # Pipeline
2 |
3 | ## Pipeline
4 |
5 | [[autodoc]] pipeline.Pipeline
6 |
7 | ## PipelineParameters
8 |
9 | [[autodoc]] pipeline.PipelineParameters
10 |
11 | ## ParallelismManager
12 |
13 | [[autodoc]] pipeline.ParallelismManager
14 |
--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_mmlu_social_sciences.txt:
--------------------------------------------------------------------------------
1 | # MMLU (Social Sciences)
2 | community|serbian_evals:mmlu_globalne_cinjenice|0
3 | community|serbian_evals:mmlu_logicke_zablude|0
4 | community|serbian_evals:mmlu_sociologija|0
5 | community|serbian_evals:mmlu_human_aging|0
6 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:logical_deduction_five_objects|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1d10ce12e8f76b5ce3113273124e7683e5c5bddde6063cd3cbf25d495cffa6ba
3 | size 34653
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:logical_deduction_seven_objects|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e15bcad77e0453d7e987b4bf5216639b625f9df63341dfce4246dab88b87ca35
3 | size 38176
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f6dd8f8d104f1a4252685019e5413ce9ecfc4611bb819ff627e77be296afc581
3 | size 52493
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9651280724d245b37a7c3dde465c5a384de7b12055b9474696d533d58330b240
3 | size 59838
4 |
--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_mmlu_ethics_philosophy.txt:
--------------------------------------------------------------------------------
1 | # MMLU (Ethics, Philosophy)
2 | community|serbian_evals:mmlu_moralni_sporovi|0
3 | community|serbian_evals:mmlu_moralne_dileme|0
4 | community|serbian_evals:mmlu_filozofija|0
5 | community|serbian_evals:mmlu_svetska_religija|0
6 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:salient_translation_error_detection|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e34151ea0415cb442b47d334448abf127c8f1747da78a5a9977ff78ed2d831b5
3 | size 49337
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1d7e589d611391395b2990a29e55bdd856ab440d45cba22fcd190936daf391dd
3 | size 34842
4 |
--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8e116d939941d57db2e5515114bec4890b56b6a35a5a2e49c809e6361b947337
3 | size 37387
4 |
--------------------------------------------------------------------------------
/examples/model_configs/transformers_vlm_model.yaml:
--------------------------------------------------------------------------------
1 | model_parameters:
2 | model_name: "Qwen/Qwen2.5-VL-3B-Instruct"
3 | revision: "main"
4 | dtype: "float16"
5 | compile: false
6 | model_parallel: false
7 | batch_size: 1
8 | use_fast_image_processor: true
9 | generation_parameters:
10 | temperature: 0.0
11 | top_p: 0.9
12 |
--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_misc.txt:
--------------------------------------------------------------------------------
1 | # MMLU (Miscellaneous)
2 | community|serbian_evals:mmlu_anatomija|0
3 | community|serbian_evals:mmlu_astronomija|0
4 | community|serbian_evals:mmlu_poslovna_etika|0
5 | community|serbian_evals:mmlu_kliničko_znanje|0
6 | community|serbian_evals:mmlu_razno|0
7 | community|serbian_evals:mmlu_elektrotehnika|0
8 |
--------------------------------------------------------------------------------
/examples/model_configs/tgi_model.yaml:
--------------------------------------------------------------------------------
1 | model_parameters:
2 | inference_server_address: "http://localhost:8080" # Replace with your actual TGI server address
3 | inference_server_auth: null
4 | model_name: null # Optional, only required if the TGI container was launched with model_id pointing to a local directory
5 | generation_parameters:
6 | temperature: 0.1
7 |
--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/cf/th.txt:
--------------------------------------------------------------------------------
1 | # General Knowledge (GK)
2 | lighteval|meta_mmlu_tha_cf|0
3 | lighteval|m3exams_tha_cf|0
4 |
5 | # Reading Comprehension (RC)
6 | lighteval|belebele_tha_Thai_cf|0
7 | lighteval|thaiqa_tha|0
8 | lighteval|xquad_tha|0
9 |
10 | # Natural Language Understanding (NLU)
11 | lighteval|community_hellaswag_tha_cf|0
12 | lighteval|xnli2.0_tha_cf|0
13 |
--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/mcf/th.txt:
--------------------------------------------------------------------------------
1 | # General Knowledge (GK)
2 | lighteval|meta_mmlu_tha_mcf|5
3 | lighteval|m3exams_tha_mcf|5
4 |
5 | # Reading Comprehension (RC)
6 | lighteval|belebele_tha_Thai_mcf|5
7 | lighteval|thaiqa_tha|5
8 | lighteval|xquad_tha|5
9 |
10 | # Natural Language Understanding (NLU)
11 | lighteval|community_hellaswag_tha_mcf|5
12 | lighteval|xnli2.0_tha_mcf|5
13 |
--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_mmlu_math_logic.txt:
--------------------------------------------------------------------------------
1 | # MMLU (Math, Logic)
2 | community|serbian_evals:mmlu_abstract_algebra|0
3 | community|serbian_evals:mmlu_osnovna_matematika|0
4 | community|serbian_evals:mmlu_formalna_logika|0
5 | community|serbian_evals:mmlu_konceptualna_fizika|0
6 | community|serbian_evals:mmlu_metrika_ekonomije|0
7 | community|serbian_evals:mmlu_masinsko_ucenje|0
8 |
--------------------------------------------------------------------------------
/examples/model_configs/litellm_model.yaml:
--------------------------------------------------------------------------------
1 | model_parameters:
2 | model_name: "openai/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
3 | provider: "openai"
4 | base_url: "https://router.huggingface.co/hf-inference/v1"
5 | generation_parameters:
6 | temperature: 0.5
7 | max_new_tokens: 256
8 | top_p: 0.9
9 | seed: 0
10 | repetition_penalty: 1.0
11 | frequency_penalty: 0.0
12 |
--------------------------------------------------------------------------------
/.github/workflows/pr_style_bot.yaml:
--------------------------------------------------------------------------------
1 | name: PR Style Bot
2 |
3 | on:
4 | issue_comment:
5 | types: [created]
6 |
7 | permissions:
8 | pull-requests: write
9 |
10 | jobs:
11 | style:
12 | uses: huggingface/huggingface_hub/.github/workflows/style-bot-action.yml@main
13 | with:
14 | python_quality_dependencies: "[quality]"
15 | secrets:
16 | bot_token: ${{ secrets.HF_STYLE_BOT_ACTION }}
17 |
--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/cf/te.txt:
--------------------------------------------------------------------------------
1 | # General Knowledge (GK)
2 | lighteval|mlmm_mmlu_tel_cf|0
3 |
4 | # Reading Comprehension (RC)
5 | lighteval|belebele_tel_Telu_cf|0
6 | lighteval|indicqa_tel|0
7 |
8 | # Reasoning (RES)
9 | lighteval|indicxcopa_tel_cf|0
10 |
11 | # Natural Language Understanding (NLU)
12 | lighteval|community_hellaswag_tel_cf|0
13 | lighteval|indicnxnli_tel_cf|0
14 | lighteval|xstory_cloze_tel_cf|0
15 |
--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/mcf/te.txt:
--------------------------------------------------------------------------------
1 | # General Knowledge (GK)
2 | lighteval|mlmm_mmlu_tel_mcf|5
3 |
4 | # Reading Comprehension (RC)
5 | lighteval|belebele_tel_Telu_mcf|5
6 | lighteval|indicqa_tel|5
7 |
8 | # Reasoning (RES)
9 | lighteval|indicxcopa_tel_mcf|5
10 |
11 | # Natural Language Understanding (NLU)
12 | lighteval|community_hellaswag_tel_mcf|5
13 | lighteval|indicnxnli_tel_mcf|5
14 | lighteval|xstory_cloze_tel_mcf|5
15 |
--------------------------------------------------------------------------------
/examples/model_configs/quantized_model.yaml:
--------------------------------------------------------------------------------
1 | model_parameters:
2 | model_name: "HuggingFaceH4/zephyr-7b-beta" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ...
3 | revision: "main" # revision to use
4 | dtype: "4bit" # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization.
5 | compile: true
6 | batch_size: 1 # batch size to use
7 |
--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_mmlu_college_level.txt:
--------------------------------------------------------------------------------
1 | # MMLU (College Level Tasks)
2 | community|serbian_evals:mmlu_fakultet_biologija|0
3 | community|serbian_evals:mmlu_fakultet_hemija|0
4 | community|serbian_evals:mmlu_fakultet_racunari|0
5 | community|serbian_evals:mmlu_fakultet_matematika|0
6 | community|serbian_evals:mmlu_fakultet_medicina|0
7 | community|serbian_evals:mmlu_fakultet_fizika|0
8 | community|serbian_evals:mmlu_sigurnost_racunara|0
9 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/evaluation-task-request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Evaluation/task request
3 | about: Suggest a new evaluation you want us to add
4 | title: "[EVAL]"
5 | labels: new task
6 | assignees: ''
7 |
8 | ---
9 |
10 | ## Evaluation short description
11 | - Why is this evaluation interesting?
12 | - How used is it in the community?
13 |
14 | ## Evaluation metadata
15 | Provide all available
16 | - Paper url:
17 | - Github url:
18 | - Dataset url:
19 |
--------------------------------------------------------------------------------
/.github/workflows/trufflehog.yml:
--------------------------------------------------------------------------------
1 | on:
2 | push:
3 |
4 | name: Scan Secret Leaks
5 |
6 | permissions:
7 | contents: read
8 |
9 | jobs:
10 | trufflehog:
11 | runs-on: ubuntu-latest
12 | steps:
13 | - name: Checkout code
14 | uses: actions/checkout@v4
15 | with:
16 | fetch-depth: 0
17 | - name: Secret Scanning
18 | uses: trufflesecurity/trufflehog@main
19 | with:
20 | extra_args: --only-verified
21 |
--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/cf/fr.txt:
--------------------------------------------------------------------------------
1 | # General Knowledge (GK)
2 | lighteval|meta_mmlu_fra_cf|0
3 | lighteval|mlmm_arc_fra_cf:challenge|0
4 | lighteval|mintaka_fra|0
5 |
6 | # Reading Comprehension (RC)
7 | lighteval|belebele_fra_Latn_cf|0
8 | lighteval|fquadv2_fra|0
9 |
10 | # Reasoning (RES)
11 | lighteval|xcodah_fra_cf|0
12 | lighteval|xcsqa_fra_cf|0
13 |
14 | # Natural Language Understanding (NLU)
15 | lighteval|mlmm_hellaswag_fra_cf|0
16 | lighteval|xnli2.0_fra_cf|0
17 |
--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/cf/tr.txt:
--------------------------------------------------------------------------------
1 | # General Knowledge (GK)
2 | lighteval|community_arc_tur_cf:easy|0
3 | lighteval|exams_tur_cf|0
4 | lighteval|community_mmlu_tur_cf|0
5 |
6 | # Reading Comprehension (RC)
7 | lighteval|belebele_tur_Latn_cf|0
8 | lighteval|tquadv2_tur|0
9 | lighteval|xquad_tur|0
10 |
11 | # Reasoning (RES)
12 | lighteval|xcopa_tur_cf|0
13 |
14 | # Natural Language Understanding (NLU)
15 | lighteval|community_hellaswag_tur_cf|0
16 | lighteval|xnli2.0_tur_cf|0
17 |
--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/mcf/fr.txt:
--------------------------------------------------------------------------------
1 | # General Knowledge (GK)
2 | lighteval|meta_mmlu_fra_mcf|5
3 | lighteval|mlmm_arc_fra_mcf:challenge|5
4 | lighteval|mintaka_fra|5
5 |
6 | # Reading Comprehension (RC)
7 | lighteval|belebele_fra_Latn_mcf|5
8 | lighteval|fquadv2_fra|5
9 |
10 | # Reasoning (RES)
11 | lighteval|xcodah_fra_mcf|5
12 | lighteval|xcsqa_fra_mcf|5
13 |
14 | # Natural Language Understanding (NLU)
15 | lighteval|mlmm_hellaswag_fra_mcf|5
16 | lighteval|xnli2.0_fra_mcf|5
17 |
--------------------------------------------------------------------------------
/.github/workflows/doc-build.yml:
--------------------------------------------------------------------------------
1 | name: Build Documentation
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | - doc-builder*
8 | - v*-release
9 |
10 | jobs:
11 | build:
12 | uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
13 | with:
14 | commit_sha: ${{ github.sha }}
15 | package: lighteval
16 | secrets:
17 | token: ${{ secrets.HUGGINGFACE_PUSH }}
18 | hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
19 |
--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/mcf/tr.txt:
--------------------------------------------------------------------------------
1 | # General Knowledge (GK)
2 | lighteval|community_arc_tur_mcf:easy|5
3 | lighteval|exams_tur_mcf|5
4 | lighteval|community_mmlu_tur_mcf|5
5 |
6 | # Reading Comprehension (RC)
7 | lighteval|belebele_tur_Latn_mcf|5
8 | lighteval|tquadv2_tur|5
9 | lighteval|xquad_tur|5
10 |
11 | # Reasoning (RES)
12 | lighteval|xcopa_tur_mcf|5
13 |
14 | # Natural Language Understanding (NLU)
15 | lighteval|community_hellaswag_tur_mcf|5
16 | lighteval|xnli2.0_tur_mcf|5
17 |
--------------------------------------------------------------------------------
/.github/workflows/doc-pr-build.yml:
--------------------------------------------------------------------------------
1 | name: Build PR Documentation
2 |
3 | on:
4 | pull_request:
5 |
6 | concurrency:
7 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
8 | cancel-in-progress: true
9 |
10 | jobs:
11 | build:
12 | uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
13 | with:
14 | commit_sha: ${{ github.event.pull_request.head.sha }}
15 | pr_number: ${{ github.event.number }}
16 | package: lighteval
17 |
--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/cf/sw.txt:
--------------------------------------------------------------------------------
1 | # General Knowledge (GK)
2 | lighteval|community_arc_swa_cf:easy|0
3 | lighteval|m3exams_swa_cf|0
4 | lighteval|openai_mmlu_swa_cf|0
5 |
6 | # Reading Comprehension (RC)
7 | lighteval|belebele_swh_Latn_cf|0
8 | lighteval|kenswquad_swa|0
9 | lighteval|tydiqa_swa|0
10 |
11 | # Reasoning (RES)
12 | lighteval|xcsqa_swa_cf|0
13 | lighteval|xcopa_swa_cf|0
14 |
15 | # Natural Language Understanding (NLU)
16 | lighteval|xnli2.0_swa_cf|0
17 | lighteval|xstory_cloze_swa_cf|0
18 |
--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/mcf/sw.txt:
--------------------------------------------------------------------------------
1 | # General Knowledge (GK)
2 | lighteval|community_arc_swa_mcf:easy|5
3 | lighteval|m3exams_swa_mcf|5
4 | lighteval|openai_mmlu_swa_mcf|5
5 |
6 | # Reading Comprehension (RC)
7 | lighteval|belebele_swh_Latn_mcf|5
8 | lighteval|kenswquad_swa|5
9 | lighteval|tydiqa_swa|5
10 |
11 | # Reasoning (RES)
12 | lighteval|xcsqa_swa_mcf|5
13 | lighteval|xcopa_swa_mcf|5
14 |
15 | # Natural Language Understanding (NLU)
16 | lighteval|xnli2.0_swa_mcf|5
17 | lighteval|xstory_cloze_swa_mcf|5
18 |
--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/cf/hi.txt:
--------------------------------------------------------------------------------
1 | # General Knowledge (GK)
2 | lighteval|meta_mmlu_hin_cf|0
3 | lighteval|community_arc_hin_cf:easy|0
4 |
5 | # Reading Comprehension (RC)
6 | lighteval|belebele_hin_Deva_cf|0
7 | lighteval|indicqa_hin|0
8 |
9 | # Reasoning (RES)
10 | lighteval|xcodah_hin_cf|0
11 | lighteval|indicxcopa_hin_cf|0
12 | lighteval|xcsqa_hin_cf|0
13 |
14 | # Natural Language Understanding (NLU)
15 | lighteval|mlmm_hellaswag_hin_cf|0
16 | lighteval|indicnxnli_hin_cf|0
17 | lighteval|xstory_cloze_hin_cf|0
18 |
--------------------------------------------------------------------------------
/docs/source/package_reference/logging.mdx:
--------------------------------------------------------------------------------
1 | # Logging
2 |
3 | ## EvaluationTracker
4 | [[autodoc]] logging.evaluation_tracker.EvaluationTracker
5 |
6 | ## GeneralConfigLogger
7 | [[autodoc]] logging.info_loggers.GeneralConfigLogger
8 | ## DetailsLogger
9 | [[autodoc]] logging.info_loggers.DetailsLogger
10 | ## MetricsLogger
11 | [[autodoc]] logging.info_loggers.MetricsLogger
12 | ## VersionsLogger
13 | [[autodoc]] logging.info_loggers.VersionsLogger
14 | ## TaskConfigLogger
15 | [[autodoc]] logging.info_loggers.TaskConfigLogger
16 |
--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/mcf/hi.txt:
--------------------------------------------------------------------------------
1 | # General Knowledge (GK)
2 | lighteval|meta_mmlu_hin_mcf|5
3 | lighteval|community_arc_hin_mcf:easy|5
4 |
5 | # Reading Comprehension (RC)
6 | lighteval|belebele_hin_Deva_mcf|5
7 | lighteval|indicqa_hin|5
8 |
9 | # Reasoning (RES)
10 | lighteval|xcodah_hin_mcf|5
11 | lighteval|indicxcopa_hin_mcf|5
12 | lighteval|xcsqa_hin_mcf|5
13 |
14 | # Natural Language Understanding (NLU)
15 | lighteval|mlmm_hellaswag_hin_mcf|5
16 | lighteval|indicnxnli_hin_mcf|5
17 | lighteval|xstory_cloze_hin_mcf|5
18 |
--------------------------------------------------------------------------------
/examples/model_configs/transformers_model.yaml:
--------------------------------------------------------------------------------
1 | model_parameters:
2 | model_name: "HuggingFaceTB/SmolLM2-1.7B-Instruct"
3 | revision: "57aa3c6599c53705406c648e7acca7e11dc45ea3"
4 | dtype: "float16"
5 | compile: false
6 | model_parallel: false
7 | batch_size: 1
8 | continuous_batching: false
9 | model_loading_kwargs:
10 | attn_implementation: "eager"
11 | #tp_plan: "auto"
12 | generation_parameters:
13 | #num_blocks: 4096
14 | #block_size: 64
15 | #max_new_tokens: 256
16 | temperature: 0.0
17 | top_p: 0.9
18 |
--------------------------------------------------------------------------------
/.github/workflows/doc-pr-upload.yml:
--------------------------------------------------------------------------------
1 | name: Upload PR Documentation
2 |
3 | on:
4 | workflow_run:
5 | workflows: ["Build PR Documentation"]
6 | types:
7 | - completed
8 |
9 | jobs:
10 | build:
11 | uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
12 | with:
13 | package_name: lighteval
14 | secrets:
15 | hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
16 | comment_bot_app_id: ${{ secrets.COMMENT_BOT_APP_ID }}
17 | comment_bot_secret_pem: ${{ secrets.COMMENT_BOT_SECRET_PEM }}
18 |
--------------------------------------------------------------------------------
/tests/unit/metrics/pytest.ini:
--------------------------------------------------------------------------------
1 | [tool:pytest]
2 | testpaths = .
3 | python_files = test_*.py
4 | python_classes = Test*
5 | python_functions = test_*
6 | addopts =
7 | -v
8 | --tb=short
9 | --strict-markers
10 | --disable-warnings
11 | markers =
12 | slow: marks tests as slow (deselect with '-m "not slow"')
13 | unit: marks tests as unit tests
14 | integration: marks tests as integration tests
15 | automated: marks tests as automated metric tests
16 | filterwarnings =
17 | ignore::DeprecationWarning
18 | ignore::PendingDeprecationWarning
19 |
--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/cf/ru.txt:
--------------------------------------------------------------------------------
1 | # General Knowledge (GK)
2 | lighteval|mlmm_arc_rus_cf:challenge|0
3 | lighteval|rummlu_rus_cf|0
4 | lighteval|mera_openbookqa_rus_cf|0
5 |
6 | # Reading Comprehension (RC)
7 | lighteval|belebele_rus_Cyrl_cf|0
8 | lighteval|tydiqa_rus|0
9 | lighteval|sber_squad_rus|0
10 | lighteval|xquad_rus|0
11 |
12 | # Reasoning (RES)
13 | lighteval|parus_rus_cf|0
14 | lighteval|xcodah_rus_cf|0
15 | lighteval|xcsqa_rus_cf|0
16 |
17 | # Natural Language Understanding (NLU)
18 | lighteval|mlmm_hellaswag_rus_cf|0
19 | lighteval|xnli2.0_rus_cf|0
20 | lighteval|xstory_cloze_rus_cf|0
21 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: "[FT] "
5 | labels: feature request
6 | assignees: ''
7 |
8 | ---
9 |
10 | ## Issue encountered
11 | Is your feature request related to a problem? Please provide a clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | ## Solution/Feature
14 | A clear and concise description of what you want to happen.
15 |
16 | ## Possible alternatives
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/mcf/ru.txt:
--------------------------------------------------------------------------------
1 | # General Knowledge (GK)
2 | lighteval|mlmm_arc_rus_mcf:challenge|5
3 | lighteval|rummlu_rus_mcf|5
4 | lighteval|mera_openbookqa_rus_mcf|5
5 |
6 | # Reading Comprehension (RC)
7 | lighteval|belebele_rus_Cyrl_mcf|5
8 | lighteval|tydiqa_rus|5
9 | lighteval|sber_squad_rus|5
10 | lighteval|xquad_rus|5
11 |
12 | # Reasoning (RES)
13 | lighteval|parus_rus_mcf|0
14 | lighteval|xcodah_rus_mcf|5
15 | lighteval|xcsqa_rus_mcf|5
16 |
17 | # Natural Language Understanding (NLU)
18 | lighteval|mlmm_hellaswag_rus_mcf|0
19 | lighteval|xnli2.0_rus_mcf|5
20 | lighteval|xstory_cloze_rus_mcf|5
21 |
--------------------------------------------------------------------------------
/.github/release.yml:
--------------------------------------------------------------------------------
1 | changelog:
2 | exclude:
3 | labels:
4 | - ignore-for-release
5 | categories:
6 | - title: New Features 🎉
7 | labels:
8 | - feature
9 | - title: Enhancement ⚙️
10 | labels:
11 | - enhancement
12 | - title: Documentation 📚
13 | labels:
14 | - documentation
15 | - title: New Tasks
16 | labels:
17 | - new-task
18 | - title: Task and Metrics changes 🛠️
19 | labels:
20 | - task-update
21 | - title: Bug Fixes 🐛
22 | labels:
23 | - bug
24 | - title: Other Changes
25 | labels:
26 | - "*"
27 |
--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/cf/zh.txt:
--------------------------------------------------------------------------------
1 | # General Knowledge (GK)
2 | lighteval|agieval_zho_cf|0
3 | lighteval|ceval_zho_cf|0
4 | lighteval|cmmlu_zho_cf|0
5 | lighteval|m3exams_zho_cf|0
6 |
7 | # Reading Comprehension (RC)
8 | lighteval|belebele_zho_Hans_cf|0
9 | lighteval|c3_zho_cf|0
10 | lighteval|cmrc2018_zho|0
11 | lighteval|chinese_squad_zho|0
12 |
13 | # Reasoning (RES)
14 | lighteval|xcodah_zho_cf|0
15 | lighteval|xcopa_zho_cf|0
16 | lighteval|xcsqa_zho_cf|0
17 |
18 | # Natural Language Understanding (NLU)
19 | lighteval|mlmm_hellaswag_zho_cf|0
20 | lighteval|ocnli_zho_cf|0
21 | lighteval|xwinograd_zho_cf|0
22 | lighteval|xstory_cloze_zho_cf|0
23 |
--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/mcf/zh.txt:
--------------------------------------------------------------------------------
1 | # General Knowledge (GK)
2 | lighteval|agieval_zho_mcf|5
3 | lighteval|ceval_zho_mcf|5
4 | lighteval|cmmlu_zho_mcf|5
5 | lighteval|m3exams_zho_mcf|5
6 |
7 | # Reading Comprehension (RC)
8 | lighteval|belebele_zho_Hans_mcf|5
9 | lighteval|c3_zho_mcf|5
10 | lighteval|cmrc2018_zho|5
11 | lighteval|chinese_squad_zho|5
12 |
13 | # Reasoning (RES)
14 | lighteval|xcodah_zho_mcf|5
15 | lighteval|xcopa_zho_mcf|5
16 | lighteval|xcsqa_zho_mcf|5
17 |
18 | # Natural Language Understanding (NLU)
19 | lighteval|mlmm_hellaswag_zho_mcf|5
20 | lighteval|ocnli_zho_mcf|5
21 | lighteval|xwinograd_zho_mcf|5
22 | lighteval|xstory_cloze_zho_mcf|5
23 |
--------------------------------------------------------------------------------
/docs/source/package_reference/tasks.mdx:
--------------------------------------------------------------------------------
1 | # Tasks
2 |
3 | ## LightevalTask
4 | ### LightevalTaskConfig
5 | [[autodoc]] tasks.lighteval_task.LightevalTaskConfig
6 | ### LightevalTask
7 | [[autodoc]] tasks.lighteval_task.LightevalTask
8 |
9 | ## PromptManager
10 | [[autodoc]] tasks.prompt_manager.PromptManager
11 |
12 | ## Registry
13 | [[autodoc]] tasks.registry.Registry
14 |
15 | ## Doc
16 | [[autodoc]] tasks.requests.Doc
17 |
18 | ## Datasets
19 | [[autodoc]] data.DynamicBatchDataset
20 | [[autodoc]] data.LoglikelihoodDataset
21 | [[autodoc]] data.GenerativeTaskDataset
22 | [[autodoc]] data.GenerativeTaskDatasetNanotron
23 | [[autodoc]] data.GenDistributedSampler
24 |
--------------------------------------------------------------------------------
/examples/nanotron/lighteval_config_override_template.yaml:
--------------------------------------------------------------------------------
1 | # As of right now auto batch size doesn't work, so we use some default
2 | batch_size: 8
3 | generation: null
4 | logging:
5 | output_dir: "outputs"
6 | save_details: false
7 | push_to_hub: false
8 | public_run: false
9 | results_org: null
10 | tensorboard_metric_prefix: "eval"
11 | parallelism:
12 | dp: 1
13 | pp: 1
14 | pp_engine: 1f1b
15 | tp: 1
16 | tp_linear_async_communication: false
17 | tp_mode: ALL_REDUCE
18 | tasks:
19 | dataset_loading_processes: 8
20 | max_samples: 10
21 | multichoice_continuations_start_space: null
22 | num_fewshot_seeds: null
23 | tasks: lighteval|gsm8k|5
24 |
--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/cf/ar.txt:
--------------------------------------------------------------------------------
1 | # General Knowledge (GK)
2 | lighteval|exams_ara_cf|0
3 | lighteval|mmlu_ara_cf|0
4 | lighteval|alghafa_arc_ara_cf:easy|0
5 | lighteval|alghafa_sciqa_ara_cf|0
6 |
7 | # Reading Comprehension (RC)
8 | lighteval|belebele_arb_Arab_cf|0
9 | lighteval|soqal_ara_cf|0
10 | lighteval|mlqa_ara|0
11 | lighteval|tydiqa_ara|0
12 | lighteval|alghafa_race_ara_cf|0
13 | lighteval|arcd_ara|0
14 |
15 | # Reasoning (RES)
16 | lighteval|xcodah_ara_cf|0
17 | lighteval|alghafa_piqa_ara_cf|0
18 | lighteval|xcsqa_ara_cf|0
19 |
20 | # Natural Language Understanding (NLU)
21 | lighteval|xnli2.0_ara_cf|0
22 | lighteval|mlmm_hellaswag_ara_cf|0
23 | lighteval|xstory_cloze_ara_cf|0
24 |
--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/mcf/ar.txt:
--------------------------------------------------------------------------------
1 | # General Knowledge (GK)
2 | lighteval|exams_ara_mcf|5
3 | lighteval|mmlu_ara_mcf|5
4 | lighteval|alghafa_arc_ara_mcf:easy|5
5 | lighteval|alghafa_sciqa_ara_mcf|5
6 |
7 | # Reading Comprehension (RC)
8 | lighteval|belebele_arb_Arab_mcf|5
9 | lighteval|soqal_ara_mcf|5
10 | lighteval|mlqa_ara|5
11 | lighteval|tydiqa_ara|5
12 | lighteval|alghafa_race_ara_mcf|5
13 | lighteval|arcd_ara|5
14 |
15 | # Reasoning (RES)
16 | lighteval|xcodah_ara_mcf|5
17 | lighteval|alghafa_piqa_ara_mcf|5
18 | lighteval|xcsqa_ara_mcf|5
19 |
20 | # Natural Language Understanding (NLU)
21 | lighteval|xnli2.0_ara_mcf|5
22 | lighteval|mlmm_hellaswag_ara_mcf|5
23 | lighteval|xstory_cloze_ara_mcf|5
24 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve lighteval!
4 | title: "[BUG] "
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | ## Describe the bug
11 | A clear and concise description of what the bug is.
12 |
13 | ## To Reproduce
14 | Please provide all the steps needed to reproduce the behavior, or provide a minimal working example if needed. We will ignore issues missing this section.
15 |
16 | ## Expected behavior
17 | A clear and concise description of what you expected to happen.
18 |
19 | ## Version info
20 | Please provide your operating system, lighteval version or commit if you installed from main, and pip/conda environment if your problem concerns dependencies.
21 |
--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2024 The HuggingFace Team
4 |
5 | import pytest
6 |
7 |
8 | def pytest_addoption(parser):
9 | parser.addoption("--runslow", action="store_true", default=False, help="run slow tests")
10 |
11 |
12 | def pytest_configure(config):
13 | config.addinivalue_line("markers", "slow: mark test as slow to run")
14 |
15 |
16 | def pytest_collection_modifyitems(config, items):
17 | if config.getoption("--runslow"):
18 | # --runslow given in cli: do not skip slow tests
19 | return
20 | skip_slow = pytest.mark.skip(reason="need --runslow option to run")
21 | for item in items:
22 | if "slow" in item.keywords:
23 | item.add_marker(skip_slow)
24 |
--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_mmlu_high_school_level.txt:
--------------------------------------------------------------------------------
1 | # MMLU (High School Level Tasks)
2 | community|serbian_evals:mmlu_srednja_skola_biologija|0
3 | community|serbian_evals:mmlu_srednja_skola_hemija|0
4 | community|serbian_evals:mmlu_srednja_skola_racunari|0
5 | community|serbian_evals:mmlu_srednja_skola_istorija_evrope|0
6 | community|serbian_evals:mmlu_srednja_skola_geografija|0
7 | community|serbian_evals:mmlu_srednja_skola_matematika|0
8 | community|serbian_evals:mmlu_srednja_skola_mikroekonomija|0
9 | community|serbian_evals:mmlu_srednja_skola_fizika|0
10 | community|serbian_evals:mmlu_srednja_skola_psihologija|0
11 | community|serbian_evals:mmlu_srednja_skola_statistika|0
12 | community|serbian_evals:mmlu_srednja_skola_svetska_istorija|0
13 |
--------------------------------------------------------------------------------
/.github/workflows/quality.yaml:
--------------------------------------------------------------------------------
1 | name: Quality
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | - v*-release
8 | pull_request:
9 | branches:
10 | - main
11 |
12 | jobs:
13 |
14 | check_code_quality:
15 | name: Check code quality
16 | runs-on: ubuntu-latest
17 | steps:
18 | - name: Checkout code
19 | uses: actions/checkout@v2
20 | - name: Setup Python environment
21 | uses: actions/setup-python@v2
22 | with:
23 | python-version: '3.10'
24 | - name: Install dependencies
25 | run: |
26 | python -m pip install --upgrade pip
27 | python -m pip install ".[quality]"
28 | - name: Code quality
29 | run: |
30 | make quality
31 |
--------------------------------------------------------------------------------
/docs/source/available-tasks.mdx:
--------------------------------------------------------------------------------
1 | # Available tasks
2 |
3 | Browse and inspect tasks available in LightEval.
4 |
10 |
11 |
12 |
13 | List all tasks:
14 |
15 | ```bash
16 | lighteval tasks list
17 | ```
18 |
19 | Extract tasks details:
20 |
21 | ```bash
22 | lighteval tasks dump
23 | ```
24 |
25 | Store the tasks details in a JSON file:
26 |
27 | ```bash
28 | lighteval tasks dump > tasks.json
29 | ```
30 |
31 | ### Inspect specific tasks
32 |
33 | Inspect a task to view its config, metrics, and requirements:
34 |
35 | ```bash
36 | lighteval tasks inspect
37 | ```
38 |
39 | Example:
40 | ```bash
41 | lighteval tasks inspect truthfulqa:mc
42 | ```
43 |
--------------------------------------------------------------------------------
/examples/model_configs/sglang_model_config.yaml:
--------------------------------------------------------------------------------
1 | model_parameters:
2 | model_name: "HuggingFaceTB/SmolLM-1.7B-Instruct"
3 | dtype: "auto"
4 | tp_size: 1
5 | dp_size: 1
6 | context_length: null
7 | random_seed: 1
8 | trust_remote_code: False
9 | device: "cuda"
10 | skip_tokenizer_init: False
11 | kv_cache_dtype: "auto"
12 | add_special_tokens: True
13 | pairwise_tokenization: False
14 | sampling_backend: null
15 | attention_backend: null
16 | mem_fraction_static: 0.8
17 | chunked_prefill_size: 4096
18 | generation_parameters:
19 | max_new_tokens: 1024
20 | min_new_tokens: 0
21 | temperature: 1.0
22 | top_k: 50
23 | min_p: 0.0
24 | top_p: 1.0
25 | presence_penalty: 0.0
26 | repetition_penalty: 1.0
27 | frequency_penalty: 0.0
28 | metrics_options:
29 | yo: null
30 |
--------------------------------------------------------------------------------
/examples/tasks/all_filipino_tasks.txt:
--------------------------------------------------------------------------------
1 | community|readability_ceb_mcf|0
2 | community|kalahi_tgl_mcf|0
3 | community|kalahi_tgl_hybrid|0
4 | community|cebuaner_ceb_mcf|0
5 | community|universalner_tgl_mcf|0
6 | community|universalner_ceb_mcf|0
7 | community|tlunifiedner_tgl_mcf|0
8 | community|stingraybench_correctness_tgl_mcf|0
9 | community|stingraybench_semantic_appropriateness_tgl_mcf|0
10 | community|tatoeba_ceb|0
11 | community|tatoeba_tgl|0
12 | community|ntrex128_fil|0
13 | community|tico19_tgl|0
14 | community|dengue_filipino_fil|0
15 | community|include_tgl_mcf|0
16 | community|newsphnli_fil_mcf|0
17 | community|belebele_ceb_mcf|0
18 | community|belebele_fil_mcf|0
19 | community|sib200_ceb_mcf|0
20 | community|sib200_tgl_mcf|0
21 | community|firecs_fil_mcf|0
22 | community|global_mmlu_all_tgl_mcf|0
23 | community|balita_tgl_mcf|0
24 |
--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/rouge1.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "ROUGE1 Test Suite",
3 | "description": "Test cases for ROUGE1 metric",
4 | "test_cases": [
5 | {
6 | "name": "ROUGE Score",
7 | "metric_class": "rouge1",
8 | "metric_params": {
9 | },
10 | "doc": {
11 | "query": "Summarize the text",
12 | "choices": ["The quick brown fox jumps over the lazy dog"],
13 | "gold_index": 0,
14 | "task_name": "test"
15 | },
16 | "model_response": {
17 | "text": ["The quick brown fox jumps over the lazy dog"],
18 | "logprobs": [],
19 | "output_tokens": []
20 | },
21 | "expected_output": {
22 | "rouge1": 1
23 | },
24 | "tolerance": 0.01,
25 | "description": "Test ROUGE score with perfect match"
26 | }
27 | ]
28 | }
29 |
--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/simpleqa_judge.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Simpleqa Judge Test Suite",
3 | "description": "Test cases for simpleqa_judge metric",
4 | "test_cases": [
5 | {
6 | "name": "Simpleqa Judge - Basic Test",
7 | "metric_class": "simpleqa_judge",
8 | "metric_params": {},
9 | "doc": {
10 | "query": "Test query for simpleqa_judge",
11 | "choices": [
12 | "Test choice 1",
13 | "Test choice 2",
14 | "Test choice 3"
15 | ],
16 | "gold_index": 0,
17 | "task_name": "test"
18 | },
19 | "model_response": {
20 | "text": [
21 | "Test choice 1"
22 | ]
23 | },
24 | "expected_output": {
25 | "simpleqa_judge": 1.0
26 | },
27 | "tolerance": 0.01,
28 | "description": "Basic test case for simpleqa_judge metric"
29 | }
30 | ]
31 | }
32 |
--------------------------------------------------------------------------------
/examples/test_tasks.txt:
--------------------------------------------------------------------------------
1 | arc:challenge|25
2 | truthfulqa:mc|0
3 | hellaswag|10
4 | mmlu:college_chemistry|5
5 | mmlu:us_foreign_policy|5
6 | agieval:aqua-rat|0
7 | agieval:logiqa-en|0
8 | agieval:lsat-ar|0
9 | agieval:lsat-lr|0
10 | agieval:lsat-rc|0
11 | agieval:sat-en-without-passage|0
12 | agieval:sat-en|0
13 | bigbench_hard:causal_judgment|3
14 | bigbench_hard:date_understanding|3
15 | bigbench_hard:disambiguation_qa|3
16 | bigbench_hard:geometric_shapes|3
17 | bigbench_hard:logical_deduction_five_objects|3
18 | bigbench_hard:logical_deduction_seven_objects|3
19 | bigbench_hard:movie_recommendation|3
20 | bigbench_hard:navigate|3
21 | bigbench_hard:ruin_names|3
22 | bigbench_hard:salient_translation_error_detection|3
23 | bigbench_hard:snarks|3
24 | bigbench_hard:temporal_sequences|3
25 | bigbench_hard:tracking_shuffled_objects_five_objects|3
26 | bigbench_hard:tracking_shuffled_objects_seven_objects|3
27 | gsm8k_test|0
28 |
--------------------------------------------------------------------------------
/examples/model_configs/vllm_model_config.yaml:
--------------------------------------------------------------------------------
1 | model_parameters:
2 | model_name: "HuggingFaceTB/SmolLM2-1.7B-Instruct"
3 | revision: "57aa3c6599c53705406c648e7acca7e11dc45ea3"
4 | dtype: "float16"
5 | tensor_parallel_size: 1
6 | data_parallel_size: 1
7 | pipeline_parallel_size: 1
8 | gpu_memory_utilization: 0.6
9 | max_model_length: null
10 | swap_space: 4
11 | seed: 42
12 | trust_remote_code: False
13 | add_special_tokens: True
14 | multichoice_continuations_start_space: False
15 | pairwise_tokenization: False
16 | subfolder: null
17 | max_num_seqs: 1
18 | max_num_batched_tokens: 8192
19 | is_async: false
20 | generation_parameters:
21 | presence_penalty: 0.0
22 | repetition_penalty: 1.0
23 | frequency_penalty: 0.0
24 | temperature: 0.0
25 | top_k: null
26 | min_p: 0.0
27 | top_p: 0.9
28 | seed: 42
29 | stop_tokens: null
30 | max_new_tokens: 2048
31 | min_new_tokens: 0
32 |
--------------------------------------------------------------------------------
/examples/model_configs/peft_model.yaml:
--------------------------------------------------------------------------------
1 | model_parameters:
2 | model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ... For a PEFT model, the pretrained model should be the one trained with PEFT and the base model below will contain the original model on which the adapters will be applied.
3 | tokenizer: null # name of tokenizer to use if different from the model's default
4 | subfolder: null # subfolder in the model's directory to use
5 | dtype: "float16" # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization.
6 | compile: true
7 | revision: "main" # revision to use
8 | trust_remote_code: true # Trust remote code
9 | model_parallel: null # Model parallel
10 | max_length: 2048 # maximum length of the input text and the generated text
11 |
12 | # should go in generation
13 | max_generation_toks: 256 # maximum number of tokens to generate
14 | batch_size: 10 # batch size to use
15 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Hugging Face
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/bert_score.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Bert Score Test Suite",
3 | "description": "Test cases for bert_score metric",
4 | "test_cases": [
5 | {
6 | "name": "Bert Score - Basic Test",
7 | "metric_class": "bert_score",
8 | "metric_params": {},
9 | "doc": {
10 | "query": "Test query for bert_score",
11 | "choices": [
12 | "Test choice 1",
13 | "Test choice 2",
14 | "Test choice 3"
15 | ],
16 | "gold_index": 0,
17 | "task_name": "test"
18 | },
19 | "model_response": {
20 | "text": [
21 | "Test choice 1"
22 | ],
23 | "logprobs": [
24 | 0.5,
25 | 0.3,
26 | 0.2
27 | ],
28 | "output_tokens": [
29 | [
30 | 1
31 | ],
32 | [
33 | 2
34 | ],
35 | [
36 | 3
37 | ]
38 | ]
39 | },
40 | "expected_output": {
41 | "result": 1.0
42 | },
43 | "tolerance": 0.01,
44 | "description": "Basic test case for bert_score metric"
45 | }
46 | ]
47 | }
48 |
--------------------------------------------------------------------------------
/.github/workflows/slow_tests.yaml:
--------------------------------------------------------------------------------
1 | name: Slow end to end tests
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | - v*-release
8 | pull_request:
9 | branches:
10 | - main
11 |
12 | jobs:
13 | run_tests:
14 | name: Run tests
15 | runs-on: 'aws-g4dn-2xlarge-use1-public-80'
16 | steps:
17 | - name: Install Git LFS
18 | run: |
19 | if ! command -v git-lfs &> /dev/null; then
20 | echo "Installing Git LFS..."
21 | sudo apt-get update && sudo apt-get install -y git-lfs
22 | git lfs install
23 | else
24 | echo "Git LFS already installed."
25 | fi
26 |
27 | - name: Checkout repository
28 | uses: actions/checkout@v4
29 | with:
30 | lfs: true
31 |
32 | - name: Install uv
33 | uses: astral-sh/setup-uv@v5
34 | with:
35 | enable-cache: true
36 |
37 | - name: Install the project
38 | run: uv sync --extra dev
39 |
40 |
41 | - name: run nvidia-smi
42 | run: nvidia-smi
43 |
44 | - name: Run tests
45 | run: uv run pytest --disable-pytest-warnings --runslow tests/slow_tests/
46 |
--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2024 The HuggingFace Team
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/bits_per_byte.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Bits Per Byte Test Suite",
3 | "description": "Test cases for bits_per_byte metric",
4 | "test_cases": [
5 | {
6 | "name": "Bits Per Byte - Basic Test",
7 | "metric_class": "bits_per_byte",
8 | "metric_params": {},
9 | "doc": {
10 | "query": "Test query for bits_per_byte",
11 | "choices": [
12 | "Test choice 1",
13 | "Test choice 2",
14 | "Test choice 3"
15 | ],
16 | "gold_index": 0,
17 | "task_name": "test"
18 | },
19 | "model_response": {
20 | "text": [
21 | "Test choice 1"
22 | ],
23 | "logprobs": [
24 | 0.5,
25 | 0.3,
26 | 0.2
27 | ],
28 | "output_tokens": [
29 | [
30 | 1
31 | ],
32 | [
33 | 2
34 | ],
35 | [
36 | 3
37 | ]
38 | ]
39 | },
40 | "expected_output": {
41 | "bits_per_byte": 1.0
42 | },
43 | "tolerance": 0.01,
44 | "description": "Basic test case for bits_per_byte metric"
45 | }
46 | ]
47 | }
48 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/jeopardy.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Jeopardy
4 |
5 | dataset:
6 | openaccess-ai-collective/jeopardy
7 |
8 | abstract:
9 | Jeopardy is a dataset of questions and answers from the Jeopardy game show.
10 |
11 | languages:
12 | english
13 |
14 | tags:
15 | knowledge, qa
16 |
17 | paper:
18 | """
19 |
20 | from lighteval.metrics.metrics import Metrics
21 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
22 | from lighteval.tasks.templates.qa import get_qa_prompt_function
23 | from lighteval.utils.language import Language
24 |
25 |
26 | jeopardy = LightevalTaskConfig(
27 | name="jeopardy",
28 | prompt_function=get_qa_prompt_function(
29 | Language.ENGLISH,
30 | lambda line: {
31 | "question": line["question"],
32 | "choices": [line["answer"]],
33 | },
34 | ),
35 | hf_repo="openaccess-ai-collective/jeopardy",
36 | hf_subset="default",
37 | evaluation_splits=("train",),
38 | few_shots_split="train",
39 | generation_size=250,
40 | stop_sequence=["\n", "Question:", "question:"],
41 | metrics=[Metrics.exact_match],
42 | version=1,
43 | )
44 |
45 | TASKS_TABLE = [
46 | jeopardy,
47 | ]
48 |
--------------------------------------------------------------------------------
/src/lighteval/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2024 The HuggingFace Team
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
--------------------------------------------------------------------------------
/tests/slow_tests/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2024 The HuggingFace Team
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/lighteval/metrics/imports/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2024 The HuggingFace Team
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/templates/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2024 The HuggingFace Team
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/byte_perplexity.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Byte Perplexity Test Suite",
3 | "description": "Test cases for byte_perplexity metric",
4 | "test_cases": [
5 | {
6 | "name": "Byte Perplexity - Basic Test",
7 | "metric_class": "byte_perplexity",
8 | "metric_params": {},
9 | "doc": {
10 | "query": "Test query for byte_perplexity",
11 | "choices": [
12 | "Test choice 1",
13 | "Test choice 2",
14 | "Test choice 3"
15 | ],
16 | "gold_index": 0,
17 | "task_name": "test"
18 | },
19 | "model_response": {
20 | "text": [
21 | "Test choice 1"
22 | ],
23 | "logprobs": [
24 | 0.5,
25 | 0.3,
26 | 0.2
27 | ],
28 | "output_tokens": [
29 | [
30 | 1
31 | ],
32 | [
33 | 2
34 | ],
35 | [
36 | 3
37 | ]
38 | ]
39 | },
40 | "expected_output": {
41 | "byte_perplexity": 1.0
42 | },
43 | "tolerance": 0.01,
44 | "description": "Basic test case for byte_perplexity metric"
45 | }
46 | ]
47 | }
48 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2024 The HuggingFace Team
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/expr_gold_metric.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Expr Gold Metric Test Suite",
3 | "description": "Test cases for expr_gold_metric metric",
4 | "test_cases": [
5 | {
6 | "name": "Expr Gold Metric - Basic Test",
7 | "metric_class": "expr_gold_metric",
8 | "metric_params": {},
9 | "doc": {
10 | "query": "Test query for expr_gold_metric",
11 | "choices": [
12 | "Test choice 1",
13 | "Test choice 2",
14 | "Test choice 3"
15 | ],
16 | "gold_index": 0,
17 | "task_name": "test"
18 | },
19 | "model_response": {
20 | "text": [
21 | "Test choice 1"
22 | ],
23 | "logprobs": [
24 | 0.5,
25 | 0.3,
26 | 0.2
27 | ],
28 | "output_tokens": [
29 | [
30 | 1
31 | ],
32 | [
33 | 2
34 | ],
35 | [
36 | 3
37 | ]
38 | ]
39 | },
40 | "expected_output": {
41 | "extractive_match": 1.0
42 | },
43 | "tolerance": 0.01,
44 | "description": "Basic test case for expr_gold_metric metric"
45 | }
46 | ]
47 | }
48 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2024 The HuggingFace Team
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/templates/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2024 The HuggingFace Team
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/prediction_perplexity.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Prediction Perplexity Test Suite",
3 | "description": "Test cases for prediction_perplexity metric",
4 | "test_cases": [
5 | {
6 | "name": "Prediction Perplexity - Basic Test",
7 | "metric_class": "prediction_perplexity",
8 | "metric_params": {},
9 | "doc": {
10 | "query": "Test query for prediction_perplexity",
11 | "choices": [
12 | "Test choice 1",
13 | "Test choice 2",
14 | "Test choice 3"
15 | ],
16 | "gold_index": 0,
17 | "task_name": "test"
18 | },
19 | "model_response": {
20 | "text": [
21 | "Test choice 1"
22 | ],
23 | "logprobs": [
24 | 0.5,
25 | 0.3,
26 | 0.2
27 | ],
28 | "output_tokens": [
29 | [
30 | 1
31 | ],
32 | [
33 | 2
34 | ],
35 | [
36 | 3
37 | ]
38 | ]
39 | },
40 | "expected_output": {
41 | "ppl": 1.0
42 | },
43 | "tolerance": 0.01,
44 | "description": "Basic test case for prediction_perplexity metric"
45 | }
46 | ]
47 | }
48 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2024 The HuggingFace Team
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | from setuptools import setup
24 |
25 |
26 | setup()
27 |
--------------------------------------------------------------------------------
/examples/model_configs/endpoint_model.yaml:
--------------------------------------------------------------------------------
1 | model_parameters:
2 | reuse_existing: false # if true, ignore all params in instance, and don't delete the endpoint after evaluation
3 | # endpoint_name: "llama-2-7B-lighteval" # needs to be lower case without special characters
4 |
5 | model_name: "meta-llama/Llama-2-7b-hf"
6 | revision: "main" # defaults to "main"
7 | dtype: "float16" # can be any of "awq", "eetq", "gptq", "4bit' or "8bit" (will use bitsandbytes), "bfloat16" or "float16"
8 | accelerator: "gpu"
9 | region: "eu-west-1"
10 | vendor: "aws"
11 | instance_type: "nvidia-a10g"
12 | instance_size: "x1"
13 | framework: "pytorch"
14 | endpoint_type: "protected"
15 | namespace: null # The namespace under which to launch the endpoint. Defaults to the current user's namespace
16 | image_url: null # Optionally specify the docker image to use when launching the endpoint model. E.g., launching models with later releases of the TGI container with support for newer models.
17 | env_vars:
18 | null # Optional environment variables to include when launching the endpoint. e.g., `MAX_INPUT_LENGTH: 2048`
19 | generation_parameters:
20 | max_new_tokens: 256 # maximum number of tokens to generate
21 | temperature: 0.2
22 | top_p: 0.9
23 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/pubmedqa.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Pubmedqa
4 |
5 | dataset:
6 | pubmed_qa
7 |
8 | abstract:
9 | PubMedQA is a dataset for biomedical research question answering.
10 |
11 | languages:
12 | english
13 |
14 | tags:
15 | biomedical, health, medical, qa
16 |
17 | paper:
18 | https://pubmedqa.github.io/
19 | """
20 |
21 | from lighteval.metrics.metrics import Metrics
22 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
23 | from lighteval.tasks.requests import Doc
24 |
25 |
26 | def pubmed_qa_prompt(line, task_name: str = None):
27 | return Doc(
28 | task_name=task_name,
29 | query=f"{line['QUESTION']}\n{line['CONTEXTS']}\nAnswer: ",
30 | choices=[line["final_decision"]],
31 | gold_index=0,
32 | )
33 |
34 |
35 | pubmedqa = LightevalTaskConfig(
36 | name="pubmedqa",
37 | prompt_function=pubmed_qa_prompt,
38 | hf_repo="pubmed_qa",
39 | hf_subset="pqa_labeled",
40 | hf_avail_splits=["train"],
41 | evaluation_splits=["train"],
42 | few_shots_split=None,
43 | few_shots_select=None,
44 | generation_size=1,
45 | metrics=[
46 | Metrics.exact_match,
47 | ],
48 | stop_sequence=["\n"],
49 | version=0,
50 | )
51 |
52 | TASKS_TABLE = [
53 | pubmedqa,
54 | ]
55 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/cmath.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Cmath
4 |
5 | dataset:
6 | weitianwen/cmath
7 |
8 | abstract:
9 | Cmath multilingual benchmark.
10 |
11 | languages:
12 | chinese
13 |
14 | tags:
15 | math, multilingual, reasoning
16 |
17 | paper:
18 | """
19 |
20 | from lighteval.metrics.dynamic_metrics import (
21 | MultilingualQuasiExactMatchMetric,
22 | )
23 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
24 | from lighteval.tasks.templates.qa import get_qa_prompt_function
25 | from lighteval.utils.language import Language
26 |
27 |
28 | TASKS_TABLE = [
29 | LightevalTaskConfig(
30 | name=f"cmath_{Language.CHINESE.value}",
31 | prompt_function=get_qa_prompt_function(
32 | Language.CHINESE,
33 | lambda line: {
34 | "question": line["question"],
35 | "choices": [line["golden"]],
36 | },
37 | ),
38 | hf_repo="weitianwen/cmath",
39 | hf_subset="default",
40 | evaluation_splits=("test",),
41 | few_shots_split="validation",
42 | generation_size=25,
43 | metrics=[
44 | MultilingualQuasiExactMatchMetric(Language.CHINESE, "full"),
45 | ],
46 | stop_sequence=("\n",),
47 | )
48 | ]
49 |
--------------------------------------------------------------------------------
/src/lighteval/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2024 The HuggingFace Team
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | import importlib.metadata
24 |
25 |
26 | __version__ = importlib.metadata.version(__package__ or __name__)
27 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/quac.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Quac
4 |
5 | dataset:
6 | lighteval/quac_helm
7 |
8 | abstract:
9 | The QuAC benchmark for question answering in the context of dialogues.
10 |
11 | languages:
12 | english
13 |
14 | tags:
15 | dialog, qa
16 |
17 | paper:
18 | https://aclanthology.org/D18-1241/
19 | """
20 |
21 | from lighteval.metrics.metrics import Metrics
22 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
23 | from lighteval.tasks.requests import Doc
24 |
25 |
26 | def quac_prompt(line, task_name: str = None):
27 | references = [ref for ref in line["references"] if ref is not None and ref != ""]
28 | return Doc(
29 | task_name=task_name,
30 | query=f"{line['prompt']}\nAnswer:",
31 | choices=references,
32 | gold_index=list(range(len(references))),
33 | )
34 |
35 |
36 | quac = LightevalTaskConfig(
37 | name="quac",
38 | prompt_function=quac_prompt,
39 | hf_repo="lighteval/quac_helm",
40 | hf_subset="default",
41 | hf_avail_splits=["train", "validation"],
42 | evaluation_splits=["validation"],
43 | few_shots_split=None,
44 | few_shots_select=None,
45 | generation_size=100,
46 | metrics=[Metrics.exact_match],
47 | stop_sequence=["\n"],
48 | version=0,
49 | )
50 |
51 | TASKS_TABLE = [
52 | quac,
53 | ]
54 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/chegeka.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Chegeka
4 |
5 | dataset:
6 | ai-forever/MERA
7 |
8 | abstract:
9 | Chegeka multilingual benchmark.
10 |
11 | languages:
12 | russian
13 |
14 | tags:
15 | knowledge, multilingual, qa
16 |
17 | paper:
18 | """
19 |
20 | from lighteval.metrics.dynamic_metrics import (
21 | MultilingualQuasiExactMatchMetric,
22 | MultilingualQuasiF1ScoreMetric,
23 | )
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.templates.qa import get_qa_prompt_function
26 | from lighteval.utils.language import Language
27 |
28 |
29 | TASKS_TABLE = [
30 | LightevalTaskConfig(
31 | name=f"chegeka_{Language.RUSSIAN.value}",
32 | prompt_function=get_qa_prompt_function(
33 | Language.RUSSIAN,
34 | lambda line: {
35 | "question": line["inputs"]["text"],
36 | "choices": [line["outputs"]],
37 | },
38 | ),
39 | hf_repo="ai-forever/MERA",
40 | hf_subset="chegeka",
41 | evaluation_splits=("train",),
42 | hf_avail_splits=["train"],
43 | generation_size=400,
44 | stop_sequence=("\n",),
45 | metrics=[
46 | MultilingualQuasiExactMatchMetric(Language.RUSSIAN, "prefix"),
47 | MultilingualQuasiF1ScoreMetric(Language.RUSSIAN),
48 | ],
49 | )
50 | ]
51 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/__init__.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2024 The HuggingFace Team
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | """
24 | Automatically imports all task configs from the tasks/ directory.
25 | This module dynamically loads all Python files in tasks/ and exposes their LightevalTaskConfig objects.
26 | """
27 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/french_triviqa.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | French Triviqa
4 |
5 | dataset:
6 | manu/french-trivia
7 |
8 | abstract:
9 | French Triviqa multilingual benchmark.
10 |
11 | languages:
12 | french
13 |
14 | tags:
15 | multilingual, qa
16 |
17 | paper:
18 | """
19 |
20 | from lighteval.metrics.dynamic_metrics import (
21 | MultilingualQuasiExactMatchMetric,
22 | MultilingualQuasiF1ScoreMetric,
23 | )
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.templates.qa import get_qa_prompt_function
26 | from lighteval.utils.language import Language
27 |
28 |
29 | TASKS_TABLE = [
30 | LightevalTaskConfig(
31 | name=f"community_triviaqa_{Language.FRENCH.value}",
32 | prompt_function=get_qa_prompt_function(
33 | Language.FRENCH,
34 | lambda line: {
35 | "question": line["Question"],
36 | "choices": [line["Answer"]],
37 | },
38 | ),
39 | hf_repo="manu/french-trivia",
40 | hf_subset="default",
41 | evaluation_splits=("train",),
42 | hf_avail_splits=["train"],
43 | generation_size=400,
44 | stop_sequence=("\n",),
45 | metrics=[
46 | MultilingualQuasiExactMatchMetric(Language.FRENCH, "prefix"),
47 | MultilingualQuasiF1ScoreMetric(Language.FRENCH),
48 | ],
49 | )
50 | ]
51 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/natural_questions.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Natural Questions
4 |
5 | dataset:
6 | lighteval/small_natural_questions
7 |
8 | abstract:
9 | This dataset is a collection of question-answer pairs from the Natural Questions
10 | dataset. See Natural Questions for additional information. This dataset can be
11 | used directly with Sentence Transformers to train embedding models.
12 |
13 | languages:
14 | english
15 |
16 | tags:
17 | general-knowledge, qa
18 |
19 | paper:
20 | https://ai.google.com/research/NaturalQuestions
21 | """
22 |
23 | from lighteval.metrics.metrics import Metrics
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.templates.qa import get_qa_prompt_function
26 | from lighteval.utils.language import Language
27 |
28 |
29 | natural_questions = LightevalTaskConfig(
30 | name="natural_questions",
31 | prompt_function=get_qa_prompt_function(
32 | Language.ENGLISH,
33 | lambda line: {"question": line["question"], "choices": [line["answer"]]},
34 | ),
35 | hf_repo="lighteval/small_natural_questions",
36 | hf_subset="default",
37 | evaluation_splits=("test",),
38 | few_shots_split="few_shot",
39 | generation_size=250,
40 | stop_sequence=["\n", "Question:", "question:"],
41 | metrics=[Metrics.exact_match],
42 | version=1,
43 | )
44 |
45 | TASKS_TABLE = [
46 | natural_questions,
47 | ]
48 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/toxigen.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Toxigen
4 |
5 | dataset:
6 | skg/toxigen-data
7 |
8 | abstract:
9 | This dataset is for implicit hate speech detection. All instances were generated
10 | using GPT-3 and the methods described in our paper.
11 |
12 | languages:
13 | english
14 |
15 | tags:
16 | generation, safety
17 |
18 | paper:
19 | https://arxiv.org/abs/2203.09509
20 | """
21 |
22 | from lighteval.metrics.metrics import Metrics
23 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
24 | from lighteval.tasks.requests import Doc
25 |
26 |
27 | def toxigen_prompt(line, task_name: str = None):
28 | return Doc(
29 | task_name=task_name,
30 | query=f"Is the following statement hateful? Respond with either Yes or No. Statement: '{line['text']}'",
31 | choices=["No", "Yes"],
32 | gold_index=int(round(((line["toxicity_ai"] + line["toxicity_human"]) > 5.5), 0)),
33 | )
34 |
35 |
36 | toxigen = LightevalTaskConfig(
37 | name="toxigen",
38 | prompt_function=toxigen_prompt,
39 | hf_repo="skg/toxigen-data",
40 | hf_subset="annotated",
41 | hf_avail_splits=["train", "test"],
42 | evaluation_splits=["test"],
43 | few_shots_split=None,
44 | few_shots_select=None,
45 | generation_size=-1,
46 | metrics=[Metrics.loglikelihood_acc],
47 | stop_sequence=["\n"],
48 | version=0,
49 | )
50 |
51 | TASKS_TABLE = [
52 | toxigen,
53 | ]
54 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | default_language_version:
16 | python: python3
17 |
18 | ci:
19 | autofix_prs: true
20 | autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
21 | autoupdate_schedule: quarterly
22 |
23 | repos:
24 | - repo: https://github.com/pre-commit/pre-commit-hooks
25 | rev: v4.3.0
26 | hooks:
27 | - id: check-yaml
28 | - id: check-case-conflict
29 | - id: detect-private-key
30 | - id: check-added-large-files
31 | args: ['--maxkb=1000']
32 | - id: end-of-file-fixer
33 | - id: trailing-whitespace
34 |
35 | - repo: https://github.com/charliermarsh/ruff-pre-commit
36 | # Ruff version.
37 | rev: 'v0.11.10'
38 | hooks:
39 | - id: ruff
40 | args: ['--fix']
41 | - id: ruff-format
42 |
--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/mcc.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "MCC Test Suite",
3 | "description": "Test cases for MCC (Matthews Correlation Coefficient) metric",
4 | "corpus_level": true,
5 | "test_cases": [
6 | {
7 | "name": "MCC - Corpus Level Test with 3 Samples",
8 | "metric_class": "mcc",
9 | "metric_name": "mcc",
10 | "metric_params": {},
11 | "docs": [
12 | {
13 | "query": "What is the capital of France?",
14 | "choices": ["Paris", "London", "Berlin"],
15 | "gold_index": 0,
16 | "task_name": "geography"
17 | },
18 | {
19 | "query": "What is 2 + 2?",
20 | "choices": ["3", "4", "5"],
21 | "gold_index": 1,
22 | "task_name": "math"
23 | },
24 | {
25 | "query": "What color is the sky?",
26 | "choices": ["Red", "Blue", "Green"],
27 | "gold_index": 1,
28 | "task_name": "science"
29 | }
30 | ],
31 | "model_responses": [
32 | {
33 | "logprobs": [-0.2, -0.8, -1.5]
34 | },
35 | {
36 | "logprobs": [-1.2, -0.3, -0.9]
37 | },
38 | {
39 | "logprobs": [-0.7, -0.4, -1.1]
40 | }
41 | ],
42 | "expected_output": 1.0,
43 | "tolerance": 0.01,
44 | "description": "Corpus level test case for MCC metric with 3 samples - all predictions correct"
45 | }
46 | ]
47 | }
48 |
--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/exact_match.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Exact Match Test Suite",
3 | "description": "Test cases for exact match metric",
4 | "test_cases": [
5 | {
6 | "name": "Exact Match - Perfect Match",
7 | "metric_class": "exact_match",
8 | "metric_params": {},
9 | "doc": {
10 | "query": "What is the capital of France?",
11 | "choices": ["Paris", "London", "Berlin"],
12 | "gold_index": 0,
13 | "task_name": "test"
14 | },
15 | "model_response": {
16 | "text": ["Paris"],
17 | "logprobs": [],
18 | "output_tokens": []
19 | },
20 | "expected_output": {
21 | "em": 1.0
22 | },
23 | "tolerance": 0.01,
24 | "description": "Test exact match with perfect prediction"
25 | },
26 | {
27 | "name": "Exact Match - No Match",
28 | "metric_class": "exact_match",
29 | "metric_params": {},
30 | "doc": {
31 | "query": "What is the capital of France?",
32 | "choices": ["Paris", "London", "Berlin"],
33 | "gold_index": 0,
34 | "task_name": "test"
35 | },
36 | "model_response": {
37 | "text": ["London"],
38 | "logprobs": [],
39 | "output_tokens": []
40 | },
41 | "expected_output": {
42 | "em": 0.0
43 | },
44 | "tolerance": 0.01,
45 | "description": "Test exact match with wrong prediction"
46 | }
47 | ]
48 | }
49 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/coqa.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Coqa
4 |
5 | dataset:
6 | stanfordnlp/coqa
7 |
8 | abstract:
9 | CoQA is a large-scale dataset for building Conversational Question Answering
10 | systems. The goal of the CoQA challenge is to measure the ability of machines to
11 | understand a text passage and answer a series of interconnected questions that
12 | appear in a conversation.
13 |
14 | languages:
15 | english
16 |
17 | tags:
18 | dialog, qa
19 |
20 | paper:
21 | https://arxiv.org/abs/1808.07042
22 | """
23 |
24 | from lighteval.metrics.metrics import Metrics
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.requests import Doc
27 |
28 |
29 | def coqa_prompt(line, task_name: str = None):
30 | results = []
31 | for q, a in zip(line["questions"], line["answers"]["input_text"]):
32 | results.append(Doc(task_name=task_name, query=f"{line['story']} \n\nQ: {q}\n\nA: ", choices=[a], gold_index=0))
33 | return results
34 |
35 |
36 | coqa_first_question = LightevalTaskConfig(
37 | name="coqa",
38 | prompt_function=coqa_prompt,
39 | hf_repo="stanfordnlp/coqa",
40 | hf_subset="default",
41 | hf_avail_splits=["train", "validation"],
42 | evaluation_splits=["validation"],
43 | stop_sequence=["\n", "Question:", "question:"],
44 | generation_size=100,
45 | version=1,
46 | metrics=[Metrics.exact_match],
47 | )
48 |
49 | TASKS_TABLE = [
50 | coqa_first_question,
51 | ]
52 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/tquad_v2.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Tquad V2
4 |
5 | dataset:
6 | erdometo/tquad2
7 |
8 | abstract:
9 | TQuAD v2: Turkish Question Answering Dataset version 2.
10 |
11 | languages:
12 | turkish
13 |
14 | tags:
15 | multilingual, qa
16 |
17 | paper:
18 | """
19 |
20 | from lighteval.metrics.dynamic_metrics import (
21 | MultilingualQuasiExactMatchMetric,
22 | MultilingualQuasiF1ScoreMetric,
23 | )
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.templates.qa import get_qa_prompt_function
26 | from lighteval.utils.language import Language
27 |
28 |
29 | TASKS_TABLE = [
30 | LightevalTaskConfig(
31 | name=f"tquadv2_{Language.TURKISH.value}",
32 | prompt_function=get_qa_prompt_function(
33 | Language.TURKISH,
34 | lambda line: {
35 | "question": line["question"],
36 | "context": line["context"],
37 | "choices": [a["text"] for a in line["answers"]],
38 | },
39 | ),
40 | hf_repo="erdometo/tquad2",
41 | hf_subset="default",
42 | evaluation_splits=("validation",),
43 | few_shots_split="train",
44 | generation_size=400,
45 | stop_sequence=("\n",),
46 | metrics=(
47 | MultilingualQuasiExactMatchMetric(Language.TURKISH, "prefix"),
48 | MultilingualQuasiF1ScoreMetric(Language.TURKISH),
49 | ),
50 | )
51 | ]
52 |
--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/acc_golds_likelihood.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Acc Golds Likelihood Test Suite",
3 | "description": "Test cases for acc_golds_likelihood metric",
4 | "test_cases": [
5 | {
6 | "name": "Acc Golds Likelihood - Correct Likelihood",
7 | "metric_class": "acc_golds_likelihood",
8 | "metric_params": {},
9 | "doc": {
10 | "query": "What is the capital of France?",
11 | "choices": ["Paris", "London", "Berlin"],
12 | "gold_index": 0,
13 | "task_name": "geography"
14 | },
15 | "model_response": {
16 | "argmax_logits_eq_gold": [1, 0, 0]
17 | },
18 | "expected_output": {
19 | "acc": 1
20 | },
21 | "tolerance": 0.01,
22 | "description": "Test acc golds likelihood with correct likelihood"
23 | },
24 | {
25 | "name": "Acc Golds Likelihood - Incorrect Likelihood",
26 | "metric_class": "acc_golds_likelihood",
27 | "metric_params": {},
28 | "doc": {
29 | "query": "What is the capital of France?",
30 | "choices": ["Paris", "London", "Berlin"],
31 | "gold_index": 0,
32 | "task_name": "geography"
33 | },
34 | "model_response": {
35 | "argmax_logits_eq_gold": [0, 0, 0]
36 | },
37 | "expected_output": {
38 | "acc": 0
39 | },
40 | "tolerance": 0.01,
41 | "description": "Test acc golds likelihood with incorrect likelihood"
42 | }
43 | ]
44 | }
45 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/thaiqa.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Thaiqa
4 |
5 | dataset:
6 | lighteval/thaiqa_squad_fixed
7 |
8 | abstract:
9 | ThaiQA: A question answering dataset for the Thai language.
10 |
11 | languages:
12 | thai
13 |
14 | tags:
15 | multilingual, qa
16 |
17 | paper:
18 | """
19 |
20 | from lighteval.metrics.dynamic_metrics import (
21 | MultilingualQuasiExactMatchMetric,
22 | MultilingualQuasiF1ScoreMetric,
23 | )
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.templates.qa import get_qa_prompt_function
26 | from lighteval.utils.language import Language
27 |
28 |
29 | TASKS_TABLE = [
30 | LightevalTaskConfig(
31 | name=f"thaiqa_{Language.THAI.value}",
32 | prompt_function=get_qa_prompt_function(
33 | Language.THAI,
34 | lambda line: {
35 | "question": line["question"],
36 | "context": line["context"],
37 | "choices": [ans for ans in line["answers"]["answer"] if len(ans) > 0],
38 | },
39 | ),
40 | hf_repo="lighteval/thaiqa_squad_fixed",
41 | hf_subset="default",
42 | evaluation_splits=("train",),
43 | few_shots_split="validation",
44 | generation_size=400,
45 | stop_sequence=("\n",),
46 | metrics=(
47 | MultilingualQuasiExactMatchMetric(Language.THAI, "prefix"),
48 | MultilingualQuasiF1ScoreMetric(Language.THAI),
49 | ),
50 | )
51 | ]
52 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/kenswquad.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Kenswquad
4 |
5 | dataset:
6 | lighteval/KenSwQuAD
7 |
8 | abstract:
9 | KenSwQuAD: A question answering dataset for Kenyan Swahili.
10 |
11 | languages:
12 | swahili
13 |
14 | tags:
15 | multilingual, qa
16 |
17 | paper:
18 | https://arxiv.org/abs/2205.02364
19 | """
20 |
21 | from lighteval.metrics.dynamic_metrics import (
22 | MultilingualQuasiExactMatchMetric,
23 | MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 |
29 |
30 | TASKS_TABLE = [
31 | LightevalTaskConfig(
32 | name=f"kenswquad_{Language.SWAHILI.value}",
33 | prompt_function=get_qa_prompt_function(
34 | Language.SWAHILI,
35 | lambda line: {
36 | "question": line["question"],
37 | "context": line["context"],
38 | "choices": [line["answer"]],
39 | },
40 | ),
41 | hf_repo="lighteval/KenSwQuAD",
42 | hf_subset="default",
43 | evaluation_splits=("test",),
44 | few_shots_split="validation",
45 | metrics=(
46 | MultilingualQuasiExactMatchMetric(Language.SWAHILI, "prefix"),
47 | MultilingualQuasiF1ScoreMetric(Language.SWAHILI),
48 | ),
49 | generation_size=400,
50 | stop_sequence=("\n",),
51 | )
52 | ]
53 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/french_boolq.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | French Boolq
4 |
5 | dataset:
6 | manu/french_boolq
7 |
8 | abstract:
9 | French Boolq multilingual benchmark.
10 |
11 | languages:
12 | french
13 |
14 | tags:
15 | classification, multilingual, qa
16 |
17 | paper:
18 | """
19 |
20 | from lighteval.metrics.dynamic_metrics import (
21 | LogLikelihoodAccMetric,
22 | MultilingualQuasiExactMatchMetric,
23 | )
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.templates.boolq import get_boolq_prompt_function
26 | from lighteval.tasks.templates.utils.formulation import (
27 | CFFormulation,
28 | )
29 | from lighteval.utils.language import Language
30 |
31 |
32 | TASKS_TABLE = [
33 | LightevalTaskConfig(
34 | name=f"community_boolq_{Language.FRENCH.value}",
35 | prompt_function=get_boolq_prompt_function(
36 | Language.FRENCH,
37 | lambda line: {
38 | "question": line["question"],
39 | "answer": line["label"] == 1,
40 | "context": line["passage"],
41 | },
42 | formulation=CFFormulation(),
43 | ),
44 | hf_repo="manu/french_boolq",
45 | hf_subset="default",
46 | evaluation_splits=("test",),
47 | few_shots_split="valid",
48 | generation_size=5,
49 | stop_sequence=["\n"],
50 | metrics=[MultilingualQuasiExactMatchMetric(Language.FRENCH, "full"), LogLikelihoodAccMetric()],
51 | )
52 | ]
53 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/fquad_v2.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Fquad V2
4 |
5 | dataset:
6 | manu/fquad2_test
7 |
8 | abstract:
9 | FQuAD v2: French Question Answering Dataset version 2.
10 |
11 | languages:
12 | french
13 |
14 | tags:
15 | multilingual, qa
16 |
17 | paper:
18 | https://arxiv.org/abs/2002.06071
19 | """
20 |
21 | from lighteval.metrics.dynamic_metrics import (
22 | MultilingualQuasiExactMatchMetric,
23 | MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 |
29 |
30 | TASKS_TABLE = [
31 | LightevalTaskConfig(
32 | name=f"fquadv2_{Language.FRENCH.value}",
33 | prompt_function=get_qa_prompt_function(
34 | Language.FRENCH,
35 | lambda line: {
36 | "question": line["question"],
37 | "context": line["context"],
38 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
39 | },
40 | ),
41 | hf_repo="manu/fquad2_test",
42 | hf_subset="default",
43 | evaluation_splits=("test_hasAns",),
44 | few_shots_split="valid_hasAns",
45 | generation_size=400,
46 | stop_sequence=("\n",),
47 | metrics=(
48 | MultilingualQuasiExactMatchMetric(Language.FRENCH, "prefix"),
49 | MultilingualQuasiF1ScoreMetric(Language.FRENCH),
50 | ),
51 | )
52 | ]
53 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/cmrc2018.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Cmrc2018
4 |
5 | dataset:
6 | clue/clue
7 |
8 | abstract:
9 | CMRC 2018: A span-extraction machine reading comprehension dataset for Chinese.
10 |
11 | languages:
12 | chinese
13 |
14 | tags:
15 | multilingual, qa
16 |
17 | paper:
18 | https://arxiv.org/abs/1810.07366
19 | """
20 |
21 | from lighteval.metrics.dynamic_metrics import (
22 | MultilingualQuasiExactMatchMetric,
23 | MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 |
29 |
30 | TASKS_TABLE = [
31 | LightevalTaskConfig(
32 | name=f"cmrc2018_{Language.CHINESE.value}",
33 | prompt_function=get_qa_prompt_function(
34 | Language.CHINESE,
35 | lambda line: {
36 | "question": line["question"],
37 | "context": line["context"],
38 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
39 | },
40 | ),
41 | hf_repo="clue/clue",
42 | hf_subset="cmrc2018",
43 | evaluation_splits=("trial",),
44 | few_shots_split="train",
45 | generation_size=400,
46 | metrics=(
47 | MultilingualQuasiExactMatchMetric(Language.CHINESE, "prefix"),
48 | MultilingualQuasiF1ScoreMetric(Language.CHINESE),
49 | ),
50 | stop_sequence=("\n",),
51 | )
52 | ]
53 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/real_toxicity_prompts.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Real Toxicity Prompts
4 |
5 | dataset:
6 | allenai/real-toxicity-prompts
7 |
8 | abstract:
9 | The RealToxicityPrompts dataset for measuring toxicity in prompted model generations
10 |
11 | languages:
12 | english
13 |
14 | tags:
15 | generation, safety
16 |
17 | paper:
18 | https://aclanthology.org/2020.findings-emnlp.301/
19 | """
20 |
21 | from lighteval.metrics.metrics import Metrics
22 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
23 | from lighteval.tasks.requests import Doc
24 |
25 |
26 | def real_toxicity_prompts_prompt(line, task_name: str = None):
27 | # Some variants store text under 'prompt' -> 'text'; handle both flat and nested
28 | text = (
29 | line["prompt"]["text"]
30 | if isinstance(line.get("prompt"), dict) and "text" in line["prompt"]
31 | else line.get("text", "")
32 | )
33 | return Doc(task_name=task_name, query=text, choices=None, gold_index=None)
34 |
35 |
36 | real_toxicity_prompts = LightevalTaskConfig(
37 | name="real_toxicity_prompts",
38 | prompt_function=real_toxicity_prompts_prompt,
39 | hf_repo="allenai/real-toxicity-prompts",
40 | hf_subset="default",
41 | hf_avail_splits=["train"],
42 | evaluation_splits=["train"],
43 | few_shots_split=None,
44 | few_shots_select=None,
45 | generation_size=20,
46 | metrics=[Metrics.exact_match],
47 | stop_sequence=["\n"],
48 | version=0,
49 | )
50 |
51 | TASKS_TABLE = [
52 | real_toxicity_prompts,
53 | ]
54 |
--------------------------------------------------------------------------------
/docs/source/package_reference/models.mdx:
--------------------------------------------------------------------------------
1 | # Model Configs
2 |
3 | The model configs are used to define the model and its parameters. All the parameters can be
4 | set in the `model-args` or in the model yaml file (see example
5 | [here](https://github.com/huggingface/lighteval/blob/main/examples/model_configs/vllm_model_config.yaml)).
6 |
7 | ### Base model config
8 | [[autodoc]] models.abstract_model.ModelConfig
9 |
10 | ## Local Models
11 |
12 | ### Transformers Model
13 | [[autodoc]] models.transformers.transformers_model.TransformersModelConfig
14 | [[autodoc]] models.transformers.adapter_model.AdapterModelConfig
15 | [[autodoc]] models.transformers.delta_model.DeltaModelConfig
16 |
17 | ### VLLM Model
18 | [[autodoc]] models.vllm.vllm_model.VLLMModelConfig
19 |
20 | ### SGLang Model
21 | [[autodoc]] models.sglang.sglang_model.SGLangModelConfig
22 |
23 | ### Dummy Model
24 | [[autodoc]] models.dummy.dummy_model.DummyModelConfig
25 |
26 |
27 | ## Endpoints-based Models
28 |
29 | ### Inference Providers Model
30 | [[autodoc]] models.endpoints.inference_providers_model.InferenceProvidersModelConfig
31 |
32 | ### InferenceEndpointModel
33 | [[autodoc]] models.endpoints.endpoint_model.InferenceEndpointModelConfig
34 | [[autodoc]] models.endpoints.endpoint_model.ServerlessEndpointModelConfig
35 |
36 | ### TGI ModelClient
37 | [[autodoc]] models.endpoints.tgi_model.TGIModelConfig
38 |
39 | ### Litellm Model
40 | [[autodoc]] models.endpoints.litellm_model.LiteLLMModelConfig
41 |
42 | ## Custom Model
43 | [[autodoc]] models.custom.custom_model.CustomModelConfig
44 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/sber_squad.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Sber Squad
4 |
5 | dataset:
6 | kuznetsoffandrey/sberquad
7 |
8 | abstract:
9 | SberQuAD: A large-scale Russian reading comprehension dataset.
10 |
11 | languages:
12 | russian
13 |
14 | tags:
15 | multilingual, qa
16 |
17 | paper:
18 | https://arxiv.org/abs/1912.09723
19 | """
20 |
21 | from lighteval.metrics.dynamic_metrics import (
22 | MultilingualQuasiExactMatchMetric,
23 | MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 |
29 |
30 | TASKS_TABLE = [
31 | LightevalTaskConfig(
32 | name=f"sber_squad_{Language.RUSSIAN.value}",
33 | prompt_function=get_qa_prompt_function(
34 | Language.RUSSIAN,
35 | lambda line: {
36 | "question": line["question"],
37 | "context": line["context"],
38 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
39 | },
40 | ),
41 | hf_repo="kuznetsoffandrey/sberquad",
42 | hf_subset="sberquad",
43 | evaluation_splits=("validation",),
44 | few_shots_split="train",
45 | metrics=(
46 | MultilingualQuasiExactMatchMetric(Language.RUSSIAN, "prefix"),
47 | MultilingualQuasiF1ScoreMetric(Language.RUSSIAN),
48 | ),
49 | generation_size=400,
50 | stop_sequence=("\n",),
51 | )
52 | ]
53 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/chinese_squad.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Chinese Squad
4 |
5 | dataset:
6 | lighteval/ChineseSquad
7 |
8 | abstract:
9 | ChineseSquad is a reading comprehension dataset for Chinese.
10 |
11 | languages:
12 | chinese
13 |
14 | tags:
15 | multilingual, qa
16 |
17 | paper:
18 | https://github.com/pluto-junzeng/ChineseSquad
19 | """
20 |
21 | from lighteval.metrics.dynamic_metrics import (
22 | MultilingualQuasiExactMatchMetric,
23 | MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 |
29 |
30 | TASKS_TABLE = [
31 | LightevalTaskConfig(
32 | name=f"chinese_squad_{Language.CHINESE.value}",
33 | prompt_function=get_qa_prompt_function(
34 | Language.CHINESE,
35 | lambda line: {
36 | "question": line["question"],
37 | "context": line["context"],
38 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
39 | },
40 | ),
41 | hf_repo="lighteval/ChineseSquad",
42 | hf_subset="default",
43 | evaluation_splits=("validation",),
44 | few_shots_split="train",
45 | metrics=(
46 | MultilingualQuasiExactMatchMetric(Language.CHINESE, "prefix"),
47 | MultilingualQuasiF1ScoreMetric(Language.CHINESE),
48 | ),
49 | generation_size=400,
50 | stop_sequence=("\n",),
51 | )
52 | ]
53 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/squad_it.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Squad It
4 |
5 | dataset:
6 | crux82/squad_it
7 |
8 | abstract:
9 | SQuAD-it: Italian translation of the SQuAD dataset.
10 |
11 | languages:
12 | italian
13 |
14 | tags:
15 | multilingual, qa
16 |
17 | paper:
18 | https://github.com/crux82/squad-it
19 | """
20 |
21 | from lighteval.metrics.dynamic_metrics import (
22 | MultilingualQuasiExactMatchMetric,
23 | MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 |
29 |
30 | TASKS_TABLE = [
31 | LightevalTaskConfig(
32 | name=f"squad_{Language.ITALIAN.value}",
33 | prompt_function=get_qa_prompt_function(
34 | Language.ITALIAN,
35 | lambda line: {
36 | "question": line["question"],
37 | "context": line["context"],
38 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
39 | },
40 | ),
41 | hf_repo="crux82/squad_it",
42 | hf_subset="default",
43 | hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]),
44 | evaluation_splits=("test",),
45 | few_shots_split="train",
46 | generation_size=400,
47 | stop_sequence=("\n",),
48 | metrics=(
49 | MultilingualQuasiExactMatchMetric(Language.ITALIAN, "prefix"),
50 | MultilingualQuasiF1ScoreMetric(Language.ITALIAN),
51 | ),
52 | )
53 | ]
54 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/arcd.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Arcd
4 |
5 | dataset:
6 | hsseinmz/arcd
7 |
8 | abstract:
9 | ARCD: Arabic Reading Comprehension Dataset.
10 |
11 | languages:
12 | arabic
13 |
14 | tags:
15 | multilingual, multiple-choice, qa, reasoning
16 |
17 | paper:
18 | https://arxiv.org/pdf/1906.05394
19 | """
20 |
21 | from lighteval.metrics.dynamic_metrics import (
22 | MultilingualQuasiExactMatchMetric,
23 | MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 |
29 |
30 | # ARCD: Arabic Reading Comprehension Dataset.
31 | # https://arxiv.org/pdf/1906.05394
32 |
33 |
34 | TASKS_TABLE = [
35 | LightevalTaskConfig(
36 | name=f"arcd_{Language.ARABIC.value}",
37 | prompt_function=get_qa_prompt_function(
38 | Language.ARABIC,
39 | lambda line: {
40 | "question": line["question"],
41 | "context": line["context"],
42 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
43 | },
44 | ),
45 | hf_repo="hsseinmz/arcd",
46 | hf_subset="plain_text",
47 | evaluation_splits=("validation",),
48 | few_shots_split="train",
49 | metrics=(
50 | MultilingualQuasiExactMatchMetric(Language.ARABIC, "prefix"),
51 | MultilingualQuasiF1ScoreMetric(Language.ARABIC),
52 | ),
53 | generation_size=400,
54 | stop_sequence=("\n",),
55 | )
56 | ]
57 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/prost.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Prost
4 |
5 | dataset:
6 | lighteval/prost
7 |
8 | abstract:
9 | PROST is a benchmark for testing physical reasoning about objects through space
10 | and time. It includes 18,736 multiple-choice questions covering 10 core physics
11 | concepts, designed to probe models in zero-shot settings. Results show that even
12 | large pretrained models struggle with physical reasoning and are sensitive to
13 | question phrasing, underscoring their limited real-world understanding.
14 |
15 | languages:
16 | english
17 |
18 | tags:
19 | reasoning, qa, physical-commonsense
20 |
21 | paper:
22 | https://arxiv.org/abs/2106.03634
23 | """
24 |
25 | from lighteval.metrics.metrics import Metrics
26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
27 | from lighteval.tasks.requests import Doc
28 |
29 |
30 | def prost_prompt(line, task_name: str = None):
31 | return Doc(
32 | task_name=task_name,
33 | query=line["question"],
34 | choices=[f" {c}" for c in line["choices"]],
35 | gold_index=int(line["label"]) if isinstance(line["label"], int) else int(line["label"]),
36 | )
37 |
38 |
39 | prost = LightevalTaskConfig(
40 | name="prost",
41 | prompt_function=prost_prompt,
42 | hf_repo="lighteval/prost",
43 | hf_subset="default",
44 | hf_avail_splits=["test"],
45 | evaluation_splits=["test"],
46 | few_shots_split=None,
47 | few_shots_select=None,
48 | generation_size=-1,
49 | metrics=[Metrics.loglikelihood_acc],
50 | stop_sequence=["\n"],
51 | version=0,
52 | )
53 |
54 | TASKS_TABLE = [
55 | prost,
56 | ]
57 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/squad_es.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Squad Es
4 |
5 | dataset:
6 | ccasimiro/squad_es
7 |
8 | abstract:
9 | SQuAD-es: Spanish translation of the Stanford Question Answering Dataset
10 |
11 | languages:
12 | spanish
13 |
14 | tags:
15 | multilingual, qa
16 |
17 | paper:
18 | https://huggingface.co/datasets/ccasimiro/squad_es
19 | """
20 |
21 | from lighteval.metrics.dynamic_metrics import (
22 | MultilingualQuasiExactMatchMetric,
23 | MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 |
29 |
30 | TASKS_TABLE = [
31 | LightevalTaskConfig(
32 | name=f"squad_{Language.SPANISH.value}",
33 | prompt_function=get_qa_prompt_function(
34 | Language.SPANISH,
35 | lambda line: {
36 | "question": line["question"],
37 | "context": line["context"],
38 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
39 | },
40 | ),
41 | hf_repo="ccasimiro/squad_es",
42 | hf_subset="v2.0.0",
43 | hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]),
44 | evaluation_splits=("validation",),
45 | few_shots_split="train",
46 | metrics=(
47 | MultilingualQuasiExactMatchMetric(Language.SPANISH, "prefix"),
48 | MultilingualQuasiF1ScoreMetric(Language.SPANISH),
49 | ),
50 | generation_size=400,
51 | stop_sequence=("\n",),
52 | )
53 | ]
54 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/narrativeqa.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Narrativeqa
4 |
5 | dataset:
6 | lighteval/narrative_qa_helm
7 |
8 | abstract:
9 | NarrativeQA is a reading comprehension benchmark that tests deep understanding
10 | of full narratives—books and movie scripts—rather than shallow text matching. To
11 | answer its questions, models must integrate information across entire stories.
12 |
13 | languages:
14 | english
15 |
16 | tags:
17 | qa, reading-comprehension
18 |
19 | paper:
20 | https://aclanthology.org/Q18-1023/
21 | """
22 |
23 | from lighteval.metrics.metrics import Metrics
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.requests import Doc
26 |
27 |
28 | narrativeqa_instruction = "Answer the question based on the passage.\n"
29 |
30 |
31 | def narrativeqa_prompt(line, task_name: str = None):
32 | return Doc(
33 | task_name=task_name,
34 | query=f"Passage: {line['passage']}\nQuestion: {line['question']}\nAnswer:",
35 | gold_index=list(range(len(line["references"]))),
36 | choices=[[str(a) for a in line["references"]]],
37 | )
38 |
39 |
40 | narrativeqa = LightevalTaskConfig(
41 | name="narrativeqa",
42 | prompt_function=narrativeqa_prompt,
43 | hf_repo="lighteval/narrative_qa_helm",
44 | hf_subset="default",
45 | hf_avail_splits=["train", "test", "validation"],
46 | evaluation_splits=["test"],
47 | few_shots_split=None,
48 | few_shots_select=None,
49 | generation_size=100,
50 | metrics=[Metrics.exact_match],
51 | stop_sequence=["\n"],
52 | version=0,
53 | )
54 |
55 | TASKS_TABLE = [
56 | narrativeqa,
57 | ]
58 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/legalsupport.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Legalsupport
4 |
5 | dataset:
6 | lighteval/LegalSupport
7 |
8 | abstract:
9 | Measures fine-grained legal reasoning through reverse entailment.
10 |
11 | languages:
12 | english
13 |
14 | tags:
15 | legal
16 |
17 | paper:
18 | """
19 |
20 | from lighteval.metrics.metrics import Metrics
21 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
22 | from lighteval.tasks.requests import Doc
23 |
24 |
25 | def legalsupport_prompt(line, task_name: str = None):
26 | query = f"Which statement best supports the passage?\nPassage: {line['context']}\n"
27 | query += "".join(
28 | [
29 | f"{key}. {choice}\n"
30 | for key, choice in zip(
31 | ["a", "b"], [line["citation_a"]["parenthetical"], line["citation_b"]["parenthetical"]]
32 | )
33 | ]
34 | )
35 | query += "Answer:"
36 |
37 | return Doc(
38 | task_name=task_name,
39 | query=query,
40 | choices=["a", "b"],
41 | gold_index=0 if line["answer_label"] == "citation_a" else 1,
42 | )
43 |
44 |
45 | legalsupport = LightevalTaskConfig(
46 | name="legalsupport",
47 | prompt_function=legalsupport_prompt,
48 | hf_repo="lighteval/LegalSupport",
49 | hf_subset="default",
50 | hf_avail_splits=["train", "test", "validation"],
51 | evaluation_splits=["validation", "test"],
52 | few_shots_split=None,
53 | few_shots_select=None,
54 | generation_size=None,
55 | metrics=[Metrics.loglikelihood_acc],
56 | stop_sequence=["\n"],
57 | version=0,
58 | )
59 |
60 | TASKS_TABLE = [
61 | legalsupport,
62 | ]
63 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/sciq.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Sciq
4 |
5 | dataset:
6 | allenai/sciq
7 |
8 | abstract:
9 | The SciQ dataset contains 13,679 crowdsourced science exam questions about
10 | Physics, Chemistry and Biology, among others. The questions are in
11 | multiple-choice format with 4 answer options each. For the majority of the
12 | questions, an additional paragraph with supporting evidence for the correct
13 | answer is provided.
14 |
15 | languages:
16 | english
17 |
18 | tags:
19 | physics, chemistry, biology, reasoning, multiple-choice, qa
20 |
21 | paper:
22 | https://arxiv.org/abs/1707.06209
23 | """
24 |
25 | from lighteval.metrics.metrics import Metrics
26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
27 | from lighteval.tasks.requests import Doc
28 |
29 |
30 | def sciq_prompt(line, task_name: str = None):
31 | return Doc(
32 | task_name=task_name,
33 | query=f"{line['support']}\nQuestion: {line['question']}\nAnswer:".strip(),
34 | choices=[
35 | f" {c}" for c in [line["distractor1"], line["distractor2"], line["distractor3"], line["correct_answer"]]
36 | ],
37 | gold_index=3,
38 | )
39 |
40 |
41 | sciq = LightevalTaskConfig(
42 | name="sciq",
43 | prompt_function=sciq_prompt,
44 | hf_repo="allenai/sciq",
45 | hf_subset="default",
46 | hf_avail_splits=["train", "validation", "test"],
47 | evaluation_splits=["test"],
48 | few_shots_split=None,
49 | few_shots_select=None,
50 | generation_size=-1,
51 | metrics=[Metrics.loglikelihood_acc],
52 | stop_sequence=["\n"],
53 | version=0,
54 | )
55 |
56 | TASKS_TABLE = [
57 | sciq,
58 | ]
59 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/qasper.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Qasper
4 |
5 | dataset:
6 | allenai/qasper
7 |
8 | abstract:
9 | QASPER is a dataset for question answering on scientific research papers. It
10 | consists of 5,049 questions over 1,585 Natural Language Processing papers. Each
11 | question is written by an NLP practitioner who read only the title and abstract
12 | of the corresponding paper, and the question seeks information present in the
13 | full text. The questions are then answered by a separate set of NLP
14 | practitioners who also provide supporting evidence to answers.
15 |
16 | languages:
17 | english
18 |
19 | tags:
20 | qa, scientific
21 |
22 | paper:
23 | https://arxiv.org/abs/2105.03011
24 | """
25 |
26 | from lighteval.metrics.metrics import Metrics
27 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
28 | from lighteval.tasks.requests import Doc
29 |
30 |
31 | def qasper_prompt(line, task_name: str = None):
32 | return Doc(
33 | task_name=task_name,
34 | query=f"Title: {line['title']}\n\nPassage: {line['passage']}\n\n Question: {line['question']}\nAnswer: ",
35 | gold_index=0,
36 | choices=[line["gold"]],
37 | )
38 |
39 |
40 | qasper = LightevalTaskConfig(
41 | name="qasper",
42 | prompt_function=qasper_prompt,
43 | hf_repo="allenai/qasper",
44 | hf_subset="qasper",
45 | hf_avail_splits=["train", "validation"],
46 | evaluation_splits=["validation"],
47 | few_shots_split=None,
48 | few_shots_select=None,
49 | generation_size=20,
50 | metrics=[Metrics.f1_score],
51 | stop_sequence=["\n"],
52 | version=0,
53 | )
54 |
55 | TASKS_TABLE = [
56 | qasper,
57 | ]
58 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/faquad.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Faquad
4 |
5 | dataset:
6 | eraldoluis/faquad
7 |
8 | abstract:
9 | FaQuAD: A Portuguese Reading Comprehension Dataset
10 |
11 | languages:
12 | portuguese
13 |
14 | tags:
15 | multilingual, qa
16 |
17 | paper:
18 | https://arxiv.org/abs/2007.15671
19 | """
20 |
21 | from lighteval.metrics.dynamic_metrics import (
22 | MultilingualQuasiExactMatchMetric,
23 | MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 |
29 |
30 | TASKS_TABLE = [
31 | LightevalTaskConfig(
32 | name=f"faquad_{Language.PORTUGUESE.value}",
33 | prompt_function=get_qa_prompt_function(
34 | Language.PORTUGUESE,
35 | lambda line: {
36 | "question": line["question"],
37 | "context": line["context"],
38 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
39 | },
40 | ),
41 | hf_repo="eraldoluis/faquad",
42 | hf_subset="plain_text",
43 | hf_revision="205ba826a2282a4a5aa9bd3651e55ee4f2da1546",
44 | hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]),
45 | evaluation_splits=("validation",),
46 | few_shots_split="train",
47 | metrics=(
48 | MultilingualQuasiExactMatchMetric(Language.PORTUGUESE, "prefix"),
49 | MultilingualQuasiF1ScoreMetric(Language.PORTUGUESE),
50 | ),
51 | generation_size=400,
52 | stop_sequence=("\n",),
53 | )
54 | ]
55 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/germanquad.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Germanquad
4 |
5 | dataset:
6 | deepset/germanquad
7 |
8 | abstract:
9 | GermanQuAD: High-quality German QA dataset with 13,722 questions.
10 |
11 | languages:
12 | german
13 |
14 | tags:
15 | multilingual, qa
16 |
17 | paper:
18 | https://arxiv.org/abs/2104.12741
19 | """
20 |
21 | from lighteval.metrics.dynamic_metrics import (
22 | MultilingualQuasiExactMatchMetric,
23 | MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 |
29 |
30 | TASKS_TABLE = [
31 | LightevalTaskConfig(
32 | name=f"germanquad_{Language.GERMAN.value}",
33 | prompt_function=get_qa_prompt_function(
34 | Language.GERMAN,
35 | lambda line: {
36 | "question": line["question"],
37 | "context": line["context"],
38 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
39 | },
40 | ),
41 | hf_repo="deepset/germanquad",
42 | hf_subset="plain_text",
43 | hf_revision="fff05ceaf2ffbe5b65c7e0c57e678f7b7e1a0581",
44 | hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]),
45 | evaluation_splits=("test",),
46 | few_shots_split="train",
47 | generation_size=400,
48 | stop_sequence=("\n",),
49 | metrics=(
50 | MultilingualQuasiExactMatchMetric(Language.GERMAN, "prefix"),
51 | MultilingualQuasiF1ScoreMetric(Language.GERMAN),
52 | ),
53 | )
54 | ]
55 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/webqs.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Webqs
4 |
5 | dataset:
6 | stanfordnlp/web_questions
7 |
8 | abstract:
9 | This dataset consists of 6,642 question/answer pairs. The questions are supposed
10 | to be answerable by Freebase, a large knowledge graph. The questions are mostly
11 | centered around a single named entity. The questions are popular ones asked on
12 | the web.
13 |
14 | languages:
15 | english
16 |
17 | tags:
18 | qa
19 |
20 | paper:
21 | https://aclanthology.org/D13-1160.pdf
22 | """
23 |
24 | from lighteval.metrics.metrics import Metrics
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.requests import Doc
27 |
28 |
29 | def webqs_prompt(line, task_name: str = None):
30 | def _remove_prefixes(aliases):
31 | aliases.sort()
32 | ret = [aliases[0]]
33 | for alias in aliases[1:]:
34 | if not alias.startswith(ret[-1]):
35 | ret.append(alias)
36 | return ret
37 |
38 | return Doc(
39 | task_name=task_name,
40 | query=f"Question: {line['question']}\nAnswer:",
41 | gold_index=0,
42 | choices=[[f" {c}" for c in _remove_prefixes(line["answers"])]],
43 | )
44 |
45 |
46 | webqs = LightevalTaskConfig(
47 | name="webqs",
48 | prompt_function=webqs_prompt,
49 | hf_repo="stanfordnlp/web_questions",
50 | hf_subset="default",
51 | hf_avail_splits=["train", "test"],
52 | evaluation_splits=["test"],
53 | few_shots_split=None,
54 | few_shots_select=None,
55 | generation_size=-1,
56 | metrics=[Metrics.exact_match],
57 | stop_sequence=["\n"],
58 | version=0,
59 | )
60 |
61 | TASKS_TABLE = [
62 | webqs,
63 | ]
64 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/aimo.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | AIMO Progress Prize 1
4 |
5 | dataset:
6 | lighteval/aimo_progress_prize_1
7 |
8 | abstract:
9 | Task to evaluate LLMs on the training set of the Kaggle AIMO competition:
10 |
11 | languages:
12 | english
13 |
14 | tags:
15 | math, reasoning
16 |
17 | paper:
18 | """
19 |
20 | from inspect_ai.dataset import Sample
21 | from inspect_ai.solver import generate
22 |
23 | from lighteval.metrics.metrics import Metrics, math_scorer
24 | from lighteval.metrics.normalizations import math_normalizer
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.requests import Doc
27 |
28 |
29 | def aimo_prompt(line, task_name: str = None):
30 | return Doc(
31 | task_name=task_name,
32 | choices=[str(line["answer"])],
33 | gold_index=0,
34 | query=line["problem"],
35 | )
36 |
37 |
38 | def record_to_sample(record):
39 | return Sample(input=record["problem"], target=str(record["answer"]))
40 |
41 |
42 | task = LightevalTaskConfig(
43 | name="aimo_progress_prize_1",
44 | prompt_function=aimo_prompt,
45 | sample_fields=record_to_sample,
46 | solver=[generate(cache=True)],
47 | scorer=math_scorer(),
48 | hf_subset="",
49 | hf_repo="lighteval/aimo_progress_prize_1",
50 | hf_avail_splits=["train"],
51 | evaluation_splits=["train"],
52 | few_shots_split="train",
53 | few_shots_select="sequential",
54 | metrics=[
55 | Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer})
56 | ],
57 | generation_size=2048,
58 | stop_sequence=None,
59 | )
60 |
61 | # STORE YOUR EVALS
62 | TASKS_TABLE = [task]
63 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/asdiv.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Asdiv
4 |
5 | dataset:
6 | EleutherAI/asdiv
7 |
8 | abstract:
9 | ASDiv is a dataset for arithmetic reasoning that contains 2,000+ questions
10 | covering addition, subtraction, multiplication, and division.
11 |
12 | languages:
13 | english
14 |
15 | tags:
16 | math, reasoning
17 |
18 | paper:
19 | https://arxiv.org/abs/2410.12853
20 | """
21 |
22 | from inspect_ai.dataset import Sample
23 | from inspect_ai.solver import generate
24 |
25 | from lighteval.metrics.metrics import Metrics, math_scorer
26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
27 | from lighteval.tasks.requests import Doc
28 |
29 |
30 | def asdiv_prompt(line, task_name: str = None):
31 | return Doc(
32 | task_name=task_name,
33 | query=f"{line['body']}\nQuestion:{line['question']}\nAnswer:",
34 | choices=line["answer"].split(" (")[0],
35 | gold_index=[0],
36 | )
37 |
38 |
39 | def record_to_sample(record):
40 | query = f"{record['body']}\n{record['question']}"
41 | target = record["answer"].split(" (")[0]
42 | return Sample(input=query, target=target)
43 |
44 |
45 | asdiv = LightevalTaskConfig(
46 | name="asdiv",
47 | prompt_function=asdiv_prompt,
48 | hf_repo="EleutherAI/asdiv",
49 | hf_subset="asdiv",
50 | hf_avail_splits=["validation"],
51 | evaluation_splits=["validation"],
52 | few_shots_split=None,
53 | few_shots_select=None,
54 | generation_size=-1,
55 | metrics=[Metrics.exact_match],
56 | stop_sequence=["\n"],
57 | version=0,
58 | sample_fields=record_to_sample,
59 | solver=[generate(cache=True)],
60 | scorer=math_scorer(),
61 | )
62 |
63 | TASKS_TABLE = [asdiv]
64 |
--------------------------------------------------------------------------------
/examples/tasks/bbh.txt:
--------------------------------------------------------------------------------
1 | lighteval|bigbench:causal_judgment|3
2 | lighteval|bigbench:date_understanding|3
3 | lighteval|bigbench:disambiguation_qa|3
4 | lighteval|bigbench:geometric_shapes|3
5 | lighteval|bigbench:logical_deduction_five_objects|3
6 | lighteval|bigbench:logical_deduction_seven_objects|3
7 | lighteval|bigbench:logical_deduction_three_objects|3
8 | lighteval|bigbench:movie_recommendation|3
9 | lighteval|bigbench:navigate|3
10 | lighteval|bigbench:reasoning_about_colored_objects|3
11 | lighteval|bigbench:ruin_names|3
12 | lighteval|bigbench:salient_translation_error_detection|3
13 | lighteval|bigbench:snarks|3
14 | lighteval|bigbench:sports_understanding|3
15 | lighteval|bigbench:temporal_sequences|3
16 | lighteval|bigbench:tracking_shuffled_objects_five_objects|3
17 | lighteval|bigbench:tracking_shuffled_objects_seven_objects|3
18 | lighteval|bigbench:tracking_shuffled_objects_three_objects|3
19 | harness|bigbench:causal_judgment|3
20 | harness|bigbench:date_understanding|3
21 | harness|bigbench:disambiguation_qa|3
22 | harness|bigbench:geometric_shapes|3
23 | harness|bigbench:logical_deduction_five_objects|3
24 | harness|bigbench:logical_deduction_seven_objects|3
25 | harness|bigbench:logical_deduction_three_objects|3
26 | harness|bigbench:movie_recommendation|3
27 | harness|bigbench:navigate|3
28 | harness|bigbench:reasoning_about_colored_objects|3
29 | harness|bigbench:ruin_names|3
30 | harness|bigbench:salient_translation_error_detection|3
31 | harness|bigbench:snarks|3
32 | harness|bigbench:sports_understanding|3
33 | harness|bigbench:temporal_sequences|3
34 | harness|bigbench:tracking_shuffled_objects_five_objects|3
35 | harness|bigbench:tracking_shuffled_objects_seven_objects|3
36 | harness|bigbench:tracking_shuffled_objects_three_objects|3
37 |
--------------------------------------------------------------------------------
/.github/workflows/tests.yaml:
--------------------------------------------------------------------------------
1 | name: Tests
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | - v*-release
8 | pull_request:
9 | branches:
10 | - main
11 |
12 | jobs:
13 | run_tests:
14 | name: Run tests
15 | runs-on: ubuntu-latest
16 | steps:
17 | - name: Checkout repository
18 | uses: actions/checkout@v4
19 | with:
20 | lfs: true
21 |
22 | - name: Cache Hugging Face models
23 | uses: actions/cache@v4
24 | with:
25 | path: cache/models
26 | key: hf-models-${{ runner.os }}-${{ github.ref }}
27 | restore-keys: hf-models-${{ runner.os }}-
28 |
29 | - name: Cache Hugging Face datasets
30 | uses: actions/cache@v4
31 | with:
32 | path: cache/datasets
33 | key: hf-datasets-${{ runner.os }}-${{ github.ref }}
34 | restore-keys: hf-datasets-${{ runner.os }}-
35 |
36 | - name: Cache uv virtual environment
37 | uses: actions/cache@v4
38 | with:
39 | path: .venv
40 | key: uv-env-${{ runner.os }}-${{ hashFiles('pyproject.toml') }}
41 | restore-keys: uv-env-${{ runner.os }}-
42 |
43 | - name: Install uv
44 | uses: astral-sh/setup-uv@v5
45 | with:
46 | enable-cache: true
47 |
48 | - name: Install the project
49 | run: uv sync --extra dev
50 |
51 | - name: Ensure cache directories exist
52 | run: mkdir -p cache/models cache/datasets
53 |
54 | - name: Run tests
55 | env:
56 | HF_TEST_TOKEN: ${{ secrets.HF_TEST_TOKEN }}
57 | HF_HOME: "cache/models"
58 | HF_DATASETS_CACHE: "cache/datasets"
59 | run: uv run pytest -x --disable-pytest-warnings
60 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/twitterAAE.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Twitteraae
4 |
5 | dataset:
6 | lighteval/twitterAAE
7 |
8 | abstract:
9 | Demographic Dialectal Variation in Social Media: A Case Study of African-American English
10 |
11 | languages:
12 | english
13 |
14 | tags:
15 | language-modeling
16 |
17 | paper:
18 | https://aclanthology.org/D16-1120/
19 | """
20 |
21 | from lighteval.metrics.metrics import Metrics
22 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
23 | from lighteval.tasks.requests import Doc
24 |
25 |
26 | def twitter_aae_prompt(line, task_name: str = None):
27 | return Doc(task_name=task_name, query=line["tweet"], choices=None, gold_index=None)
28 |
29 |
30 | twitterAAE_aa = LightevalTaskConfig(
31 | name="twitterAAE:aa",
32 | prompt_function=twitter_aae_prompt,
33 | hf_repo="lighteval/twitterAAE",
34 | hf_subset="aa",
35 | hf_avail_splits=["test"],
36 | evaluation_splits=["test"],
37 | few_shots_split=None,
38 | few_shots_select=None,
39 | generation_size=-1,
40 | metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
41 | stop_sequence=["\n"],
42 | version=0,
43 | )
44 |
45 |
46 | twitterAAE_white = LightevalTaskConfig(
47 | name="twitterAAE:white",
48 | prompt_function=twitter_aae_prompt,
49 | hf_repo="lighteval/twitterAAE",
50 | hf_subset="white",
51 | hf_avail_splits=["test"],
52 | evaluation_splits=["test"],
53 | few_shots_split=None,
54 | few_shots_select=None,
55 | generation_size=-1,
56 | metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
57 | stop_sequence=["\n"],
58 | version=0,
59 | )
60 |
61 | TASKS_TABLE = [
62 | twitterAAE_aa,
63 | twitterAAE_white,
64 | ]
65 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/logiqa.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Logiqa
4 |
5 | dataset:
6 | lighteval/logiqa_harness
7 |
8 | abstract:
9 | LogiQA is a machine reading comprehension dataset focused on testing logical
10 | reasoning abilities. It contains 8,678 expert-written multiple-choice questions
11 | covering various types of deductive reasoning. While humans perform strongly,
12 | state-of-the-art models lag far behind, making LogiQA a benchmark for advancing
13 | logical reasoning in NLP systems.
14 |
15 | languages:
16 | english
17 |
18 | tags:
19 | qa
20 |
21 | paper:
22 | https://arxiv.org/abs/2007.08124
23 | """
24 |
25 | from lighteval.metrics.metrics import Metrics
26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
27 | from lighteval.tasks.requests import Doc
28 |
29 |
30 | def logiqa_prompt(line, task_name: str = None):
31 | query = f"Passage: {line['context']}\nQuestion: {line['question']}\nChoices:\n"
32 | query += "".join([f"{key}. {choice}\n" for key, choice in zip(["A", "B", "C", "D"], line["options"])])
33 | query += "Answer:"
34 |
35 | return Doc(
36 | task_name=task_name,
37 | query=query,
38 | choices=[f" {c}" for c in line["options"]],
39 | gold_index=["a", "b", "c", "d"].index(line["label"]),
40 | )
41 |
42 |
43 | logiqa = LightevalTaskConfig(
44 | name="logiqa",
45 | prompt_function=logiqa_prompt,
46 | hf_repo="lighteval/logiqa_harness",
47 | hf_subset="logiqa",
48 | hf_avail_splits=["train", "validation", "test"],
49 | evaluation_splits=["test"],
50 | few_shots_split=None,
51 | few_shots_select=None,
52 | generation_size=-1,
53 | metrics=[Metrics.loglikelihood_acc],
54 | stop_sequence=["\n"],
55 | version=0,
56 | )
57 |
58 | TASKS_TABLE = [
59 | logiqa,
60 | ]
61 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/hindi_boolq.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Hindi Boolq
4 |
5 | dataset:
6 | ai4bharat/boolq-hi
7 |
8 | abstract:
9 | Hindi Boolq multilingual benchmark.
10 |
11 | languages:
12 | gujarati, hindi, malayalam, marathi, tamil
13 |
14 | tags:
15 | classification, multilingual, qa
16 |
17 | paper:
18 | """
19 |
20 | from langcodes import standardize_tag
21 |
22 | from lighteval.metrics.dynamic_metrics import (
23 | LogLikelihoodAccMetric,
24 | MultilingualQuasiExactMatchMetric,
25 | )
26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
27 | from lighteval.tasks.templates.boolq import get_boolq_prompt_function
28 | from lighteval.tasks.templates.utils.formulation import (
29 | CFFormulation,
30 | )
31 | from lighteval.utils.language import Language
32 |
33 |
34 | TASKS_TABLE = [
35 | LightevalTaskConfig(
36 | name=f"community_boolq_{language.value}",
37 | prompt_function=get_boolq_prompt_function(
38 | language,
39 | lambda line: {
40 | "question": line["question"],
41 | "answer": line["answer"],
42 | "context": line["passage"],
43 | },
44 | formulation=CFFormulation(),
45 | ),
46 | hf_repo="ai4bharat/boolq-hi",
47 | hf_subset=standardize_tag(language.value),
48 | evaluation_splits=("validation",),
49 | few_shots_split="train",
50 | generation_size=5,
51 | stop_sequence=["\n"],
52 | metrics=[MultilingualQuasiExactMatchMetric(language, "full"), LogLikelihoodAccMetric()],
53 | )
54 | for language in [
55 | Language.HINDI,
56 | Language.GUJARATI,
57 | Language.MALAYALAM,
58 | Language.MARATHI,
59 | Language.TAMIL,
60 | ]
61 | ]
62 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/mintaka.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Mintaka
4 |
5 | dataset:
6 | AmazonScience/mintaka
7 |
8 | abstract:
9 | Mintaka multilingual benchmark.
10 |
11 | languages:
12 | arabic, english, french, german, hindi, italian, japanese, portuguese, spanish
13 |
14 | tags:
15 | knowledge, multilingual, qa
16 |
17 | paper:
18 | """
19 |
20 | from langcodes import standardize_tag
21 |
22 | from lighteval.metrics.dynamic_metrics import (
23 | MultilingualQuasiExactMatchMetric,
24 | MultilingualQuasiF1ScoreMetric,
25 | )
26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
27 | from lighteval.tasks.templates.qa import get_qa_prompt_function
28 | from lighteval.utils.language import Language
29 |
30 |
31 | TASKS_TABLE = [
32 | LightevalTaskConfig(
33 | name=f"mintaka_{lang.value}",
34 | prompt_function=get_qa_prompt_function(
35 | lang,
36 | lambda line: {
37 | "question": line["question"],
38 | "choices": [line["answerText"]],
39 | },
40 | ),
41 | hf_repo="AmazonScience/mintaka",
42 | hf_subset=standardize_tag(lang.value),
43 | evaluation_splits=("test",),
44 | few_shots_split="train",
45 | generation_size=400,
46 | stop_sequence=("\n",),
47 | metrics=[
48 | MultilingualQuasiExactMatchMetric(lang, "prefix"),
49 | MultilingualQuasiF1ScoreMetric(lang),
50 | ],
51 | )
52 | for lang in [
53 | Language.ARABIC,
54 | Language.GERMAN,
55 | Language.ENGLISH,
56 | Language.SPANISH,
57 | Language.FRENCH,
58 | Language.HINDI,
59 | Language.ITALIAN,
60 | Language.JAPANESE,
61 | Language.PORTUGUESE,
62 | ]
63 | ]
64 |
--------------------------------------------------------------------------------
/tests/unit/models/test_base_model.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2024 The HuggingFace Team
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | from lighteval.models.model_loader import load_model
24 | from lighteval.models.transformers.transformers_model import TransformersModel, TransformersModelConfig
25 |
26 |
27 | def test_empty_requests():
28 | model_config = TransformersModelConfig(
29 | model_name="hf-internal-testing/tiny-random-LlamaForCausalLM", model_parallel=False, revision="main"
30 | )
31 | model: TransformersModel = load_model(config=model_config)
32 |
33 | assert model.loglikelihood([]) == []
34 | assert model.loglikelihood_rolling([]) == []
35 | assert model.greedy_until([]) == []
36 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/winogrande.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Winogrande
4 |
5 | dataset:
6 | allenai/winogrande
7 |
8 | abstract:
9 | WinoGrande is a new collection of 44k problems, inspired by Winograd Schema
10 | Challenge (Levesque, Davis, and Morgenstern 2011), but adjusted to improve the
11 | scale and robustness against the dataset-specific bias. Formulated as a
12 | fill-in-a-blank task with binary options, the goal is to choose the right option
13 | for a given sentence which requires commonsense reasoning.
14 |
15 | languages:
16 | english
17 |
18 | tags:
19 | commonsense, multiple-choice
20 |
21 | paper:
22 | https://arxiv.org/abs/1907.10641
23 | """
24 |
25 | from lighteval.metrics.metrics import Metrics
26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
27 | from lighteval.tasks.requests import Doc
28 |
29 |
30 | def winogrande_prompt(line, task_name: str = None):
31 | query, end_of_target = line["sentence"].split("_")
32 | end_of_target = end_of_target.strip()
33 | return Doc(
34 | task_name=task_name,
35 | query=query,
36 | choices=[f"{line['option1']} {end_of_target}", f"{line['option2']} {end_of_target}"],
37 | gold_index=int(line["answer"]) - 1 if line["answer"] != "" else -1,
38 | )
39 |
40 |
41 | winogrande = LightevalTaskConfig(
42 | name="winogrande",
43 | prompt_function=winogrande_prompt,
44 | hf_repo="allenai/winogrande",
45 | hf_subset="winogrande_xl",
46 | hf_avail_splits=["train", "test", "validation"],
47 | evaluation_splits=["validation"],
48 | few_shots_split=None,
49 | few_shots_select="random_sampling",
50 | generation_size=-1,
51 | metrics=[Metrics.loglikelihood_acc],
52 | stop_sequence=["\n"],
53 | version=0,
54 | )
55 |
56 | TASKS_TABLE = [
57 | winogrande,
58 | ]
59 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/swag.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Swag
4 |
5 | dataset:
6 | allenai/swag
7 |
8 | abstract:
9 | The dataset consists of 113k multiple choice questions about grounded situations
10 | (73k training, 20k validation, 20k test). Each question is a video caption from
11 | LSMDC or ActivityNet Captions, with four answer choices about what might happen
12 | next in the scene. The correct answer is the (real) video caption for the next
13 | event in the video; the three incorrect answers are adversarially generated and
14 | human verified, so as to fool machines but not humans. SWAG aims to be a
15 | benchmark for evaluating grounded commonsense NLI and for learning
16 | representations.
17 |
18 | languages:
19 | english
20 |
21 | tags:
22 | narrative, reasoning
23 |
24 | paper:
25 | https://arxiv.org/abs/1808.05326
26 | """
27 |
28 | from lighteval.metrics.metrics import Metrics
29 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
30 | from lighteval.tasks.requests import Doc
31 |
32 |
33 | def swag_prompt(line, task_name: str = None):
34 | choices = [line["ending0"], line["ending1"], line["ending2"], line["ending3"]]
35 | return Doc(
36 | task_name=task_name,
37 | query=line["startphrase"],
38 | choices=choices,
39 | gold_index=int(line["label"]),
40 | )
41 |
42 |
43 | swag = LightevalTaskConfig(
44 | name="swag",
45 | prompt_function=swag_prompt,
46 | hf_repo="allenai/swag",
47 | hf_subset="regular",
48 | hf_avail_splits=["train", "validation"],
49 | evaluation_splits=["validation"],
50 | few_shots_split=None,
51 | few_shots_select=None,
52 | generation_size=-1,
53 | metrics=[Metrics.loglikelihood_acc],
54 | stop_sequence=["\n"],
55 | version=0,
56 | )
57 |
58 | TASKS_TABLE = [
59 | swag,
60 | ]
61 |
--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/avg_at_k_math.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Avg At K Math Test Suite",
3 | "description": "Test cases for avg_at_k_math metric",
4 | "test_cases": [
5 | {
6 | "name": "Avg at K Math - Correct Math",
7 | "metric_class": "avg_at_k_math",
8 | "metric_params": {"k": 1},
9 | "doc": {
10 | "query": "What is 2 + 2?",
11 | "choices": ["4"],
12 | "gold_index": 0,
13 | "task_name": "math"
14 | },
15 | "model_response": {
16 | "text": ["4"]
17 | },
18 | "expected_output": {
19 | "avg@k:k=1": 1.0
20 | },
21 | "tolerance": 0.01,
22 | "description": "Test avg at k math with correct math answer"
23 | },
24 | {
25 | "name": "Avg at K Math - Wrong Math",
26 | "metric_class": "avg_at_k_math",
27 | "metric_params": {"k": 1},
28 | "doc": {
29 | "query": "What is 2 + 2?",
30 | "choices": ["4"],
31 | "gold_index": 0,
32 | "task_name": "math"
33 | },
34 | "model_response": {
35 | "text": ["5"]
36 | },
37 | "expected_output": {
38 | "avg@k:k=1": 0.0
39 | },
40 | "tolerance": 0.01,
41 | "description": "Test avg at k math with wrong math answer"
42 | },
43 | {
44 | "name": "Avg at K Math - Multiple Attempts",
45 | "metric_class": "avg_at_k_math",
46 | "metric_params": {"k": 2},
47 | "doc": {
48 | "query": "What is 3 * 4?",
49 | "choices": ["12"],
50 | "gold_index": 0,
51 | "task_name": "math"
52 | },
53 | "model_response": {
54 | "text": ["12", "15"]
55 | },
56 | "expected_output": {
57 | "avg@k:k=2": 0.5
58 | },
59 | "tolerance": 0.01,
60 | "description": "Test avg at k math with multiple attempts"
61 | }
62 | ]
63 | }
64 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/mgsm.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Mgsm
4 |
5 | dataset:
6 | juletxara/mgsm
7 |
8 | abstract:
9 | Mgsm multilingual benchmark.
10 |
11 | languages:
12 | bengali, chinese, english, french, german, japanese, russian, spanish, swahili,
13 | telugu, thai
14 |
15 | tags:
16 | math, multilingual, reasoning
17 |
18 | paper:
19 | """
20 |
21 | from langcodes import standardize_tag
22 |
23 | from lighteval.metrics.dynamic_metrics import (
24 | MultilingualQuasiExactMatchMetric,
25 | )
26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
27 | from lighteval.tasks.templates.qa import get_qa_prompt_function
28 | from lighteval.utils.language import Language
29 |
30 |
31 | TASKS_TABLE = [
32 | LightevalTaskConfig(
33 | name=f"mgsm_{language.value}",
34 | prompt_function=get_qa_prompt_function(
35 | language,
36 | lambda line: {
37 | "question": line["question"],
38 | # The cot is available but we have no use:
39 | # line["answer"]
40 | "choices": [str(line["answer_number"])],
41 | },
42 | ),
43 | hf_repo="juletxara/mgsm",
44 | hf_subset=standardize_tag(language.value),
45 | evaluation_splits=("test",),
46 | few_shots_split="train",
47 | generation_size=25,
48 | metrics=[
49 | MultilingualQuasiExactMatchMetric(language, "full"),
50 | ],
51 | stop_sequence=("\n",),
52 | )
53 | for language in [
54 | Language.ENGLISH,
55 | Language.SPANISH,
56 | Language.FRENCH,
57 | Language.GERMAN,
58 | Language.RUSSIAN,
59 | Language.CHINESE,
60 | Language.JAPANESE,
61 | Language.THAI,
62 | Language.SWAHILI,
63 | Language.BENGALI,
64 | Language.TELUGU,
65 | ]
66 | ]
67 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/med_dialog.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Med Dialog
4 |
5 | dataset:
6 | lighteval/med_dialog
7 |
8 | abstract:
9 | A collection of medical dialogue datasets.
10 |
11 | languages:
12 | english
13 |
14 | tags:
15 | dialog, health, medical
16 |
17 | paper:
18 | """
19 |
20 | from lighteval.metrics.metrics import Metrics
21 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
22 | from lighteval.tasks.requests import Doc
23 |
24 |
25 | def med_dialog_prompt(line, task_name: str = None):
26 | return Doc(
27 | task_name=task_name,
28 | query=f"###\nArticle:{line['src']}\n\nSummarize the above article in 1 sentence.\n",
29 | gold_index=0,
30 | choices=[line["tgt"]],
31 | )
32 |
33 |
34 | med_dialog_healthcaremagic = LightevalTaskConfig(
35 | name="med_dialog:healthcaremagic",
36 | prompt_function=med_dialog_prompt,
37 | hf_repo="lighteval/med_dialog",
38 | hf_subset="healthcaremagic",
39 | hf_avail_splits=["train", "test", "validation"],
40 | evaluation_splits=["validation", "test"],
41 | few_shots_split=None,
42 | few_shots_select=None,
43 | generation_size=128,
44 | metrics=[
45 | Metrics.exact_match,
46 | ],
47 | stop_sequence=["\n"],
48 | version=0,
49 | )
50 |
51 |
52 | med_dialog_icliniq = LightevalTaskConfig(
53 | name="med_dialog:icliniq",
54 | prompt_function=med_dialog_prompt,
55 | hf_repo="lighteval/med_dialog",
56 | hf_subset="icliniq",
57 | hf_avail_splits=["train", "test", "validation"],
58 | evaluation_splits=["validation", "test"],
59 | few_shots_split=None,
60 | few_shots_select=None,
61 | generation_size=128,
62 | metrics=[
63 | Metrics.exact_match,
64 | ],
65 | stop_sequence=["\n"],
66 | version=0,
67 | )
68 |
69 | TASKS_TABLE = [
70 | med_dialog_healthcaremagic,
71 | med_dialog_icliniq,
72 | ]
73 |
--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/pass_at_k_math.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Pass At K Math Test Suite",
3 | "description": "Test cases for pass_at_k_math metric",
4 | "test_cases": [
5 | {
6 | "name": "Pass at K Math - Correct Math",
7 | "metric_class": "pass_at_k_math",
8 | "metric_params": {"k": 1, "n": 2},
9 | "doc": {
10 | "query": "What is 2 + 2?",
11 | "choices": ["4"],
12 | "gold_index": 0,
13 | "task_name": "math"
14 | },
15 | "model_response": {
16 | "text": ["4", "5"]
17 | },
18 | "expected_output": {
19 | "pass@k:k=1&n=2": 0.5
20 | },
21 | "tolerance": 0.01,
22 | "description": "Test pass at k math with correct math answer"
23 | },
24 | {
25 | "name": "Pass at K Math - Wrong Math",
26 | "metric_class": "pass_at_k_math",
27 | "metric_params": {"k": 1, "n": 2},
28 | "doc": {
29 | "query": "What is 2 + 2?",
30 | "choices": ["4"],
31 | "gold_index": 0,
32 | "task_name": "math"
33 | },
34 | "model_response": {
35 | "text": ["5", "6"]
36 | },
37 | "expected_output": {
38 | "pass@k:k=1&n=2": 0.0
39 | },
40 | "tolerance": 0.01,
41 | "description": "Test pass at k math with wrong math answer"
42 | },
43 | {
44 | "name": "Pass at K Math - Multiple Attempts",
45 | "metric_class": "pass_at_k_math",
46 | "metric_params": {"k": 2, "n": 3},
47 | "doc": {
48 | "query": "What is 3 * 4?",
49 | "choices": ["12"],
50 | "gold_index": 0,
51 | "task_name": "math"
52 | },
53 | "model_response": {
54 | "text": ["10", "12", "15"]
55 | },
56 | "expected_output": {
57 | "pass@k:k=2&n=3": 0.66
58 | },
59 | "tolerance": 0.01,
60 | "description": "Test pass at k math with multiple attempts"
61 | }
62 | ]
63 | }
64 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/soqal.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Soqal
4 |
5 | dataset:
6 | OALL/AlGhafa-Arabic-LLM-Benchmark-Native
7 |
8 | abstract:
9 | SOQAL: A large-scale Arabic reading comprehension dataset.
10 |
11 | languages:
12 | arabic
13 |
14 | tags:
15 | multilingual, qa
16 |
17 | paper:
18 | https://arxiv.org/abs/1906.05394
19 | """
20 |
21 | from lighteval.metrics.dynamic_metrics import (
22 | LogLikelihoodAccMetric,
23 | )
24 | from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.multilingual.adapters import (
27 | alghafa_adapter,
28 | )
29 | from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
30 | from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
31 | from lighteval.tasks.templates.utils.formulation import (
32 | CFFormulation,
33 | HybridFormulation,
34 | MCFFormulation,
35 | )
36 | from lighteval.utils.language import Language
37 |
38 |
39 | TASKS_TABLE = [
40 | LightevalTaskConfig(
41 | name=f"soqal_{Language.ARABIC.value}_{formulation.name.lower()}",
42 | hf_subset="multiple_choice_grounded_statement_soqal_task",
43 | prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation),
44 | evaluation_splits=["test"],
45 | few_shots_split="validation",
46 | hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
47 | metrics=get_metrics_for_formulation(
48 | formulation,
49 | [
50 | LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
51 | LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
52 | ],
53 | ),
54 | )
55 | for formulation in [
56 | MCFFormulation(),
57 | CFFormulation(),
58 | HybridFormulation(),
59 | ]
60 | ]
61 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/piqa.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Piqa
4 |
5 | dataset:
6 | ybisk/piqa
7 |
8 | abstract:
9 | PIQA is a benchmark for testing physical commonsense reasoning. It contains
10 | questions requiring this kind of physical commonsense reasoning.
11 |
12 | languages:
13 | english
14 |
15 | tags:
16 | commonsense, multiple-choice, qa
17 |
18 | paper:
19 | https://arxiv.org/abs/1911.11641
20 | """
21 |
22 | from string import ascii_uppercase
23 |
24 | from lighteval.metrics.metrics import Metrics
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.requests import Doc
27 |
28 |
29 | def piqa_prompt(line, task_name: str = None):
30 | letters = list(ascii_uppercase)[:2]
31 | query = "The following are multiple choice questions (with answers) about common sense.\n"
32 | query += f"Question: {line['goal']}\n"
33 | query += "".join([f"{key}. {choice}\n" for key, choice in zip(letters, [line["sol1"], line["sol2"]])])
34 | query += "Answer: "
35 |
36 | gold_ix = int(line["label"])
37 | is_few_shots = line.get("__few_shots", False)
38 | return Doc(
39 | task_name=task_name,
40 | query=query,
41 | choices=letters if not is_few_shots else [line["sol1"], line["sol2"]],
42 | gold_index=gold_ix,
43 | instruction="The following are multiple choice questions (with answers) about common sense.\n",
44 | )
45 |
46 |
47 | piqa = LightevalTaskConfig(
48 | name="piqa",
49 | prompt_function=piqa_prompt,
50 | hf_repo="ybisk/piqa",
51 | hf_subset="plain_text",
52 | hf_avail_splits=["train", "test", "validation"],
53 | evaluation_splits=["validation", "test"],
54 | few_shots_split=None,
55 | few_shots_select=None,
56 | generation_size=1,
57 | metrics=[
58 | Metrics.exact_match,
59 | ],
60 | stop_sequence=["\n"],
61 | version=0,
62 | )
63 |
64 | TASKS_TABLE = [
65 | piqa,
66 | ]
67 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/thai_exams.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Thai Exams
4 |
5 | dataset:
6 | scb10x/thai_exam
7 |
8 | abstract:
9 | Thai Exams multilingual benchmark.
10 |
11 | languages:
12 | thai
13 |
14 | tags:
15 | knowledge, multilingual, multiple-choice
16 |
17 | paper:
18 | """
19 |
20 | from lighteval.metrics.dynamic_metrics import (
21 | LogLikelihoodAccMetric,
22 | )
23 | from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.multilingual.adapters import (
26 | thai_exams_adapter,
27 | )
28 | from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
29 | from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
30 | from lighteval.tasks.templates.utils.formulation import (
31 | CFFormulation,
32 | HybridFormulation,
33 | MCFFormulation,
34 | )
35 | from lighteval.utils.language import Language
36 |
37 |
38 | THAI_EXAMS_SUBSETS = ["a_level", "ic", "onet", "tgat", "tpat1"]
39 |
40 |
41 | TASKS_TABLE = [
42 | LightevalTaskConfig(
43 | name=f"thai_exams_{Language.THAI.value}_{formulation.name.lower()}:{subset}",
44 | prompt_function=get_mcq_prompt_function(Language.THAI, thai_exams_adapter, formulation=formulation),
45 | hf_repo="scb10x/thai_exam",
46 | hf_subset=subset,
47 | evaluation_splits=("test",),
48 | few_shots_split="train",
49 | metrics=get_metrics_for_formulation(
50 | formulation,
51 | [
52 | LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
53 | LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
54 | ],
55 | ),
56 | )
57 | for subset in THAI_EXAMS_SUBSETS
58 | for formulation in [
59 | MCFFormulation(),
60 | CFFormulation(),
61 | HybridFormulation(),
62 | ]
63 | ]
64 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/hellaswag.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Hellaswag
4 |
5 | dataset:
6 | Rowan/hellaswag
7 |
8 | abstract:
9 | HellaSwag is a commonsense inference benchmark designed to challenge language
10 | models with adversarially filtered multiple-choice questions.
11 |
12 | languages:
13 | english
14 |
15 | tags:
16 | multiple-choice, narrative, reasoning
17 |
18 | paper:
19 | https://arxiv.org/abs/1905.07830
20 | """
21 |
22 | from string import ascii_uppercase
23 |
24 | from lighteval.metrics.metrics import Metrics
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.requests import Doc
27 |
28 |
29 | def hellaswag_prompt(line, task_name: str = None):
30 | query = "The following are multiple choice questions (with answers) about common sense.\n\n"
31 | query += f"Question: {line['activity_label']}: {line['ctx_a']} {line['ctx_b'].capitalize()}\n"
32 | query += "".join([f"{key}. {choice}\n" for key, choice in zip(ascii_uppercase, line["endings"])])
33 | query += "Answer:"
34 |
35 | gold_ix = int(line["label"]) if line["label"] != "" else -1
36 | return Doc(
37 | task_name=task_name,
38 | query=query,
39 | choices=[" " + i for i in ascii_uppercase[: len(line["endings"])]],
40 | gold_index=gold_ix,
41 | instruction="The following are multiple choice questions (with answers) about common sense.\n\n",
42 | )
43 |
44 |
45 | hellaswag = LightevalTaskConfig(
46 | name="hellaswag",
47 | prompt_function=hellaswag_prompt,
48 | hf_repo="Rowan/hellaswag",
49 | hf_subset="default",
50 | hf_avail_splits=["train", "test", "validation"],
51 | evaluation_splits=["validation"],
52 | few_shots_split=None,
53 | few_shots_select=None,
54 | generation_size=1,
55 | metrics=[
56 | Metrics.exact_match,
57 | ],
58 | stop_sequence=["\n"],
59 | version=0,
60 | )
61 |
62 | TASKS_TABLE = [
63 | hellaswag,
64 | ]
65 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/storycloze.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Storycloze
4 |
5 | dataset:
6 | MoE-UNC/story_cloze
7 |
8 | abstract:
9 | A Corpus and Cloze Evaluation for Deeper Understanding of
10 | Commonsense Stories
11 |
12 | languages:
13 | english
14 |
15 | tags:
16 | narrative, reasoning
17 |
18 | paper:
19 | https://arxiv.org/abs/1604.01696
20 | """
21 |
22 | from lighteval.metrics.metrics import Metrics
23 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
24 | from lighteval.tasks.requests import Doc
25 |
26 |
27 | def storycloze_prompt(line, task_name: str = None):
28 | context = "\n".join(
29 | [line["input_sentence_1"], line["input_sentence_2"], line["input_sentence_3"], line["input_sentence_4"]]
30 | )
31 | choices = [line["sentence_quiz1"], line["sentence_quiz2"]]
32 | gold = int(line["answer_right_ending"]) - 1
33 | return Doc(task_name=task_name, query=context + "\n", choices=choices, gold_index=gold)
34 |
35 |
36 | storycloze_2016 = LightevalTaskConfig(
37 | name="storycloze:2016",
38 | prompt_function=storycloze_prompt,
39 | hf_repo="MoE-UNC/story_cloze",
40 | hf_subset="2016",
41 | hf_avail_splits=["validation"],
42 | evaluation_splits=["validation"],
43 | few_shots_split=None,
44 | few_shots_select=None,
45 | generation_size=-1,
46 | metrics=[Metrics.exact_match],
47 | stop_sequence=["\n"],
48 | version=0,
49 | )
50 |
51 |
52 | storycloze_2018 = LightevalTaskConfig(
53 | name="storycloze:2018",
54 | prompt_function=storycloze_prompt,
55 | hf_repo="MoE-UNC/story_cloze",
56 | hf_subset="2018",
57 | hf_avail_splits=["validation"],
58 | evaluation_splits=["validation"],
59 | few_shots_split=None,
60 | few_shots_select=None,
61 | generation_size=-1,
62 | metrics=[Metrics.exact_match],
63 | stop_sequence=["\n"],
64 | version=0,
65 | )
66 |
67 | TASKS_TABLE = [
68 | storycloze_2016,
69 | storycloze_2018,
70 | ]
71 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/squad_v2.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Squad V2
4 |
5 | dataset:
6 | rajpurkar/squad_v2
7 |
8 | abstract:
9 | Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
10 | consisting of questions posed by crowdworkers on a set of Wikipedia articles,
11 | where the answer to every question is a segment of text, or span, from the
12 | corresponding reading passage, or the question might be unanswerable.
13 | SQuAD 2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000
14 | unanswerable questions written adversarially by crowdworkers to look similar to
15 | answerable ones. To do well on SQuAD2.0, systems must not only answer questions
16 | when possible, but also determine when no answer is supported by the paragraph
17 | and abstain from answering.
18 |
19 | languages:
20 | english
21 |
22 | tags:
23 | qa
24 |
25 | paper:
26 | https://arxiv.org/abs/1806.03822
27 | """
28 |
29 | from lighteval.metrics.metrics import Metrics
30 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
31 | from lighteval.tasks.templates.qa import get_qa_prompt_function
32 | from lighteval.utils.language import Language
33 |
34 |
35 | squad_v2 = LightevalTaskConfig(
36 | name="squad_v2",
37 | prompt_function=get_qa_prompt_function(
38 | Language.ENGLISH,
39 | lambda line: {
40 | "question": line["question"],
41 | "context": line["context"],
42 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
43 | },
44 | ),
45 | hf_repo="rajpurkar/squad_v2",
46 | hf_subset="squad_v2",
47 | hf_filter=lambda line: any(ans for ans in line["answers"]["text"] if len(ans) > 0),
48 | evaluation_splits=("validation",),
49 | few_shots_split="train",
50 | stop_sequence=["\n", "Question:", "question:"],
51 | generation_size=200,
52 | metrics=[Metrics.exact_match],
53 | version=1,
54 | )
55 |
56 | TASKS_TABLE = [
57 | squad_v2,
58 | ]
59 |
--------------------------------------------------------------------------------
/tests/unit/models/test_abstract_model.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2024 The HuggingFace Team
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 | from transformers import AutoTokenizer
24 |
25 | from lighteval.models.dummy.dummy_model import DummyModel, DummyModelConfig
26 |
27 |
28 | def test_tok_encode_pair():
29 | model = DummyModel(config=DummyModelConfig(seed=42))
30 | model._tokenizer = AutoTokenizer.from_pretrained("facebook/xglm-564M")
31 | context = "答案:"
32 | continuation = ["1"]
33 | non_pairwise_tokens = model.tok_encode_pair(context, continuation, pairwise=False)
34 | pairwise_tokens = model.tok_encode_pair(context, continuation, pairwise=True)
35 | # Non-pairwise merged ":1" to one token
36 | assert non_pairwise_tokens == ([[6, 47873]], [[34871]])
37 | # Pairwise separated ":" and "1"
38 | assert pairwise_tokens == ([[6, 47873, 13]], [[82]])
39 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/hellaswag_tel.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Hellaswag Tel
4 |
5 | dataset:
6 | LightFury9/hellaswag-telugu
7 |
8 | abstract:
9 | Hellaswag Tel multilingual benchmark.
10 |
11 | languages:
12 | telugu
13 |
14 | tags:
15 | multilingual, multiple-choice, reasoning
16 |
17 | paper:
18 | """
19 |
20 | from lighteval.metrics.dynamic_metrics import (
21 | LogLikelihoodAccMetric,
22 | )
23 | from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
26 | from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function
27 | from lighteval.tasks.templates.utils.formulation import (
28 | CFFormulation,
29 | HybridFormulation,
30 | MCFFormulation,
31 | )
32 | from lighteval.utils.language import Language
33 |
34 |
35 | TASKS_TABLE = [
36 | LightevalTaskConfig(
37 | name=f"community_hellaswag_{Language.TELUGU.value}_{formulation.name.lower()}",
38 | prompt_function=get_hellaswag_prompt_function(
39 | language=Language.TELUGU,
40 | adapter=lambda line: {
41 | "ctx_a": line["ctx_a"],
42 | "continuations": line["endings"],
43 | "gold_idx": int(line["label"]),
44 | },
45 | formulation=formulation,
46 | ),
47 | hf_repo="LightFury9/hellaswag-telugu",
48 | hf_subset="default",
49 | evaluation_splits=("valid",),
50 | few_shots_split="train",
51 | metrics=get_metrics_for_formulation(
52 | formulation,
53 | [
54 | LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
55 | LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
56 | ],
57 | ),
58 | )
59 | for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
60 | ]
61 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/arabic_arc.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Arabic Arc
4 |
5 | dataset:
6 | OALL/AlGhafa-Arabic-LLM-Benchmark-Translated
7 |
8 | abstract:
9 | Arabic Arc multilingual benchmark.
10 |
11 | languages:
12 | arabic
13 |
14 | tags:
15 | multilingual, multiple-choice, reasoning
16 |
17 | paper:
18 | """
19 |
20 | from lighteval.metrics.dynamic_metrics import (
21 | LogLikelihoodAccMetric,
22 | )
23 | from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.multilingual.adapters import (
26 | alghafa_adapter,
27 | )
28 | from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
29 | from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
30 | from lighteval.tasks.templates.utils.formulation import (
31 | CFFormulation,
32 | HybridFormulation,
33 | MCFFormulation,
34 | )
35 | from lighteval.utils.language import Language
36 |
37 |
38 | TASKS_TABLE = [
39 | LightevalTaskConfig(
40 | name=f"alghafa_arc_{Language.ARABIC.value}_{formulation.name.lower()}:easy",
41 | prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation),
42 | hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
43 | hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff",
44 | hf_subset="arc_easy_ar",
45 | evaluation_splits=["test"],
46 | few_shots_split="validation",
47 | few_shots_select="sequential",
48 | metrics=get_metrics_for_formulation(
49 | formulation,
50 | [
51 | LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
52 | LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
53 | ],
54 | ),
55 | )
56 | for formulation in [
57 | MCFFormulation(),
58 | CFFormulation(),
59 | HybridFormulation(),
60 | ]
61 | ]
62 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/mathqa.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Mathqa
4 |
5 | dataset:
6 | allenai/math_qa
7 |
8 | abstract:
9 | large-scale dataset of math word problems. Our dataset is gathered by using a
10 | new representation language to annotate over the AQuA-RAT dataset with
11 | fully-specified operational programs. AQuA-RAT has provided the questions,
12 | options, rationale, and the correct options.
13 |
14 | languages:
15 | english
16 |
17 | tags:
18 | math, qa, reasoning
19 |
20 | paper:
21 | https://arxiv.org/abs/1905.13319
22 | """
23 |
24 | from lighteval.metrics.metrics import Metrics
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.requests import Doc
27 |
28 |
29 | def mathqa_prompt(line, task_name: str = None):
30 | query = f"Problem: {line['Problem']}\n"
31 | query += "Options:\n"
32 | query += "".join(
33 | [
34 | f"{key}) {choice}\n"
35 | for key, choice in zip(
36 | ["a", "b", "c", "d", "e"],
37 | [line["option_a"], line["option_b"], line["option_c"], line["option_d"], line["option_e"]],
38 | )
39 | ]
40 | )
41 | query += "Answer:"
42 | return Doc(
43 | task_name=task_name,
44 | query=query,
45 | choices=[
46 | f" {c}" for c in [line["option_a"], line["option_b"], line["option_c"], line["option_d"], line["option_e"]]
47 | ],
48 | gold_index=["a", "b", "c", "d", "e"].index(line["correct"]),
49 | )
50 |
51 |
52 | mathqa = LightevalTaskConfig(
53 | name="mathqa",
54 | prompt_function=mathqa_prompt,
55 | hf_repo="allenai/math_qa",
56 | hf_subset="default",
57 | hf_avail_splits=["train", "validation", "test"],
58 | evaluation_splits=["test"],
59 | few_shots_split=None,
60 | few_shots_select=None,
61 | generation_size=-1,
62 | metrics=[Metrics.loglikelihood_acc],
63 | stop_sequence=["\n"],
64 | version=0,
65 | )
66 |
67 | TASKS_TABLE = [
68 | mathqa,
69 | ]
70 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/triviaqa.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Triviaqa
4 |
5 | dataset:
6 | mandarjoshi/trivia_qa
7 |
8 | abstract:
9 | TriviaqQA is a reading comprehension dataset containing over 650K
10 | question-answer-evidence triples. TriviaqQA includes 95K question-answer pairs
11 | authored by trivia enthusiasts and independently gathered evidence documents,
12 | six per question on average, that provide high quality distant supervision for
13 | answering the questions.
14 |
15 | languages:
16 | english
17 |
18 | tags:
19 | qa
20 |
21 | paper:
22 | https://arxiv.org/abs/1705.03551
23 | """
24 |
25 | import string
26 |
27 | from lighteval.metrics.metrics import Metrics
28 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
29 | from lighteval.tasks.requests import Doc
30 |
31 |
32 | def triviaqa_prompt(line, task_name: str = None):
33 | def _remove_prefixes(aliases):
34 | aliases.sort()
35 | ret = [aliases[0]]
36 | for alias in aliases[1:]:
37 | if not alias.startswith(ret[-1]):
38 | ret.append(alias)
39 | return ret
40 |
41 | list_of_candidates = [
42 | alias.lower().translate(str.maketrans("", "", string.punctuation))
43 | for alias in _remove_prefixes(line["answer"]["aliases"])
44 | ]
45 |
46 | return Doc(
47 | task_name=task_name,
48 | query=f"Question: {line['question']}\nAnswer:",
49 | gold_index=0,
50 | choices=[list_of_candidates],
51 | )
52 |
53 |
54 | triviaqa = LightevalTaskConfig(
55 | name="triviaqa",
56 | prompt_function=triviaqa_prompt,
57 | hf_repo="mandarjoshi/trivia_qa",
58 | hf_subset="rc.nocontext",
59 | hf_avail_splits=["train", "test", "validation"],
60 | evaluation_splits=["validation"],
61 | few_shots_split=None,
62 | few_shots_select=None,
63 | generation_size=20,
64 | metrics=[Metrics.exact_match],
65 | stop_sequence=["\n", ".", ","],
66 | version=0,
67 | )
68 |
69 | TASKS_TABLE = [
70 | triviaqa,
71 | ]
72 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/utils/task_utils.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2024 The HuggingFace Team
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
25 | from lighteval.metrics.utils.metric_utils import Metric
26 | from lighteval.tasks.templates.utils.formulation import Formulation, MCFFormulation
27 |
28 |
29 | def normalize_subset(subset: str) -> str:
30 | return subset.replace(" ", "_").replace("(", "").replace(")", "").lower()
31 |
32 |
33 | def get_metrics_for_formulation(formulation: Formulation, metrics: list[Metric]) -> list[Metric]:
34 | """Choose the appropriate metrics for the given formulation otherwise fallback to the original metrics."""
35 | match formulation:
36 | #
37 | case MCFFormulation(choice_prefix="Letters"):
38 | return [LogLikelihoodAccMetric(normalization=None)]
39 | case _:
40 | return metrics
41 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/simpleqa.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Simpleqa
4 |
5 | dataset:
6 | lighteval/SimpleQA
7 |
8 | abstract:
9 | A factuality benchmark called SimpleQA that measures the ability for language
10 | models to answer short, fact-seeking questions.
11 |
12 | languages:
13 | english
14 |
15 | tags:
16 | factuality, general-knowledge, qa
17 |
18 | paper:
19 | https://openai.com/index/introducing-simpleqa/
20 |
21 | starred:
22 | true
23 | """
24 |
25 | from inspect_ai.dataset import Sample
26 | from inspect_ai.scorer import model_graded_fact
27 | from inspect_ai.solver import generate
28 |
29 | from lighteval.metrics.metrics import Metrics
30 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
31 | from lighteval.tasks.requests import Doc
32 |
33 |
34 | def simpleqa_prompt(line, task_name: str = None):
35 | query = f"Question: {line['question']}\n"
36 | query += "".join(
37 | [f"\n{key}. {choice}" for key, choice in zip(["A", "B", "C", "D", "E", "F"], line["choices"]["text"])]
38 | )
39 | query += "\nAnswer:"
40 | return Doc(
41 | task_name=task_name,
42 | query=query,
43 | choices=line["choices"]["text"],
44 | gold_index=line["choices"]["label"].index(line["answerKey"]),
45 | )
46 |
47 |
48 | def record_to_sample(record):
49 | query = record["problem"]
50 | target = record["answer"]
51 | return Sample(input=query, target=target)
52 |
53 |
54 | simpleqa = LightevalTaskConfig(
55 | name="simpleqa",
56 | prompt_function=simpleqa_prompt,
57 | hf_repo="lighteval/SimpleQA",
58 | hf_subset="default",
59 | hf_avail_splits=["test"],
60 | evaluation_splits=["test"],
61 | few_shots_split="few_shot",
62 | few_shots_select=None,
63 | generation_size=2048,
64 | metrics=[Metrics.exact_match],
65 | stop_sequence=["\n"],
66 | version=0,
67 | sample_fields=record_to_sample,
68 | solver=[generate(cache=True)],
69 | scorer=model_graded_fact(),
70 | )
71 |
72 | TASKS_TABLE = [
73 | simpleqa,
74 | ]
75 |
--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/avg_at_k.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Avg At K Test Suite",
3 | "description": "Test cases for avg_at_k metric",
4 | "test_cases": [
5 | {
6 | "name": "Avg at K - Correct in Top K",
7 | "metric_class": "avg_at_k",
8 | "metric_params": {"k": 2},
9 | "doc": {
10 | "query": "What is the capital of France?",
11 | "choices": ["London", "Paris", "Berlin"],
12 | "gold_index": 1,
13 | "task_name": "geography"
14 | },
15 | "model_response": {
16 | "text": ["Paris", "London", "Berlin"]
17 | },
18 | "expected_output": {
19 | "avg@k:k=2": 0.5
20 | },
21 | "tolerance": 0.01,
22 | "description": "Test avg at k with correct answer in top k"
23 | },
24 | {
25 | "name": "Avg at K - Not in Top K",
26 | "metric_class": "avg_at_k",
27 | "metric_params": {"k": 1},
28 | "doc": {
29 | "query": "What is the capital of France?",
30 | "choices": ["London", "Paris", "Berlin"],
31 | "gold_index": 1,
32 | "task_name": "geography"
33 | },
34 | "model_response": {
35 | "text": ["London", "Berlin", "Paris"]
36 | },
37 | "expected_output": {
38 | "avg@k:k=1": 0.0
39 | },
40 | "tolerance": 0.01,
41 | "description": "Test avg at k with correct answer not in top k"
42 | },
43 | {
44 | "name": "Avg at K - Multiple Correct",
45 | "metric_class": "avg_at_k",
46 | "metric_params": {"k": 3},
47 | "doc": {
48 | "query": "Which are European capitals?",
49 | "choices": ["London", "Paris", "Tokyo", "Berlin"],
50 | "gold_index": [0, 1, 3],
51 | "task_name": "geography"
52 | },
53 | "model_response": {
54 | "text": ["Paris", "London", "Berlin", "Tokyo"]
55 | },
56 | "expected_output": {
57 | "avg@k:k=3": 0.33
58 | },
59 | "tolerance": 0.01,
60 | "description": "Test avg at k with multiple correct answers"
61 | }
62 | ]
63 | }
64 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/entity_data_imputation.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Entity Data Imputation
4 |
5 | dataset:
6 | lighteval/Buy, lighteval/Restaurant
7 |
8 | abstract:
9 | Scenario that tests the ability to impute missing entities in a data table.
10 |
11 | languages:
12 | english
13 |
14 | tags:
15 | reasoning
16 |
17 | paper:
18 | https://ieeexplore.ieee.org/document/9458712
19 | """
20 |
21 | from lighteval.metrics.metrics import Metrics
22 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
23 | from lighteval.tasks.requests import Doc
24 |
25 |
26 | def entity_data_imputation_prompt(line, task_name: str = None):
27 | return Doc(
28 | task_name=task_name,
29 | query=f"What is the missing value?\n{line['text']}\nAnswer:",
30 | choices=[line["gold"]],
31 | gold_index=0,
32 | instruction="What is the missing value?\n",
33 | )
34 |
35 |
36 | entity_data_imputation_Buy = LightevalTaskConfig(
37 | name="entity_data_imputation:Buy",
38 | prompt_function=entity_data_imputation_prompt,
39 | hf_repo="lighteval/Buy",
40 | hf_subset="default",
41 | hf_avail_splits=["train", "test", "valid"],
42 | evaluation_splits=["valid", "test"],
43 | few_shots_split=None,
44 | few_shots_select=None,
45 | generation_size=5,
46 | metrics=[
47 | Metrics.exact_match,
48 | ],
49 | stop_sequence=["\n"],
50 | version=0,
51 | )
52 |
53 |
54 | entity_data_imputation_Restaurant = LightevalTaskConfig(
55 | name="entity_data_imputation:Restaurant",
56 | prompt_function=entity_data_imputation_prompt,
57 | hf_repo="lighteval/Restaurant",
58 | hf_subset="default",
59 | hf_avail_splits=["train"],
60 | evaluation_splits=["train"],
61 | few_shots_split=None,
62 | few_shots_select=None,
63 | generation_size=5,
64 | metrics=[
65 | Metrics.exact_match,
66 | ],
67 | stop_sequence=["\n"],
68 | version=0,
69 | )
70 |
71 | TASKS_TABLE = [
72 | entity_data_imputation_Buy,
73 | entity_data_imputation_Restaurant,
74 | ]
75 |
--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/drop.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "Drop Test Suite",
3 | "description": "Test cases for drop metric",
4 | "test_cases": [
5 | {
6 | "name": "DROP - Correct Answer",
7 | "metric_class": "drop",
8 | "metric_params": {},
9 | "doc": {
10 | "query": "What is 2 + 2?",
11 | "specific": {
12 | "golds_no_preprocessing": ["4"]
13 | },
14 | "choices": ["4"],
15 | "gold_index": 0,
16 | "task_name": "math"
17 | },
18 | "model_response": {
19 | "text": ["4"]
20 | },
21 | "expected_output": {
22 | "em": 1.0,
23 | "f1": 1.0
24 | },
25 | "tolerance": 0.01,
26 | "description": "Test DROP with correct answer"
27 | },
28 | {
29 | "name": "DROP - Wrong Answer",
30 | "metric_class": "drop",
31 | "metric_params": {},
32 | "doc": {
33 | "query": "What is 2 + 2?",
34 | "specific": {
35 | "golds_no_preprocessing": ["4"]
36 | },
37 | "choices": ["4"],
38 | "gold_index": 0,
39 | "task_name": "math"
40 | },
41 | "model_response": {
42 | "text": ["5"]
43 | },
44 | "expected_output": {
45 | "em": 0.0,
46 | "f1": 0.0
47 | },
48 | "tolerance": 0.01,
49 | "description": "Test DROP with wrong answer"
50 | },
51 | {
52 | "name": "DROP - Partial Match",
53 | "metric_class": "drop",
54 | "metric_params": {},
55 | "doc": {
56 | "query": "What is the sum of 2 and 2?",
57 | "specific": {
58 | "golds_no_preprocessing": ["4", "four"]
59 | },
60 | "choices": ["4", "four"],
61 | "gold_index": 0,
62 | "task_name": "math"
63 | },
64 | "model_response": {
65 | "text": ["4"]
66 | },
67 | "expected_output": {
68 | "em": 1.0,
69 | "f1": 1.0
70 | },
71 | "tolerance": 0.01,
72 | "description": "Test DROP with partial match"
73 | }
74 | ]
75 | }
76 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/hellaswag_hin.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Hellaswag Hin
4 |
5 | dataset:
6 | ai4bharat/hellaswag-hi
7 |
8 | abstract:
9 | Hellaswag Hin multilingual benchmark.
10 |
11 | languages:
12 | hindi
13 |
14 | tags:
15 | multilingual, multiple-choice, reasoning
16 |
17 | paper:
18 | """
19 |
20 | from lighteval.metrics.dynamic_metrics import (
21 | LogLikelihoodAccMetric,
22 | )
23 | from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
26 | from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function
27 | from lighteval.tasks.templates.utils.formulation import (
28 | CFFormulation,
29 | HybridFormulation,
30 | MCFFormulation,
31 | )
32 | from lighteval.utils.language import Language
33 |
34 |
35 | TASKS_TABLE = [
36 | LightevalTaskConfig(
37 | name=f"community_hellaswag_{Language.HINDI.value}_{formulation.name.lower()}",
38 | prompt_function=get_hellaswag_prompt_function(
39 | language=Language.HINDI,
40 | adapter=lambda line: {
41 | "ctx_a": line["ctx_a"],
42 | "continuations": line["endings"],
43 | "gold_idx": int(line["label"]),
44 | },
45 | formulation=formulation,
46 | ),
47 | hf_repo="ai4bharat/hellaswag-hi",
48 | hf_filter=lambda line: all(len(choice.strip()) > 0 for choice in line["endings"]),
49 | hf_subset="hi",
50 | evaluation_splits=("validation",),
51 | few_shots_split="validation",
52 | metrics=get_metrics_for_formulation(
53 | formulation,
54 | [
55 | LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
56 | LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
57 | ],
58 | ),
59 | )
60 | for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
61 | ]
62 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/covid_dialogue.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Covid Dialogue
4 |
5 | dataset:
6 | lighteval/covid_dialogue
7 |
8 | abstract:
9 | The COVID-19 Dialogue dataset is a collection of 500+ dialogues between
10 | doctors and patients during the COVID-19 pandemic.
11 |
12 | languages:
13 | english
14 |
15 | tags:
16 | dialog, medical
17 |
18 | paper:
19 | https://arxiv.org/abs/2004.06561
20 | """
21 |
22 | from inspect_ai.dataset import Sample
23 | from inspect_ai.scorer import model_graded_fact
24 | from inspect_ai.solver import generate, system_message
25 |
26 | from lighteval.metrics.metrics import Metrics
27 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
28 | from lighteval.tasks.requests import Doc
29 |
30 |
31 | PROMPT = "Generate a response given a patient's questions and concerns."
32 |
33 |
34 | def covid_dialogue_prompt(line, task_name: str = None):
35 | return Doc(
36 | task_name=task_name,
37 | query=f"Generate a response given a patient's questions and concerns.\nPatient: {line['query']}\nDoctor: ",
38 | choices=[line["answer"]],
39 | gold_index=0,
40 | instruction="Generate a response given a patient's questions and concerns.\n",
41 | )
42 |
43 |
44 | def record_to_sample(record):
45 | query = record["query"]
46 | target = record["answer"]
47 | return Sample(input=query, target=target)
48 |
49 |
50 | covid_dialogue = LightevalTaskConfig(
51 | name="covid_dialogue",
52 | prompt_function=covid_dialogue_prompt,
53 | hf_repo="lighteval/covid_dialogue",
54 | hf_subset="default",
55 | hf_avail_splits=["train", "test", "validation"],
56 | evaluation_splits=["validation", "test"],
57 | few_shots_split=None,
58 | few_shots_select=None,
59 | generation_size=128,
60 | metrics=[Metrics.exact_match],
61 | stop_sequence=["\n"],
62 | version=0,
63 | sample_fields=record_to_sample,
64 | solver=[system_message(PROMPT), generate(cache=True)],
65 | scorer=model_graded_fact(),
66 | )
67 |
68 | TASKS_TABLE = [
69 | covid_dialogue,
70 | ]
71 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/tydiqa.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Tydiqa
4 |
5 | dataset:
6 | google-research-datasets/tydiqa
7 |
8 | abstract:
9 | Other QA tasks for RC TyDi QA: A benchmark for information-seeking question answering in typologically diverse languages. https://arxiv.org/abs/2003.05002
10 |
11 | languages:
12 | arabic, bengali, english, finnish, indonesian, japanese, korean, russian, swahili, telugu, thai
13 |
14 | tags:
15 | multilingual, qa
16 |
17 | paper:
18 | https://arxiv.org/abs/2003.05002
19 | """
20 |
21 | from lighteval.metrics.dynamic_metrics import (
22 | MultilingualQuasiExactMatchMetric,
23 | MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 |
29 |
30 | TASKS_TABLE = [
31 | LightevalTaskConfig(
32 | name=f"tydiqa_{language.value}",
33 | prompt_function=get_qa_prompt_function(
34 | language,
35 | lambda line: {
36 | "question": line["question"],
37 | "context": line["context"],
38 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
39 | },
40 | ),
41 | hf_repo="google-research-datasets/tydiqa",
42 | hf_subset="secondary_task",
43 | evaluation_splits=("validation",),
44 | few_shots_split="train",
45 | generation_size=400,
46 | stop_sequence=("\n",),
47 | metrics=(
48 | MultilingualQuasiExactMatchMetric(language, "prefix"),
49 | MultilingualQuasiF1ScoreMetric(language),
50 | ),
51 | )
52 | for language in [
53 | Language.ENGLISH,
54 | Language.ARABIC,
55 | Language.BENGALI,
56 | Language.FINNISH,
57 | Language.INDONESIAN,
58 | Language.JAPANESE,
59 | Language.KOREAN,
60 | Language.SWAHILI,
61 | Language.RUSSIAN,
62 | Language.TELUGU,
63 | Language.THAI,
64 | ]
65 | ]
66 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/templates/utils/adapter_utils.py:
--------------------------------------------------------------------------------
1 | # MIT License
2 |
3 | # Copyright (c) 2024 The HuggingFace Team
4 |
5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
6 | # of this software and associated documentation files (the "Software"), to deal
7 | # in the Software without restriction, including without limitation the rights
8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 |
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 |
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 |
23 |
24 | from typing import Any, Callable, Mapping, TypeVar
25 |
26 |
27 | AdapterReturnTypeVar = TypeVar("AdapterReturnTypeVar")
28 |
29 |
30 | def create_adapter_from_dict(
31 | adapter: Mapping[str, Any] | Callable[[dict], AdapterReturnTypeVar],
32 | ) -> Callable[[dict], AdapterReturnTypeVar]:
33 | """Creates adapter function for the template input from a dict.
34 |
35 | Args:
36 | adapter: Dict of the form {key: value} where value is key in the input dict to get.
37 |
38 | Returns:
39 | Callable[[dict], AdapterReturnTypeVar]: A function that adapts dictionary input to the expected format
40 | """
41 | if not isinstance(adapter, Mapping):
42 | return adapter
43 |
44 | def adapter_fn(line: dict):
45 | return {key: line[value] for key, value in adapter.items()}
46 |
47 | return adapter_fn # type: ignore
48 |
--------------------------------------------------------------------------------
/docs/source/_toctree.yml:
--------------------------------------------------------------------------------
1 | - sections:
2 | - local: index
3 | title: 🤗 Lighteval
4 | - local: installation
5 | title: Installation
6 | - local: quicktour
7 | title: Quicktour
8 | title: Getting started
9 | - sections:
10 | - local: inspect-ai
11 | title: Examples using Inspect-AI
12 | - local: saving-and-reading-results
13 | title: Save and read results
14 | - local: caching
15 | title: Caching
16 | - local: using-the-python-api
17 | title: Use the Python API
18 | - local: adding-a-custom-task
19 | title: Add a custom task
20 | - local: adding-a-new-metric
21 | title: Add a custom metric
22 | - local: evaluating-a-custom-model
23 | title: Evaluate a custom model
24 | - local: use-inference-providers-as-backend
25 | title: Use HF's inference providers as backend
26 | - local: use-litellm-as-backend
27 | title: Use litellm as backend
28 | - local: use-vllm-as-backend
29 | title: Use vllm as backend
30 | - local: use-sglang-as-backend
31 | title: Use SGLang as backend
32 | - local: use-huggingface-inference-endpoints-or-tgi-as-backend
33 | title: Use Hugging Face inference endpoints or TGI as backend
34 | - local: contributing-to-multilingual-evaluations
35 | title: Contributing to multilingual evaluations
36 | title: Guides
37 | - sections:
38 | - local: metric-list
39 | title: Available Metrics
40 | - local: available-tasks
41 | title: Available Tasks
42 | title: API
43 | - sections:
44 | - sections:
45 | - local: package_reference/evaluation_tracker
46 | title: EvaluationTracker
47 | - local: package_reference/models
48 | title: Model Configs
49 | - local: package_reference/pipeline
50 | title: Pipeline
51 | title: Main classes
52 | - local: package_reference/metrics
53 | title: Metrics
54 | - local: package_reference/tasks
55 | title: Tasks
56 | - local: package_reference/logging
57 | title: Logging
58 | - local: package_reference/models_outputs
59 | title: ModelResponse
60 | - local: package_reference/doc
61 | title: Doc
62 | title: Reference
63 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/afri_mgsm.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Afri Mgsm
4 |
5 | dataset:
6 | masakhane/afrimgsm
7 |
8 | abstract:
9 | African MGSM: MGSM for African Languages
10 |
11 | languages:
12 | amharic, ewe, french, hausa, igbo, kinyarwanda, lingala, luganda, oromo, shona,
13 | sotho, swahili, twi, wolof, xhosa, yoruba, zulu
14 |
15 | tags:
16 | math, multilingual, reasoning
17 |
18 | paper:
19 | https://arxiv.org/abs/2406.03368.
20 | """
21 |
22 | from lighteval.metrics.dynamic_metrics import (
23 | MultilingualQuasiExactMatchMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 |
29 |
30 | TASKS_TABLE = [
31 | LightevalTaskConfig(
32 | name=f"afri_mgsm_{language.value}",
33 | prompt_function=get_qa_prompt_function(
34 | language,
35 | lambda line: {
36 | "question": line["question"],
37 | # The cot is available but we have no use:
38 | # line["answer"]
39 | "choices": [str(line["answer_number"])],
40 | },
41 | ),
42 | hf_repo="masakhane/afrimgsm",
43 | hf_subset=language.value,
44 | evaluation_splits=("test",),
45 | few_shots_split="train",
46 | generation_size=25,
47 | metrics=[
48 | MultilingualQuasiExactMatchMetric(language, "full"),
49 | ],
50 | stop_sequence=("\n",),
51 | )
52 | for language in [
53 | Language.AMHARIC,
54 | # Language.EWE,
55 | Language.FRENCH,
56 | # Language.HAUSA,
57 | # Language.IGBO,
58 | # Language.KINYARWANDA,
59 | # Language.LINGALA,
60 | # Language.LUGANDA,
61 | # Language.OROMO,
62 | # Language.SHONA,
63 | # Language.SOTHO,
64 | Language.SWAHILI,
65 | # Language.TWI,
66 | # Language.WOLOF,
67 | # Language.XHOSA,
68 | Language.YORUBA,
69 | # Language.ZULU,
70 | ]
71 | ]
72 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/babi_qa.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Babi Qa
4 |
5 | dataset:
6 | facebook/babi_qa
7 |
8 | abstract:
9 | The bAbI benchmark for measuring understanding and reasoning, evaluates reading
10 | comprehension via question answering.
11 |
12 | languages:
13 | english
14 |
15 | tags:
16 | qa, reasoning
17 |
18 | paper:
19 | https://arxiv.org/abs/1502.05698
20 | """
21 |
22 | import json
23 |
24 | from lighteval.metrics.metrics import Metrics
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.requests import Doc
27 |
28 |
29 | # TODO: clean dataset and convert to inspect-ai
30 |
31 |
32 | def babi_qa_prompt(line, task_name: str = None):
33 | def process_path(path: str) -> str:
34 | steps = path.split(",")
35 | directions = {"s": "south", "n": "north", "e": "east", "w": "west"}
36 | path = " ".join([directions[step] for step in steps])
37 | return path
38 |
39 | if isinstance(line["story"], dict):
40 | line = line["story"]
41 | else:
42 | line = json.loads(line["story"])
43 |
44 | results = []
45 | story = []
46 | for type, text, answer in zip(line["type"], line["text"], line["answer"]):
47 | if type == "supporting fact":
48 | story.append(text)
49 | elif type == "question":
50 | text = text.replace("_", process_path(answer))
51 | query = "\n".join(story) + f"\nQuestion: {text}\nAnswer: "
52 | results.append(Doc(task_name=task_name, query=query, choices=[answer], gold_index=0))
53 | story = []
54 | return results
55 |
56 |
57 | babi_qa = LightevalTaskConfig(
58 | name="babi_qa",
59 | prompt_function=babi_qa_prompt,
60 | hf_repo="facebook/babi_qa",
61 | hf_subset="en-valid-qa1",
62 | hf_avail_splits=["train", "test", "validation"],
63 | evaluation_splits=["validation", "test"],
64 | few_shots_split=None,
65 | few_shots_select=None,
66 | generation_size=-1,
67 | metrics=[Metrics.exact_match],
68 | stop_sequence=["\n"],
69 | version=0,
70 | )
71 |
72 | TASKS_TABLE = [babi_qa]
73 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/openbookqa.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Openbookqa
4 |
5 | dataset:
6 | allenai/openbookqa
7 |
8 | abstract:
9 | OpenBookQA is a question-answering dataset modeled after open-book exams for
10 | assessing human understanding of a subject. It contains multiple-choice
11 | questions that require combining facts from a given open book with broad common
12 | knowledge. The task tests language models' ability to leverage provided
13 | information and apply common sense reasoning.
14 |
15 | languages:
16 | english
17 |
18 | tags:
19 | multiple-choice, qa
20 |
21 | paper:
22 | https://arxiv.org/abs/1809.02789
23 | """
24 |
25 | from string import ascii_uppercase
26 |
27 | from lighteval.metrics.metrics import Metrics
28 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
29 | from lighteval.tasks.requests import Doc
30 |
31 |
32 | def openbookqa_prompt(line, task_name: str = None):
33 | query = "The following are multiple choice questions (with answers) about common sense.\n"
34 | query += f"Question: {line['question_stem']}\n"
35 | query += "".join([f"{key}. {choice}\n" for key, choice in zip(ascii_uppercase, line["choices"]["text"])])
36 | query += "Answer: "
37 |
38 | gold_ix = ["A", "B", "C", "D", "E"].index(line["answerKey"].strip())
39 | return Doc(
40 | task_name=task_name,
41 | query=query,
42 | choices=list(ascii_uppercase[: len(line["choices"]["text"])]),
43 | gold_index=gold_ix,
44 | instruction="The following are multiple choice questions (with answers) about common sense.\n",
45 | )
46 |
47 |
48 | openbookqa = LightevalTaskConfig(
49 | name="openbookqa",
50 | prompt_function=openbookqa_prompt,
51 | hf_repo="allenai/openbookqa",
52 | hf_subset="main",
53 | hf_avail_splits=["train", "test", "validation"],
54 | evaluation_splits=["validation", "test"],
55 | few_shots_split=None,
56 | few_shots_select=None,
57 | generation_size=1,
58 | metrics=[
59 | Metrics.exact_match,
60 | ],
61 | stop_sequence=["\n"],
62 | version=0,
63 | )
64 |
65 | TASKS_TABLE = [
66 | openbookqa,
67 | ]
68 |
--------------------------------------------------------------------------------
/docs/source/caching.mdx:
--------------------------------------------------------------------------------
1 | # Caching System
2 |
3 | Lighteval includes a caching system that can significantly speed up evaluations by storing and reusing model predictions.
4 | This is especially useful when running the same evaluation multiple times, or comparing different evaluation metrics on the same model outputs.
5 |
6 | ## How It Works
7 |
8 | The caching system caches the predictions of the model for now (we will add tokenized input caching later).
9 | It stores model responses objects (generations, logits, probabilities) for evaluation samples.
10 |
11 | ### Cache Structure
12 |
13 | Cached data is stored on disk using HuggingFace datasets in the following structure:
14 |
15 | ```
16 | .cache/
17 | └── huggingface/
18 | └── lighteval/
19 | └── predictions/
20 | └── {model_name}/
21 | └── {model_hash}/
22 | └── {task_name}.parquet
23 | ```
24 |
25 | Where:
26 | - `model_name`: The model name (path on the hub or local path)
27 | - `model_hash`: Hash of the model configuration to ensure cache invalidation when parameters change
28 | - `task_name`: Name of the evaluation task
29 |
30 | ### Cache Recreation
31 |
32 | A new cache is automatically created when:
33 | - Model configuration changes (different parameters, quantization, etc.)
34 | - Model weights change (different revision, checkpoint, etc.)
35 | - Generation parameters change (temperature, max_tokens, etc.)
36 |
37 | This ensures that cached results are always consistent with your current model setup.
38 |
39 | ## Using Caching
40 |
41 | ### Automatic Caching
42 |
43 | All built-in model classes in Lighteval automatically support caching. No additional configuration is needed.
44 | For custom models you need to add a cache to the model class and decorators on all functions.
45 |
46 | ## Cache Management
47 |
48 | ### Clearing Cache
49 |
50 | To clear cache for a specific model, delete the corresponding directory:
51 |
52 | ```bash
53 | rm -rf ~/.cache/huggingface/lighteval/predictions/{model_name}/{model_hash}/
54 | ```
55 |
56 | To clear all caches:
57 |
58 | ```bash
59 | rm -rf ~/.cache/huggingface/lighteval/predictions
60 | ```
61 |
--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/openbook_es.py:
--------------------------------------------------------------------------------
1 | """
2 | name:
3 | Openbook Es
4 |
5 | dataset:
6 | BSC-LT/openbookqa-es
7 |
8 | abstract:
9 | Spanish version of OpenBookQA from BSC Language Technology group
10 |
11 | languages:
12 | spanish
13 |
14 | tags:
15 | multilingual, multiple-choice, reasoning
16 |
17 | paper:
18 | https://huggingface.co/datasets/BSC-LT/openbookqa-es
19 | """
20 |
21 | from string import ascii_uppercase
22 |
23 | from lighteval.metrics.dynamic_metrics import (
24 | LogLikelihoodAccMetric,
25 | )
26 | from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
27 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
28 | from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
29 | from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
30 | from lighteval.tasks.templates.utils.formulation import (
31 | CFFormulation,
32 | HybridFormulation,
33 | MCFFormulation,
34 | )
35 | from lighteval.utils.language import Language
36 |
37 |
38 | TASKS_TABLE = [
39 | LightevalTaskConfig(
40 | name=f"openbookqa_{Language.SPANISH.value}_{formulation.name.lower()}",
41 | prompt_function=get_mcq_prompt_function(
42 | Language.SPANISH,
43 | lambda line: {
44 | "question": line["question_stem"],
45 | "choices": line["choices"]["text"],
46 | "gold_idx": ascii_uppercase.index(line["answerKey"]),
47 | },
48 | formulation=formulation,
49 | ),
50 | hf_repo="BSC-LT/openbookqa-es",
51 | hf_subset="default",
52 | evaluation_splits=("test",),
53 | few_shots_split="validation",
54 | metrics=get_metrics_for_formulation(
55 | formulation,
56 | [
57 | LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
58 | LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
59 | ],
60 | ),
61 | )
62 | for formulation in [
63 | MCFFormulation(),
64 | CFFormulation(),
65 | HybridFormulation(),
66 | ]
67 | ]
68 |
--------------------------------------------------------------------------------