├── src └── lighteval │ ├── py.typed │ ├── tasks │ ├── tasks │ │ ├── jeopardy.py │ │ ├── pubmedqa.py │ │ ├── quac.py │ │ ├── natural_questions.py │ │ ├── toxigen.py │ │ ├── coqa.py │ │ ├── real_toxicity_prompts.py │ │ ├── prost.py │ │ ├── narrativeqa.py │ │ ├── legalsupport.py │ │ ├── sciq.py │ │ ├── qasper.py │ │ ├── webqs.py │ │ ├── aimo.py │ │ ├── asdiv.py │ │ ├── twitterAAE.py │ │ ├── logiqa.py │ │ ├── winogrande.py │ │ ├── swag.py │ │ ├── med_dialog.py │ │ ├── piqa.py │ │ ├── hellaswag.py │ │ ├── storycloze.py │ │ ├── squad_v2.py │ │ ├── mathqa.py │ │ ├── triviaqa.py │ │ ├── simpleqa.py │ │ ├── entity_data_imputation.py │ │ ├── covid_dialogue.py │ │ ├── babi_qa.py │ │ └── openbookqa.py │ ├── templates │ │ ├── __init__.py │ │ └── utils │ │ │ ├── __init__.py │ │ │ └── adapter_utils.py │ ├── multilingual │ │ ├── __init__.py │ │ ├── utils │ │ │ ├── __init__.py │ │ │ └── task_utils.py │ │ └── tasks │ │ │ ├── cmath.py │ │ │ ├── chegeka.py │ │ │ ├── french_triviqa.py │ │ │ ├── tquad_v2.py │ │ │ ├── thaiqa.py │ │ │ ├── kenswquad.py │ │ │ ├── french_boolq.py │ │ │ ├── fquad_v2.py │ │ │ ├── cmrc2018.py │ │ │ ├── sber_squad.py │ │ │ ├── chinese_squad.py │ │ │ ├── squad_it.py │ │ │ ├── arcd.py │ │ │ ├── squad_es.py │ │ │ ├── faquad.py │ │ │ ├── germanquad.py │ │ │ ├── hindi_boolq.py │ │ │ ├── mintaka.py │ │ │ ├── mgsm.py │ │ │ ├── soqal.py │ │ │ ├── thai_exams.py │ │ │ ├── hellaswag_tel.py │ │ │ ├── arabic_arc.py │ │ │ ├── hellaswag_hin.py │ │ │ ├── tydiqa.py │ │ │ ├── afri_mgsm.py │ │ │ └── openbook_es.py │ └── __init__.py │ ├── utils │ └── __init__.py │ ├── metrics │ └── imports │ │ └── __init__.py │ └── __init__.py ├── docs └── source │ ├── package_reference │ ├── doc.mdx │ ├── evaluation_tracker.mdx │ ├── models_outputs.mdx │ ├── pipeline.mdx │ ├── logging.mdx │ ├── tasks.mdx │ └── models.mdx │ ├── available-tasks.mdx │ ├── _toctree.yml │ └── caching.mdx ├── MANIFEST.in ├── examples ├── tasks │ ├── serbian_task_group │ │ ├── sr_all_inclusive.txt │ │ ├── sr_custom_task.txt │ │ ├── sr_qa_knowledge.txt │ │ ├── sr_arc.txt │ │ ├── sr_mmlu_business_professional.txt │ │ ├── sr_commonsense_reasoning.txt │ │ ├── sr_mmlu_social_sciences.txt │ │ ├── sr_mmlu_ethics_philosophy.txt │ │ ├── sr_misc.txt │ │ ├── sr_mmlu_math_logic.txt │ │ ├── sr_mmlu_college_level.txt │ │ └── sr_mmlu_high_school_level.txt │ ├── all_german_rag_evals.txt │ ├── fine_tasks │ │ ├── cf │ │ │ ├── th.txt │ │ │ ├── te.txt │ │ │ ├── fr.txt │ │ │ ├── tr.txt │ │ │ ├── sw.txt │ │ │ ├── hi.txt │ │ │ ├── ru.txt │ │ │ ├── zh.txt │ │ │ └── ar.txt │ │ └── mcf │ │ │ ├── th.txt │ │ │ ├── te.txt │ │ │ ├── fr.txt │ │ │ ├── tr.txt │ │ │ ├── sw.txt │ │ │ ├── hi.txt │ │ │ ├── ru.txt │ │ │ ├── zh.txt │ │ │ └── ar.txt │ ├── all_filipino_tasks.txt │ └── bbh.txt ├── model_configs │ ├── inference_providers.yaml │ ├── transformers_vlm_model.yaml │ ├── tgi_model.yaml │ ├── litellm_model.yaml │ ├── quantized_model.yaml │ ├── transformers_model.yaml │ ├── sglang_model_config.yaml │ ├── vllm_model_config.yaml │ ├── peft_model.yaml │ └── endpoint_model.yaml ├── nanotron │ └── lighteval_config_override_template.yaml └── test_tasks.txt ├── .gitattributes ├── Makefile ├── tests ├── reference_scores │ ├── harness_metrics.json │ ├── harness_prompts.json │ ├── Qwen2.5-VL-3B-Instruct-results-vlm.json │ ├── Qwen2.5-VL-7B-Instruct-results-vlm.json │ ├── SmolLM2-1.7B-Instruct-results-vllm.json │ └── SmolLM2-1.7B-Instruct-results-accelerate.json ├── reference_details │ ├── SmolLM2-1.7B-Instruct-vllm │ │ ├── details_gsm8k_test|0_2025-11-05T14-52-08.352779.parquet │ │ ├── details_hellaswag|10_2025-11-05T14-52-08.352779.parquet │ │ ├── details_agieval:lsat-ar|0_2025-11-05T14-52-08.352779.parquet │ │ ├── details_agieval:lsat-lr|0_2025-11-05T14-52-08.352779.parquet │ │ ├── details_agieval:sat-en|0_2025-11-05T14-52-08.352779.parquet │ │ ├── details_arc:challenge|25_2025-11-05T14-52-08.352779.parquet │ │ ├── details_truthfulqa:mc|0_2025-11-05T14-52-08.352779.parquet │ │ ├── details_agieval:aqua-rat|0_2025-11-05T14-52-08.352779.parquet │ │ ├── details_agieval:logiqa-en|0_2025-11-05T14-52-08.352779.parquet │ │ ├── details_agieval:lsat-rc|0_2025-11-05T14-52-08.352779.parquet │ │ ├── details_bigbench_hard:snarks|3_2025-11-05T14-52-08.352779.parquet │ │ ├── details_bigbench_hard:navigate|3_2025-11-05T14-52-08.352779.parquet │ │ ├── details_bigbench_hard:ruin_names|3_2025-11-05T14-52-08.352779.parquet │ │ ├── details_mmlu:college_chemistry|5_2025-11-05T14-52-08.352779.parquet │ │ ├── details_mmlu:us_foreign_policy|5_2025-11-05T14-52-08.352779.parquet │ │ ├── details_agieval:sat-en-without-passage|0_2025-11-05T14-52-08.352779.parquet │ │ ├── details_bigbench_hard:causal_judgment|3_2025-11-05T14-52-08.352779.parquet │ │ ├── details_bigbench_hard:geometric_shapes|3_2025-11-05T14-52-08.352779.parquet │ │ ├── details_bigbench_hard:date_understanding|3_2025-11-05T14-52-08.352779.parquet │ │ ├── details_bigbench_hard:disambiguation_qa|3_2025-11-05T14-52-08.352779.parquet │ │ ├── details_bigbench_hard:movie_recommendation|3_2025-11-05T14-52-08.352779.parquet │ │ ├── details_bigbench_hard:temporal_sequences|3_2025-11-05T14-52-08.352779.parquet │ │ ├── details_bigbench_hard:logical_deduction_five_objects|3_2025-11-05T14-52-08.352779.parquet │ │ ├── details_bigbench_hard:logical_deduction_seven_objects|3_2025-11-05T14-52-08.352779.parquet │ │ ├── details_bigbench_hard:salient_translation_error_detection|3_2025-11-05T14-52-08.352779.parquet │ │ ├── details_bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-11-05T14-52-08.352779.parquet │ │ └── details_bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-11-05T14-52-08.352779.parquet │ ├── SmolLM2-1.7B-Instruct-transformers │ │ ├── details_gsm8k_test|0_2025-11-05T14-43-47.148527.parquet │ │ ├── details_hellaswag|10_2025-11-05T14-43-47.148527.parquet │ │ ├── details_agieval:aqua-rat|0_2025-11-05T14-43-47.148527.parquet │ │ ├── details_agieval:logiqa-en|0_2025-11-05T14-43-47.148527.parquet │ │ ├── details_agieval:lsat-ar|0_2025-11-05T14-43-47.148527.parquet │ │ ├── details_agieval:lsat-lr|0_2025-11-05T14-43-47.148527.parquet │ │ ├── details_agieval:lsat-rc|0_2025-11-05T14-43-47.148527.parquet │ │ ├── details_agieval:sat-en|0_2025-11-05T14-43-47.148527.parquet │ │ ├── details_arc:challenge|25_2025-11-05T14-43-47.148527.parquet │ │ ├── details_truthfulqa:mc|0_2025-11-05T14-43-47.148527.parquet │ │ ├── details_bigbench_hard:navigate|3_2025-11-05T14-43-47.148527.parquet │ │ ├── details_bigbench_hard:snarks|3_2025-11-05T14-43-47.148527.parquet │ │ ├── details_mmlu:college_chemistry|5_2025-11-05T14-43-47.148527.parquet │ │ ├── details_mmlu:us_foreign_policy|5_2025-11-05T14-43-47.148527.parquet │ │ ├── details_bigbench_hard:ruin_names|3_2025-11-05T14-43-47.148527.parquet │ │ ├── details_agieval:sat-en-without-passage|0_2025-11-05T14-43-47.148527.parquet │ │ ├── details_bigbench_hard:causal_judgment|3_2025-11-05T14-43-47.148527.parquet │ │ ├── details_bigbench_hard:date_understanding|3_2025-11-05T14-43-47.148527.parquet │ │ ├── details_bigbench_hard:disambiguation_qa|3_2025-11-05T14-43-47.148527.parquet │ │ ├── details_bigbench_hard:geometric_shapes|3_2025-11-05T14-43-47.148527.parquet │ │ ├── details_bigbench_hard:temporal_sequences|3_2025-11-05T14-43-47.148527.parquet │ │ ├── details_bigbench_hard:movie_recommendation|3_2025-11-05T14-43-47.148527.parquet │ │ ├── details_bigbench_hard:logical_deduction_five_objects|3_2025-11-05T14-43-47.148527.parquet │ │ ├── details_bigbench_hard:logical_deduction_seven_objects|3_2025-11-05T14-43-47.148527.parquet │ │ ├── details_bigbench_hard:salient_translation_error_detection|3_2025-11-05T14-43-47.148527.parquet │ │ ├── details_bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-11-05T14-43-47.148527.parquet │ │ └── details_bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-11-05T14-43-47.148527.parquet │ └── Qwen2.5-VL-3B-Instruct-vlm │ │ └── details_mmmu_pro:standard-4|0_2025-11-05T15-23-34.026089.parquet ├── unit │ ├── metrics │ │ ├── pytest.ini │ │ └── test_cases │ │ │ ├── rouge1.json │ │ │ ├── simpleqa_judge.json │ │ │ ├── bert_score.json │ │ │ ├── bits_per_byte.json │ │ │ ├── byte_perplexity.json │ │ │ ├── expr_gold_metric.json │ │ │ ├── prediction_perplexity.json │ │ │ ├── mcc.json │ │ │ ├── exact_match.json │ │ │ ├── acc_golds_likelihood.json │ │ │ ├── avg_at_k_math.json │ │ │ ├── pass_at_k_math.json │ │ │ ├── avg_at_k.json │ │ │ └── drop.json │ └── models │ │ ├── test_base_model.py │ │ └── test_abstract_model.py ├── conftest.py ├── __init__.py └── slow_tests │ └── __init__.py ├── .github ├── workflows │ ├── pr_style_bot.yaml │ ├── trufflehog.yml │ ├── doc-build.yml │ ├── doc-pr-build.yml │ ├── doc-pr-upload.yml │ ├── quality.yaml │ ├── slow_tests.yaml │ └── tests.yaml ├── ISSUE_TEMPLATE │ ├── evaluation-task-request.md │ ├── feature-request.md │ └── bug_report.md └── release.yml ├── LICENSE ├── setup.py └── .pre-commit-config.yaml /src/lighteval/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /docs/source/package_reference/doc.mdx: -------------------------------------------------------------------------------- 1 | # Doc 2 | 3 | [[autodoc]] tasks.requests.Doc 4 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include src/lighteval/tasks/tasks_table.jsonl 2 | include src/lighteval/metrics/*.jsonl 3 | -------------------------------------------------------------------------------- /examples/tasks/serbian_task_group/sr_all_inclusive.txt: -------------------------------------------------------------------------------- 1 | # MMLU (All-inclusive Task Entry) 2 | community|serbian_evals:mmlu|0 3 | -------------------------------------------------------------------------------- /examples/tasks/serbian_task_group/sr_custom_task.txt: -------------------------------------------------------------------------------- 1 | # Serbian Evaluations - Custom/Other Task 2 | community|serbian_evals:oz_eval|0 3 | -------------------------------------------------------------------------------- /docs/source/package_reference/evaluation_tracker.mdx: -------------------------------------------------------------------------------- 1 | # EvaluationTracker 2 | 3 | [[autodoc]] logging.evaluation_tracker.EvaluationTracker 4 | -------------------------------------------------------------------------------- /examples/tasks/serbian_task_group/sr_qa_knowledge.txt: -------------------------------------------------------------------------------- 1 | # Question Answering and Knowledge 2 | community|serbian_evals:boolq|0 3 | community|serbian_evals:openbook|0 4 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | *.json filter=lfs diff=lfs merge=lfs -text 2 | tests/unit/metrics/test_cases/*.json -filter -diff -merge text 3 | *.parquet filter=lfs diff=lfs merge=lfs -text 4 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: style format 2 | 3 | 4 | style: 5 | ruff format . 6 | ruff check --fix . 7 | 8 | 9 | quality: 10 | ruff format --check . 11 | ruff check . 12 | -------------------------------------------------------------------------------- /examples/tasks/serbian_task_group/sr_arc.txt: -------------------------------------------------------------------------------- 1 | # Serbian Evaluations - ARC (AI2 Reasoning Challenge) 2 | community|serbian_evals:arc_easy|0 3 | community|serbian_evals:arc_challenge|0 4 | -------------------------------------------------------------------------------- /tests/reference_scores/harness_metrics.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c2080305011a7ac8b0895ec1fbb26b45af4e3dced6272abf67156ebf57656f88 3 | size 48360080 4 | -------------------------------------------------------------------------------- /tests/reference_scores/harness_prompts.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:059a48631d4243cda36d067db50350639c12b0a88fb209f76bbcd0c3ff266ffb 3 | size 20244711 4 | -------------------------------------------------------------------------------- /examples/tasks/serbian_task_group/sr_mmlu_business_professional.txt: -------------------------------------------------------------------------------- 1 | # MMLU (Business Professional) 2 | community|serbian_evals:mmlu_marketing|0 3 | community|serbian_evals:mmlu_manadzment|0 4 | -------------------------------------------------------------------------------- /tests/reference_scores/Qwen2.5-VL-3B-Instruct-results-vlm.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5b38f65703ddd426111ba45e6f6b8b82ee2049c7e754e977a5c6269aa2d94ade 3 | size 3968 4 | -------------------------------------------------------------------------------- /tests/reference_scores/Qwen2.5-VL-7B-Instruct-results-vlm.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d07d8341188999f359a530e1dae4cd8ec3936d4046232a68b90a56c9f2994b3c 3 | size 3083 4 | -------------------------------------------------------------------------------- /examples/tasks/serbian_task_group/sr_commonsense_reasoning.txt: -------------------------------------------------------------------------------- 1 | # Commonsense Reasoning 2 | community|serbian_evals:hellaswag|0 3 | community|serbian_evals:piqa|0 4 | community|serbian_evals:winogrande|0 5 | -------------------------------------------------------------------------------- /tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:4dcc899c5963df3e98cc9d144f3c582edda227d8d9e2c24fabc1f794a4fab524 3 | size 47986 4 | -------------------------------------------------------------------------------- /tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:55b420e5ff6061d2d2d66c3f9ce8cab541820766b1bd7becc0d7b290b99144b6 3 | size 47858 4 | -------------------------------------------------------------------------------- /docs/source/package_reference/models_outputs.mdx: -------------------------------------------------------------------------------- 1 | # Model's Output 2 | 3 | All models will generate an ouput per Doc supplied to the `generation` or `loglikelihood` fuctions. 4 | 5 | [[autodoc]] lighteval.models.model_output.ModelResponse 6 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_gsm8k_test|0_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:df34c40c43eeea4355e86ec505b053db421189b2082c670409e66d93defdd0d1 3 | size 39054 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_hellaswag|10_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:973fa1740490bf212831075ac9842dd88a31db7aa422e240c01eafb840979207 3 | size 88599 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:lsat-ar|0_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:29588411e9390fe550e3ca353e0d7c89e381d25673ced35399f5896e0c613216 3 | size 50719 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:lsat-lr|0_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:450f5a17118613189a749f0fd9f7807265b43733482367e969b04ae7971a749c 3 | size 55774 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:sat-en|0_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:bd4eed73faaf58a18a302a1d2f0c8b8b8e1fbd482a5fd4ca77c375f1e3082f0e 3 | size 109931 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_arc:challenge|25_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:7bab826310f526d7aaa9c5e15ff50314524d54847de37699b762adec3c57fb78 3 | size 144793 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_truthfulqa:mc|0_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ec46468169068183da1c57ace7064fcfa8664e4acad3a76f2d37e260468b67ee 3 | size 32367 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_gsm8k_test|0_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:419f252eddb158e185b515b39ca9e1784f7b4122a620a2a67034178bb1ea6abb 3 | size 35694 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_hellaswag|10_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:2c74caa34cd1c6b227a1d66dcda7a0c61986435f925f17cf81e676f8c542d146 3 | size 67250 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:aqua-rat|0_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6337b98efe5a6d10a02d4c13d78bcff056797c65999dbfb8ef5ab128f88fe4cf 3 | size 26482 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:logiqa-en|0_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1a998148dbb28f6826861479e8d9fc7bf7f73b0ab6921dc9a6da811e70eddba4 3 | size 45688 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:lsat-rc|0_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5fab7e1da1cdd0e8f66831b57bcda28b6350c7cf16c9905417746278a6f30f31 3 | size 148786 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:snarks|3_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:00a0ea645c10c6a8e0d55408f20bf59f95c3cd135afaebc5df0d1fbb89c3b93d 3 | size 37857 4 | -------------------------------------------------------------------------------- /tests/reference_details/Qwen2.5-VL-3B-Instruct-vlm/details_mmmu_pro:standard-4|0_2025-11-05T15-23-34.026089.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5eda1dbcf8c9604005ce8c27239a57e5f41f852dbd3da13656d94b01b325f16c 3 | size 11538690 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:aqua-rat|0_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ef968329bee498b3387ec8df3677ca9bbac72e90599efbe7f78db23f4227b2f6 3 | size 21785 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:logiqa-en|0_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5bb00dd8c872d95dd5b2999d788ece8c34d43b5b5ea4ef8f0859ba535d7b8cbf 3 | size 34021 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:lsat-ar|0_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:161eb467544ca6273231945067a8d70aedb0e4e6c3eba4e8b40532cd1c37e6b0 3 | size 30662 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:lsat-lr|0_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:da3a3dcc9ae24c6f3bc80f0ce72b64d09a5ab19d7803b168a8f1ad540f7f66c1 3 | size 39332 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:lsat-rc|0_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:fb3787ff3e796b49199e8d438a30ce768438b6d4fc5df5941e393bfd1fdf2ae6 3 | size 74124 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:sat-en|0_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8a20696f4036b3a6e2b41b309f405d9a5445c4573150fc7e7bec15f28fd77bdf 3 | size 72441 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_arc:challenge|25_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:803a445f6d563c2d2097df7c7772cd81b3df0bacc9e702caaab5f0dde7fe5b25 3 | size 87624 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_truthfulqa:mc|0_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:d0886aca82f5687e1fd742c0c5b9fe046fb20622d9e67818d21d7300de27e746 3 | size 26034 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:navigate|3_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e62a25828dddf557091b9dcbc065f2c9e36fdf0c8d365dd1976584fc3f516eae 3 | size 34538 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:ruin_names|3_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:480bb8570ec01a173ebe85649989dc9a8ab64a4a2de2152d82bc344787bfffee 3 | size 34511 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_mmlu:college_chemistry|5_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:fa71775ee3b585b1612f2bbbd8327ba4f564c9eddcdce63533fd6d11c67c5d95 3 | size 50977 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_mmlu:us_foreign_policy|5_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:c5cae6c52cee4a49e9cf28920ec303235a15f2b9f075b4f2deff65bd220aea77 3 | size 52510 4 | -------------------------------------------------------------------------------- /examples/tasks/all_german_rag_evals.txt: -------------------------------------------------------------------------------- 1 | community|german_rag_eval:choose_question_by_context|0 2 | community|german_rag_eval:choose_context_by_question|0 3 | community|german_rag_eval:question_answer_match|0 4 | community|german_rag_eval:context_question_match|0 5 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:navigate|3_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:6baa807a3c85b3ca43708f5810f3898dd517db806ce4106124d4913b0fcea8b0 3 | size 28834 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:snarks|3_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:13659b3b6bcd5c2744fc3b33d8752589a1f6c52b2ed8ee17c6a3a4f28cd46908 3 | size 29149 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_mmlu:college_chemistry|5_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:25ea151a418829d528da4f762102052308b1cbb15b00d7190d5d0b9fd033436d 3 | size 37684 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_mmlu:us_foreign_policy|5_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:df6206dba3812d089f03078b97d067eb851eee03fd2aa295cbac2551f82837c0 3 | size 38453 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:sat-en-without-passage|0_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:61c1a8942675c523f499e19b01432fb0ce7b0cb8bbd910fe0a16b2b60bb7e80c 3 | size 32704 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:causal_judgment|3_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:4b020f10c882cb482d6e2ac597e14d662a5d38f873f5f7eada8c839c5e13b476 3 | size 72052 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:geometric_shapes|3_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:5ecdb28b097666cd4ccb0a208708a2076779e06145719ebd24e75060a272bdcf 3 | size 49571 4 | -------------------------------------------------------------------------------- /examples/model_configs/inference_providers.yaml: -------------------------------------------------------------------------------- 1 | model_parameters: 2 | model_name: "meta-llama/Llama-3.1-8B-Instruct" 3 | provider: "nebius" 4 | timeout: null 5 | proxies: null 6 | parallel_calls_count: 20 7 | generation_parameters: 8 | temperature: 0.4 9 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:ruin_names|3_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:7fe3c64179df68407c6fe6c6ad75b3f4a83a1edd286ca81da3fe96bcb5b21e9b 3 | size 27971 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:date_understanding|3_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:a8a1f1ef9ba8e3a58d2538f2f2e016769155f2b6c18da49454849c8b276cd398 3 | size 39423 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:disambiguation_qa|3_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:13fa9f4b64152c7140112b607f6dfddb5f9f755646bbef0b9cc5516a9c0e6de4 3 | size 38263 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:movie_recommendation|3_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:88e7d66c2396ab8a3f56ae9f4a342037a0f13f4ed83982312fdc7424eb74f60b 3 | size 36502 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:temporal_sequences|3_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:226b2d3fc783dcfecf3c634558746b1314f9f80a32a259c9fe174332fb1e3173 3 | size 50277 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:sat-en-without-passage|0_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:47204bfa1d6843f06ef6c08bb66d85adceab6457295f03303b7cd39bc7e4dd37 3 | size 25864 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:causal_judgment|3_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:eb844c74f574b377e4b27110dbdf0c28c227a96f4e8d1c0eac52578f4608bc49 3 | size 47558 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:date_understanding|3_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:b5ce79da0c3657667830882fa28ce623cb463bf5fb3c5e1367d6a5c13c480973 3 | size 30006 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:disambiguation_qa|3_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:4d4919aa444d52a1589883329eb3fdbb583b029a6213d4af13aa17c11a835399 3 | size 30932 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:geometric_shapes|3_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:0dffe2c495874fa53e0289b2307161107c54e9d15c9a8aa39016c990f7d62f8f 3 | size 32464 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:temporal_sequences|3_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:23ffe95306670d3a737b30bf34866734dcba717742011a424cc0230377f52363 3 | size 34393 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:movie_recommendation|3_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:64975666dfc61cd3a3a515a88134775c0f90cff1e1b9120a8ab9c8861c68bb99 3 | size 29221 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:logical_deduction_five_objects|3_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:4582e35192caeae218a50aa76738582d360914fd96cc9a4c3608d3683c44c33a 3 | size 47557 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:logical_deduction_seven_objects|3_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1a4b6292fb5df093df5ac43fba76b0af5b00337e0d2579a9c2b2f6398007b842 3 | size 56164 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:salient_translation_error_detection|3_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:ed82ceb8a3c05ae2c47b1769b333173e15069eb83710bc5d66918abb4ef4b7e7 3 | size 69137 4 | -------------------------------------------------------------------------------- /docs/source/package_reference/pipeline.mdx: -------------------------------------------------------------------------------- 1 | # Pipeline 2 | 3 | ## Pipeline 4 | 5 | [[autodoc]] pipeline.Pipeline 6 | 7 | ## PipelineParameters 8 | 9 | [[autodoc]] pipeline.PipelineParameters 10 | 11 | ## ParallelismManager 12 | 13 | [[autodoc]] pipeline.ParallelismManager 14 | -------------------------------------------------------------------------------- /examples/tasks/serbian_task_group/sr_mmlu_social_sciences.txt: -------------------------------------------------------------------------------- 1 | # MMLU (Social Sciences) 2 | community|serbian_evals:mmlu_globalne_cinjenice|0 3 | community|serbian_evals:mmlu_logicke_zablude|0 4 | community|serbian_evals:mmlu_sociologija|0 5 | community|serbian_evals:mmlu_human_aging|0 6 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:logical_deduction_five_objects|3_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1d10ce12e8f76b5ce3113273124e7683e5c5bddde6063cd3cbf25d495cffa6ba 3 | size 34653 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:logical_deduction_seven_objects|3_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e15bcad77e0453d7e987b4bf5216639b625f9df63341dfce4246dab88b87ca35 3 | size 38176 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:f6dd8f8d104f1a4252685019e5413ce9ecfc4611bb819ff627e77be296afc581 3 | size 52493 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-11-05T14-52-08.352779.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:9651280724d245b37a7c3dde465c5a384de7b12055b9474696d533d58330b240 3 | size 59838 4 | -------------------------------------------------------------------------------- /examples/tasks/serbian_task_group/sr_mmlu_ethics_philosophy.txt: -------------------------------------------------------------------------------- 1 | # MMLU (Ethics, Philosophy) 2 | community|serbian_evals:mmlu_moralni_sporovi|0 3 | community|serbian_evals:mmlu_moralne_dileme|0 4 | community|serbian_evals:mmlu_filozofija|0 5 | community|serbian_evals:mmlu_svetska_religija|0 6 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:salient_translation_error_detection|3_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:e34151ea0415cb442b47d334448abf127c8f1747da78a5a9977ff78ed2d831b5 3 | size 49337 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:1d7e589d611391395b2990a29e55bdd856ab440d45cba22fcd190936daf391dd 3 | size 34842 4 | -------------------------------------------------------------------------------- /tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-11-05T14-43-47.148527.parquet: -------------------------------------------------------------------------------- 1 | version https://git-lfs.github.com/spec/v1 2 | oid sha256:8e116d939941d57db2e5515114bec4890b56b6a35a5a2e49c809e6361b947337 3 | size 37387 4 | -------------------------------------------------------------------------------- /examples/model_configs/transformers_vlm_model.yaml: -------------------------------------------------------------------------------- 1 | model_parameters: 2 | model_name: "Qwen/Qwen2.5-VL-3B-Instruct" 3 | revision: "main" 4 | dtype: "float16" 5 | compile: false 6 | model_parallel: false 7 | batch_size: 1 8 | use_fast_image_processor: true 9 | generation_parameters: 10 | temperature: 0.0 11 | top_p: 0.9 12 | -------------------------------------------------------------------------------- /examples/tasks/serbian_task_group/sr_misc.txt: -------------------------------------------------------------------------------- 1 | # MMLU (Miscellaneous) 2 | community|serbian_evals:mmlu_anatomija|0 3 | community|serbian_evals:mmlu_astronomija|0 4 | community|serbian_evals:mmlu_poslovna_etika|0 5 | community|serbian_evals:mmlu_kliničko_znanje|0 6 | community|serbian_evals:mmlu_razno|0 7 | community|serbian_evals:mmlu_elektrotehnika|0 8 | -------------------------------------------------------------------------------- /examples/model_configs/tgi_model.yaml: -------------------------------------------------------------------------------- 1 | model_parameters: 2 | inference_server_address: "http://localhost:8080" # Replace with your actual TGI server address 3 | inference_server_auth: null 4 | model_name: null # Optional, only required if the TGI container was launched with model_id pointing to a local directory 5 | generation_parameters: 6 | temperature: 0.1 7 | -------------------------------------------------------------------------------- /examples/tasks/fine_tasks/cf/th.txt: -------------------------------------------------------------------------------- 1 | # General Knowledge (GK) 2 | lighteval|meta_mmlu_tha_cf|0 3 | lighteval|m3exams_tha_cf|0 4 | 5 | # Reading Comprehension (RC) 6 | lighteval|belebele_tha_Thai_cf|0 7 | lighteval|thaiqa_tha|0 8 | lighteval|xquad_tha|0 9 | 10 | # Natural Language Understanding (NLU) 11 | lighteval|community_hellaswag_tha_cf|0 12 | lighteval|xnli2.0_tha_cf|0 13 | -------------------------------------------------------------------------------- /examples/tasks/fine_tasks/mcf/th.txt: -------------------------------------------------------------------------------- 1 | # General Knowledge (GK) 2 | lighteval|meta_mmlu_tha_mcf|5 3 | lighteval|m3exams_tha_mcf|5 4 | 5 | # Reading Comprehension (RC) 6 | lighteval|belebele_tha_Thai_mcf|5 7 | lighteval|thaiqa_tha|5 8 | lighteval|xquad_tha|5 9 | 10 | # Natural Language Understanding (NLU) 11 | lighteval|community_hellaswag_tha_mcf|5 12 | lighteval|xnli2.0_tha_mcf|5 13 | -------------------------------------------------------------------------------- /examples/tasks/serbian_task_group/sr_mmlu_math_logic.txt: -------------------------------------------------------------------------------- 1 | # MMLU (Math, Logic) 2 | community|serbian_evals:mmlu_abstract_algebra|0 3 | community|serbian_evals:mmlu_osnovna_matematika|0 4 | community|serbian_evals:mmlu_formalna_logika|0 5 | community|serbian_evals:mmlu_konceptualna_fizika|0 6 | community|serbian_evals:mmlu_metrika_ekonomije|0 7 | community|serbian_evals:mmlu_masinsko_ucenje|0 8 | -------------------------------------------------------------------------------- /examples/model_configs/litellm_model.yaml: -------------------------------------------------------------------------------- 1 | model_parameters: 2 | model_name: "openai/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B" 3 | provider: "openai" 4 | base_url: "https://router.huggingface.co/hf-inference/v1" 5 | generation_parameters: 6 | temperature: 0.5 7 | max_new_tokens: 256 8 | top_p: 0.9 9 | seed: 0 10 | repetition_penalty: 1.0 11 | frequency_penalty: 0.0 12 | -------------------------------------------------------------------------------- /.github/workflows/pr_style_bot.yaml: -------------------------------------------------------------------------------- 1 | name: PR Style Bot 2 | 3 | on: 4 | issue_comment: 5 | types: [created] 6 | 7 | permissions: 8 | pull-requests: write 9 | 10 | jobs: 11 | style: 12 | uses: huggingface/huggingface_hub/.github/workflows/style-bot-action.yml@main 13 | with: 14 | python_quality_dependencies: "[quality]" 15 | secrets: 16 | bot_token: ${{ secrets.HF_STYLE_BOT_ACTION }} 17 | -------------------------------------------------------------------------------- /examples/tasks/fine_tasks/cf/te.txt: -------------------------------------------------------------------------------- 1 | # General Knowledge (GK) 2 | lighteval|mlmm_mmlu_tel_cf|0 3 | 4 | # Reading Comprehension (RC) 5 | lighteval|belebele_tel_Telu_cf|0 6 | lighteval|indicqa_tel|0 7 | 8 | # Reasoning (RES) 9 | lighteval|indicxcopa_tel_cf|0 10 | 11 | # Natural Language Understanding (NLU) 12 | lighteval|community_hellaswag_tel_cf|0 13 | lighteval|indicnxnli_tel_cf|0 14 | lighteval|xstory_cloze_tel_cf|0 15 | -------------------------------------------------------------------------------- /examples/tasks/fine_tasks/mcf/te.txt: -------------------------------------------------------------------------------- 1 | # General Knowledge (GK) 2 | lighteval|mlmm_mmlu_tel_mcf|5 3 | 4 | # Reading Comprehension (RC) 5 | lighteval|belebele_tel_Telu_mcf|5 6 | lighteval|indicqa_tel|5 7 | 8 | # Reasoning (RES) 9 | lighteval|indicxcopa_tel_mcf|5 10 | 11 | # Natural Language Understanding (NLU) 12 | lighteval|community_hellaswag_tel_mcf|5 13 | lighteval|indicnxnli_tel_mcf|5 14 | lighteval|xstory_cloze_tel_mcf|5 15 | -------------------------------------------------------------------------------- /examples/model_configs/quantized_model.yaml: -------------------------------------------------------------------------------- 1 | model_parameters: 2 | model_name: "HuggingFaceH4/zephyr-7b-beta" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ... 3 | revision: "main" # revision to use 4 | dtype: "4bit" # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization. 5 | compile: true 6 | batch_size: 1 # batch size to use 7 | -------------------------------------------------------------------------------- /examples/tasks/serbian_task_group/sr_mmlu_college_level.txt: -------------------------------------------------------------------------------- 1 | # MMLU (College Level Tasks) 2 | community|serbian_evals:mmlu_fakultet_biologija|0 3 | community|serbian_evals:mmlu_fakultet_hemija|0 4 | community|serbian_evals:mmlu_fakultet_racunari|0 5 | community|serbian_evals:mmlu_fakultet_matematika|0 6 | community|serbian_evals:mmlu_fakultet_medicina|0 7 | community|serbian_evals:mmlu_fakultet_fizika|0 8 | community|serbian_evals:mmlu_sigurnost_racunara|0 9 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/evaluation-task-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Evaluation/task request 3 | about: Suggest a new evaluation you want us to add 4 | title: "[EVAL]" 5 | labels: new task 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Evaluation short description 11 | - Why is this evaluation interesting? 12 | - How used is it in the community? 13 | 14 | ## Evaluation metadata 15 | Provide all available 16 | - Paper url: 17 | - Github url: 18 | - Dataset url: 19 | -------------------------------------------------------------------------------- /.github/workflows/trufflehog.yml: -------------------------------------------------------------------------------- 1 | on: 2 | push: 3 | 4 | name: Scan Secret Leaks 5 | 6 | permissions: 7 | contents: read 8 | 9 | jobs: 10 | trufflehog: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Checkout code 14 | uses: actions/checkout@v4 15 | with: 16 | fetch-depth: 0 17 | - name: Secret Scanning 18 | uses: trufflesecurity/trufflehog@main 19 | with: 20 | extra_args: --only-verified 21 | -------------------------------------------------------------------------------- /examples/tasks/fine_tasks/cf/fr.txt: -------------------------------------------------------------------------------- 1 | # General Knowledge (GK) 2 | lighteval|meta_mmlu_fra_cf|0 3 | lighteval|mlmm_arc_fra_cf:challenge|0 4 | lighteval|mintaka_fra|0 5 | 6 | # Reading Comprehension (RC) 7 | lighteval|belebele_fra_Latn_cf|0 8 | lighteval|fquadv2_fra|0 9 | 10 | # Reasoning (RES) 11 | lighteval|xcodah_fra_cf|0 12 | lighteval|xcsqa_fra_cf|0 13 | 14 | # Natural Language Understanding (NLU) 15 | lighteval|mlmm_hellaswag_fra_cf|0 16 | lighteval|xnli2.0_fra_cf|0 17 | -------------------------------------------------------------------------------- /examples/tasks/fine_tasks/cf/tr.txt: -------------------------------------------------------------------------------- 1 | # General Knowledge (GK) 2 | lighteval|community_arc_tur_cf:easy|0 3 | lighteval|exams_tur_cf|0 4 | lighteval|community_mmlu_tur_cf|0 5 | 6 | # Reading Comprehension (RC) 7 | lighteval|belebele_tur_Latn_cf|0 8 | lighteval|tquadv2_tur|0 9 | lighteval|xquad_tur|0 10 | 11 | # Reasoning (RES) 12 | lighteval|xcopa_tur_cf|0 13 | 14 | # Natural Language Understanding (NLU) 15 | lighteval|community_hellaswag_tur_cf|0 16 | lighteval|xnli2.0_tur_cf|0 17 | -------------------------------------------------------------------------------- /examples/tasks/fine_tasks/mcf/fr.txt: -------------------------------------------------------------------------------- 1 | # General Knowledge (GK) 2 | lighteval|meta_mmlu_fra_mcf|5 3 | lighteval|mlmm_arc_fra_mcf:challenge|5 4 | lighteval|mintaka_fra|5 5 | 6 | # Reading Comprehension (RC) 7 | lighteval|belebele_fra_Latn_mcf|5 8 | lighteval|fquadv2_fra|5 9 | 10 | # Reasoning (RES) 11 | lighteval|xcodah_fra_mcf|5 12 | lighteval|xcsqa_fra_mcf|5 13 | 14 | # Natural Language Understanding (NLU) 15 | lighteval|mlmm_hellaswag_fra_mcf|5 16 | lighteval|xnli2.0_fra_mcf|5 17 | -------------------------------------------------------------------------------- /.github/workflows/doc-build.yml: -------------------------------------------------------------------------------- 1 | name: Build Documentation 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - doc-builder* 8 | - v*-release 9 | 10 | jobs: 11 | build: 12 | uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main 13 | with: 14 | commit_sha: ${{ github.sha }} 15 | package: lighteval 16 | secrets: 17 | token: ${{ secrets.HUGGINGFACE_PUSH }} 18 | hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} 19 | -------------------------------------------------------------------------------- /examples/tasks/fine_tasks/mcf/tr.txt: -------------------------------------------------------------------------------- 1 | # General Knowledge (GK) 2 | lighteval|community_arc_tur_mcf:easy|5 3 | lighteval|exams_tur_mcf|5 4 | lighteval|community_mmlu_tur_mcf|5 5 | 6 | # Reading Comprehension (RC) 7 | lighteval|belebele_tur_Latn_mcf|5 8 | lighteval|tquadv2_tur|5 9 | lighteval|xquad_tur|5 10 | 11 | # Reasoning (RES) 12 | lighteval|xcopa_tur_mcf|5 13 | 14 | # Natural Language Understanding (NLU) 15 | lighteval|community_hellaswag_tur_mcf|5 16 | lighteval|xnli2.0_tur_mcf|5 17 | -------------------------------------------------------------------------------- /.github/workflows/doc-pr-build.yml: -------------------------------------------------------------------------------- 1 | name: Build PR Documentation 2 | 3 | on: 4 | pull_request: 5 | 6 | concurrency: 7 | group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }} 8 | cancel-in-progress: true 9 | 10 | jobs: 11 | build: 12 | uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main 13 | with: 14 | commit_sha: ${{ github.event.pull_request.head.sha }} 15 | pr_number: ${{ github.event.number }} 16 | package: lighteval 17 | -------------------------------------------------------------------------------- /examples/tasks/fine_tasks/cf/sw.txt: -------------------------------------------------------------------------------- 1 | # General Knowledge (GK) 2 | lighteval|community_arc_swa_cf:easy|0 3 | lighteval|m3exams_swa_cf|0 4 | lighteval|openai_mmlu_swa_cf|0 5 | 6 | # Reading Comprehension (RC) 7 | lighteval|belebele_swh_Latn_cf|0 8 | lighteval|kenswquad_swa|0 9 | lighteval|tydiqa_swa|0 10 | 11 | # Reasoning (RES) 12 | lighteval|xcsqa_swa_cf|0 13 | lighteval|xcopa_swa_cf|0 14 | 15 | # Natural Language Understanding (NLU) 16 | lighteval|xnli2.0_swa_cf|0 17 | lighteval|xstory_cloze_swa_cf|0 18 | -------------------------------------------------------------------------------- /examples/tasks/fine_tasks/mcf/sw.txt: -------------------------------------------------------------------------------- 1 | # General Knowledge (GK) 2 | lighteval|community_arc_swa_mcf:easy|5 3 | lighteval|m3exams_swa_mcf|5 4 | lighteval|openai_mmlu_swa_mcf|5 5 | 6 | # Reading Comprehension (RC) 7 | lighteval|belebele_swh_Latn_mcf|5 8 | lighteval|kenswquad_swa|5 9 | lighteval|tydiqa_swa|5 10 | 11 | # Reasoning (RES) 12 | lighteval|xcsqa_swa_mcf|5 13 | lighteval|xcopa_swa_mcf|5 14 | 15 | # Natural Language Understanding (NLU) 16 | lighteval|xnli2.0_swa_mcf|5 17 | lighteval|xstory_cloze_swa_mcf|5 18 | -------------------------------------------------------------------------------- /examples/tasks/fine_tasks/cf/hi.txt: -------------------------------------------------------------------------------- 1 | # General Knowledge (GK) 2 | lighteval|meta_mmlu_hin_cf|0 3 | lighteval|community_arc_hin_cf:easy|0 4 | 5 | # Reading Comprehension (RC) 6 | lighteval|belebele_hin_Deva_cf|0 7 | lighteval|indicqa_hin|0 8 | 9 | # Reasoning (RES) 10 | lighteval|xcodah_hin_cf|0 11 | lighteval|indicxcopa_hin_cf|0 12 | lighteval|xcsqa_hin_cf|0 13 | 14 | # Natural Language Understanding (NLU) 15 | lighteval|mlmm_hellaswag_hin_cf|0 16 | lighteval|indicnxnli_hin_cf|0 17 | lighteval|xstory_cloze_hin_cf|0 18 | -------------------------------------------------------------------------------- /docs/source/package_reference/logging.mdx: -------------------------------------------------------------------------------- 1 | # Logging 2 | 3 | ## EvaluationTracker 4 | [[autodoc]] logging.evaluation_tracker.EvaluationTracker 5 | 6 | ## GeneralConfigLogger 7 | [[autodoc]] logging.info_loggers.GeneralConfigLogger 8 | ## DetailsLogger 9 | [[autodoc]] logging.info_loggers.DetailsLogger 10 | ## MetricsLogger 11 | [[autodoc]] logging.info_loggers.MetricsLogger 12 | ## VersionsLogger 13 | [[autodoc]] logging.info_loggers.VersionsLogger 14 | ## TaskConfigLogger 15 | [[autodoc]] logging.info_loggers.TaskConfigLogger 16 | -------------------------------------------------------------------------------- /examples/tasks/fine_tasks/mcf/hi.txt: -------------------------------------------------------------------------------- 1 | # General Knowledge (GK) 2 | lighteval|meta_mmlu_hin_mcf|5 3 | lighteval|community_arc_hin_mcf:easy|5 4 | 5 | # Reading Comprehension (RC) 6 | lighteval|belebele_hin_Deva_mcf|5 7 | lighteval|indicqa_hin|5 8 | 9 | # Reasoning (RES) 10 | lighteval|xcodah_hin_mcf|5 11 | lighteval|indicxcopa_hin_mcf|5 12 | lighteval|xcsqa_hin_mcf|5 13 | 14 | # Natural Language Understanding (NLU) 15 | lighteval|mlmm_hellaswag_hin_mcf|5 16 | lighteval|indicnxnli_hin_mcf|5 17 | lighteval|xstory_cloze_hin_mcf|5 18 | -------------------------------------------------------------------------------- /examples/model_configs/transformers_model.yaml: -------------------------------------------------------------------------------- 1 | model_parameters: 2 | model_name: "HuggingFaceTB/SmolLM2-1.7B-Instruct" 3 | revision: "57aa3c6599c53705406c648e7acca7e11dc45ea3" 4 | dtype: "float16" 5 | compile: false 6 | model_parallel: false 7 | batch_size: 1 8 | continuous_batching: false 9 | model_loading_kwargs: 10 | attn_implementation: "eager" 11 | #tp_plan: "auto" 12 | generation_parameters: 13 | #num_blocks: 4096 14 | #block_size: 64 15 | #max_new_tokens: 256 16 | temperature: 0.0 17 | top_p: 0.9 18 | -------------------------------------------------------------------------------- /.github/workflows/doc-pr-upload.yml: -------------------------------------------------------------------------------- 1 | name: Upload PR Documentation 2 | 3 | on: 4 | workflow_run: 5 | workflows: ["Build PR Documentation"] 6 | types: 7 | - completed 8 | 9 | jobs: 10 | build: 11 | uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main 12 | with: 13 | package_name: lighteval 14 | secrets: 15 | hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} 16 | comment_bot_app_id: ${{ secrets.COMMENT_BOT_APP_ID }} 17 | comment_bot_secret_pem: ${{ secrets.COMMENT_BOT_SECRET_PEM }} 18 | -------------------------------------------------------------------------------- /tests/unit/metrics/pytest.ini: -------------------------------------------------------------------------------- 1 | [tool:pytest] 2 | testpaths = . 3 | python_files = test_*.py 4 | python_classes = Test* 5 | python_functions = test_* 6 | addopts = 7 | -v 8 | --tb=short 9 | --strict-markers 10 | --disable-warnings 11 | markers = 12 | slow: marks tests as slow (deselect with '-m "not slow"') 13 | unit: marks tests as unit tests 14 | integration: marks tests as integration tests 15 | automated: marks tests as automated metric tests 16 | filterwarnings = 17 | ignore::DeprecationWarning 18 | ignore::PendingDeprecationWarning 19 | -------------------------------------------------------------------------------- /examples/tasks/fine_tasks/cf/ru.txt: -------------------------------------------------------------------------------- 1 | # General Knowledge (GK) 2 | lighteval|mlmm_arc_rus_cf:challenge|0 3 | lighteval|rummlu_rus_cf|0 4 | lighteval|mera_openbookqa_rus_cf|0 5 | 6 | # Reading Comprehension (RC) 7 | lighteval|belebele_rus_Cyrl_cf|0 8 | lighteval|tydiqa_rus|0 9 | lighteval|sber_squad_rus|0 10 | lighteval|xquad_rus|0 11 | 12 | # Reasoning (RES) 13 | lighteval|parus_rus_cf|0 14 | lighteval|xcodah_rus_cf|0 15 | lighteval|xcsqa_rus_cf|0 16 | 17 | # Natural Language Understanding (NLU) 18 | lighteval|mlmm_hellaswag_rus_cf|0 19 | lighteval|xnli2.0_rus_cf|0 20 | lighteval|xstory_cloze_rus_cf|0 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[FT] " 5 | labels: feature request 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Issue encountered 11 | Is your feature request related to a problem? Please provide a clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | ## Solution/Feature 14 | A clear and concise description of what you want to happen. 15 | 16 | ## Possible alternatives 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | -------------------------------------------------------------------------------- /examples/tasks/fine_tasks/mcf/ru.txt: -------------------------------------------------------------------------------- 1 | # General Knowledge (GK) 2 | lighteval|mlmm_arc_rus_mcf:challenge|5 3 | lighteval|rummlu_rus_mcf|5 4 | lighteval|mera_openbookqa_rus_mcf|5 5 | 6 | # Reading Comprehension (RC) 7 | lighteval|belebele_rus_Cyrl_mcf|5 8 | lighteval|tydiqa_rus|5 9 | lighteval|sber_squad_rus|5 10 | lighteval|xquad_rus|5 11 | 12 | # Reasoning (RES) 13 | lighteval|parus_rus_mcf|0 14 | lighteval|xcodah_rus_mcf|5 15 | lighteval|xcsqa_rus_mcf|5 16 | 17 | # Natural Language Understanding (NLU) 18 | lighteval|mlmm_hellaswag_rus_mcf|0 19 | lighteval|xnli2.0_rus_mcf|5 20 | lighteval|xstory_cloze_rus_mcf|5 21 | -------------------------------------------------------------------------------- /.github/release.yml: -------------------------------------------------------------------------------- 1 | changelog: 2 | exclude: 3 | labels: 4 | - ignore-for-release 5 | categories: 6 | - title: New Features 🎉 7 | labels: 8 | - feature 9 | - title: Enhancement ⚙️ 10 | labels: 11 | - enhancement 12 | - title: Documentation 📚 13 | labels: 14 | - documentation 15 | - title: New Tasks 16 | labels: 17 | - new-task 18 | - title: Task and Metrics changes 🛠️ 19 | labels: 20 | - task-update 21 | - title: Bug Fixes 🐛 22 | labels: 23 | - bug 24 | - title: Other Changes 25 | labels: 26 | - "*" 27 | -------------------------------------------------------------------------------- /examples/tasks/fine_tasks/cf/zh.txt: -------------------------------------------------------------------------------- 1 | # General Knowledge (GK) 2 | lighteval|agieval_zho_cf|0 3 | lighteval|ceval_zho_cf|0 4 | lighteval|cmmlu_zho_cf|0 5 | lighteval|m3exams_zho_cf|0 6 | 7 | # Reading Comprehension (RC) 8 | lighteval|belebele_zho_Hans_cf|0 9 | lighteval|c3_zho_cf|0 10 | lighteval|cmrc2018_zho|0 11 | lighteval|chinese_squad_zho|0 12 | 13 | # Reasoning (RES) 14 | lighteval|xcodah_zho_cf|0 15 | lighteval|xcopa_zho_cf|0 16 | lighteval|xcsqa_zho_cf|0 17 | 18 | # Natural Language Understanding (NLU) 19 | lighteval|mlmm_hellaswag_zho_cf|0 20 | lighteval|ocnli_zho_cf|0 21 | lighteval|xwinograd_zho_cf|0 22 | lighteval|xstory_cloze_zho_cf|0 23 | -------------------------------------------------------------------------------- /examples/tasks/fine_tasks/mcf/zh.txt: -------------------------------------------------------------------------------- 1 | # General Knowledge (GK) 2 | lighteval|agieval_zho_mcf|5 3 | lighteval|ceval_zho_mcf|5 4 | lighteval|cmmlu_zho_mcf|5 5 | lighteval|m3exams_zho_mcf|5 6 | 7 | # Reading Comprehension (RC) 8 | lighteval|belebele_zho_Hans_mcf|5 9 | lighteval|c3_zho_mcf|5 10 | lighteval|cmrc2018_zho|5 11 | lighteval|chinese_squad_zho|5 12 | 13 | # Reasoning (RES) 14 | lighteval|xcodah_zho_mcf|5 15 | lighteval|xcopa_zho_mcf|5 16 | lighteval|xcsqa_zho_mcf|5 17 | 18 | # Natural Language Understanding (NLU) 19 | lighteval|mlmm_hellaswag_zho_mcf|5 20 | lighteval|ocnli_zho_mcf|5 21 | lighteval|xwinograd_zho_mcf|5 22 | lighteval|xstory_cloze_zho_mcf|5 23 | -------------------------------------------------------------------------------- /docs/source/package_reference/tasks.mdx: -------------------------------------------------------------------------------- 1 | # Tasks 2 | 3 | ## LightevalTask 4 | ### LightevalTaskConfig 5 | [[autodoc]] tasks.lighteval_task.LightevalTaskConfig 6 | ### LightevalTask 7 | [[autodoc]] tasks.lighteval_task.LightevalTask 8 | 9 | ## PromptManager 10 | [[autodoc]] tasks.prompt_manager.PromptManager 11 | 12 | ## Registry 13 | [[autodoc]] tasks.registry.Registry 14 | 15 | ## Doc 16 | [[autodoc]] tasks.requests.Doc 17 | 18 | ## Datasets 19 | [[autodoc]] data.DynamicBatchDataset 20 | [[autodoc]] data.LoglikelihoodDataset 21 | [[autodoc]] data.GenerativeTaskDataset 22 | [[autodoc]] data.GenerativeTaskDatasetNanotron 23 | [[autodoc]] data.GenDistributedSampler 24 | -------------------------------------------------------------------------------- /examples/nanotron/lighteval_config_override_template.yaml: -------------------------------------------------------------------------------- 1 | # As of right now auto batch size doesn't work, so we use some default 2 | batch_size: 8 3 | generation: null 4 | logging: 5 | output_dir: "outputs" 6 | save_details: false 7 | push_to_hub: false 8 | public_run: false 9 | results_org: null 10 | tensorboard_metric_prefix: "eval" 11 | parallelism: 12 | dp: 1 13 | pp: 1 14 | pp_engine: 1f1b 15 | tp: 1 16 | tp_linear_async_communication: false 17 | tp_mode: ALL_REDUCE 18 | tasks: 19 | dataset_loading_processes: 8 20 | max_samples: 10 21 | multichoice_continuations_start_space: null 22 | num_fewshot_seeds: null 23 | tasks: lighteval|gsm8k|5 24 | -------------------------------------------------------------------------------- /examples/tasks/fine_tasks/cf/ar.txt: -------------------------------------------------------------------------------- 1 | # General Knowledge (GK) 2 | lighteval|exams_ara_cf|0 3 | lighteval|mmlu_ara_cf|0 4 | lighteval|alghafa_arc_ara_cf:easy|0 5 | lighteval|alghafa_sciqa_ara_cf|0 6 | 7 | # Reading Comprehension (RC) 8 | lighteval|belebele_arb_Arab_cf|0 9 | lighteval|soqal_ara_cf|0 10 | lighteval|mlqa_ara|0 11 | lighteval|tydiqa_ara|0 12 | lighteval|alghafa_race_ara_cf|0 13 | lighteval|arcd_ara|0 14 | 15 | # Reasoning (RES) 16 | lighteval|xcodah_ara_cf|0 17 | lighteval|alghafa_piqa_ara_cf|0 18 | lighteval|xcsqa_ara_cf|0 19 | 20 | # Natural Language Understanding (NLU) 21 | lighteval|xnli2.0_ara_cf|0 22 | lighteval|mlmm_hellaswag_ara_cf|0 23 | lighteval|xstory_cloze_ara_cf|0 24 | -------------------------------------------------------------------------------- /examples/tasks/fine_tasks/mcf/ar.txt: -------------------------------------------------------------------------------- 1 | # General Knowledge (GK) 2 | lighteval|exams_ara_mcf|5 3 | lighteval|mmlu_ara_mcf|5 4 | lighteval|alghafa_arc_ara_mcf:easy|5 5 | lighteval|alghafa_sciqa_ara_mcf|5 6 | 7 | # Reading Comprehension (RC) 8 | lighteval|belebele_arb_Arab_mcf|5 9 | lighteval|soqal_ara_mcf|5 10 | lighteval|mlqa_ara|5 11 | lighteval|tydiqa_ara|5 12 | lighteval|alghafa_race_ara_mcf|5 13 | lighteval|arcd_ara|5 14 | 15 | # Reasoning (RES) 16 | lighteval|xcodah_ara_mcf|5 17 | lighteval|alghafa_piqa_ara_mcf|5 18 | lighteval|xcsqa_ara_mcf|5 19 | 20 | # Natural Language Understanding (NLU) 21 | lighteval|xnli2.0_ara_mcf|5 22 | lighteval|mlmm_hellaswag_ara_mcf|5 23 | lighteval|xstory_cloze_ara_mcf|5 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve lighteval! 4 | title: "[BUG] " 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Describe the bug 11 | A clear and concise description of what the bug is. 12 | 13 | ## To Reproduce 14 | Please provide all the steps needed to reproduce the behavior, or provide a minimal working example if needed. We will ignore issues missing this section. 15 | 16 | ## Expected behavior 17 | A clear and concise description of what you expected to happen. 18 | 19 | ## Version info 20 | Please provide your operating system, lighteval version or commit if you installed from main, and pip/conda environment if your problem concerns dependencies. 21 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2024 The HuggingFace Team 4 | 5 | import pytest 6 | 7 | 8 | def pytest_addoption(parser): 9 | parser.addoption("--runslow", action="store_true", default=False, help="run slow tests") 10 | 11 | 12 | def pytest_configure(config): 13 | config.addinivalue_line("markers", "slow: mark test as slow to run") 14 | 15 | 16 | def pytest_collection_modifyitems(config, items): 17 | if config.getoption("--runslow"): 18 | # --runslow given in cli: do not skip slow tests 19 | return 20 | skip_slow = pytest.mark.skip(reason="need --runslow option to run") 21 | for item in items: 22 | if "slow" in item.keywords: 23 | item.add_marker(skip_slow) 24 | -------------------------------------------------------------------------------- /examples/tasks/serbian_task_group/sr_mmlu_high_school_level.txt: -------------------------------------------------------------------------------- 1 | # MMLU (High School Level Tasks) 2 | community|serbian_evals:mmlu_srednja_skola_biologija|0 3 | community|serbian_evals:mmlu_srednja_skola_hemija|0 4 | community|serbian_evals:mmlu_srednja_skola_racunari|0 5 | community|serbian_evals:mmlu_srednja_skola_istorija_evrope|0 6 | community|serbian_evals:mmlu_srednja_skola_geografija|0 7 | community|serbian_evals:mmlu_srednja_skola_matematika|0 8 | community|serbian_evals:mmlu_srednja_skola_mikroekonomija|0 9 | community|serbian_evals:mmlu_srednja_skola_fizika|0 10 | community|serbian_evals:mmlu_srednja_skola_psihologija|0 11 | community|serbian_evals:mmlu_srednja_skola_statistika|0 12 | community|serbian_evals:mmlu_srednja_skola_svetska_istorija|0 13 | -------------------------------------------------------------------------------- /.github/workflows/quality.yaml: -------------------------------------------------------------------------------- 1 | name: Quality 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - v*-release 8 | pull_request: 9 | branches: 10 | - main 11 | 12 | jobs: 13 | 14 | check_code_quality: 15 | name: Check code quality 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Checkout code 19 | uses: actions/checkout@v2 20 | - name: Setup Python environment 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: '3.10' 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | python -m pip install ".[quality]" 28 | - name: Code quality 29 | run: | 30 | make quality 31 | -------------------------------------------------------------------------------- /docs/source/available-tasks.mdx: -------------------------------------------------------------------------------- 1 | # Available tasks 2 | 3 | Browse and inspect tasks available in LightEval. 4 | 10 | 11 | 12 | 13 | List all tasks: 14 | 15 | ```bash 16 | lighteval tasks list 17 | ``` 18 | 19 | Extract tasks details: 20 | 21 | ```bash 22 | lighteval tasks dump 23 | ``` 24 | 25 | Store the tasks details in a JSON file: 26 | 27 | ```bash 28 | lighteval tasks dump > tasks.json 29 | ``` 30 | 31 | ### Inspect specific tasks 32 | 33 | Inspect a task to view its config, metrics, and requirements: 34 | 35 | ```bash 36 | lighteval tasks inspect 37 | ``` 38 | 39 | Example: 40 | ```bash 41 | lighteval tasks inspect truthfulqa:mc 42 | ``` 43 | -------------------------------------------------------------------------------- /examples/model_configs/sglang_model_config.yaml: -------------------------------------------------------------------------------- 1 | model_parameters: 2 | model_name: "HuggingFaceTB/SmolLM-1.7B-Instruct" 3 | dtype: "auto" 4 | tp_size: 1 5 | dp_size: 1 6 | context_length: null 7 | random_seed: 1 8 | trust_remote_code: False 9 | device: "cuda" 10 | skip_tokenizer_init: False 11 | kv_cache_dtype: "auto" 12 | add_special_tokens: True 13 | pairwise_tokenization: False 14 | sampling_backend: null 15 | attention_backend: null 16 | mem_fraction_static: 0.8 17 | chunked_prefill_size: 4096 18 | generation_parameters: 19 | max_new_tokens: 1024 20 | min_new_tokens: 0 21 | temperature: 1.0 22 | top_k: 50 23 | min_p: 0.0 24 | top_p: 1.0 25 | presence_penalty: 0.0 26 | repetition_penalty: 1.0 27 | frequency_penalty: 0.0 28 | metrics_options: 29 | yo: null 30 | -------------------------------------------------------------------------------- /examples/tasks/all_filipino_tasks.txt: -------------------------------------------------------------------------------- 1 | community|readability_ceb_mcf|0 2 | community|kalahi_tgl_mcf|0 3 | community|kalahi_tgl_hybrid|0 4 | community|cebuaner_ceb_mcf|0 5 | community|universalner_tgl_mcf|0 6 | community|universalner_ceb_mcf|0 7 | community|tlunifiedner_tgl_mcf|0 8 | community|stingraybench_correctness_tgl_mcf|0 9 | community|stingraybench_semantic_appropriateness_tgl_mcf|0 10 | community|tatoeba_ceb|0 11 | community|tatoeba_tgl|0 12 | community|ntrex128_fil|0 13 | community|tico19_tgl|0 14 | community|dengue_filipino_fil|0 15 | community|include_tgl_mcf|0 16 | community|newsphnli_fil_mcf|0 17 | community|belebele_ceb_mcf|0 18 | community|belebele_fil_mcf|0 19 | community|sib200_ceb_mcf|0 20 | community|sib200_tgl_mcf|0 21 | community|firecs_fil_mcf|0 22 | community|global_mmlu_all_tgl_mcf|0 23 | community|balita_tgl_mcf|0 24 | -------------------------------------------------------------------------------- /tests/unit/metrics/test_cases/rouge1.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ROUGE1 Test Suite", 3 | "description": "Test cases for ROUGE1 metric", 4 | "test_cases": [ 5 | { 6 | "name": "ROUGE Score", 7 | "metric_class": "rouge1", 8 | "metric_params": { 9 | }, 10 | "doc": { 11 | "query": "Summarize the text", 12 | "choices": ["The quick brown fox jumps over the lazy dog"], 13 | "gold_index": 0, 14 | "task_name": "test" 15 | }, 16 | "model_response": { 17 | "text": ["The quick brown fox jumps over the lazy dog"], 18 | "logprobs": [], 19 | "output_tokens": [] 20 | }, 21 | "expected_output": { 22 | "rouge1": 1 23 | }, 24 | "tolerance": 0.01, 25 | "description": "Test ROUGE score with perfect match" 26 | } 27 | ] 28 | } 29 | -------------------------------------------------------------------------------- /tests/unit/metrics/test_cases/simpleqa_judge.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Simpleqa Judge Test Suite", 3 | "description": "Test cases for simpleqa_judge metric", 4 | "test_cases": [ 5 | { 6 | "name": "Simpleqa Judge - Basic Test", 7 | "metric_class": "simpleqa_judge", 8 | "metric_params": {}, 9 | "doc": { 10 | "query": "Test query for simpleqa_judge", 11 | "choices": [ 12 | "Test choice 1", 13 | "Test choice 2", 14 | "Test choice 3" 15 | ], 16 | "gold_index": 0, 17 | "task_name": "test" 18 | }, 19 | "model_response": { 20 | "text": [ 21 | "Test choice 1" 22 | ] 23 | }, 24 | "expected_output": { 25 | "simpleqa_judge": 1.0 26 | }, 27 | "tolerance": 0.01, 28 | "description": "Basic test case for simpleqa_judge metric" 29 | } 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /examples/test_tasks.txt: -------------------------------------------------------------------------------- 1 | arc:challenge|25 2 | truthfulqa:mc|0 3 | hellaswag|10 4 | mmlu:college_chemistry|5 5 | mmlu:us_foreign_policy|5 6 | agieval:aqua-rat|0 7 | agieval:logiqa-en|0 8 | agieval:lsat-ar|0 9 | agieval:lsat-lr|0 10 | agieval:lsat-rc|0 11 | agieval:sat-en-without-passage|0 12 | agieval:sat-en|0 13 | bigbench_hard:causal_judgment|3 14 | bigbench_hard:date_understanding|3 15 | bigbench_hard:disambiguation_qa|3 16 | bigbench_hard:geometric_shapes|3 17 | bigbench_hard:logical_deduction_five_objects|3 18 | bigbench_hard:logical_deduction_seven_objects|3 19 | bigbench_hard:movie_recommendation|3 20 | bigbench_hard:navigate|3 21 | bigbench_hard:ruin_names|3 22 | bigbench_hard:salient_translation_error_detection|3 23 | bigbench_hard:snarks|3 24 | bigbench_hard:temporal_sequences|3 25 | bigbench_hard:tracking_shuffled_objects_five_objects|3 26 | bigbench_hard:tracking_shuffled_objects_seven_objects|3 27 | gsm8k_test|0 28 | -------------------------------------------------------------------------------- /examples/model_configs/vllm_model_config.yaml: -------------------------------------------------------------------------------- 1 | model_parameters: 2 | model_name: "HuggingFaceTB/SmolLM2-1.7B-Instruct" 3 | revision: "57aa3c6599c53705406c648e7acca7e11dc45ea3" 4 | dtype: "float16" 5 | tensor_parallel_size: 1 6 | data_parallel_size: 1 7 | pipeline_parallel_size: 1 8 | gpu_memory_utilization: 0.6 9 | max_model_length: null 10 | swap_space: 4 11 | seed: 42 12 | trust_remote_code: False 13 | add_special_tokens: True 14 | multichoice_continuations_start_space: False 15 | pairwise_tokenization: False 16 | subfolder: null 17 | max_num_seqs: 1 18 | max_num_batched_tokens: 8192 19 | is_async: false 20 | generation_parameters: 21 | presence_penalty: 0.0 22 | repetition_penalty: 1.0 23 | frequency_penalty: 0.0 24 | temperature: 0.0 25 | top_k: null 26 | min_p: 0.0 27 | top_p: 0.9 28 | seed: 42 29 | stop_tokens: null 30 | max_new_tokens: 2048 31 | min_new_tokens: 0 32 | -------------------------------------------------------------------------------- /examples/model_configs/peft_model.yaml: -------------------------------------------------------------------------------- 1 | model_parameters: 2 | model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ... For a PEFT model, the pretrained model should be the one trained with PEFT and the base model below will contain the original model on which the adapters will be applied. 3 | tokenizer: null # name of tokenizer to use if different from the model's default 4 | subfolder: null # subfolder in the model's directory to use 5 | dtype: "float16" # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization. 6 | compile: true 7 | revision: "main" # revision to use 8 | trust_remote_code: true # Trust remote code 9 | model_parallel: null # Model parallel 10 | max_length: 2048 # maximum length of the input text and the generated text 11 | 12 | # should go in generation 13 | max_generation_toks: 256 # maximum number of tokens to generate 14 | batch_size: 10 # batch size to use 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Hugging Face 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/unit/metrics/test_cases/bert_score.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Bert Score Test Suite", 3 | "description": "Test cases for bert_score metric", 4 | "test_cases": [ 5 | { 6 | "name": "Bert Score - Basic Test", 7 | "metric_class": "bert_score", 8 | "metric_params": {}, 9 | "doc": { 10 | "query": "Test query for bert_score", 11 | "choices": [ 12 | "Test choice 1", 13 | "Test choice 2", 14 | "Test choice 3" 15 | ], 16 | "gold_index": 0, 17 | "task_name": "test" 18 | }, 19 | "model_response": { 20 | "text": [ 21 | "Test choice 1" 22 | ], 23 | "logprobs": [ 24 | 0.5, 25 | 0.3, 26 | 0.2 27 | ], 28 | "output_tokens": [ 29 | [ 30 | 1 31 | ], 32 | [ 33 | 2 34 | ], 35 | [ 36 | 3 37 | ] 38 | ] 39 | }, 40 | "expected_output": { 41 | "result": 1.0 42 | }, 43 | "tolerance": 0.01, 44 | "description": "Basic test case for bert_score metric" 45 | } 46 | ] 47 | } 48 | -------------------------------------------------------------------------------- /.github/workflows/slow_tests.yaml: -------------------------------------------------------------------------------- 1 | name: Slow end to end tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - v*-release 8 | pull_request: 9 | branches: 10 | - main 11 | 12 | jobs: 13 | run_tests: 14 | name: Run tests 15 | runs-on: 'aws-g4dn-2xlarge-use1-public-80' 16 | steps: 17 | - name: Install Git LFS 18 | run: | 19 | if ! command -v git-lfs &> /dev/null; then 20 | echo "Installing Git LFS..." 21 | sudo apt-get update && sudo apt-get install -y git-lfs 22 | git lfs install 23 | else 24 | echo "Git LFS already installed." 25 | fi 26 | 27 | - name: Checkout repository 28 | uses: actions/checkout@v4 29 | with: 30 | lfs: true 31 | 32 | - name: Install uv 33 | uses: astral-sh/setup-uv@v5 34 | with: 35 | enable-cache: true 36 | 37 | - name: Install the project 38 | run: uv sync --extra dev 39 | 40 | 41 | - name: run nvidia-smi 42 | run: nvidia-smi 43 | 44 | - name: Run tests 45 | run: uv run pytest --disable-pytest-warnings --runslow tests/slow_tests/ 46 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2024 The HuggingFace Team 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/unit/metrics/test_cases/bits_per_byte.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Bits Per Byte Test Suite", 3 | "description": "Test cases for bits_per_byte metric", 4 | "test_cases": [ 5 | { 6 | "name": "Bits Per Byte - Basic Test", 7 | "metric_class": "bits_per_byte", 8 | "metric_params": {}, 9 | "doc": { 10 | "query": "Test query for bits_per_byte", 11 | "choices": [ 12 | "Test choice 1", 13 | "Test choice 2", 14 | "Test choice 3" 15 | ], 16 | "gold_index": 0, 17 | "task_name": "test" 18 | }, 19 | "model_response": { 20 | "text": [ 21 | "Test choice 1" 22 | ], 23 | "logprobs": [ 24 | 0.5, 25 | 0.3, 26 | 0.2 27 | ], 28 | "output_tokens": [ 29 | [ 30 | 1 31 | ], 32 | [ 33 | 2 34 | ], 35 | [ 36 | 3 37 | ] 38 | ] 39 | }, 40 | "expected_output": { 41 | "bits_per_byte": 1.0 42 | }, 43 | "tolerance": 0.01, 44 | "description": "Basic test case for bits_per_byte metric" 45 | } 46 | ] 47 | } 48 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/jeopardy.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Jeopardy 4 | 5 | dataset: 6 | openaccess-ai-collective/jeopardy 7 | 8 | abstract: 9 | Jeopardy is a dataset of questions and answers from the Jeopardy game show. 10 | 11 | languages: 12 | english 13 | 14 | tags: 15 | knowledge, qa 16 | 17 | paper: 18 | """ 19 | 20 | from lighteval.metrics.metrics import Metrics 21 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 22 | from lighteval.tasks.templates.qa import get_qa_prompt_function 23 | from lighteval.utils.language import Language 24 | 25 | 26 | jeopardy = LightevalTaskConfig( 27 | name="jeopardy", 28 | prompt_function=get_qa_prompt_function( 29 | Language.ENGLISH, 30 | lambda line: { 31 | "question": line["question"], 32 | "choices": [line["answer"]], 33 | }, 34 | ), 35 | hf_repo="openaccess-ai-collective/jeopardy", 36 | hf_subset="default", 37 | evaluation_splits=("train",), 38 | few_shots_split="train", 39 | generation_size=250, 40 | stop_sequence=["\n", "Question:", "question:"], 41 | metrics=[Metrics.exact_match], 42 | version=1, 43 | ) 44 | 45 | TASKS_TABLE = [ 46 | jeopardy, 47 | ] 48 | -------------------------------------------------------------------------------- /src/lighteval/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2024 The HuggingFace Team 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/slow_tests/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2024 The HuggingFace Team 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/lighteval/metrics/imports/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2024 The HuggingFace Team 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/lighteval/tasks/templates/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2024 The HuggingFace Team 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/unit/metrics/test_cases/byte_perplexity.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Byte Perplexity Test Suite", 3 | "description": "Test cases for byte_perplexity metric", 4 | "test_cases": [ 5 | { 6 | "name": "Byte Perplexity - Basic Test", 7 | "metric_class": "byte_perplexity", 8 | "metric_params": {}, 9 | "doc": { 10 | "query": "Test query for byte_perplexity", 11 | "choices": [ 12 | "Test choice 1", 13 | "Test choice 2", 14 | "Test choice 3" 15 | ], 16 | "gold_index": 0, 17 | "task_name": "test" 18 | }, 19 | "model_response": { 20 | "text": [ 21 | "Test choice 1" 22 | ], 23 | "logprobs": [ 24 | 0.5, 25 | 0.3, 26 | 0.2 27 | ], 28 | "output_tokens": [ 29 | [ 30 | 1 31 | ], 32 | [ 33 | 2 34 | ], 35 | [ 36 | 3 37 | ] 38 | ] 39 | }, 40 | "expected_output": { 41 | "byte_perplexity": 1.0 42 | }, 43 | "tolerance": 0.01, 44 | "description": "Basic test case for byte_perplexity metric" 45 | } 46 | ] 47 | } 48 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2024 The HuggingFace Team 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/unit/metrics/test_cases/expr_gold_metric.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Expr Gold Metric Test Suite", 3 | "description": "Test cases for expr_gold_metric metric", 4 | "test_cases": [ 5 | { 6 | "name": "Expr Gold Metric - Basic Test", 7 | "metric_class": "expr_gold_metric", 8 | "metric_params": {}, 9 | "doc": { 10 | "query": "Test query for expr_gold_metric", 11 | "choices": [ 12 | "Test choice 1", 13 | "Test choice 2", 14 | "Test choice 3" 15 | ], 16 | "gold_index": 0, 17 | "task_name": "test" 18 | }, 19 | "model_response": { 20 | "text": [ 21 | "Test choice 1" 22 | ], 23 | "logprobs": [ 24 | 0.5, 25 | 0.3, 26 | 0.2 27 | ], 28 | "output_tokens": [ 29 | [ 30 | 1 31 | ], 32 | [ 33 | 2 34 | ], 35 | [ 36 | 3 37 | ] 38 | ] 39 | }, 40 | "expected_output": { 41 | "extractive_match": 1.0 42 | }, 43 | "tolerance": 0.01, 44 | "description": "Basic test case for expr_gold_metric metric" 45 | } 46 | ] 47 | } 48 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2024 The HuggingFace Team 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/lighteval/tasks/templates/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2024 The HuggingFace Team 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | -------------------------------------------------------------------------------- /tests/unit/metrics/test_cases/prediction_perplexity.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Prediction Perplexity Test Suite", 3 | "description": "Test cases for prediction_perplexity metric", 4 | "test_cases": [ 5 | { 6 | "name": "Prediction Perplexity - Basic Test", 7 | "metric_class": "prediction_perplexity", 8 | "metric_params": {}, 9 | "doc": { 10 | "query": "Test query for prediction_perplexity", 11 | "choices": [ 12 | "Test choice 1", 13 | "Test choice 2", 14 | "Test choice 3" 15 | ], 16 | "gold_index": 0, 17 | "task_name": "test" 18 | }, 19 | "model_response": { 20 | "text": [ 21 | "Test choice 1" 22 | ], 23 | "logprobs": [ 24 | 0.5, 25 | 0.3, 26 | 0.2 27 | ], 28 | "output_tokens": [ 29 | [ 30 | 1 31 | ], 32 | [ 33 | 2 34 | ], 35 | [ 36 | 3 37 | ] 38 | ] 39 | }, 40 | "expected_output": { 41 | "ppl": 1.0 42 | }, 43 | "tolerance": 0.01, 44 | "description": "Basic test case for prediction_perplexity metric" 45 | } 46 | ] 47 | } 48 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2024 The HuggingFace Team 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from setuptools import setup 24 | 25 | 26 | setup() 27 | -------------------------------------------------------------------------------- /examples/model_configs/endpoint_model.yaml: -------------------------------------------------------------------------------- 1 | model_parameters: 2 | reuse_existing: false # if true, ignore all params in instance, and don't delete the endpoint after evaluation 3 | # endpoint_name: "llama-2-7B-lighteval" # needs to be lower case without special characters 4 | 5 | model_name: "meta-llama/Llama-2-7b-hf" 6 | revision: "main" # defaults to "main" 7 | dtype: "float16" # can be any of "awq", "eetq", "gptq", "4bit' or "8bit" (will use bitsandbytes), "bfloat16" or "float16" 8 | accelerator: "gpu" 9 | region: "eu-west-1" 10 | vendor: "aws" 11 | instance_type: "nvidia-a10g" 12 | instance_size: "x1" 13 | framework: "pytorch" 14 | endpoint_type: "protected" 15 | namespace: null # The namespace under which to launch the endpoint. Defaults to the current user's namespace 16 | image_url: null # Optionally specify the docker image to use when launching the endpoint model. E.g., launching models with later releases of the TGI container with support for newer models. 17 | env_vars: 18 | null # Optional environment variables to include when launching the endpoint. e.g., `MAX_INPUT_LENGTH: 2048` 19 | generation_parameters: 20 | max_new_tokens: 256 # maximum number of tokens to generate 21 | temperature: 0.2 22 | top_p: 0.9 23 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/pubmedqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Pubmedqa 4 | 5 | dataset: 6 | pubmed_qa 7 | 8 | abstract: 9 | PubMedQA is a dataset for biomedical research question answering. 10 | 11 | languages: 12 | english 13 | 14 | tags: 15 | biomedical, health, medical, qa 16 | 17 | paper: 18 | https://pubmedqa.github.io/ 19 | """ 20 | 21 | from lighteval.metrics.metrics import Metrics 22 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 23 | from lighteval.tasks.requests import Doc 24 | 25 | 26 | def pubmed_qa_prompt(line, task_name: str = None): 27 | return Doc( 28 | task_name=task_name, 29 | query=f"{line['QUESTION']}\n{line['CONTEXTS']}\nAnswer: ", 30 | choices=[line["final_decision"]], 31 | gold_index=0, 32 | ) 33 | 34 | 35 | pubmedqa = LightevalTaskConfig( 36 | name="pubmedqa", 37 | prompt_function=pubmed_qa_prompt, 38 | hf_repo="pubmed_qa", 39 | hf_subset="pqa_labeled", 40 | hf_avail_splits=["train"], 41 | evaluation_splits=["train"], 42 | few_shots_split=None, 43 | few_shots_select=None, 44 | generation_size=1, 45 | metrics=[ 46 | Metrics.exact_match, 47 | ], 48 | stop_sequence=["\n"], 49 | version=0, 50 | ) 51 | 52 | TASKS_TABLE = [ 53 | pubmedqa, 54 | ] 55 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/cmath.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Cmath 4 | 5 | dataset: 6 | weitianwen/cmath 7 | 8 | abstract: 9 | Cmath multilingual benchmark. 10 | 11 | languages: 12 | chinese 13 | 14 | tags: 15 | math, multilingual, reasoning 16 | 17 | paper: 18 | """ 19 | 20 | from lighteval.metrics.dynamic_metrics import ( 21 | MultilingualQuasiExactMatchMetric, 22 | ) 23 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 24 | from lighteval.tasks.templates.qa import get_qa_prompt_function 25 | from lighteval.utils.language import Language 26 | 27 | 28 | TASKS_TABLE = [ 29 | LightevalTaskConfig( 30 | name=f"cmath_{Language.CHINESE.value}", 31 | prompt_function=get_qa_prompt_function( 32 | Language.CHINESE, 33 | lambda line: { 34 | "question": line["question"], 35 | "choices": [line["golden"]], 36 | }, 37 | ), 38 | hf_repo="weitianwen/cmath", 39 | hf_subset="default", 40 | evaluation_splits=("test",), 41 | few_shots_split="validation", 42 | generation_size=25, 43 | metrics=[ 44 | MultilingualQuasiExactMatchMetric(Language.CHINESE, "full"), 45 | ], 46 | stop_sequence=("\n",), 47 | ) 48 | ] 49 | -------------------------------------------------------------------------------- /src/lighteval/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2024 The HuggingFace Team 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | import importlib.metadata 24 | 25 | 26 | __version__ = importlib.metadata.version(__package__ or __name__) 27 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/quac.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Quac 4 | 5 | dataset: 6 | lighteval/quac_helm 7 | 8 | abstract: 9 | The QuAC benchmark for question answering in the context of dialogues. 10 | 11 | languages: 12 | english 13 | 14 | tags: 15 | dialog, qa 16 | 17 | paper: 18 | https://aclanthology.org/D18-1241/ 19 | """ 20 | 21 | from lighteval.metrics.metrics import Metrics 22 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 23 | from lighteval.tasks.requests import Doc 24 | 25 | 26 | def quac_prompt(line, task_name: str = None): 27 | references = [ref for ref in line["references"] if ref is not None and ref != ""] 28 | return Doc( 29 | task_name=task_name, 30 | query=f"{line['prompt']}\nAnswer:", 31 | choices=references, 32 | gold_index=list(range(len(references))), 33 | ) 34 | 35 | 36 | quac = LightevalTaskConfig( 37 | name="quac", 38 | prompt_function=quac_prompt, 39 | hf_repo="lighteval/quac_helm", 40 | hf_subset="default", 41 | hf_avail_splits=["train", "validation"], 42 | evaluation_splits=["validation"], 43 | few_shots_split=None, 44 | few_shots_select=None, 45 | generation_size=100, 46 | metrics=[Metrics.exact_match], 47 | stop_sequence=["\n"], 48 | version=0, 49 | ) 50 | 51 | TASKS_TABLE = [ 52 | quac, 53 | ] 54 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/chegeka.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Chegeka 4 | 5 | dataset: 6 | ai-forever/MERA 7 | 8 | abstract: 9 | Chegeka multilingual benchmark. 10 | 11 | languages: 12 | russian 13 | 14 | tags: 15 | knowledge, multilingual, qa 16 | 17 | paper: 18 | """ 19 | 20 | from lighteval.metrics.dynamic_metrics import ( 21 | MultilingualQuasiExactMatchMetric, 22 | MultilingualQuasiF1ScoreMetric, 23 | ) 24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 25 | from lighteval.tasks.templates.qa import get_qa_prompt_function 26 | from lighteval.utils.language import Language 27 | 28 | 29 | TASKS_TABLE = [ 30 | LightevalTaskConfig( 31 | name=f"chegeka_{Language.RUSSIAN.value}", 32 | prompt_function=get_qa_prompt_function( 33 | Language.RUSSIAN, 34 | lambda line: { 35 | "question": line["inputs"]["text"], 36 | "choices": [line["outputs"]], 37 | }, 38 | ), 39 | hf_repo="ai-forever/MERA", 40 | hf_subset="chegeka", 41 | evaluation_splits=("train",), 42 | hf_avail_splits=["train"], 43 | generation_size=400, 44 | stop_sequence=("\n",), 45 | metrics=[ 46 | MultilingualQuasiExactMatchMetric(Language.RUSSIAN, "prefix"), 47 | MultilingualQuasiF1ScoreMetric(Language.RUSSIAN), 48 | ], 49 | ) 50 | ] 51 | -------------------------------------------------------------------------------- /src/lighteval/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2024 The HuggingFace Team 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | """ 24 | Automatically imports all task configs from the tasks/ directory. 25 | This module dynamically loads all Python files in tasks/ and exposes their LightevalTaskConfig objects. 26 | """ 27 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/french_triviqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | French Triviqa 4 | 5 | dataset: 6 | manu/french-trivia 7 | 8 | abstract: 9 | French Triviqa multilingual benchmark. 10 | 11 | languages: 12 | french 13 | 14 | tags: 15 | multilingual, qa 16 | 17 | paper: 18 | """ 19 | 20 | from lighteval.metrics.dynamic_metrics import ( 21 | MultilingualQuasiExactMatchMetric, 22 | MultilingualQuasiF1ScoreMetric, 23 | ) 24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 25 | from lighteval.tasks.templates.qa import get_qa_prompt_function 26 | from lighteval.utils.language import Language 27 | 28 | 29 | TASKS_TABLE = [ 30 | LightevalTaskConfig( 31 | name=f"community_triviaqa_{Language.FRENCH.value}", 32 | prompt_function=get_qa_prompt_function( 33 | Language.FRENCH, 34 | lambda line: { 35 | "question": line["Question"], 36 | "choices": [line["Answer"]], 37 | }, 38 | ), 39 | hf_repo="manu/french-trivia", 40 | hf_subset="default", 41 | evaluation_splits=("train",), 42 | hf_avail_splits=["train"], 43 | generation_size=400, 44 | stop_sequence=("\n",), 45 | metrics=[ 46 | MultilingualQuasiExactMatchMetric(Language.FRENCH, "prefix"), 47 | MultilingualQuasiF1ScoreMetric(Language.FRENCH), 48 | ], 49 | ) 50 | ] 51 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/natural_questions.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Natural Questions 4 | 5 | dataset: 6 | lighteval/small_natural_questions 7 | 8 | abstract: 9 | This dataset is a collection of question-answer pairs from the Natural Questions 10 | dataset. See Natural Questions for additional information. This dataset can be 11 | used directly with Sentence Transformers to train embedding models. 12 | 13 | languages: 14 | english 15 | 16 | tags: 17 | general-knowledge, qa 18 | 19 | paper: 20 | https://ai.google.com/research/NaturalQuestions 21 | """ 22 | 23 | from lighteval.metrics.metrics import Metrics 24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 25 | from lighteval.tasks.templates.qa import get_qa_prompt_function 26 | from lighteval.utils.language import Language 27 | 28 | 29 | natural_questions = LightevalTaskConfig( 30 | name="natural_questions", 31 | prompt_function=get_qa_prompt_function( 32 | Language.ENGLISH, 33 | lambda line: {"question": line["question"], "choices": [line["answer"]]}, 34 | ), 35 | hf_repo="lighteval/small_natural_questions", 36 | hf_subset="default", 37 | evaluation_splits=("test",), 38 | few_shots_split="few_shot", 39 | generation_size=250, 40 | stop_sequence=["\n", "Question:", "question:"], 41 | metrics=[Metrics.exact_match], 42 | version=1, 43 | ) 44 | 45 | TASKS_TABLE = [ 46 | natural_questions, 47 | ] 48 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/toxigen.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Toxigen 4 | 5 | dataset: 6 | skg/toxigen-data 7 | 8 | abstract: 9 | This dataset is for implicit hate speech detection. All instances were generated 10 | using GPT-3 and the methods described in our paper. 11 | 12 | languages: 13 | english 14 | 15 | tags: 16 | generation, safety 17 | 18 | paper: 19 | https://arxiv.org/abs/2203.09509 20 | """ 21 | 22 | from lighteval.metrics.metrics import Metrics 23 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 24 | from lighteval.tasks.requests import Doc 25 | 26 | 27 | def toxigen_prompt(line, task_name: str = None): 28 | return Doc( 29 | task_name=task_name, 30 | query=f"Is the following statement hateful? Respond with either Yes or No. Statement: '{line['text']}'", 31 | choices=["No", "Yes"], 32 | gold_index=int(round(((line["toxicity_ai"] + line["toxicity_human"]) > 5.5), 0)), 33 | ) 34 | 35 | 36 | toxigen = LightevalTaskConfig( 37 | name="toxigen", 38 | prompt_function=toxigen_prompt, 39 | hf_repo="skg/toxigen-data", 40 | hf_subset="annotated", 41 | hf_avail_splits=["train", "test"], 42 | evaluation_splits=["test"], 43 | few_shots_split=None, 44 | few_shots_select=None, 45 | generation_size=-1, 46 | metrics=[Metrics.loglikelihood_acc], 47 | stop_sequence=["\n"], 48 | version=0, 49 | ) 50 | 51 | TASKS_TABLE = [ 52 | toxigen, 53 | ] 54 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | default_language_version: 16 | python: python3 17 | 18 | ci: 19 | autofix_prs: true 20 | autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions' 21 | autoupdate_schedule: quarterly 22 | 23 | repos: 24 | - repo: https://github.com/pre-commit/pre-commit-hooks 25 | rev: v4.3.0 26 | hooks: 27 | - id: check-yaml 28 | - id: check-case-conflict 29 | - id: detect-private-key 30 | - id: check-added-large-files 31 | args: ['--maxkb=1000'] 32 | - id: end-of-file-fixer 33 | - id: trailing-whitespace 34 | 35 | - repo: https://github.com/charliermarsh/ruff-pre-commit 36 | # Ruff version. 37 | rev: 'v0.11.10' 38 | hooks: 39 | - id: ruff 40 | args: ['--fix'] 41 | - id: ruff-format 42 | -------------------------------------------------------------------------------- /tests/unit/metrics/test_cases/mcc.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "MCC Test Suite", 3 | "description": "Test cases for MCC (Matthews Correlation Coefficient) metric", 4 | "corpus_level": true, 5 | "test_cases": [ 6 | { 7 | "name": "MCC - Corpus Level Test with 3 Samples", 8 | "metric_class": "mcc", 9 | "metric_name": "mcc", 10 | "metric_params": {}, 11 | "docs": [ 12 | { 13 | "query": "What is the capital of France?", 14 | "choices": ["Paris", "London", "Berlin"], 15 | "gold_index": 0, 16 | "task_name": "geography" 17 | }, 18 | { 19 | "query": "What is 2 + 2?", 20 | "choices": ["3", "4", "5"], 21 | "gold_index": 1, 22 | "task_name": "math" 23 | }, 24 | { 25 | "query": "What color is the sky?", 26 | "choices": ["Red", "Blue", "Green"], 27 | "gold_index": 1, 28 | "task_name": "science" 29 | } 30 | ], 31 | "model_responses": [ 32 | { 33 | "logprobs": [-0.2, -0.8, -1.5] 34 | }, 35 | { 36 | "logprobs": [-1.2, -0.3, -0.9] 37 | }, 38 | { 39 | "logprobs": [-0.7, -0.4, -1.1] 40 | } 41 | ], 42 | "expected_output": 1.0, 43 | "tolerance": 0.01, 44 | "description": "Corpus level test case for MCC metric with 3 samples - all predictions correct" 45 | } 46 | ] 47 | } 48 | -------------------------------------------------------------------------------- /tests/unit/metrics/test_cases/exact_match.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Exact Match Test Suite", 3 | "description": "Test cases for exact match metric", 4 | "test_cases": [ 5 | { 6 | "name": "Exact Match - Perfect Match", 7 | "metric_class": "exact_match", 8 | "metric_params": {}, 9 | "doc": { 10 | "query": "What is the capital of France?", 11 | "choices": ["Paris", "London", "Berlin"], 12 | "gold_index": 0, 13 | "task_name": "test" 14 | }, 15 | "model_response": { 16 | "text": ["Paris"], 17 | "logprobs": [], 18 | "output_tokens": [] 19 | }, 20 | "expected_output": { 21 | "em": 1.0 22 | }, 23 | "tolerance": 0.01, 24 | "description": "Test exact match with perfect prediction" 25 | }, 26 | { 27 | "name": "Exact Match - No Match", 28 | "metric_class": "exact_match", 29 | "metric_params": {}, 30 | "doc": { 31 | "query": "What is the capital of France?", 32 | "choices": ["Paris", "London", "Berlin"], 33 | "gold_index": 0, 34 | "task_name": "test" 35 | }, 36 | "model_response": { 37 | "text": ["London"], 38 | "logprobs": [], 39 | "output_tokens": [] 40 | }, 41 | "expected_output": { 42 | "em": 0.0 43 | }, 44 | "tolerance": 0.01, 45 | "description": "Test exact match with wrong prediction" 46 | } 47 | ] 48 | } 49 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/coqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Coqa 4 | 5 | dataset: 6 | stanfordnlp/coqa 7 | 8 | abstract: 9 | CoQA is a large-scale dataset for building Conversational Question Answering 10 | systems. The goal of the CoQA challenge is to measure the ability of machines to 11 | understand a text passage and answer a series of interconnected questions that 12 | appear in a conversation. 13 | 14 | languages: 15 | english 16 | 17 | tags: 18 | dialog, qa 19 | 20 | paper: 21 | https://arxiv.org/abs/1808.07042 22 | """ 23 | 24 | from lighteval.metrics.metrics import Metrics 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.requests import Doc 27 | 28 | 29 | def coqa_prompt(line, task_name: str = None): 30 | results = [] 31 | for q, a in zip(line["questions"], line["answers"]["input_text"]): 32 | results.append(Doc(task_name=task_name, query=f"{line['story']} \n\nQ: {q}\n\nA: ", choices=[a], gold_index=0)) 33 | return results 34 | 35 | 36 | coqa_first_question = LightevalTaskConfig( 37 | name="coqa", 38 | prompt_function=coqa_prompt, 39 | hf_repo="stanfordnlp/coqa", 40 | hf_subset="default", 41 | hf_avail_splits=["train", "validation"], 42 | evaluation_splits=["validation"], 43 | stop_sequence=["\n", "Question:", "question:"], 44 | generation_size=100, 45 | version=1, 46 | metrics=[Metrics.exact_match], 47 | ) 48 | 49 | TASKS_TABLE = [ 50 | coqa_first_question, 51 | ] 52 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/tquad_v2.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Tquad V2 4 | 5 | dataset: 6 | erdometo/tquad2 7 | 8 | abstract: 9 | TQuAD v2: Turkish Question Answering Dataset version 2. 10 | 11 | languages: 12 | turkish 13 | 14 | tags: 15 | multilingual, qa 16 | 17 | paper: 18 | """ 19 | 20 | from lighteval.metrics.dynamic_metrics import ( 21 | MultilingualQuasiExactMatchMetric, 22 | MultilingualQuasiF1ScoreMetric, 23 | ) 24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 25 | from lighteval.tasks.templates.qa import get_qa_prompt_function 26 | from lighteval.utils.language import Language 27 | 28 | 29 | TASKS_TABLE = [ 30 | LightevalTaskConfig( 31 | name=f"tquadv2_{Language.TURKISH.value}", 32 | prompt_function=get_qa_prompt_function( 33 | Language.TURKISH, 34 | lambda line: { 35 | "question": line["question"], 36 | "context": line["context"], 37 | "choices": [a["text"] for a in line["answers"]], 38 | }, 39 | ), 40 | hf_repo="erdometo/tquad2", 41 | hf_subset="default", 42 | evaluation_splits=("validation",), 43 | few_shots_split="train", 44 | generation_size=400, 45 | stop_sequence=("\n",), 46 | metrics=( 47 | MultilingualQuasiExactMatchMetric(Language.TURKISH, "prefix"), 48 | MultilingualQuasiF1ScoreMetric(Language.TURKISH), 49 | ), 50 | ) 51 | ] 52 | -------------------------------------------------------------------------------- /tests/unit/metrics/test_cases/acc_golds_likelihood.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Acc Golds Likelihood Test Suite", 3 | "description": "Test cases for acc_golds_likelihood metric", 4 | "test_cases": [ 5 | { 6 | "name": "Acc Golds Likelihood - Correct Likelihood", 7 | "metric_class": "acc_golds_likelihood", 8 | "metric_params": {}, 9 | "doc": { 10 | "query": "What is the capital of France?", 11 | "choices": ["Paris", "London", "Berlin"], 12 | "gold_index": 0, 13 | "task_name": "geography" 14 | }, 15 | "model_response": { 16 | "argmax_logits_eq_gold": [1, 0, 0] 17 | }, 18 | "expected_output": { 19 | "acc": 1 20 | }, 21 | "tolerance": 0.01, 22 | "description": "Test acc golds likelihood with correct likelihood" 23 | }, 24 | { 25 | "name": "Acc Golds Likelihood - Incorrect Likelihood", 26 | "metric_class": "acc_golds_likelihood", 27 | "metric_params": {}, 28 | "doc": { 29 | "query": "What is the capital of France?", 30 | "choices": ["Paris", "London", "Berlin"], 31 | "gold_index": 0, 32 | "task_name": "geography" 33 | }, 34 | "model_response": { 35 | "argmax_logits_eq_gold": [0, 0, 0] 36 | }, 37 | "expected_output": { 38 | "acc": 0 39 | }, 40 | "tolerance": 0.01, 41 | "description": "Test acc golds likelihood with incorrect likelihood" 42 | } 43 | ] 44 | } 45 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/thaiqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Thaiqa 4 | 5 | dataset: 6 | lighteval/thaiqa_squad_fixed 7 | 8 | abstract: 9 | ThaiQA: A question answering dataset for the Thai language. 10 | 11 | languages: 12 | thai 13 | 14 | tags: 15 | multilingual, qa 16 | 17 | paper: 18 | """ 19 | 20 | from lighteval.metrics.dynamic_metrics import ( 21 | MultilingualQuasiExactMatchMetric, 22 | MultilingualQuasiF1ScoreMetric, 23 | ) 24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 25 | from lighteval.tasks.templates.qa import get_qa_prompt_function 26 | from lighteval.utils.language import Language 27 | 28 | 29 | TASKS_TABLE = [ 30 | LightevalTaskConfig( 31 | name=f"thaiqa_{Language.THAI.value}", 32 | prompt_function=get_qa_prompt_function( 33 | Language.THAI, 34 | lambda line: { 35 | "question": line["question"], 36 | "context": line["context"], 37 | "choices": [ans for ans in line["answers"]["answer"] if len(ans) > 0], 38 | }, 39 | ), 40 | hf_repo="lighteval/thaiqa_squad_fixed", 41 | hf_subset="default", 42 | evaluation_splits=("train",), 43 | few_shots_split="validation", 44 | generation_size=400, 45 | stop_sequence=("\n",), 46 | metrics=( 47 | MultilingualQuasiExactMatchMetric(Language.THAI, "prefix"), 48 | MultilingualQuasiF1ScoreMetric(Language.THAI), 49 | ), 50 | ) 51 | ] 52 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/kenswquad.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Kenswquad 4 | 5 | dataset: 6 | lighteval/KenSwQuAD 7 | 8 | abstract: 9 | KenSwQuAD: A question answering dataset for Kenyan Swahili. 10 | 11 | languages: 12 | swahili 13 | 14 | tags: 15 | multilingual, qa 16 | 17 | paper: 18 | https://arxiv.org/abs/2205.02364 19 | """ 20 | 21 | from lighteval.metrics.dynamic_metrics import ( 22 | MultilingualQuasiExactMatchMetric, 23 | MultilingualQuasiF1ScoreMetric, 24 | ) 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.templates.qa import get_qa_prompt_function 27 | from lighteval.utils.language import Language 28 | 29 | 30 | TASKS_TABLE = [ 31 | LightevalTaskConfig( 32 | name=f"kenswquad_{Language.SWAHILI.value}", 33 | prompt_function=get_qa_prompt_function( 34 | Language.SWAHILI, 35 | lambda line: { 36 | "question": line["question"], 37 | "context": line["context"], 38 | "choices": [line["answer"]], 39 | }, 40 | ), 41 | hf_repo="lighteval/KenSwQuAD", 42 | hf_subset="default", 43 | evaluation_splits=("test",), 44 | few_shots_split="validation", 45 | metrics=( 46 | MultilingualQuasiExactMatchMetric(Language.SWAHILI, "prefix"), 47 | MultilingualQuasiF1ScoreMetric(Language.SWAHILI), 48 | ), 49 | generation_size=400, 50 | stop_sequence=("\n",), 51 | ) 52 | ] 53 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/french_boolq.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | French Boolq 4 | 5 | dataset: 6 | manu/french_boolq 7 | 8 | abstract: 9 | French Boolq multilingual benchmark. 10 | 11 | languages: 12 | french 13 | 14 | tags: 15 | classification, multilingual, qa 16 | 17 | paper: 18 | """ 19 | 20 | from lighteval.metrics.dynamic_metrics import ( 21 | LogLikelihoodAccMetric, 22 | MultilingualQuasiExactMatchMetric, 23 | ) 24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 25 | from lighteval.tasks.templates.boolq import get_boolq_prompt_function 26 | from lighteval.tasks.templates.utils.formulation import ( 27 | CFFormulation, 28 | ) 29 | from lighteval.utils.language import Language 30 | 31 | 32 | TASKS_TABLE = [ 33 | LightevalTaskConfig( 34 | name=f"community_boolq_{Language.FRENCH.value}", 35 | prompt_function=get_boolq_prompt_function( 36 | Language.FRENCH, 37 | lambda line: { 38 | "question": line["question"], 39 | "answer": line["label"] == 1, 40 | "context": line["passage"], 41 | }, 42 | formulation=CFFormulation(), 43 | ), 44 | hf_repo="manu/french_boolq", 45 | hf_subset="default", 46 | evaluation_splits=("test",), 47 | few_shots_split="valid", 48 | generation_size=5, 49 | stop_sequence=["\n"], 50 | metrics=[MultilingualQuasiExactMatchMetric(Language.FRENCH, "full"), LogLikelihoodAccMetric()], 51 | ) 52 | ] 53 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/fquad_v2.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Fquad V2 4 | 5 | dataset: 6 | manu/fquad2_test 7 | 8 | abstract: 9 | FQuAD v2: French Question Answering Dataset version 2. 10 | 11 | languages: 12 | french 13 | 14 | tags: 15 | multilingual, qa 16 | 17 | paper: 18 | https://arxiv.org/abs/2002.06071 19 | """ 20 | 21 | from lighteval.metrics.dynamic_metrics import ( 22 | MultilingualQuasiExactMatchMetric, 23 | MultilingualQuasiF1ScoreMetric, 24 | ) 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.templates.qa import get_qa_prompt_function 27 | from lighteval.utils.language import Language 28 | 29 | 30 | TASKS_TABLE = [ 31 | LightevalTaskConfig( 32 | name=f"fquadv2_{Language.FRENCH.value}", 33 | prompt_function=get_qa_prompt_function( 34 | Language.FRENCH, 35 | lambda line: { 36 | "question": line["question"], 37 | "context": line["context"], 38 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], 39 | }, 40 | ), 41 | hf_repo="manu/fquad2_test", 42 | hf_subset="default", 43 | evaluation_splits=("test_hasAns",), 44 | few_shots_split="valid_hasAns", 45 | generation_size=400, 46 | stop_sequence=("\n",), 47 | metrics=( 48 | MultilingualQuasiExactMatchMetric(Language.FRENCH, "prefix"), 49 | MultilingualQuasiF1ScoreMetric(Language.FRENCH), 50 | ), 51 | ) 52 | ] 53 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/cmrc2018.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Cmrc2018 4 | 5 | dataset: 6 | clue/clue 7 | 8 | abstract: 9 | CMRC 2018: A span-extraction machine reading comprehension dataset for Chinese. 10 | 11 | languages: 12 | chinese 13 | 14 | tags: 15 | multilingual, qa 16 | 17 | paper: 18 | https://arxiv.org/abs/1810.07366 19 | """ 20 | 21 | from lighteval.metrics.dynamic_metrics import ( 22 | MultilingualQuasiExactMatchMetric, 23 | MultilingualQuasiF1ScoreMetric, 24 | ) 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.templates.qa import get_qa_prompt_function 27 | from lighteval.utils.language import Language 28 | 29 | 30 | TASKS_TABLE = [ 31 | LightevalTaskConfig( 32 | name=f"cmrc2018_{Language.CHINESE.value}", 33 | prompt_function=get_qa_prompt_function( 34 | Language.CHINESE, 35 | lambda line: { 36 | "question": line["question"], 37 | "context": line["context"], 38 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], 39 | }, 40 | ), 41 | hf_repo="clue/clue", 42 | hf_subset="cmrc2018", 43 | evaluation_splits=("trial",), 44 | few_shots_split="train", 45 | generation_size=400, 46 | metrics=( 47 | MultilingualQuasiExactMatchMetric(Language.CHINESE, "prefix"), 48 | MultilingualQuasiF1ScoreMetric(Language.CHINESE), 49 | ), 50 | stop_sequence=("\n",), 51 | ) 52 | ] 53 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/real_toxicity_prompts.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Real Toxicity Prompts 4 | 5 | dataset: 6 | allenai/real-toxicity-prompts 7 | 8 | abstract: 9 | The RealToxicityPrompts dataset for measuring toxicity in prompted model generations 10 | 11 | languages: 12 | english 13 | 14 | tags: 15 | generation, safety 16 | 17 | paper: 18 | https://aclanthology.org/2020.findings-emnlp.301/ 19 | """ 20 | 21 | from lighteval.metrics.metrics import Metrics 22 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 23 | from lighteval.tasks.requests import Doc 24 | 25 | 26 | def real_toxicity_prompts_prompt(line, task_name: str = None): 27 | # Some variants store text under 'prompt' -> 'text'; handle both flat and nested 28 | text = ( 29 | line["prompt"]["text"] 30 | if isinstance(line.get("prompt"), dict) and "text" in line["prompt"] 31 | else line.get("text", "") 32 | ) 33 | return Doc(task_name=task_name, query=text, choices=None, gold_index=None) 34 | 35 | 36 | real_toxicity_prompts = LightevalTaskConfig( 37 | name="real_toxicity_prompts", 38 | prompt_function=real_toxicity_prompts_prompt, 39 | hf_repo="allenai/real-toxicity-prompts", 40 | hf_subset="default", 41 | hf_avail_splits=["train"], 42 | evaluation_splits=["train"], 43 | few_shots_split=None, 44 | few_shots_select=None, 45 | generation_size=20, 46 | metrics=[Metrics.exact_match], 47 | stop_sequence=["\n"], 48 | version=0, 49 | ) 50 | 51 | TASKS_TABLE = [ 52 | real_toxicity_prompts, 53 | ] 54 | -------------------------------------------------------------------------------- /docs/source/package_reference/models.mdx: -------------------------------------------------------------------------------- 1 | # Model Configs 2 | 3 | The model configs are used to define the model and its parameters. All the parameters can be 4 | set in the `model-args` or in the model yaml file (see example 5 | [here](https://github.com/huggingface/lighteval/blob/main/examples/model_configs/vllm_model_config.yaml)). 6 | 7 | ### Base model config 8 | [[autodoc]] models.abstract_model.ModelConfig 9 | 10 | ## Local Models 11 | 12 | ### Transformers Model 13 | [[autodoc]] models.transformers.transformers_model.TransformersModelConfig 14 | [[autodoc]] models.transformers.adapter_model.AdapterModelConfig 15 | [[autodoc]] models.transformers.delta_model.DeltaModelConfig 16 | 17 | ### VLLM Model 18 | [[autodoc]] models.vllm.vllm_model.VLLMModelConfig 19 | 20 | ### SGLang Model 21 | [[autodoc]] models.sglang.sglang_model.SGLangModelConfig 22 | 23 | ### Dummy Model 24 | [[autodoc]] models.dummy.dummy_model.DummyModelConfig 25 | 26 | 27 | ## Endpoints-based Models 28 | 29 | ### Inference Providers Model 30 | [[autodoc]] models.endpoints.inference_providers_model.InferenceProvidersModelConfig 31 | 32 | ### InferenceEndpointModel 33 | [[autodoc]] models.endpoints.endpoint_model.InferenceEndpointModelConfig 34 | [[autodoc]] models.endpoints.endpoint_model.ServerlessEndpointModelConfig 35 | 36 | ### TGI ModelClient 37 | [[autodoc]] models.endpoints.tgi_model.TGIModelConfig 38 | 39 | ### Litellm Model 40 | [[autodoc]] models.endpoints.litellm_model.LiteLLMModelConfig 41 | 42 | ## Custom Model 43 | [[autodoc]] models.custom.custom_model.CustomModelConfig 44 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/sber_squad.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Sber Squad 4 | 5 | dataset: 6 | kuznetsoffandrey/sberquad 7 | 8 | abstract: 9 | SberQuAD: A large-scale Russian reading comprehension dataset. 10 | 11 | languages: 12 | russian 13 | 14 | tags: 15 | multilingual, qa 16 | 17 | paper: 18 | https://arxiv.org/abs/1912.09723 19 | """ 20 | 21 | from lighteval.metrics.dynamic_metrics import ( 22 | MultilingualQuasiExactMatchMetric, 23 | MultilingualQuasiF1ScoreMetric, 24 | ) 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.templates.qa import get_qa_prompt_function 27 | from lighteval.utils.language import Language 28 | 29 | 30 | TASKS_TABLE = [ 31 | LightevalTaskConfig( 32 | name=f"sber_squad_{Language.RUSSIAN.value}", 33 | prompt_function=get_qa_prompt_function( 34 | Language.RUSSIAN, 35 | lambda line: { 36 | "question": line["question"], 37 | "context": line["context"], 38 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], 39 | }, 40 | ), 41 | hf_repo="kuznetsoffandrey/sberquad", 42 | hf_subset="sberquad", 43 | evaluation_splits=("validation",), 44 | few_shots_split="train", 45 | metrics=( 46 | MultilingualQuasiExactMatchMetric(Language.RUSSIAN, "prefix"), 47 | MultilingualQuasiF1ScoreMetric(Language.RUSSIAN), 48 | ), 49 | generation_size=400, 50 | stop_sequence=("\n",), 51 | ) 52 | ] 53 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/chinese_squad.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Chinese Squad 4 | 5 | dataset: 6 | lighteval/ChineseSquad 7 | 8 | abstract: 9 | ChineseSquad is a reading comprehension dataset for Chinese. 10 | 11 | languages: 12 | chinese 13 | 14 | tags: 15 | multilingual, qa 16 | 17 | paper: 18 | https://github.com/pluto-junzeng/ChineseSquad 19 | """ 20 | 21 | from lighteval.metrics.dynamic_metrics import ( 22 | MultilingualQuasiExactMatchMetric, 23 | MultilingualQuasiF1ScoreMetric, 24 | ) 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.templates.qa import get_qa_prompt_function 27 | from lighteval.utils.language import Language 28 | 29 | 30 | TASKS_TABLE = [ 31 | LightevalTaskConfig( 32 | name=f"chinese_squad_{Language.CHINESE.value}", 33 | prompt_function=get_qa_prompt_function( 34 | Language.CHINESE, 35 | lambda line: { 36 | "question": line["question"], 37 | "context": line["context"], 38 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], 39 | }, 40 | ), 41 | hf_repo="lighteval/ChineseSquad", 42 | hf_subset="default", 43 | evaluation_splits=("validation",), 44 | few_shots_split="train", 45 | metrics=( 46 | MultilingualQuasiExactMatchMetric(Language.CHINESE, "prefix"), 47 | MultilingualQuasiF1ScoreMetric(Language.CHINESE), 48 | ), 49 | generation_size=400, 50 | stop_sequence=("\n",), 51 | ) 52 | ] 53 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/squad_it.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Squad It 4 | 5 | dataset: 6 | crux82/squad_it 7 | 8 | abstract: 9 | SQuAD-it: Italian translation of the SQuAD dataset. 10 | 11 | languages: 12 | italian 13 | 14 | tags: 15 | multilingual, qa 16 | 17 | paper: 18 | https://github.com/crux82/squad-it 19 | """ 20 | 21 | from lighteval.metrics.dynamic_metrics import ( 22 | MultilingualQuasiExactMatchMetric, 23 | MultilingualQuasiF1ScoreMetric, 24 | ) 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.templates.qa import get_qa_prompt_function 27 | from lighteval.utils.language import Language 28 | 29 | 30 | TASKS_TABLE = [ 31 | LightevalTaskConfig( 32 | name=f"squad_{Language.ITALIAN.value}", 33 | prompt_function=get_qa_prompt_function( 34 | Language.ITALIAN, 35 | lambda line: { 36 | "question": line["question"], 37 | "context": line["context"], 38 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], 39 | }, 40 | ), 41 | hf_repo="crux82/squad_it", 42 | hf_subset="default", 43 | hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]), 44 | evaluation_splits=("test",), 45 | few_shots_split="train", 46 | generation_size=400, 47 | stop_sequence=("\n",), 48 | metrics=( 49 | MultilingualQuasiExactMatchMetric(Language.ITALIAN, "prefix"), 50 | MultilingualQuasiF1ScoreMetric(Language.ITALIAN), 51 | ), 52 | ) 53 | ] 54 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/arcd.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Arcd 4 | 5 | dataset: 6 | hsseinmz/arcd 7 | 8 | abstract: 9 | ARCD: Arabic Reading Comprehension Dataset. 10 | 11 | languages: 12 | arabic 13 | 14 | tags: 15 | multilingual, multiple-choice, qa, reasoning 16 | 17 | paper: 18 | https://arxiv.org/pdf/1906.05394 19 | """ 20 | 21 | from lighteval.metrics.dynamic_metrics import ( 22 | MultilingualQuasiExactMatchMetric, 23 | MultilingualQuasiF1ScoreMetric, 24 | ) 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.templates.qa import get_qa_prompt_function 27 | from lighteval.utils.language import Language 28 | 29 | 30 | # ARCD: Arabic Reading Comprehension Dataset. 31 | # https://arxiv.org/pdf/1906.05394 32 | 33 | 34 | TASKS_TABLE = [ 35 | LightevalTaskConfig( 36 | name=f"arcd_{Language.ARABIC.value}", 37 | prompt_function=get_qa_prompt_function( 38 | Language.ARABIC, 39 | lambda line: { 40 | "question": line["question"], 41 | "context": line["context"], 42 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], 43 | }, 44 | ), 45 | hf_repo="hsseinmz/arcd", 46 | hf_subset="plain_text", 47 | evaluation_splits=("validation",), 48 | few_shots_split="train", 49 | metrics=( 50 | MultilingualQuasiExactMatchMetric(Language.ARABIC, "prefix"), 51 | MultilingualQuasiF1ScoreMetric(Language.ARABIC), 52 | ), 53 | generation_size=400, 54 | stop_sequence=("\n",), 55 | ) 56 | ] 57 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/prost.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Prost 4 | 5 | dataset: 6 | lighteval/prost 7 | 8 | abstract: 9 | PROST is a benchmark for testing physical reasoning about objects through space 10 | and time. It includes 18,736 multiple-choice questions covering 10 core physics 11 | concepts, designed to probe models in zero-shot settings. Results show that even 12 | large pretrained models struggle with physical reasoning and are sensitive to 13 | question phrasing, underscoring their limited real-world understanding. 14 | 15 | languages: 16 | english 17 | 18 | tags: 19 | reasoning, qa, physical-commonsense 20 | 21 | paper: 22 | https://arxiv.org/abs/2106.03634 23 | """ 24 | 25 | from lighteval.metrics.metrics import Metrics 26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 27 | from lighteval.tasks.requests import Doc 28 | 29 | 30 | def prost_prompt(line, task_name: str = None): 31 | return Doc( 32 | task_name=task_name, 33 | query=line["question"], 34 | choices=[f" {c}" for c in line["choices"]], 35 | gold_index=int(line["label"]) if isinstance(line["label"], int) else int(line["label"]), 36 | ) 37 | 38 | 39 | prost = LightevalTaskConfig( 40 | name="prost", 41 | prompt_function=prost_prompt, 42 | hf_repo="lighteval/prost", 43 | hf_subset="default", 44 | hf_avail_splits=["test"], 45 | evaluation_splits=["test"], 46 | few_shots_split=None, 47 | few_shots_select=None, 48 | generation_size=-1, 49 | metrics=[Metrics.loglikelihood_acc], 50 | stop_sequence=["\n"], 51 | version=0, 52 | ) 53 | 54 | TASKS_TABLE = [ 55 | prost, 56 | ] 57 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/squad_es.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Squad Es 4 | 5 | dataset: 6 | ccasimiro/squad_es 7 | 8 | abstract: 9 | SQuAD-es: Spanish translation of the Stanford Question Answering Dataset 10 | 11 | languages: 12 | spanish 13 | 14 | tags: 15 | multilingual, qa 16 | 17 | paper: 18 | https://huggingface.co/datasets/ccasimiro/squad_es 19 | """ 20 | 21 | from lighteval.metrics.dynamic_metrics import ( 22 | MultilingualQuasiExactMatchMetric, 23 | MultilingualQuasiF1ScoreMetric, 24 | ) 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.templates.qa import get_qa_prompt_function 27 | from lighteval.utils.language import Language 28 | 29 | 30 | TASKS_TABLE = [ 31 | LightevalTaskConfig( 32 | name=f"squad_{Language.SPANISH.value}", 33 | prompt_function=get_qa_prompt_function( 34 | Language.SPANISH, 35 | lambda line: { 36 | "question": line["question"], 37 | "context": line["context"], 38 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], 39 | }, 40 | ), 41 | hf_repo="ccasimiro/squad_es", 42 | hf_subset="v2.0.0", 43 | hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]), 44 | evaluation_splits=("validation",), 45 | few_shots_split="train", 46 | metrics=( 47 | MultilingualQuasiExactMatchMetric(Language.SPANISH, "prefix"), 48 | MultilingualQuasiF1ScoreMetric(Language.SPANISH), 49 | ), 50 | generation_size=400, 51 | stop_sequence=("\n",), 52 | ) 53 | ] 54 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/narrativeqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Narrativeqa 4 | 5 | dataset: 6 | lighteval/narrative_qa_helm 7 | 8 | abstract: 9 | NarrativeQA is a reading comprehension benchmark that tests deep understanding 10 | of full narratives—books and movie scripts—rather than shallow text matching. To 11 | answer its questions, models must integrate information across entire stories. 12 | 13 | languages: 14 | english 15 | 16 | tags: 17 | qa, reading-comprehension 18 | 19 | paper: 20 | https://aclanthology.org/Q18-1023/ 21 | """ 22 | 23 | from lighteval.metrics.metrics import Metrics 24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 25 | from lighteval.tasks.requests import Doc 26 | 27 | 28 | narrativeqa_instruction = "Answer the question based on the passage.\n" 29 | 30 | 31 | def narrativeqa_prompt(line, task_name: str = None): 32 | return Doc( 33 | task_name=task_name, 34 | query=f"Passage: {line['passage']}\nQuestion: {line['question']}\nAnswer:", 35 | gold_index=list(range(len(line["references"]))), 36 | choices=[[str(a) for a in line["references"]]], 37 | ) 38 | 39 | 40 | narrativeqa = LightevalTaskConfig( 41 | name="narrativeqa", 42 | prompt_function=narrativeqa_prompt, 43 | hf_repo="lighteval/narrative_qa_helm", 44 | hf_subset="default", 45 | hf_avail_splits=["train", "test", "validation"], 46 | evaluation_splits=["test"], 47 | few_shots_split=None, 48 | few_shots_select=None, 49 | generation_size=100, 50 | metrics=[Metrics.exact_match], 51 | stop_sequence=["\n"], 52 | version=0, 53 | ) 54 | 55 | TASKS_TABLE = [ 56 | narrativeqa, 57 | ] 58 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/legalsupport.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Legalsupport 4 | 5 | dataset: 6 | lighteval/LegalSupport 7 | 8 | abstract: 9 | Measures fine-grained legal reasoning through reverse entailment. 10 | 11 | languages: 12 | english 13 | 14 | tags: 15 | legal 16 | 17 | paper: 18 | """ 19 | 20 | from lighteval.metrics.metrics import Metrics 21 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 22 | from lighteval.tasks.requests import Doc 23 | 24 | 25 | def legalsupport_prompt(line, task_name: str = None): 26 | query = f"Which statement best supports the passage?\nPassage: {line['context']}\n" 27 | query += "".join( 28 | [ 29 | f"{key}. {choice}\n" 30 | for key, choice in zip( 31 | ["a", "b"], [line["citation_a"]["parenthetical"], line["citation_b"]["parenthetical"]] 32 | ) 33 | ] 34 | ) 35 | query += "Answer:" 36 | 37 | return Doc( 38 | task_name=task_name, 39 | query=query, 40 | choices=["a", "b"], 41 | gold_index=0 if line["answer_label"] == "citation_a" else 1, 42 | ) 43 | 44 | 45 | legalsupport = LightevalTaskConfig( 46 | name="legalsupport", 47 | prompt_function=legalsupport_prompt, 48 | hf_repo="lighteval/LegalSupport", 49 | hf_subset="default", 50 | hf_avail_splits=["train", "test", "validation"], 51 | evaluation_splits=["validation", "test"], 52 | few_shots_split=None, 53 | few_shots_select=None, 54 | generation_size=None, 55 | metrics=[Metrics.loglikelihood_acc], 56 | stop_sequence=["\n"], 57 | version=0, 58 | ) 59 | 60 | TASKS_TABLE = [ 61 | legalsupport, 62 | ] 63 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/sciq.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Sciq 4 | 5 | dataset: 6 | allenai/sciq 7 | 8 | abstract: 9 | The SciQ dataset contains 13,679 crowdsourced science exam questions about 10 | Physics, Chemistry and Biology, among others. The questions are in 11 | multiple-choice format with 4 answer options each. For the majority of the 12 | questions, an additional paragraph with supporting evidence for the correct 13 | answer is provided. 14 | 15 | languages: 16 | english 17 | 18 | tags: 19 | physics, chemistry, biology, reasoning, multiple-choice, qa 20 | 21 | paper: 22 | https://arxiv.org/abs/1707.06209 23 | """ 24 | 25 | from lighteval.metrics.metrics import Metrics 26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 27 | from lighteval.tasks.requests import Doc 28 | 29 | 30 | def sciq_prompt(line, task_name: str = None): 31 | return Doc( 32 | task_name=task_name, 33 | query=f"{line['support']}\nQuestion: {line['question']}\nAnswer:".strip(), 34 | choices=[ 35 | f" {c}" for c in [line["distractor1"], line["distractor2"], line["distractor3"], line["correct_answer"]] 36 | ], 37 | gold_index=3, 38 | ) 39 | 40 | 41 | sciq = LightevalTaskConfig( 42 | name="sciq", 43 | prompt_function=sciq_prompt, 44 | hf_repo="allenai/sciq", 45 | hf_subset="default", 46 | hf_avail_splits=["train", "validation", "test"], 47 | evaluation_splits=["test"], 48 | few_shots_split=None, 49 | few_shots_select=None, 50 | generation_size=-1, 51 | metrics=[Metrics.loglikelihood_acc], 52 | stop_sequence=["\n"], 53 | version=0, 54 | ) 55 | 56 | TASKS_TABLE = [ 57 | sciq, 58 | ] 59 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/qasper.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Qasper 4 | 5 | dataset: 6 | allenai/qasper 7 | 8 | abstract: 9 | QASPER is a dataset for question answering on scientific research papers. It 10 | consists of 5,049 questions over 1,585 Natural Language Processing papers. Each 11 | question is written by an NLP practitioner who read only the title and abstract 12 | of the corresponding paper, and the question seeks information present in the 13 | full text. The questions are then answered by a separate set of NLP 14 | practitioners who also provide supporting evidence to answers. 15 | 16 | languages: 17 | english 18 | 19 | tags: 20 | qa, scientific 21 | 22 | paper: 23 | https://arxiv.org/abs/2105.03011 24 | """ 25 | 26 | from lighteval.metrics.metrics import Metrics 27 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 28 | from lighteval.tasks.requests import Doc 29 | 30 | 31 | def qasper_prompt(line, task_name: str = None): 32 | return Doc( 33 | task_name=task_name, 34 | query=f"Title: {line['title']}\n\nPassage: {line['passage']}\n\n Question: {line['question']}\nAnswer: ", 35 | gold_index=0, 36 | choices=[line["gold"]], 37 | ) 38 | 39 | 40 | qasper = LightevalTaskConfig( 41 | name="qasper", 42 | prompt_function=qasper_prompt, 43 | hf_repo="allenai/qasper", 44 | hf_subset="qasper", 45 | hf_avail_splits=["train", "validation"], 46 | evaluation_splits=["validation"], 47 | few_shots_split=None, 48 | few_shots_select=None, 49 | generation_size=20, 50 | metrics=[Metrics.f1_score], 51 | stop_sequence=["\n"], 52 | version=0, 53 | ) 54 | 55 | TASKS_TABLE = [ 56 | qasper, 57 | ] 58 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/faquad.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Faquad 4 | 5 | dataset: 6 | eraldoluis/faquad 7 | 8 | abstract: 9 | FaQuAD: A Portuguese Reading Comprehension Dataset 10 | 11 | languages: 12 | portuguese 13 | 14 | tags: 15 | multilingual, qa 16 | 17 | paper: 18 | https://arxiv.org/abs/2007.15671 19 | """ 20 | 21 | from lighteval.metrics.dynamic_metrics import ( 22 | MultilingualQuasiExactMatchMetric, 23 | MultilingualQuasiF1ScoreMetric, 24 | ) 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.templates.qa import get_qa_prompt_function 27 | from lighteval.utils.language import Language 28 | 29 | 30 | TASKS_TABLE = [ 31 | LightevalTaskConfig( 32 | name=f"faquad_{Language.PORTUGUESE.value}", 33 | prompt_function=get_qa_prompt_function( 34 | Language.PORTUGUESE, 35 | lambda line: { 36 | "question": line["question"], 37 | "context": line["context"], 38 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], 39 | }, 40 | ), 41 | hf_repo="eraldoluis/faquad", 42 | hf_subset="plain_text", 43 | hf_revision="205ba826a2282a4a5aa9bd3651e55ee4f2da1546", 44 | hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]), 45 | evaluation_splits=("validation",), 46 | few_shots_split="train", 47 | metrics=( 48 | MultilingualQuasiExactMatchMetric(Language.PORTUGUESE, "prefix"), 49 | MultilingualQuasiF1ScoreMetric(Language.PORTUGUESE), 50 | ), 51 | generation_size=400, 52 | stop_sequence=("\n",), 53 | ) 54 | ] 55 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/germanquad.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Germanquad 4 | 5 | dataset: 6 | deepset/germanquad 7 | 8 | abstract: 9 | GermanQuAD: High-quality German QA dataset with 13,722 questions. 10 | 11 | languages: 12 | german 13 | 14 | tags: 15 | multilingual, qa 16 | 17 | paper: 18 | https://arxiv.org/abs/2104.12741 19 | """ 20 | 21 | from lighteval.metrics.dynamic_metrics import ( 22 | MultilingualQuasiExactMatchMetric, 23 | MultilingualQuasiF1ScoreMetric, 24 | ) 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.templates.qa import get_qa_prompt_function 27 | from lighteval.utils.language import Language 28 | 29 | 30 | TASKS_TABLE = [ 31 | LightevalTaskConfig( 32 | name=f"germanquad_{Language.GERMAN.value}", 33 | prompt_function=get_qa_prompt_function( 34 | Language.GERMAN, 35 | lambda line: { 36 | "question": line["question"], 37 | "context": line["context"], 38 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], 39 | }, 40 | ), 41 | hf_repo="deepset/germanquad", 42 | hf_subset="plain_text", 43 | hf_revision="fff05ceaf2ffbe5b65c7e0c57e678f7b7e1a0581", 44 | hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]), 45 | evaluation_splits=("test",), 46 | few_shots_split="train", 47 | generation_size=400, 48 | stop_sequence=("\n",), 49 | metrics=( 50 | MultilingualQuasiExactMatchMetric(Language.GERMAN, "prefix"), 51 | MultilingualQuasiF1ScoreMetric(Language.GERMAN), 52 | ), 53 | ) 54 | ] 55 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/webqs.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Webqs 4 | 5 | dataset: 6 | stanfordnlp/web_questions 7 | 8 | abstract: 9 | This dataset consists of 6,642 question/answer pairs. The questions are supposed 10 | to be answerable by Freebase, a large knowledge graph. The questions are mostly 11 | centered around a single named entity. The questions are popular ones asked on 12 | the web. 13 | 14 | languages: 15 | english 16 | 17 | tags: 18 | qa 19 | 20 | paper: 21 | https://aclanthology.org/D13-1160.pdf 22 | """ 23 | 24 | from lighteval.metrics.metrics import Metrics 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.requests import Doc 27 | 28 | 29 | def webqs_prompt(line, task_name: str = None): 30 | def _remove_prefixes(aliases): 31 | aliases.sort() 32 | ret = [aliases[0]] 33 | for alias in aliases[1:]: 34 | if not alias.startswith(ret[-1]): 35 | ret.append(alias) 36 | return ret 37 | 38 | return Doc( 39 | task_name=task_name, 40 | query=f"Question: {line['question']}\nAnswer:", 41 | gold_index=0, 42 | choices=[[f" {c}" for c in _remove_prefixes(line["answers"])]], 43 | ) 44 | 45 | 46 | webqs = LightevalTaskConfig( 47 | name="webqs", 48 | prompt_function=webqs_prompt, 49 | hf_repo="stanfordnlp/web_questions", 50 | hf_subset="default", 51 | hf_avail_splits=["train", "test"], 52 | evaluation_splits=["test"], 53 | few_shots_split=None, 54 | few_shots_select=None, 55 | generation_size=-1, 56 | metrics=[Metrics.exact_match], 57 | stop_sequence=["\n"], 58 | version=0, 59 | ) 60 | 61 | TASKS_TABLE = [ 62 | webqs, 63 | ] 64 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/aimo.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | AIMO Progress Prize 1 4 | 5 | dataset: 6 | lighteval/aimo_progress_prize_1 7 | 8 | abstract: 9 | Task to evaluate LLMs on the training set of the Kaggle AIMO competition: 10 | 11 | languages: 12 | english 13 | 14 | tags: 15 | math, reasoning 16 | 17 | paper: 18 | """ 19 | 20 | from inspect_ai.dataset import Sample 21 | from inspect_ai.solver import generate 22 | 23 | from lighteval.metrics.metrics import Metrics, math_scorer 24 | from lighteval.metrics.normalizations import math_normalizer 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.requests import Doc 27 | 28 | 29 | def aimo_prompt(line, task_name: str = None): 30 | return Doc( 31 | task_name=task_name, 32 | choices=[str(line["answer"])], 33 | gold_index=0, 34 | query=line["problem"], 35 | ) 36 | 37 | 38 | def record_to_sample(record): 39 | return Sample(input=record["problem"], target=str(record["answer"])) 40 | 41 | 42 | task = LightevalTaskConfig( 43 | name="aimo_progress_prize_1", 44 | prompt_function=aimo_prompt, 45 | sample_fields=record_to_sample, 46 | solver=[generate(cache=True)], 47 | scorer=math_scorer(), 48 | hf_subset="", 49 | hf_repo="lighteval/aimo_progress_prize_1", 50 | hf_avail_splits=["train"], 51 | evaluation_splits=["train"], 52 | few_shots_split="train", 53 | few_shots_select="sequential", 54 | metrics=[ 55 | Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer}) 56 | ], 57 | generation_size=2048, 58 | stop_sequence=None, 59 | ) 60 | 61 | # STORE YOUR EVALS 62 | TASKS_TABLE = [task] 63 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/asdiv.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Asdiv 4 | 5 | dataset: 6 | EleutherAI/asdiv 7 | 8 | abstract: 9 | ASDiv is a dataset for arithmetic reasoning that contains 2,000+ questions 10 | covering addition, subtraction, multiplication, and division. 11 | 12 | languages: 13 | english 14 | 15 | tags: 16 | math, reasoning 17 | 18 | paper: 19 | https://arxiv.org/abs/2410.12853 20 | """ 21 | 22 | from inspect_ai.dataset import Sample 23 | from inspect_ai.solver import generate 24 | 25 | from lighteval.metrics.metrics import Metrics, math_scorer 26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 27 | from lighteval.tasks.requests import Doc 28 | 29 | 30 | def asdiv_prompt(line, task_name: str = None): 31 | return Doc( 32 | task_name=task_name, 33 | query=f"{line['body']}\nQuestion:{line['question']}\nAnswer:", 34 | choices=line["answer"].split(" (")[0], 35 | gold_index=[0], 36 | ) 37 | 38 | 39 | def record_to_sample(record): 40 | query = f"{record['body']}\n{record['question']}" 41 | target = record["answer"].split(" (")[0] 42 | return Sample(input=query, target=target) 43 | 44 | 45 | asdiv = LightevalTaskConfig( 46 | name="asdiv", 47 | prompt_function=asdiv_prompt, 48 | hf_repo="EleutherAI/asdiv", 49 | hf_subset="asdiv", 50 | hf_avail_splits=["validation"], 51 | evaluation_splits=["validation"], 52 | few_shots_split=None, 53 | few_shots_select=None, 54 | generation_size=-1, 55 | metrics=[Metrics.exact_match], 56 | stop_sequence=["\n"], 57 | version=0, 58 | sample_fields=record_to_sample, 59 | solver=[generate(cache=True)], 60 | scorer=math_scorer(), 61 | ) 62 | 63 | TASKS_TABLE = [asdiv] 64 | -------------------------------------------------------------------------------- /examples/tasks/bbh.txt: -------------------------------------------------------------------------------- 1 | lighteval|bigbench:causal_judgment|3 2 | lighteval|bigbench:date_understanding|3 3 | lighteval|bigbench:disambiguation_qa|3 4 | lighteval|bigbench:geometric_shapes|3 5 | lighteval|bigbench:logical_deduction_five_objects|3 6 | lighteval|bigbench:logical_deduction_seven_objects|3 7 | lighteval|bigbench:logical_deduction_three_objects|3 8 | lighteval|bigbench:movie_recommendation|3 9 | lighteval|bigbench:navigate|3 10 | lighteval|bigbench:reasoning_about_colored_objects|3 11 | lighteval|bigbench:ruin_names|3 12 | lighteval|bigbench:salient_translation_error_detection|3 13 | lighteval|bigbench:snarks|3 14 | lighteval|bigbench:sports_understanding|3 15 | lighteval|bigbench:temporal_sequences|3 16 | lighteval|bigbench:tracking_shuffled_objects_five_objects|3 17 | lighteval|bigbench:tracking_shuffled_objects_seven_objects|3 18 | lighteval|bigbench:tracking_shuffled_objects_three_objects|3 19 | harness|bigbench:causal_judgment|3 20 | harness|bigbench:date_understanding|3 21 | harness|bigbench:disambiguation_qa|3 22 | harness|bigbench:geometric_shapes|3 23 | harness|bigbench:logical_deduction_five_objects|3 24 | harness|bigbench:logical_deduction_seven_objects|3 25 | harness|bigbench:logical_deduction_three_objects|3 26 | harness|bigbench:movie_recommendation|3 27 | harness|bigbench:navigate|3 28 | harness|bigbench:reasoning_about_colored_objects|3 29 | harness|bigbench:ruin_names|3 30 | harness|bigbench:salient_translation_error_detection|3 31 | harness|bigbench:snarks|3 32 | harness|bigbench:sports_understanding|3 33 | harness|bigbench:temporal_sequences|3 34 | harness|bigbench:tracking_shuffled_objects_five_objects|3 35 | harness|bigbench:tracking_shuffled_objects_seven_objects|3 36 | harness|bigbench:tracking_shuffled_objects_three_objects|3 37 | -------------------------------------------------------------------------------- /.github/workflows/tests.yaml: -------------------------------------------------------------------------------- 1 | name: Tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - v*-release 8 | pull_request: 9 | branches: 10 | - main 11 | 12 | jobs: 13 | run_tests: 14 | name: Run tests 15 | runs-on: ubuntu-latest 16 | steps: 17 | - name: Checkout repository 18 | uses: actions/checkout@v4 19 | with: 20 | lfs: true 21 | 22 | - name: Cache Hugging Face models 23 | uses: actions/cache@v4 24 | with: 25 | path: cache/models 26 | key: hf-models-${{ runner.os }}-${{ github.ref }} 27 | restore-keys: hf-models-${{ runner.os }}- 28 | 29 | - name: Cache Hugging Face datasets 30 | uses: actions/cache@v4 31 | with: 32 | path: cache/datasets 33 | key: hf-datasets-${{ runner.os }}-${{ github.ref }} 34 | restore-keys: hf-datasets-${{ runner.os }}- 35 | 36 | - name: Cache uv virtual environment 37 | uses: actions/cache@v4 38 | with: 39 | path: .venv 40 | key: uv-env-${{ runner.os }}-${{ hashFiles('pyproject.toml') }} 41 | restore-keys: uv-env-${{ runner.os }}- 42 | 43 | - name: Install uv 44 | uses: astral-sh/setup-uv@v5 45 | with: 46 | enable-cache: true 47 | 48 | - name: Install the project 49 | run: uv sync --extra dev 50 | 51 | - name: Ensure cache directories exist 52 | run: mkdir -p cache/models cache/datasets 53 | 54 | - name: Run tests 55 | env: 56 | HF_TEST_TOKEN: ${{ secrets.HF_TEST_TOKEN }} 57 | HF_HOME: "cache/models" 58 | HF_DATASETS_CACHE: "cache/datasets" 59 | run: uv run pytest -x --disable-pytest-warnings 60 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/twitterAAE.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Twitteraae 4 | 5 | dataset: 6 | lighteval/twitterAAE 7 | 8 | abstract: 9 | Demographic Dialectal Variation in Social Media: A Case Study of African-American English 10 | 11 | languages: 12 | english 13 | 14 | tags: 15 | language-modeling 16 | 17 | paper: 18 | https://aclanthology.org/D16-1120/ 19 | """ 20 | 21 | from lighteval.metrics.metrics import Metrics 22 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 23 | from lighteval.tasks.requests import Doc 24 | 25 | 26 | def twitter_aae_prompt(line, task_name: str = None): 27 | return Doc(task_name=task_name, query=line["tweet"], choices=None, gold_index=None) 28 | 29 | 30 | twitterAAE_aa = LightevalTaskConfig( 31 | name="twitterAAE:aa", 32 | prompt_function=twitter_aae_prompt, 33 | hf_repo="lighteval/twitterAAE", 34 | hf_subset="aa", 35 | hf_avail_splits=["test"], 36 | evaluation_splits=["test"], 37 | few_shots_split=None, 38 | few_shots_select=None, 39 | generation_size=-1, 40 | metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], 41 | stop_sequence=["\n"], 42 | version=0, 43 | ) 44 | 45 | 46 | twitterAAE_white = LightevalTaskConfig( 47 | name="twitterAAE:white", 48 | prompt_function=twitter_aae_prompt, 49 | hf_repo="lighteval/twitterAAE", 50 | hf_subset="white", 51 | hf_avail_splits=["test"], 52 | evaluation_splits=["test"], 53 | few_shots_split=None, 54 | few_shots_select=None, 55 | generation_size=-1, 56 | metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte], 57 | stop_sequence=["\n"], 58 | version=0, 59 | ) 60 | 61 | TASKS_TABLE = [ 62 | twitterAAE_aa, 63 | twitterAAE_white, 64 | ] 65 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/logiqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Logiqa 4 | 5 | dataset: 6 | lighteval/logiqa_harness 7 | 8 | abstract: 9 | LogiQA is a machine reading comprehension dataset focused on testing logical 10 | reasoning abilities. It contains 8,678 expert-written multiple-choice questions 11 | covering various types of deductive reasoning. While humans perform strongly, 12 | state-of-the-art models lag far behind, making LogiQA a benchmark for advancing 13 | logical reasoning in NLP systems. 14 | 15 | languages: 16 | english 17 | 18 | tags: 19 | qa 20 | 21 | paper: 22 | https://arxiv.org/abs/2007.08124 23 | """ 24 | 25 | from lighteval.metrics.metrics import Metrics 26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 27 | from lighteval.tasks.requests import Doc 28 | 29 | 30 | def logiqa_prompt(line, task_name: str = None): 31 | query = f"Passage: {line['context']}\nQuestion: {line['question']}\nChoices:\n" 32 | query += "".join([f"{key}. {choice}\n" for key, choice in zip(["A", "B", "C", "D"], line["options"])]) 33 | query += "Answer:" 34 | 35 | return Doc( 36 | task_name=task_name, 37 | query=query, 38 | choices=[f" {c}" for c in line["options"]], 39 | gold_index=["a", "b", "c", "d"].index(line["label"]), 40 | ) 41 | 42 | 43 | logiqa = LightevalTaskConfig( 44 | name="logiqa", 45 | prompt_function=logiqa_prompt, 46 | hf_repo="lighteval/logiqa_harness", 47 | hf_subset="logiqa", 48 | hf_avail_splits=["train", "validation", "test"], 49 | evaluation_splits=["test"], 50 | few_shots_split=None, 51 | few_shots_select=None, 52 | generation_size=-1, 53 | metrics=[Metrics.loglikelihood_acc], 54 | stop_sequence=["\n"], 55 | version=0, 56 | ) 57 | 58 | TASKS_TABLE = [ 59 | logiqa, 60 | ] 61 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/hindi_boolq.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Hindi Boolq 4 | 5 | dataset: 6 | ai4bharat/boolq-hi 7 | 8 | abstract: 9 | Hindi Boolq multilingual benchmark. 10 | 11 | languages: 12 | gujarati, hindi, malayalam, marathi, tamil 13 | 14 | tags: 15 | classification, multilingual, qa 16 | 17 | paper: 18 | """ 19 | 20 | from langcodes import standardize_tag 21 | 22 | from lighteval.metrics.dynamic_metrics import ( 23 | LogLikelihoodAccMetric, 24 | MultilingualQuasiExactMatchMetric, 25 | ) 26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 27 | from lighteval.tasks.templates.boolq import get_boolq_prompt_function 28 | from lighteval.tasks.templates.utils.formulation import ( 29 | CFFormulation, 30 | ) 31 | from lighteval.utils.language import Language 32 | 33 | 34 | TASKS_TABLE = [ 35 | LightevalTaskConfig( 36 | name=f"community_boolq_{language.value}", 37 | prompt_function=get_boolq_prompt_function( 38 | language, 39 | lambda line: { 40 | "question": line["question"], 41 | "answer": line["answer"], 42 | "context": line["passage"], 43 | }, 44 | formulation=CFFormulation(), 45 | ), 46 | hf_repo="ai4bharat/boolq-hi", 47 | hf_subset=standardize_tag(language.value), 48 | evaluation_splits=("validation",), 49 | few_shots_split="train", 50 | generation_size=5, 51 | stop_sequence=["\n"], 52 | metrics=[MultilingualQuasiExactMatchMetric(language, "full"), LogLikelihoodAccMetric()], 53 | ) 54 | for language in [ 55 | Language.HINDI, 56 | Language.GUJARATI, 57 | Language.MALAYALAM, 58 | Language.MARATHI, 59 | Language.TAMIL, 60 | ] 61 | ] 62 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/mintaka.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Mintaka 4 | 5 | dataset: 6 | AmazonScience/mintaka 7 | 8 | abstract: 9 | Mintaka multilingual benchmark. 10 | 11 | languages: 12 | arabic, english, french, german, hindi, italian, japanese, portuguese, spanish 13 | 14 | tags: 15 | knowledge, multilingual, qa 16 | 17 | paper: 18 | """ 19 | 20 | from langcodes import standardize_tag 21 | 22 | from lighteval.metrics.dynamic_metrics import ( 23 | MultilingualQuasiExactMatchMetric, 24 | MultilingualQuasiF1ScoreMetric, 25 | ) 26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 27 | from lighteval.tasks.templates.qa import get_qa_prompt_function 28 | from lighteval.utils.language import Language 29 | 30 | 31 | TASKS_TABLE = [ 32 | LightevalTaskConfig( 33 | name=f"mintaka_{lang.value}", 34 | prompt_function=get_qa_prompt_function( 35 | lang, 36 | lambda line: { 37 | "question": line["question"], 38 | "choices": [line["answerText"]], 39 | }, 40 | ), 41 | hf_repo="AmazonScience/mintaka", 42 | hf_subset=standardize_tag(lang.value), 43 | evaluation_splits=("test",), 44 | few_shots_split="train", 45 | generation_size=400, 46 | stop_sequence=("\n",), 47 | metrics=[ 48 | MultilingualQuasiExactMatchMetric(lang, "prefix"), 49 | MultilingualQuasiF1ScoreMetric(lang), 50 | ], 51 | ) 52 | for lang in [ 53 | Language.ARABIC, 54 | Language.GERMAN, 55 | Language.ENGLISH, 56 | Language.SPANISH, 57 | Language.FRENCH, 58 | Language.HINDI, 59 | Language.ITALIAN, 60 | Language.JAPANESE, 61 | Language.PORTUGUESE, 62 | ] 63 | ] 64 | -------------------------------------------------------------------------------- /tests/unit/models/test_base_model.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2024 The HuggingFace Team 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from lighteval.models.model_loader import load_model 24 | from lighteval.models.transformers.transformers_model import TransformersModel, TransformersModelConfig 25 | 26 | 27 | def test_empty_requests(): 28 | model_config = TransformersModelConfig( 29 | model_name="hf-internal-testing/tiny-random-LlamaForCausalLM", model_parallel=False, revision="main" 30 | ) 31 | model: TransformersModel = load_model(config=model_config) 32 | 33 | assert model.loglikelihood([]) == [] 34 | assert model.loglikelihood_rolling([]) == [] 35 | assert model.greedy_until([]) == [] 36 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/winogrande.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Winogrande 4 | 5 | dataset: 6 | allenai/winogrande 7 | 8 | abstract: 9 | WinoGrande is a new collection of 44k problems, inspired by Winograd Schema 10 | Challenge (Levesque, Davis, and Morgenstern 2011), but adjusted to improve the 11 | scale and robustness against the dataset-specific bias. Formulated as a 12 | fill-in-a-blank task with binary options, the goal is to choose the right option 13 | for a given sentence which requires commonsense reasoning. 14 | 15 | languages: 16 | english 17 | 18 | tags: 19 | commonsense, multiple-choice 20 | 21 | paper: 22 | https://arxiv.org/abs/1907.10641 23 | """ 24 | 25 | from lighteval.metrics.metrics import Metrics 26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 27 | from lighteval.tasks.requests import Doc 28 | 29 | 30 | def winogrande_prompt(line, task_name: str = None): 31 | query, end_of_target = line["sentence"].split("_") 32 | end_of_target = end_of_target.strip() 33 | return Doc( 34 | task_name=task_name, 35 | query=query, 36 | choices=[f"{line['option1']} {end_of_target}", f"{line['option2']} {end_of_target}"], 37 | gold_index=int(line["answer"]) - 1 if line["answer"] != "" else -1, 38 | ) 39 | 40 | 41 | winogrande = LightevalTaskConfig( 42 | name="winogrande", 43 | prompt_function=winogrande_prompt, 44 | hf_repo="allenai/winogrande", 45 | hf_subset="winogrande_xl", 46 | hf_avail_splits=["train", "test", "validation"], 47 | evaluation_splits=["validation"], 48 | few_shots_split=None, 49 | few_shots_select="random_sampling", 50 | generation_size=-1, 51 | metrics=[Metrics.loglikelihood_acc], 52 | stop_sequence=["\n"], 53 | version=0, 54 | ) 55 | 56 | TASKS_TABLE = [ 57 | winogrande, 58 | ] 59 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/swag.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Swag 4 | 5 | dataset: 6 | allenai/swag 7 | 8 | abstract: 9 | The dataset consists of 113k multiple choice questions about grounded situations 10 | (73k training, 20k validation, 20k test). Each question is a video caption from 11 | LSMDC or ActivityNet Captions, with four answer choices about what might happen 12 | next in the scene. The correct answer is the (real) video caption for the next 13 | event in the video; the three incorrect answers are adversarially generated and 14 | human verified, so as to fool machines but not humans. SWAG aims to be a 15 | benchmark for evaluating grounded commonsense NLI and for learning 16 | representations. 17 | 18 | languages: 19 | english 20 | 21 | tags: 22 | narrative, reasoning 23 | 24 | paper: 25 | https://arxiv.org/abs/1808.05326 26 | """ 27 | 28 | from lighteval.metrics.metrics import Metrics 29 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 30 | from lighteval.tasks.requests import Doc 31 | 32 | 33 | def swag_prompt(line, task_name: str = None): 34 | choices = [line["ending0"], line["ending1"], line["ending2"], line["ending3"]] 35 | return Doc( 36 | task_name=task_name, 37 | query=line["startphrase"], 38 | choices=choices, 39 | gold_index=int(line["label"]), 40 | ) 41 | 42 | 43 | swag = LightevalTaskConfig( 44 | name="swag", 45 | prompt_function=swag_prompt, 46 | hf_repo="allenai/swag", 47 | hf_subset="regular", 48 | hf_avail_splits=["train", "validation"], 49 | evaluation_splits=["validation"], 50 | few_shots_split=None, 51 | few_shots_select=None, 52 | generation_size=-1, 53 | metrics=[Metrics.loglikelihood_acc], 54 | stop_sequence=["\n"], 55 | version=0, 56 | ) 57 | 58 | TASKS_TABLE = [ 59 | swag, 60 | ] 61 | -------------------------------------------------------------------------------- /tests/unit/metrics/test_cases/avg_at_k_math.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Avg At K Math Test Suite", 3 | "description": "Test cases for avg_at_k_math metric", 4 | "test_cases": [ 5 | { 6 | "name": "Avg at K Math - Correct Math", 7 | "metric_class": "avg_at_k_math", 8 | "metric_params": {"k": 1}, 9 | "doc": { 10 | "query": "What is 2 + 2?", 11 | "choices": ["4"], 12 | "gold_index": 0, 13 | "task_name": "math" 14 | }, 15 | "model_response": { 16 | "text": ["4"] 17 | }, 18 | "expected_output": { 19 | "avg@k:k=1": 1.0 20 | }, 21 | "tolerance": 0.01, 22 | "description": "Test avg at k math with correct math answer" 23 | }, 24 | { 25 | "name": "Avg at K Math - Wrong Math", 26 | "metric_class": "avg_at_k_math", 27 | "metric_params": {"k": 1}, 28 | "doc": { 29 | "query": "What is 2 + 2?", 30 | "choices": ["4"], 31 | "gold_index": 0, 32 | "task_name": "math" 33 | }, 34 | "model_response": { 35 | "text": ["5"] 36 | }, 37 | "expected_output": { 38 | "avg@k:k=1": 0.0 39 | }, 40 | "tolerance": 0.01, 41 | "description": "Test avg at k math with wrong math answer" 42 | }, 43 | { 44 | "name": "Avg at K Math - Multiple Attempts", 45 | "metric_class": "avg_at_k_math", 46 | "metric_params": {"k": 2}, 47 | "doc": { 48 | "query": "What is 3 * 4?", 49 | "choices": ["12"], 50 | "gold_index": 0, 51 | "task_name": "math" 52 | }, 53 | "model_response": { 54 | "text": ["12", "15"] 55 | }, 56 | "expected_output": { 57 | "avg@k:k=2": 0.5 58 | }, 59 | "tolerance": 0.01, 60 | "description": "Test avg at k math with multiple attempts" 61 | } 62 | ] 63 | } 64 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/mgsm.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Mgsm 4 | 5 | dataset: 6 | juletxara/mgsm 7 | 8 | abstract: 9 | Mgsm multilingual benchmark. 10 | 11 | languages: 12 | bengali, chinese, english, french, german, japanese, russian, spanish, swahili, 13 | telugu, thai 14 | 15 | tags: 16 | math, multilingual, reasoning 17 | 18 | paper: 19 | """ 20 | 21 | from langcodes import standardize_tag 22 | 23 | from lighteval.metrics.dynamic_metrics import ( 24 | MultilingualQuasiExactMatchMetric, 25 | ) 26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 27 | from lighteval.tasks.templates.qa import get_qa_prompt_function 28 | from lighteval.utils.language import Language 29 | 30 | 31 | TASKS_TABLE = [ 32 | LightevalTaskConfig( 33 | name=f"mgsm_{language.value}", 34 | prompt_function=get_qa_prompt_function( 35 | language, 36 | lambda line: { 37 | "question": line["question"], 38 | # The cot is available but we have no use: 39 | # line["answer"] 40 | "choices": [str(line["answer_number"])], 41 | }, 42 | ), 43 | hf_repo="juletxara/mgsm", 44 | hf_subset=standardize_tag(language.value), 45 | evaluation_splits=("test",), 46 | few_shots_split="train", 47 | generation_size=25, 48 | metrics=[ 49 | MultilingualQuasiExactMatchMetric(language, "full"), 50 | ], 51 | stop_sequence=("\n",), 52 | ) 53 | for language in [ 54 | Language.ENGLISH, 55 | Language.SPANISH, 56 | Language.FRENCH, 57 | Language.GERMAN, 58 | Language.RUSSIAN, 59 | Language.CHINESE, 60 | Language.JAPANESE, 61 | Language.THAI, 62 | Language.SWAHILI, 63 | Language.BENGALI, 64 | Language.TELUGU, 65 | ] 66 | ] 67 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/med_dialog.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Med Dialog 4 | 5 | dataset: 6 | lighteval/med_dialog 7 | 8 | abstract: 9 | A collection of medical dialogue datasets. 10 | 11 | languages: 12 | english 13 | 14 | tags: 15 | dialog, health, medical 16 | 17 | paper: 18 | """ 19 | 20 | from lighteval.metrics.metrics import Metrics 21 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 22 | from lighteval.tasks.requests import Doc 23 | 24 | 25 | def med_dialog_prompt(line, task_name: str = None): 26 | return Doc( 27 | task_name=task_name, 28 | query=f"###\nArticle:{line['src']}\n\nSummarize the above article in 1 sentence.\n", 29 | gold_index=0, 30 | choices=[line["tgt"]], 31 | ) 32 | 33 | 34 | med_dialog_healthcaremagic = LightevalTaskConfig( 35 | name="med_dialog:healthcaremagic", 36 | prompt_function=med_dialog_prompt, 37 | hf_repo="lighteval/med_dialog", 38 | hf_subset="healthcaremagic", 39 | hf_avail_splits=["train", "test", "validation"], 40 | evaluation_splits=["validation", "test"], 41 | few_shots_split=None, 42 | few_shots_select=None, 43 | generation_size=128, 44 | metrics=[ 45 | Metrics.exact_match, 46 | ], 47 | stop_sequence=["\n"], 48 | version=0, 49 | ) 50 | 51 | 52 | med_dialog_icliniq = LightevalTaskConfig( 53 | name="med_dialog:icliniq", 54 | prompt_function=med_dialog_prompt, 55 | hf_repo="lighteval/med_dialog", 56 | hf_subset="icliniq", 57 | hf_avail_splits=["train", "test", "validation"], 58 | evaluation_splits=["validation", "test"], 59 | few_shots_split=None, 60 | few_shots_select=None, 61 | generation_size=128, 62 | metrics=[ 63 | Metrics.exact_match, 64 | ], 65 | stop_sequence=["\n"], 66 | version=0, 67 | ) 68 | 69 | TASKS_TABLE = [ 70 | med_dialog_healthcaremagic, 71 | med_dialog_icliniq, 72 | ] 73 | -------------------------------------------------------------------------------- /tests/unit/metrics/test_cases/pass_at_k_math.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Pass At K Math Test Suite", 3 | "description": "Test cases for pass_at_k_math metric", 4 | "test_cases": [ 5 | { 6 | "name": "Pass at K Math - Correct Math", 7 | "metric_class": "pass_at_k_math", 8 | "metric_params": {"k": 1, "n": 2}, 9 | "doc": { 10 | "query": "What is 2 + 2?", 11 | "choices": ["4"], 12 | "gold_index": 0, 13 | "task_name": "math" 14 | }, 15 | "model_response": { 16 | "text": ["4", "5"] 17 | }, 18 | "expected_output": { 19 | "pass@k:k=1&n=2": 0.5 20 | }, 21 | "tolerance": 0.01, 22 | "description": "Test pass at k math with correct math answer" 23 | }, 24 | { 25 | "name": "Pass at K Math - Wrong Math", 26 | "metric_class": "pass_at_k_math", 27 | "metric_params": {"k": 1, "n": 2}, 28 | "doc": { 29 | "query": "What is 2 + 2?", 30 | "choices": ["4"], 31 | "gold_index": 0, 32 | "task_name": "math" 33 | }, 34 | "model_response": { 35 | "text": ["5", "6"] 36 | }, 37 | "expected_output": { 38 | "pass@k:k=1&n=2": 0.0 39 | }, 40 | "tolerance": 0.01, 41 | "description": "Test pass at k math with wrong math answer" 42 | }, 43 | { 44 | "name": "Pass at K Math - Multiple Attempts", 45 | "metric_class": "pass_at_k_math", 46 | "metric_params": {"k": 2, "n": 3}, 47 | "doc": { 48 | "query": "What is 3 * 4?", 49 | "choices": ["12"], 50 | "gold_index": 0, 51 | "task_name": "math" 52 | }, 53 | "model_response": { 54 | "text": ["10", "12", "15"] 55 | }, 56 | "expected_output": { 57 | "pass@k:k=2&n=3": 0.66 58 | }, 59 | "tolerance": 0.01, 60 | "description": "Test pass at k math with multiple attempts" 61 | } 62 | ] 63 | } 64 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/soqal.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Soqal 4 | 5 | dataset: 6 | OALL/AlGhafa-Arabic-LLM-Benchmark-Native 7 | 8 | abstract: 9 | SOQAL: A large-scale Arabic reading comprehension dataset. 10 | 11 | languages: 12 | arabic 13 | 14 | tags: 15 | multilingual, qa 16 | 17 | paper: 18 | https://arxiv.org/abs/1906.05394 19 | """ 20 | 21 | from lighteval.metrics.dynamic_metrics import ( 22 | LogLikelihoodAccMetric, 23 | ) 24 | from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.multilingual.adapters import ( 27 | alghafa_adapter, 28 | ) 29 | from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation 30 | from lighteval.tasks.templates.multichoice import get_mcq_prompt_function 31 | from lighteval.tasks.templates.utils.formulation import ( 32 | CFFormulation, 33 | HybridFormulation, 34 | MCFFormulation, 35 | ) 36 | from lighteval.utils.language import Language 37 | 38 | 39 | TASKS_TABLE = [ 40 | LightevalTaskConfig( 41 | name=f"soqal_{Language.ARABIC.value}_{formulation.name.lower()}", 42 | hf_subset="multiple_choice_grounded_statement_soqal_task", 43 | prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation), 44 | evaluation_splits=["test"], 45 | few_shots_split="validation", 46 | hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native", 47 | metrics=get_metrics_for_formulation( 48 | formulation, 49 | [ 50 | LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), 51 | LogLikelihoodAccMetric(normalization=LogProbCharNorm()), 52 | ], 53 | ), 54 | ) 55 | for formulation in [ 56 | MCFFormulation(), 57 | CFFormulation(), 58 | HybridFormulation(), 59 | ] 60 | ] 61 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/piqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Piqa 4 | 5 | dataset: 6 | ybisk/piqa 7 | 8 | abstract: 9 | PIQA is a benchmark for testing physical commonsense reasoning. It contains 10 | questions requiring this kind of physical commonsense reasoning. 11 | 12 | languages: 13 | english 14 | 15 | tags: 16 | commonsense, multiple-choice, qa 17 | 18 | paper: 19 | https://arxiv.org/abs/1911.11641 20 | """ 21 | 22 | from string import ascii_uppercase 23 | 24 | from lighteval.metrics.metrics import Metrics 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.requests import Doc 27 | 28 | 29 | def piqa_prompt(line, task_name: str = None): 30 | letters = list(ascii_uppercase)[:2] 31 | query = "The following are multiple choice questions (with answers) about common sense.\n" 32 | query += f"Question: {line['goal']}\n" 33 | query += "".join([f"{key}. {choice}\n" for key, choice in zip(letters, [line["sol1"], line["sol2"]])]) 34 | query += "Answer: " 35 | 36 | gold_ix = int(line["label"]) 37 | is_few_shots = line.get("__few_shots", False) 38 | return Doc( 39 | task_name=task_name, 40 | query=query, 41 | choices=letters if not is_few_shots else [line["sol1"], line["sol2"]], 42 | gold_index=gold_ix, 43 | instruction="The following are multiple choice questions (with answers) about common sense.\n", 44 | ) 45 | 46 | 47 | piqa = LightevalTaskConfig( 48 | name="piqa", 49 | prompt_function=piqa_prompt, 50 | hf_repo="ybisk/piqa", 51 | hf_subset="plain_text", 52 | hf_avail_splits=["train", "test", "validation"], 53 | evaluation_splits=["validation", "test"], 54 | few_shots_split=None, 55 | few_shots_select=None, 56 | generation_size=1, 57 | metrics=[ 58 | Metrics.exact_match, 59 | ], 60 | stop_sequence=["\n"], 61 | version=0, 62 | ) 63 | 64 | TASKS_TABLE = [ 65 | piqa, 66 | ] 67 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/thai_exams.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Thai Exams 4 | 5 | dataset: 6 | scb10x/thai_exam 7 | 8 | abstract: 9 | Thai Exams multilingual benchmark. 10 | 11 | languages: 12 | thai 13 | 14 | tags: 15 | knowledge, multilingual, multiple-choice 16 | 17 | paper: 18 | """ 19 | 20 | from lighteval.metrics.dynamic_metrics import ( 21 | LogLikelihoodAccMetric, 22 | ) 23 | from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm 24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 25 | from lighteval.tasks.multilingual.adapters import ( 26 | thai_exams_adapter, 27 | ) 28 | from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation 29 | from lighteval.tasks.templates.multichoice import get_mcq_prompt_function 30 | from lighteval.tasks.templates.utils.formulation import ( 31 | CFFormulation, 32 | HybridFormulation, 33 | MCFFormulation, 34 | ) 35 | from lighteval.utils.language import Language 36 | 37 | 38 | THAI_EXAMS_SUBSETS = ["a_level", "ic", "onet", "tgat", "tpat1"] 39 | 40 | 41 | TASKS_TABLE = [ 42 | LightevalTaskConfig( 43 | name=f"thai_exams_{Language.THAI.value}_{formulation.name.lower()}:{subset}", 44 | prompt_function=get_mcq_prompt_function(Language.THAI, thai_exams_adapter, formulation=formulation), 45 | hf_repo="scb10x/thai_exam", 46 | hf_subset=subset, 47 | evaluation_splits=("test",), 48 | few_shots_split="train", 49 | metrics=get_metrics_for_formulation( 50 | formulation, 51 | [ 52 | LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), 53 | LogLikelihoodAccMetric(normalization=LogProbCharNorm()), 54 | ], 55 | ), 56 | ) 57 | for subset in THAI_EXAMS_SUBSETS 58 | for formulation in [ 59 | MCFFormulation(), 60 | CFFormulation(), 61 | HybridFormulation(), 62 | ] 63 | ] 64 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/hellaswag.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Hellaswag 4 | 5 | dataset: 6 | Rowan/hellaswag 7 | 8 | abstract: 9 | HellaSwag is a commonsense inference benchmark designed to challenge language 10 | models with adversarially filtered multiple-choice questions. 11 | 12 | languages: 13 | english 14 | 15 | tags: 16 | multiple-choice, narrative, reasoning 17 | 18 | paper: 19 | https://arxiv.org/abs/1905.07830 20 | """ 21 | 22 | from string import ascii_uppercase 23 | 24 | from lighteval.metrics.metrics import Metrics 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.requests import Doc 27 | 28 | 29 | def hellaswag_prompt(line, task_name: str = None): 30 | query = "The following are multiple choice questions (with answers) about common sense.\n\n" 31 | query += f"Question: {line['activity_label']}: {line['ctx_a']} {line['ctx_b'].capitalize()}\n" 32 | query += "".join([f"{key}. {choice}\n" for key, choice in zip(ascii_uppercase, line["endings"])]) 33 | query += "Answer:" 34 | 35 | gold_ix = int(line["label"]) if line["label"] != "" else -1 36 | return Doc( 37 | task_name=task_name, 38 | query=query, 39 | choices=[" " + i for i in ascii_uppercase[: len(line["endings"])]], 40 | gold_index=gold_ix, 41 | instruction="The following are multiple choice questions (with answers) about common sense.\n\n", 42 | ) 43 | 44 | 45 | hellaswag = LightevalTaskConfig( 46 | name="hellaswag", 47 | prompt_function=hellaswag_prompt, 48 | hf_repo="Rowan/hellaswag", 49 | hf_subset="default", 50 | hf_avail_splits=["train", "test", "validation"], 51 | evaluation_splits=["validation"], 52 | few_shots_split=None, 53 | few_shots_select=None, 54 | generation_size=1, 55 | metrics=[ 56 | Metrics.exact_match, 57 | ], 58 | stop_sequence=["\n"], 59 | version=0, 60 | ) 61 | 62 | TASKS_TABLE = [ 63 | hellaswag, 64 | ] 65 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/storycloze.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Storycloze 4 | 5 | dataset: 6 | MoE-UNC/story_cloze 7 | 8 | abstract: 9 | A Corpus and Cloze Evaluation for Deeper Understanding of 10 | Commonsense Stories 11 | 12 | languages: 13 | english 14 | 15 | tags: 16 | narrative, reasoning 17 | 18 | paper: 19 | https://arxiv.org/abs/1604.01696 20 | """ 21 | 22 | from lighteval.metrics.metrics import Metrics 23 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 24 | from lighteval.tasks.requests import Doc 25 | 26 | 27 | def storycloze_prompt(line, task_name: str = None): 28 | context = "\n".join( 29 | [line["input_sentence_1"], line["input_sentence_2"], line["input_sentence_3"], line["input_sentence_4"]] 30 | ) 31 | choices = [line["sentence_quiz1"], line["sentence_quiz2"]] 32 | gold = int(line["answer_right_ending"]) - 1 33 | return Doc(task_name=task_name, query=context + "\n", choices=choices, gold_index=gold) 34 | 35 | 36 | storycloze_2016 = LightevalTaskConfig( 37 | name="storycloze:2016", 38 | prompt_function=storycloze_prompt, 39 | hf_repo="MoE-UNC/story_cloze", 40 | hf_subset="2016", 41 | hf_avail_splits=["validation"], 42 | evaluation_splits=["validation"], 43 | few_shots_split=None, 44 | few_shots_select=None, 45 | generation_size=-1, 46 | metrics=[Metrics.exact_match], 47 | stop_sequence=["\n"], 48 | version=0, 49 | ) 50 | 51 | 52 | storycloze_2018 = LightevalTaskConfig( 53 | name="storycloze:2018", 54 | prompt_function=storycloze_prompt, 55 | hf_repo="MoE-UNC/story_cloze", 56 | hf_subset="2018", 57 | hf_avail_splits=["validation"], 58 | evaluation_splits=["validation"], 59 | few_shots_split=None, 60 | few_shots_select=None, 61 | generation_size=-1, 62 | metrics=[Metrics.exact_match], 63 | stop_sequence=["\n"], 64 | version=0, 65 | ) 66 | 67 | TASKS_TABLE = [ 68 | storycloze_2016, 69 | storycloze_2018, 70 | ] 71 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/squad_v2.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Squad V2 4 | 5 | dataset: 6 | rajpurkar/squad_v2 7 | 8 | abstract: 9 | Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset, 10 | consisting of questions posed by crowdworkers on a set of Wikipedia articles, 11 | where the answer to every question is a segment of text, or span, from the 12 | corresponding reading passage, or the question might be unanswerable. 13 | SQuAD 2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000 14 | unanswerable questions written adversarially by crowdworkers to look similar to 15 | answerable ones. To do well on SQuAD2.0, systems must not only answer questions 16 | when possible, but also determine when no answer is supported by the paragraph 17 | and abstain from answering. 18 | 19 | languages: 20 | english 21 | 22 | tags: 23 | qa 24 | 25 | paper: 26 | https://arxiv.org/abs/1806.03822 27 | """ 28 | 29 | from lighteval.metrics.metrics import Metrics 30 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 31 | from lighteval.tasks.templates.qa import get_qa_prompt_function 32 | from lighteval.utils.language import Language 33 | 34 | 35 | squad_v2 = LightevalTaskConfig( 36 | name="squad_v2", 37 | prompt_function=get_qa_prompt_function( 38 | Language.ENGLISH, 39 | lambda line: { 40 | "question": line["question"], 41 | "context": line["context"], 42 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], 43 | }, 44 | ), 45 | hf_repo="rajpurkar/squad_v2", 46 | hf_subset="squad_v2", 47 | hf_filter=lambda line: any(ans for ans in line["answers"]["text"] if len(ans) > 0), 48 | evaluation_splits=("validation",), 49 | few_shots_split="train", 50 | stop_sequence=["\n", "Question:", "question:"], 51 | generation_size=200, 52 | metrics=[Metrics.exact_match], 53 | version=1, 54 | ) 55 | 56 | TASKS_TABLE = [ 57 | squad_v2, 58 | ] 59 | -------------------------------------------------------------------------------- /tests/unit/models/test_abstract_model.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2024 The HuggingFace Team 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | from transformers import AutoTokenizer 24 | 25 | from lighteval.models.dummy.dummy_model import DummyModel, DummyModelConfig 26 | 27 | 28 | def test_tok_encode_pair(): 29 | model = DummyModel(config=DummyModelConfig(seed=42)) 30 | model._tokenizer = AutoTokenizer.from_pretrained("facebook/xglm-564M") 31 | context = "答案:" 32 | continuation = ["1"] 33 | non_pairwise_tokens = model.tok_encode_pair(context, continuation, pairwise=False) 34 | pairwise_tokens = model.tok_encode_pair(context, continuation, pairwise=True) 35 | # Non-pairwise merged ":1" to one token 36 | assert non_pairwise_tokens == ([[6, 47873]], [[34871]]) 37 | # Pairwise separated ":" and "1" 38 | assert pairwise_tokens == ([[6, 47873, 13]], [[82]]) 39 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/hellaswag_tel.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Hellaswag Tel 4 | 5 | dataset: 6 | LightFury9/hellaswag-telugu 7 | 8 | abstract: 9 | Hellaswag Tel multilingual benchmark. 10 | 11 | languages: 12 | telugu 13 | 14 | tags: 15 | multilingual, multiple-choice, reasoning 16 | 17 | paper: 18 | """ 19 | 20 | from lighteval.metrics.dynamic_metrics import ( 21 | LogLikelihoodAccMetric, 22 | ) 23 | from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm 24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 25 | from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation 26 | from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function 27 | from lighteval.tasks.templates.utils.formulation import ( 28 | CFFormulation, 29 | HybridFormulation, 30 | MCFFormulation, 31 | ) 32 | from lighteval.utils.language import Language 33 | 34 | 35 | TASKS_TABLE = [ 36 | LightevalTaskConfig( 37 | name=f"community_hellaswag_{Language.TELUGU.value}_{formulation.name.lower()}", 38 | prompt_function=get_hellaswag_prompt_function( 39 | language=Language.TELUGU, 40 | adapter=lambda line: { 41 | "ctx_a": line["ctx_a"], 42 | "continuations": line["endings"], 43 | "gold_idx": int(line["label"]), 44 | }, 45 | formulation=formulation, 46 | ), 47 | hf_repo="LightFury9/hellaswag-telugu", 48 | hf_subset="default", 49 | evaluation_splits=("valid",), 50 | few_shots_split="train", 51 | metrics=get_metrics_for_formulation( 52 | formulation, 53 | [ 54 | LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), 55 | LogLikelihoodAccMetric(normalization=LogProbCharNorm()), 56 | ], 57 | ), 58 | ) 59 | for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] 60 | ] 61 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/arabic_arc.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Arabic Arc 4 | 5 | dataset: 6 | OALL/AlGhafa-Arabic-LLM-Benchmark-Translated 7 | 8 | abstract: 9 | Arabic Arc multilingual benchmark. 10 | 11 | languages: 12 | arabic 13 | 14 | tags: 15 | multilingual, multiple-choice, reasoning 16 | 17 | paper: 18 | """ 19 | 20 | from lighteval.metrics.dynamic_metrics import ( 21 | LogLikelihoodAccMetric, 22 | ) 23 | from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm 24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 25 | from lighteval.tasks.multilingual.adapters import ( 26 | alghafa_adapter, 27 | ) 28 | from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation 29 | from lighteval.tasks.templates.multichoice import get_mcq_prompt_function 30 | from lighteval.tasks.templates.utils.formulation import ( 31 | CFFormulation, 32 | HybridFormulation, 33 | MCFFormulation, 34 | ) 35 | from lighteval.utils.language import Language 36 | 37 | 38 | TASKS_TABLE = [ 39 | LightevalTaskConfig( 40 | name=f"alghafa_arc_{Language.ARABIC.value}_{formulation.name.lower()}:easy", 41 | prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation), 42 | hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated", 43 | hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff", 44 | hf_subset="arc_easy_ar", 45 | evaluation_splits=["test"], 46 | few_shots_split="validation", 47 | few_shots_select="sequential", 48 | metrics=get_metrics_for_formulation( 49 | formulation, 50 | [ 51 | LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), 52 | LogLikelihoodAccMetric(normalization=LogProbCharNorm()), 53 | ], 54 | ), 55 | ) 56 | for formulation in [ 57 | MCFFormulation(), 58 | CFFormulation(), 59 | HybridFormulation(), 60 | ] 61 | ] 62 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/mathqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Mathqa 4 | 5 | dataset: 6 | allenai/math_qa 7 | 8 | abstract: 9 | large-scale dataset of math word problems. Our dataset is gathered by using a 10 | new representation language to annotate over the AQuA-RAT dataset with 11 | fully-specified operational programs. AQuA-RAT has provided the questions, 12 | options, rationale, and the correct options. 13 | 14 | languages: 15 | english 16 | 17 | tags: 18 | math, qa, reasoning 19 | 20 | paper: 21 | https://arxiv.org/abs/1905.13319 22 | """ 23 | 24 | from lighteval.metrics.metrics import Metrics 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.requests import Doc 27 | 28 | 29 | def mathqa_prompt(line, task_name: str = None): 30 | query = f"Problem: {line['Problem']}\n" 31 | query += "Options:\n" 32 | query += "".join( 33 | [ 34 | f"{key}) {choice}\n" 35 | for key, choice in zip( 36 | ["a", "b", "c", "d", "e"], 37 | [line["option_a"], line["option_b"], line["option_c"], line["option_d"], line["option_e"]], 38 | ) 39 | ] 40 | ) 41 | query += "Answer:" 42 | return Doc( 43 | task_name=task_name, 44 | query=query, 45 | choices=[ 46 | f" {c}" for c in [line["option_a"], line["option_b"], line["option_c"], line["option_d"], line["option_e"]] 47 | ], 48 | gold_index=["a", "b", "c", "d", "e"].index(line["correct"]), 49 | ) 50 | 51 | 52 | mathqa = LightevalTaskConfig( 53 | name="mathqa", 54 | prompt_function=mathqa_prompt, 55 | hf_repo="allenai/math_qa", 56 | hf_subset="default", 57 | hf_avail_splits=["train", "validation", "test"], 58 | evaluation_splits=["test"], 59 | few_shots_split=None, 60 | few_shots_select=None, 61 | generation_size=-1, 62 | metrics=[Metrics.loglikelihood_acc], 63 | stop_sequence=["\n"], 64 | version=0, 65 | ) 66 | 67 | TASKS_TABLE = [ 68 | mathqa, 69 | ] 70 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/triviaqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Triviaqa 4 | 5 | dataset: 6 | mandarjoshi/trivia_qa 7 | 8 | abstract: 9 | TriviaqQA is a reading comprehension dataset containing over 650K 10 | question-answer-evidence triples. TriviaqQA includes 95K question-answer pairs 11 | authored by trivia enthusiasts and independently gathered evidence documents, 12 | six per question on average, that provide high quality distant supervision for 13 | answering the questions. 14 | 15 | languages: 16 | english 17 | 18 | tags: 19 | qa 20 | 21 | paper: 22 | https://arxiv.org/abs/1705.03551 23 | """ 24 | 25 | import string 26 | 27 | from lighteval.metrics.metrics import Metrics 28 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 29 | from lighteval.tasks.requests import Doc 30 | 31 | 32 | def triviaqa_prompt(line, task_name: str = None): 33 | def _remove_prefixes(aliases): 34 | aliases.sort() 35 | ret = [aliases[0]] 36 | for alias in aliases[1:]: 37 | if not alias.startswith(ret[-1]): 38 | ret.append(alias) 39 | return ret 40 | 41 | list_of_candidates = [ 42 | alias.lower().translate(str.maketrans("", "", string.punctuation)) 43 | for alias in _remove_prefixes(line["answer"]["aliases"]) 44 | ] 45 | 46 | return Doc( 47 | task_name=task_name, 48 | query=f"Question: {line['question']}\nAnswer:", 49 | gold_index=0, 50 | choices=[list_of_candidates], 51 | ) 52 | 53 | 54 | triviaqa = LightevalTaskConfig( 55 | name="triviaqa", 56 | prompt_function=triviaqa_prompt, 57 | hf_repo="mandarjoshi/trivia_qa", 58 | hf_subset="rc.nocontext", 59 | hf_avail_splits=["train", "test", "validation"], 60 | evaluation_splits=["validation"], 61 | few_shots_split=None, 62 | few_shots_select=None, 63 | generation_size=20, 64 | metrics=[Metrics.exact_match], 65 | stop_sequence=["\n", ".", ","], 66 | version=0, 67 | ) 68 | 69 | TASKS_TABLE = [ 70 | triviaqa, 71 | ] 72 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/utils/task_utils.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2024 The HuggingFace Team 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric 25 | from lighteval.metrics.utils.metric_utils import Metric 26 | from lighteval.tasks.templates.utils.formulation import Formulation, MCFFormulation 27 | 28 | 29 | def normalize_subset(subset: str) -> str: 30 | return subset.replace(" ", "_").replace("(", "").replace(")", "").lower() 31 | 32 | 33 | def get_metrics_for_formulation(formulation: Formulation, metrics: list[Metric]) -> list[Metric]: 34 | """Choose the appropriate metrics for the given formulation otherwise fallback to the original metrics.""" 35 | match formulation: 36 | # 37 | case MCFFormulation(choice_prefix="Letters"): 38 | return [LogLikelihoodAccMetric(normalization=None)] 39 | case _: 40 | return metrics 41 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/simpleqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Simpleqa 4 | 5 | dataset: 6 | lighteval/SimpleQA 7 | 8 | abstract: 9 | A factuality benchmark called SimpleQA that measures the ability for language 10 | models to answer short, fact-seeking questions. 11 | 12 | languages: 13 | english 14 | 15 | tags: 16 | factuality, general-knowledge, qa 17 | 18 | paper: 19 | https://openai.com/index/introducing-simpleqa/ 20 | 21 | starred: 22 | true 23 | """ 24 | 25 | from inspect_ai.dataset import Sample 26 | from inspect_ai.scorer import model_graded_fact 27 | from inspect_ai.solver import generate 28 | 29 | from lighteval.metrics.metrics import Metrics 30 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 31 | from lighteval.tasks.requests import Doc 32 | 33 | 34 | def simpleqa_prompt(line, task_name: str = None): 35 | query = f"Question: {line['question']}\n" 36 | query += "".join( 37 | [f"\n{key}. {choice}" for key, choice in zip(["A", "B", "C", "D", "E", "F"], line["choices"]["text"])] 38 | ) 39 | query += "\nAnswer:" 40 | return Doc( 41 | task_name=task_name, 42 | query=query, 43 | choices=line["choices"]["text"], 44 | gold_index=line["choices"]["label"].index(line["answerKey"]), 45 | ) 46 | 47 | 48 | def record_to_sample(record): 49 | query = record["problem"] 50 | target = record["answer"] 51 | return Sample(input=query, target=target) 52 | 53 | 54 | simpleqa = LightevalTaskConfig( 55 | name="simpleqa", 56 | prompt_function=simpleqa_prompt, 57 | hf_repo="lighteval/SimpleQA", 58 | hf_subset="default", 59 | hf_avail_splits=["test"], 60 | evaluation_splits=["test"], 61 | few_shots_split="few_shot", 62 | few_shots_select=None, 63 | generation_size=2048, 64 | metrics=[Metrics.exact_match], 65 | stop_sequence=["\n"], 66 | version=0, 67 | sample_fields=record_to_sample, 68 | solver=[generate(cache=True)], 69 | scorer=model_graded_fact(), 70 | ) 71 | 72 | TASKS_TABLE = [ 73 | simpleqa, 74 | ] 75 | -------------------------------------------------------------------------------- /tests/unit/metrics/test_cases/avg_at_k.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Avg At K Test Suite", 3 | "description": "Test cases for avg_at_k metric", 4 | "test_cases": [ 5 | { 6 | "name": "Avg at K - Correct in Top K", 7 | "metric_class": "avg_at_k", 8 | "metric_params": {"k": 2}, 9 | "doc": { 10 | "query": "What is the capital of France?", 11 | "choices": ["London", "Paris", "Berlin"], 12 | "gold_index": 1, 13 | "task_name": "geography" 14 | }, 15 | "model_response": { 16 | "text": ["Paris", "London", "Berlin"] 17 | }, 18 | "expected_output": { 19 | "avg@k:k=2": 0.5 20 | }, 21 | "tolerance": 0.01, 22 | "description": "Test avg at k with correct answer in top k" 23 | }, 24 | { 25 | "name": "Avg at K - Not in Top K", 26 | "metric_class": "avg_at_k", 27 | "metric_params": {"k": 1}, 28 | "doc": { 29 | "query": "What is the capital of France?", 30 | "choices": ["London", "Paris", "Berlin"], 31 | "gold_index": 1, 32 | "task_name": "geography" 33 | }, 34 | "model_response": { 35 | "text": ["London", "Berlin", "Paris"] 36 | }, 37 | "expected_output": { 38 | "avg@k:k=1": 0.0 39 | }, 40 | "tolerance": 0.01, 41 | "description": "Test avg at k with correct answer not in top k" 42 | }, 43 | { 44 | "name": "Avg at K - Multiple Correct", 45 | "metric_class": "avg_at_k", 46 | "metric_params": {"k": 3}, 47 | "doc": { 48 | "query": "Which are European capitals?", 49 | "choices": ["London", "Paris", "Tokyo", "Berlin"], 50 | "gold_index": [0, 1, 3], 51 | "task_name": "geography" 52 | }, 53 | "model_response": { 54 | "text": ["Paris", "London", "Berlin", "Tokyo"] 55 | }, 56 | "expected_output": { 57 | "avg@k:k=3": 0.33 58 | }, 59 | "tolerance": 0.01, 60 | "description": "Test avg at k with multiple correct answers" 61 | } 62 | ] 63 | } 64 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/entity_data_imputation.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Entity Data Imputation 4 | 5 | dataset: 6 | lighteval/Buy, lighteval/Restaurant 7 | 8 | abstract: 9 | Scenario that tests the ability to impute missing entities in a data table. 10 | 11 | languages: 12 | english 13 | 14 | tags: 15 | reasoning 16 | 17 | paper: 18 | https://ieeexplore.ieee.org/document/9458712 19 | """ 20 | 21 | from lighteval.metrics.metrics import Metrics 22 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 23 | from lighteval.tasks.requests import Doc 24 | 25 | 26 | def entity_data_imputation_prompt(line, task_name: str = None): 27 | return Doc( 28 | task_name=task_name, 29 | query=f"What is the missing value?\n{line['text']}\nAnswer:", 30 | choices=[line["gold"]], 31 | gold_index=0, 32 | instruction="What is the missing value?\n", 33 | ) 34 | 35 | 36 | entity_data_imputation_Buy = LightevalTaskConfig( 37 | name="entity_data_imputation:Buy", 38 | prompt_function=entity_data_imputation_prompt, 39 | hf_repo="lighteval/Buy", 40 | hf_subset="default", 41 | hf_avail_splits=["train", "test", "valid"], 42 | evaluation_splits=["valid", "test"], 43 | few_shots_split=None, 44 | few_shots_select=None, 45 | generation_size=5, 46 | metrics=[ 47 | Metrics.exact_match, 48 | ], 49 | stop_sequence=["\n"], 50 | version=0, 51 | ) 52 | 53 | 54 | entity_data_imputation_Restaurant = LightevalTaskConfig( 55 | name="entity_data_imputation:Restaurant", 56 | prompt_function=entity_data_imputation_prompt, 57 | hf_repo="lighteval/Restaurant", 58 | hf_subset="default", 59 | hf_avail_splits=["train"], 60 | evaluation_splits=["train"], 61 | few_shots_split=None, 62 | few_shots_select=None, 63 | generation_size=5, 64 | metrics=[ 65 | Metrics.exact_match, 66 | ], 67 | stop_sequence=["\n"], 68 | version=0, 69 | ) 70 | 71 | TASKS_TABLE = [ 72 | entity_data_imputation_Buy, 73 | entity_data_imputation_Restaurant, 74 | ] 75 | -------------------------------------------------------------------------------- /tests/unit/metrics/test_cases/drop.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Drop Test Suite", 3 | "description": "Test cases for drop metric", 4 | "test_cases": [ 5 | { 6 | "name": "DROP - Correct Answer", 7 | "metric_class": "drop", 8 | "metric_params": {}, 9 | "doc": { 10 | "query": "What is 2 + 2?", 11 | "specific": { 12 | "golds_no_preprocessing": ["4"] 13 | }, 14 | "choices": ["4"], 15 | "gold_index": 0, 16 | "task_name": "math" 17 | }, 18 | "model_response": { 19 | "text": ["4"] 20 | }, 21 | "expected_output": { 22 | "em": 1.0, 23 | "f1": 1.0 24 | }, 25 | "tolerance": 0.01, 26 | "description": "Test DROP with correct answer" 27 | }, 28 | { 29 | "name": "DROP - Wrong Answer", 30 | "metric_class": "drop", 31 | "metric_params": {}, 32 | "doc": { 33 | "query": "What is 2 + 2?", 34 | "specific": { 35 | "golds_no_preprocessing": ["4"] 36 | }, 37 | "choices": ["4"], 38 | "gold_index": 0, 39 | "task_name": "math" 40 | }, 41 | "model_response": { 42 | "text": ["5"] 43 | }, 44 | "expected_output": { 45 | "em": 0.0, 46 | "f1": 0.0 47 | }, 48 | "tolerance": 0.01, 49 | "description": "Test DROP with wrong answer" 50 | }, 51 | { 52 | "name": "DROP - Partial Match", 53 | "metric_class": "drop", 54 | "metric_params": {}, 55 | "doc": { 56 | "query": "What is the sum of 2 and 2?", 57 | "specific": { 58 | "golds_no_preprocessing": ["4", "four"] 59 | }, 60 | "choices": ["4", "four"], 61 | "gold_index": 0, 62 | "task_name": "math" 63 | }, 64 | "model_response": { 65 | "text": ["4"] 66 | }, 67 | "expected_output": { 68 | "em": 1.0, 69 | "f1": 1.0 70 | }, 71 | "tolerance": 0.01, 72 | "description": "Test DROP with partial match" 73 | } 74 | ] 75 | } 76 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/hellaswag_hin.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Hellaswag Hin 4 | 5 | dataset: 6 | ai4bharat/hellaswag-hi 7 | 8 | abstract: 9 | Hellaswag Hin multilingual benchmark. 10 | 11 | languages: 12 | hindi 13 | 14 | tags: 15 | multilingual, multiple-choice, reasoning 16 | 17 | paper: 18 | """ 19 | 20 | from lighteval.metrics.dynamic_metrics import ( 21 | LogLikelihoodAccMetric, 22 | ) 23 | from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm 24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 25 | from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation 26 | from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function 27 | from lighteval.tasks.templates.utils.formulation import ( 28 | CFFormulation, 29 | HybridFormulation, 30 | MCFFormulation, 31 | ) 32 | from lighteval.utils.language import Language 33 | 34 | 35 | TASKS_TABLE = [ 36 | LightevalTaskConfig( 37 | name=f"community_hellaswag_{Language.HINDI.value}_{formulation.name.lower()}", 38 | prompt_function=get_hellaswag_prompt_function( 39 | language=Language.HINDI, 40 | adapter=lambda line: { 41 | "ctx_a": line["ctx_a"], 42 | "continuations": line["endings"], 43 | "gold_idx": int(line["label"]), 44 | }, 45 | formulation=formulation, 46 | ), 47 | hf_repo="ai4bharat/hellaswag-hi", 48 | hf_filter=lambda line: all(len(choice.strip()) > 0 for choice in line["endings"]), 49 | hf_subset="hi", 50 | evaluation_splits=("validation",), 51 | few_shots_split="validation", 52 | metrics=get_metrics_for_formulation( 53 | formulation, 54 | [ 55 | LogLikelihoodAccMetric(normalization=LogProbCharNorm()), 56 | LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), 57 | ], 58 | ), 59 | ) 60 | for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()] 61 | ] 62 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/covid_dialogue.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Covid Dialogue 4 | 5 | dataset: 6 | lighteval/covid_dialogue 7 | 8 | abstract: 9 | The COVID-19 Dialogue dataset is a collection of 500+ dialogues between 10 | doctors and patients during the COVID-19 pandemic. 11 | 12 | languages: 13 | english 14 | 15 | tags: 16 | dialog, medical 17 | 18 | paper: 19 | https://arxiv.org/abs/2004.06561 20 | """ 21 | 22 | from inspect_ai.dataset import Sample 23 | from inspect_ai.scorer import model_graded_fact 24 | from inspect_ai.solver import generate, system_message 25 | 26 | from lighteval.metrics.metrics import Metrics 27 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 28 | from lighteval.tasks.requests import Doc 29 | 30 | 31 | PROMPT = "Generate a response given a patient's questions and concerns." 32 | 33 | 34 | def covid_dialogue_prompt(line, task_name: str = None): 35 | return Doc( 36 | task_name=task_name, 37 | query=f"Generate a response given a patient's questions and concerns.\nPatient: {line['query']}\nDoctor: ", 38 | choices=[line["answer"]], 39 | gold_index=0, 40 | instruction="Generate a response given a patient's questions and concerns.\n", 41 | ) 42 | 43 | 44 | def record_to_sample(record): 45 | query = record["query"] 46 | target = record["answer"] 47 | return Sample(input=query, target=target) 48 | 49 | 50 | covid_dialogue = LightevalTaskConfig( 51 | name="covid_dialogue", 52 | prompt_function=covid_dialogue_prompt, 53 | hf_repo="lighteval/covid_dialogue", 54 | hf_subset="default", 55 | hf_avail_splits=["train", "test", "validation"], 56 | evaluation_splits=["validation", "test"], 57 | few_shots_split=None, 58 | few_shots_select=None, 59 | generation_size=128, 60 | metrics=[Metrics.exact_match], 61 | stop_sequence=["\n"], 62 | version=0, 63 | sample_fields=record_to_sample, 64 | solver=[system_message(PROMPT), generate(cache=True)], 65 | scorer=model_graded_fact(), 66 | ) 67 | 68 | TASKS_TABLE = [ 69 | covid_dialogue, 70 | ] 71 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/tydiqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Tydiqa 4 | 5 | dataset: 6 | google-research-datasets/tydiqa 7 | 8 | abstract: 9 | Other QA tasks for RC TyDi QA: A benchmark for information-seeking question answering in typologically diverse languages. https://arxiv.org/abs/2003.05002 10 | 11 | languages: 12 | arabic, bengali, english, finnish, indonesian, japanese, korean, russian, swahili, telugu, thai 13 | 14 | tags: 15 | multilingual, qa 16 | 17 | paper: 18 | https://arxiv.org/abs/2003.05002 19 | """ 20 | 21 | from lighteval.metrics.dynamic_metrics import ( 22 | MultilingualQuasiExactMatchMetric, 23 | MultilingualQuasiF1ScoreMetric, 24 | ) 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.templates.qa import get_qa_prompt_function 27 | from lighteval.utils.language import Language 28 | 29 | 30 | TASKS_TABLE = [ 31 | LightevalTaskConfig( 32 | name=f"tydiqa_{language.value}", 33 | prompt_function=get_qa_prompt_function( 34 | language, 35 | lambda line: { 36 | "question": line["question"], 37 | "context": line["context"], 38 | "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0], 39 | }, 40 | ), 41 | hf_repo="google-research-datasets/tydiqa", 42 | hf_subset="secondary_task", 43 | evaluation_splits=("validation",), 44 | few_shots_split="train", 45 | generation_size=400, 46 | stop_sequence=("\n",), 47 | metrics=( 48 | MultilingualQuasiExactMatchMetric(language, "prefix"), 49 | MultilingualQuasiF1ScoreMetric(language), 50 | ), 51 | ) 52 | for language in [ 53 | Language.ENGLISH, 54 | Language.ARABIC, 55 | Language.BENGALI, 56 | Language.FINNISH, 57 | Language.INDONESIAN, 58 | Language.JAPANESE, 59 | Language.KOREAN, 60 | Language.SWAHILI, 61 | Language.RUSSIAN, 62 | Language.TELUGU, 63 | Language.THAI, 64 | ] 65 | ] 66 | -------------------------------------------------------------------------------- /src/lighteval/tasks/templates/utils/adapter_utils.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | # Copyright (c) 2024 The HuggingFace Team 4 | 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | 23 | 24 | from typing import Any, Callable, Mapping, TypeVar 25 | 26 | 27 | AdapterReturnTypeVar = TypeVar("AdapterReturnTypeVar") 28 | 29 | 30 | def create_adapter_from_dict( 31 | adapter: Mapping[str, Any] | Callable[[dict], AdapterReturnTypeVar], 32 | ) -> Callable[[dict], AdapterReturnTypeVar]: 33 | """Creates adapter function for the template input from a dict. 34 | 35 | Args: 36 | adapter: Dict of the form {key: value} where value is key in the input dict to get. 37 | 38 | Returns: 39 | Callable[[dict], AdapterReturnTypeVar]: A function that adapts dictionary input to the expected format 40 | """ 41 | if not isinstance(adapter, Mapping): 42 | return adapter 43 | 44 | def adapter_fn(line: dict): 45 | return {key: line[value] for key, value in adapter.items()} 46 | 47 | return adapter_fn # type: ignore 48 | -------------------------------------------------------------------------------- /docs/source/_toctree.yml: -------------------------------------------------------------------------------- 1 | - sections: 2 | - local: index 3 | title: 🤗 Lighteval 4 | - local: installation 5 | title: Installation 6 | - local: quicktour 7 | title: Quicktour 8 | title: Getting started 9 | - sections: 10 | - local: inspect-ai 11 | title: Examples using Inspect-AI 12 | - local: saving-and-reading-results 13 | title: Save and read results 14 | - local: caching 15 | title: Caching 16 | - local: using-the-python-api 17 | title: Use the Python API 18 | - local: adding-a-custom-task 19 | title: Add a custom task 20 | - local: adding-a-new-metric 21 | title: Add a custom metric 22 | - local: evaluating-a-custom-model 23 | title: Evaluate a custom model 24 | - local: use-inference-providers-as-backend 25 | title: Use HF's inference providers as backend 26 | - local: use-litellm-as-backend 27 | title: Use litellm as backend 28 | - local: use-vllm-as-backend 29 | title: Use vllm as backend 30 | - local: use-sglang-as-backend 31 | title: Use SGLang as backend 32 | - local: use-huggingface-inference-endpoints-or-tgi-as-backend 33 | title: Use Hugging Face inference endpoints or TGI as backend 34 | - local: contributing-to-multilingual-evaluations 35 | title: Contributing to multilingual evaluations 36 | title: Guides 37 | - sections: 38 | - local: metric-list 39 | title: Available Metrics 40 | - local: available-tasks 41 | title: Available Tasks 42 | title: API 43 | - sections: 44 | - sections: 45 | - local: package_reference/evaluation_tracker 46 | title: EvaluationTracker 47 | - local: package_reference/models 48 | title: Model Configs 49 | - local: package_reference/pipeline 50 | title: Pipeline 51 | title: Main classes 52 | - local: package_reference/metrics 53 | title: Metrics 54 | - local: package_reference/tasks 55 | title: Tasks 56 | - local: package_reference/logging 57 | title: Logging 58 | - local: package_reference/models_outputs 59 | title: ModelResponse 60 | - local: package_reference/doc 61 | title: Doc 62 | title: Reference 63 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/afri_mgsm.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Afri Mgsm 4 | 5 | dataset: 6 | masakhane/afrimgsm 7 | 8 | abstract: 9 | African MGSM: MGSM for African Languages 10 | 11 | languages: 12 | amharic, ewe, french, hausa, igbo, kinyarwanda, lingala, luganda, oromo, shona, 13 | sotho, swahili, twi, wolof, xhosa, yoruba, zulu 14 | 15 | tags: 16 | math, multilingual, reasoning 17 | 18 | paper: 19 | https://arxiv.org/abs/2406.03368. 20 | """ 21 | 22 | from lighteval.metrics.dynamic_metrics import ( 23 | MultilingualQuasiExactMatchMetric, 24 | ) 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.templates.qa import get_qa_prompt_function 27 | from lighteval.utils.language import Language 28 | 29 | 30 | TASKS_TABLE = [ 31 | LightevalTaskConfig( 32 | name=f"afri_mgsm_{language.value}", 33 | prompt_function=get_qa_prompt_function( 34 | language, 35 | lambda line: { 36 | "question": line["question"], 37 | # The cot is available but we have no use: 38 | # line["answer"] 39 | "choices": [str(line["answer_number"])], 40 | }, 41 | ), 42 | hf_repo="masakhane/afrimgsm", 43 | hf_subset=language.value, 44 | evaluation_splits=("test",), 45 | few_shots_split="train", 46 | generation_size=25, 47 | metrics=[ 48 | MultilingualQuasiExactMatchMetric(language, "full"), 49 | ], 50 | stop_sequence=("\n",), 51 | ) 52 | for language in [ 53 | Language.AMHARIC, 54 | # Language.EWE, 55 | Language.FRENCH, 56 | # Language.HAUSA, 57 | # Language.IGBO, 58 | # Language.KINYARWANDA, 59 | # Language.LINGALA, 60 | # Language.LUGANDA, 61 | # Language.OROMO, 62 | # Language.SHONA, 63 | # Language.SOTHO, 64 | Language.SWAHILI, 65 | # Language.TWI, 66 | # Language.WOLOF, 67 | # Language.XHOSA, 68 | Language.YORUBA, 69 | # Language.ZULU, 70 | ] 71 | ] 72 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/babi_qa.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Babi Qa 4 | 5 | dataset: 6 | facebook/babi_qa 7 | 8 | abstract: 9 | The bAbI benchmark for measuring understanding and reasoning, evaluates reading 10 | comprehension via question answering. 11 | 12 | languages: 13 | english 14 | 15 | tags: 16 | qa, reasoning 17 | 18 | paper: 19 | https://arxiv.org/abs/1502.05698 20 | """ 21 | 22 | import json 23 | 24 | from lighteval.metrics.metrics import Metrics 25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 26 | from lighteval.tasks.requests import Doc 27 | 28 | 29 | # TODO: clean dataset and convert to inspect-ai 30 | 31 | 32 | def babi_qa_prompt(line, task_name: str = None): 33 | def process_path(path: str) -> str: 34 | steps = path.split(",") 35 | directions = {"s": "south", "n": "north", "e": "east", "w": "west"} 36 | path = " ".join([directions[step] for step in steps]) 37 | return path 38 | 39 | if isinstance(line["story"], dict): 40 | line = line["story"] 41 | else: 42 | line = json.loads(line["story"]) 43 | 44 | results = [] 45 | story = [] 46 | for type, text, answer in zip(line["type"], line["text"], line["answer"]): 47 | if type == "supporting fact": 48 | story.append(text) 49 | elif type == "question": 50 | text = text.replace("_", process_path(answer)) 51 | query = "\n".join(story) + f"\nQuestion: {text}\nAnswer: " 52 | results.append(Doc(task_name=task_name, query=query, choices=[answer], gold_index=0)) 53 | story = [] 54 | return results 55 | 56 | 57 | babi_qa = LightevalTaskConfig( 58 | name="babi_qa", 59 | prompt_function=babi_qa_prompt, 60 | hf_repo="facebook/babi_qa", 61 | hf_subset="en-valid-qa1", 62 | hf_avail_splits=["train", "test", "validation"], 63 | evaluation_splits=["validation", "test"], 64 | few_shots_split=None, 65 | few_shots_select=None, 66 | generation_size=-1, 67 | metrics=[Metrics.exact_match], 68 | stop_sequence=["\n"], 69 | version=0, 70 | ) 71 | 72 | TASKS_TABLE = [babi_qa] 73 | -------------------------------------------------------------------------------- /src/lighteval/tasks/tasks/openbookqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Openbookqa 4 | 5 | dataset: 6 | allenai/openbookqa 7 | 8 | abstract: 9 | OpenBookQA is a question-answering dataset modeled after open-book exams for 10 | assessing human understanding of a subject. It contains multiple-choice 11 | questions that require combining facts from a given open book with broad common 12 | knowledge. The task tests language models' ability to leverage provided 13 | information and apply common sense reasoning. 14 | 15 | languages: 16 | english 17 | 18 | tags: 19 | multiple-choice, qa 20 | 21 | paper: 22 | https://arxiv.org/abs/1809.02789 23 | """ 24 | 25 | from string import ascii_uppercase 26 | 27 | from lighteval.metrics.metrics import Metrics 28 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 29 | from lighteval.tasks.requests import Doc 30 | 31 | 32 | def openbookqa_prompt(line, task_name: str = None): 33 | query = "The following are multiple choice questions (with answers) about common sense.\n" 34 | query += f"Question: {line['question_stem']}\n" 35 | query += "".join([f"{key}. {choice}\n" for key, choice in zip(ascii_uppercase, line["choices"]["text"])]) 36 | query += "Answer: " 37 | 38 | gold_ix = ["A", "B", "C", "D", "E"].index(line["answerKey"].strip()) 39 | return Doc( 40 | task_name=task_name, 41 | query=query, 42 | choices=list(ascii_uppercase[: len(line["choices"]["text"])]), 43 | gold_index=gold_ix, 44 | instruction="The following are multiple choice questions (with answers) about common sense.\n", 45 | ) 46 | 47 | 48 | openbookqa = LightevalTaskConfig( 49 | name="openbookqa", 50 | prompt_function=openbookqa_prompt, 51 | hf_repo="allenai/openbookqa", 52 | hf_subset="main", 53 | hf_avail_splits=["train", "test", "validation"], 54 | evaluation_splits=["validation", "test"], 55 | few_shots_split=None, 56 | few_shots_select=None, 57 | generation_size=1, 58 | metrics=[ 59 | Metrics.exact_match, 60 | ], 61 | stop_sequence=["\n"], 62 | version=0, 63 | ) 64 | 65 | TASKS_TABLE = [ 66 | openbookqa, 67 | ] 68 | -------------------------------------------------------------------------------- /docs/source/caching.mdx: -------------------------------------------------------------------------------- 1 | # Caching System 2 | 3 | Lighteval includes a caching system that can significantly speed up evaluations by storing and reusing model predictions. 4 | This is especially useful when running the same evaluation multiple times, or comparing different evaluation metrics on the same model outputs. 5 | 6 | ## How It Works 7 | 8 | The caching system caches the predictions of the model for now (we will add tokenized input caching later). 9 | It stores model responses objects (generations, logits, probabilities) for evaluation samples. 10 | 11 | ### Cache Structure 12 | 13 | Cached data is stored on disk using HuggingFace datasets in the following structure: 14 | 15 | ``` 16 | .cache/ 17 | └── huggingface/ 18 | └── lighteval/ 19 | └── predictions/ 20 | └── {model_name}/ 21 | └── {model_hash}/ 22 | └── {task_name}.parquet 23 | ``` 24 | 25 | Where: 26 | - `model_name`: The model name (path on the hub or local path) 27 | - `model_hash`: Hash of the model configuration to ensure cache invalidation when parameters change 28 | - `task_name`: Name of the evaluation task 29 | 30 | ### Cache Recreation 31 | 32 | A new cache is automatically created when: 33 | - Model configuration changes (different parameters, quantization, etc.) 34 | - Model weights change (different revision, checkpoint, etc.) 35 | - Generation parameters change (temperature, max_tokens, etc.) 36 | 37 | This ensures that cached results are always consistent with your current model setup. 38 | 39 | ## Using Caching 40 | 41 | ### Automatic Caching 42 | 43 | All built-in model classes in Lighteval automatically support caching. No additional configuration is needed. 44 | For custom models you need to add a cache to the model class and decorators on all functions. 45 | 46 | ## Cache Management 47 | 48 | ### Clearing Cache 49 | 50 | To clear cache for a specific model, delete the corresponding directory: 51 | 52 | ```bash 53 | rm -rf ~/.cache/huggingface/lighteval/predictions/{model_name}/{model_hash}/ 54 | ``` 55 | 56 | To clear all caches: 57 | 58 | ```bash 59 | rm -rf ~/.cache/huggingface/lighteval/predictions 60 | ``` 61 | -------------------------------------------------------------------------------- /src/lighteval/tasks/multilingual/tasks/openbook_es.py: -------------------------------------------------------------------------------- 1 | """ 2 | name: 3 | Openbook Es 4 | 5 | dataset: 6 | BSC-LT/openbookqa-es 7 | 8 | abstract: 9 | Spanish version of OpenBookQA from BSC Language Technology group 10 | 11 | languages: 12 | spanish 13 | 14 | tags: 15 | multilingual, multiple-choice, reasoning 16 | 17 | paper: 18 | https://huggingface.co/datasets/BSC-LT/openbookqa-es 19 | """ 20 | 21 | from string import ascii_uppercase 22 | 23 | from lighteval.metrics.dynamic_metrics import ( 24 | LogLikelihoodAccMetric, 25 | ) 26 | from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm 27 | from lighteval.tasks.lighteval_task import LightevalTaskConfig 28 | from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation 29 | from lighteval.tasks.templates.multichoice import get_mcq_prompt_function 30 | from lighteval.tasks.templates.utils.formulation import ( 31 | CFFormulation, 32 | HybridFormulation, 33 | MCFFormulation, 34 | ) 35 | from lighteval.utils.language import Language 36 | 37 | 38 | TASKS_TABLE = [ 39 | LightevalTaskConfig( 40 | name=f"openbookqa_{Language.SPANISH.value}_{formulation.name.lower()}", 41 | prompt_function=get_mcq_prompt_function( 42 | Language.SPANISH, 43 | lambda line: { 44 | "question": line["question_stem"], 45 | "choices": line["choices"]["text"], 46 | "gold_idx": ascii_uppercase.index(line["answerKey"]), 47 | }, 48 | formulation=formulation, 49 | ), 50 | hf_repo="BSC-LT/openbookqa-es", 51 | hf_subset="default", 52 | evaluation_splits=("test",), 53 | few_shots_split="validation", 54 | metrics=get_metrics_for_formulation( 55 | formulation, 56 | [ 57 | LogLikelihoodAccMetric(normalization=LogProbTokenNorm()), 58 | LogLikelihoodAccMetric(normalization=LogProbCharNorm()), 59 | ], 60 | ), 61 | ) 62 | for formulation in [ 63 | MCFFormulation(), 64 | CFFormulation(), 65 | HybridFormulation(), 66 | ] 67 | ] 68 | --------------------------------------------------------------------------------