├── src
    └── lighteval
    │   ├── py.typed
    │   ├── tasks
    │       ├── tasks
    │       │   ├── jeopardy.py
    │       │   ├── pubmedqa.py
    │       │   ├── quac.py
    │       │   ├── natural_questions.py
    │       │   ├── toxigen.py
    │       │   ├── coqa.py
    │       │   ├── real_toxicity_prompts.py
    │       │   ├── prost.py
    │       │   ├── narrativeqa.py
    │       │   ├── legalsupport.py
    │       │   ├── sciq.py
    │       │   ├── qasper.py
    │       │   ├── webqs.py
    │       │   ├── aimo.py
    │       │   ├── asdiv.py
    │       │   ├── twitterAAE.py
    │       │   ├── logiqa.py
    │       │   ├── winogrande.py
    │       │   ├── swag.py
    │       │   ├── med_dialog.py
    │       │   ├── piqa.py
    │       │   ├── hellaswag.py
    │       │   ├── storycloze.py
    │       │   ├── squad_v2.py
    │       │   ├── mathqa.py
    │       │   ├── triviaqa.py
    │       │   ├── simpleqa.py
    │       │   ├── entity_data_imputation.py
    │       │   ├── covid_dialogue.py
    │       │   ├── babi_qa.py
    │       │   └── openbookqa.py
    │       ├── templates
    │       │   ├── __init__.py
    │       │   └── utils
    │       │   │   ├── __init__.py
    │       │   │   └── adapter_utils.py
    │       ├── multilingual
    │       │   ├── __init__.py
    │       │   ├── utils
    │       │   │   ├── __init__.py
    │       │   │   └── task_utils.py
    │       │   └── tasks
    │       │   │   ├── cmath.py
    │       │   │   ├── chegeka.py
    │       │   │   ├── french_triviqa.py
    │       │   │   ├── tquad_v2.py
    │       │   │   ├── thaiqa.py
    │       │   │   ├── kenswquad.py
    │       │   │   ├── french_boolq.py
    │       │   │   ├── fquad_v2.py
    │       │   │   ├── cmrc2018.py
    │       │   │   ├── sber_squad.py
    │       │   │   ├── chinese_squad.py
    │       │   │   ├── squad_it.py
    │       │   │   ├── arcd.py
    │       │   │   ├── squad_es.py
    │       │   │   ├── faquad.py
    │       │   │   ├── germanquad.py
    │       │   │   ├── hindi_boolq.py
    │       │   │   ├── mintaka.py
    │       │   │   ├── mgsm.py
    │       │   │   ├── soqal.py
    │       │   │   ├── thai_exams.py
    │       │   │   ├── hellaswag_tel.py
    │       │   │   ├── arabic_arc.py
    │       │   │   ├── hellaswag_hin.py
    │       │   │   ├── tydiqa.py
    │       │   │   ├── afri_mgsm.py
    │       │   │   └── openbook_es.py
    │       └── __init__.py
    │   ├── utils
    │       └── __init__.py
    │   ├── metrics
    │       └── imports
    │       │   └── __init__.py
    │   └── __init__.py
├── docs
    └── source
    │   ├── package_reference
    │       ├── doc.mdx
    │       ├── evaluation_tracker.mdx
    │       ├── models_outputs.mdx
    │       ├── pipeline.mdx
    │       ├── logging.mdx
    │       ├── tasks.mdx
    │       └── models.mdx
    │   ├── available-tasks.mdx
    │   ├── _toctree.yml
    │   └── caching.mdx
├── MANIFEST.in
├── examples
    ├── tasks
    │   ├── serbian_task_group
    │   │   ├── sr_all_inclusive.txt
    │   │   ├── sr_custom_task.txt
    │   │   ├── sr_qa_knowledge.txt
    │   │   ├── sr_arc.txt
    │   │   ├── sr_mmlu_business_professional.txt
    │   │   ├── sr_commonsense_reasoning.txt
    │   │   ├── sr_mmlu_social_sciences.txt
    │   │   ├── sr_mmlu_ethics_philosophy.txt
    │   │   ├── sr_misc.txt
    │   │   ├── sr_mmlu_math_logic.txt
    │   │   ├── sr_mmlu_college_level.txt
    │   │   └── sr_mmlu_high_school_level.txt
    │   ├── all_german_rag_evals.txt
    │   ├── fine_tasks
    │   │   ├── cf
    │   │   │   ├── th.txt
    │   │   │   ├── te.txt
    │   │   │   ├── fr.txt
    │   │   │   ├── tr.txt
    │   │   │   ├── sw.txt
    │   │   │   ├── hi.txt
    │   │   │   ├── ru.txt
    │   │   │   ├── zh.txt
    │   │   │   └── ar.txt
    │   │   └── mcf
    │   │   │   ├── th.txt
    │   │   │   ├── te.txt
    │   │   │   ├── fr.txt
    │   │   │   ├── tr.txt
    │   │   │   ├── sw.txt
    │   │   │   ├── hi.txt
    │   │   │   ├── ru.txt
    │   │   │   ├── zh.txt
    │   │   │   └── ar.txt
    │   ├── all_filipino_tasks.txt
    │   └── bbh.txt
    ├── model_configs
    │   ├── inference_providers.yaml
    │   ├── transformers_vlm_model.yaml
    │   ├── tgi_model.yaml
    │   ├── litellm_model.yaml
    │   ├── quantized_model.yaml
    │   ├── transformers_model.yaml
    │   ├── sglang_model_config.yaml
    │   ├── vllm_model_config.yaml
    │   ├── peft_model.yaml
    │   └── endpoint_model.yaml
    ├── nanotron
    │   └── lighteval_config_override_template.yaml
    └── test_tasks.txt
├── .gitattributes
├── Makefile
├── tests
    ├── reference_scores
    │   ├── harness_metrics.json
    │   ├── harness_prompts.json
    │   ├── Qwen2.5-VL-3B-Instruct-results-vlm.json
    │   ├── Qwen2.5-VL-7B-Instruct-results-vlm.json
    │   ├── SmolLM2-1.7B-Instruct-results-vllm.json
    │   └── SmolLM2-1.7B-Instruct-results-accelerate.json
    ├── reference_details
    │   ├── SmolLM2-1.7B-Instruct-vllm
    │   │   ├── details_gsm8k_test|0_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_hellaswag|10_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_agieval:lsat-ar|0_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_agieval:lsat-lr|0_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_agieval:sat-en|0_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_arc:challenge|25_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_truthfulqa:mc|0_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_agieval:aqua-rat|0_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_agieval:logiqa-en|0_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_agieval:lsat-rc|0_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_bigbench_hard:snarks|3_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_bigbench_hard:navigate|3_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_bigbench_hard:ruin_names|3_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_mmlu:college_chemistry|5_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_mmlu:us_foreign_policy|5_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_agieval:sat-en-without-passage|0_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_bigbench_hard:causal_judgment|3_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_bigbench_hard:geometric_shapes|3_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_bigbench_hard:date_understanding|3_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_bigbench_hard:disambiguation_qa|3_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_bigbench_hard:movie_recommendation|3_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_bigbench_hard:temporal_sequences|3_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_bigbench_hard:logical_deduction_five_objects|3_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_bigbench_hard:logical_deduction_seven_objects|3_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_bigbench_hard:salient_translation_error_detection|3_2025-11-05T14-52-08.352779.parquet
    │   │   ├── details_bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-11-05T14-52-08.352779.parquet
    │   │   └── details_bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-11-05T14-52-08.352779.parquet
    │   ├── SmolLM2-1.7B-Instruct-transformers
    │   │   ├── details_gsm8k_test|0_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_hellaswag|10_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_agieval:aqua-rat|0_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_agieval:logiqa-en|0_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_agieval:lsat-ar|0_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_agieval:lsat-lr|0_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_agieval:lsat-rc|0_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_agieval:sat-en|0_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_arc:challenge|25_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_truthfulqa:mc|0_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_bigbench_hard:navigate|3_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_bigbench_hard:snarks|3_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_mmlu:college_chemistry|5_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_mmlu:us_foreign_policy|5_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_bigbench_hard:ruin_names|3_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_agieval:sat-en-without-passage|0_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_bigbench_hard:causal_judgment|3_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_bigbench_hard:date_understanding|3_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_bigbench_hard:disambiguation_qa|3_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_bigbench_hard:geometric_shapes|3_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_bigbench_hard:temporal_sequences|3_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_bigbench_hard:movie_recommendation|3_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_bigbench_hard:logical_deduction_five_objects|3_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_bigbench_hard:logical_deduction_seven_objects|3_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_bigbench_hard:salient_translation_error_detection|3_2025-11-05T14-43-47.148527.parquet
    │   │   ├── details_bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-11-05T14-43-47.148527.parquet
    │   │   └── details_bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-11-05T14-43-47.148527.parquet
    │   └── Qwen2.5-VL-3B-Instruct-vlm
    │   │   └── details_mmmu_pro:standard-4|0_2025-11-05T15-23-34.026089.parquet
    ├── unit
    │   ├── metrics
    │   │   ├── pytest.ini
    │   │   └── test_cases
    │   │   │   ├── rouge1.json
    │   │   │   ├── simpleqa_judge.json
    │   │   │   ├── bert_score.json
    │   │   │   ├── bits_per_byte.json
    │   │   │   ├── byte_perplexity.json
    │   │   │   ├── expr_gold_metric.json
    │   │   │   ├── prediction_perplexity.json
    │   │   │   ├── mcc.json
    │   │   │   ├── exact_match.json
    │   │   │   ├── acc_golds_likelihood.json
    │   │   │   ├── avg_at_k_math.json
    │   │   │   ├── pass_at_k_math.json
    │   │   │   ├── avg_at_k.json
    │   │   │   └── drop.json
    │   └── models
    │   │   ├── test_base_model.py
    │   │   └── test_abstract_model.py
    ├── conftest.py
    ├── __init__.py
    └── slow_tests
    │   └── __init__.py
├── .github
    ├── workflows
    │   ├── pr_style_bot.yaml
    │   ├── trufflehog.yml
    │   ├── doc-build.yml
    │   ├── doc-pr-build.yml
    │   ├── doc-pr-upload.yml
    │   ├── quality.yaml
    │   ├── slow_tests.yaml
    │   └── tests.yaml
    ├── ISSUE_TEMPLATE
    │   ├── evaluation-task-request.md
    │   ├── feature-request.md
    │   └── bug_report.md
    └── release.yml
├── LICENSE
├── setup.py
└── .pre-commit-config.yaml


/src/lighteval/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/docs/source/package_reference/doc.mdx:
--------------------------------------------------------------------------------
1 | # Doc
2 | 
3 | [[autodoc]] tasks.requests.Doc
4 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include src/lighteval/tasks/tasks_table.jsonl
2 | include src/lighteval/metrics/*.jsonl
3 | 


--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_all_inclusive.txt:
--------------------------------------------------------------------------------
1 | # MMLU (All-inclusive Task Entry)
2 | community|serbian_evals:mmlu|0
3 | 


--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_custom_task.txt:
--------------------------------------------------------------------------------
1 | # Serbian Evaluations - Custom/Other Task
2 | community|serbian_evals:oz_eval|0
3 | 


--------------------------------------------------------------------------------
/docs/source/package_reference/evaluation_tracker.mdx:
--------------------------------------------------------------------------------
1 | # EvaluationTracker
2 | 
3 | [[autodoc]] logging.evaluation_tracker.EvaluationTracker
4 | 


--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_qa_knowledge.txt:
--------------------------------------------------------------------------------
1 | # Question Answering and Knowledge
2 | community|serbian_evals:boolq|0
3 | community|serbian_evals:openbook|0
4 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | *.json filter=lfs diff=lfs merge=lfs -text
2 | tests/unit/metrics/test_cases/*.json -filter -diff -merge text
3 | *.parquet filter=lfs diff=lfs merge=lfs -text
4 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: style format
 2 | 
 3 | 
 4 | style:
 5 | 	ruff format .
 6 | 	ruff check --fix .
 7 | 
 8 | 
 9 | quality:
10 | 	ruff format --check .
11 | 	ruff check .
12 | 


--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_arc.txt:
--------------------------------------------------------------------------------
1 | # Serbian Evaluations - ARC (AI2 Reasoning Challenge)
2 | community|serbian_evals:arc_easy|0
3 | community|serbian_evals:arc_challenge|0
4 | 


--------------------------------------------------------------------------------
/tests/reference_scores/harness_metrics.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c2080305011a7ac8b0895ec1fbb26b45af4e3dced6272abf67156ebf57656f88
3 | size 48360080
4 | 


--------------------------------------------------------------------------------
/tests/reference_scores/harness_prompts.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:059a48631d4243cda36d067db50350639c12b0a88fb209f76bbcd0c3ff266ffb
3 | size 20244711
4 | 


--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_mmlu_business_professional.txt:
--------------------------------------------------------------------------------
1 | # MMLU (Business Professional)
2 | community|serbian_evals:mmlu_marketing|0
3 | community|serbian_evals:mmlu_manadzment|0
4 | 


--------------------------------------------------------------------------------
/tests/reference_scores/Qwen2.5-VL-3B-Instruct-results-vlm.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5b38f65703ddd426111ba45e6f6b8b82ee2049c7e754e977a5c6269aa2d94ade
3 | size 3968
4 | 


--------------------------------------------------------------------------------
/tests/reference_scores/Qwen2.5-VL-7B-Instruct-results-vlm.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d07d8341188999f359a530e1dae4cd8ec3936d4046232a68b90a56c9f2994b3c
3 | size 3083
4 | 


--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_commonsense_reasoning.txt:
--------------------------------------------------------------------------------
1 | # Commonsense Reasoning
2 | community|serbian_evals:hellaswag|0
3 | community|serbian_evals:piqa|0
4 | community|serbian_evals:winogrande|0
5 | 


--------------------------------------------------------------------------------
/tests/reference_scores/SmolLM2-1.7B-Instruct-results-vllm.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4dcc899c5963df3e98cc9d144f3c582edda227d8d9e2c24fabc1f794a4fab524
3 | size 47986
4 | 


--------------------------------------------------------------------------------
/tests/reference_scores/SmolLM2-1.7B-Instruct-results-accelerate.json:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:55b420e5ff6061d2d2d66c3f9ce8cab541820766b1bd7becc0d7b290b99144b6
3 | size 47858
4 | 


--------------------------------------------------------------------------------
/docs/source/package_reference/models_outputs.mdx:
--------------------------------------------------------------------------------
1 | # Model's Output
2 | 
3 | All models will generate an ouput per Doc supplied to the `generation` or `loglikelihood` fuctions.
4 | 
5 | [[autodoc]] lighteval.models.model_output.ModelResponse
6 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_gsm8k_test|0_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:df34c40c43eeea4355e86ec505b053db421189b2082c670409e66d93defdd0d1
3 | size 39054
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_hellaswag|10_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:973fa1740490bf212831075ac9842dd88a31db7aa422e240c01eafb840979207
3 | size 88599
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:lsat-ar|0_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:29588411e9390fe550e3ca353e0d7c89e381d25673ced35399f5896e0c613216
3 | size 50719
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:lsat-lr|0_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:450f5a17118613189a749f0fd9f7807265b43733482367e969b04ae7971a749c
3 | size 55774
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:sat-en|0_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:bd4eed73faaf58a18a302a1d2f0c8b8b8e1fbd482a5fd4ca77c375f1e3082f0e
3 | size 109931
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_arc:challenge|25_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7bab826310f526d7aaa9c5e15ff50314524d54847de37699b762adec3c57fb78
3 | size 144793
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_truthfulqa:mc|0_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ec46468169068183da1c57ace7064fcfa8664e4acad3a76f2d37e260468b67ee
3 | size 32367
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_gsm8k_test|0_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:419f252eddb158e185b515b39ca9e1784f7b4122a620a2a67034178bb1ea6abb
3 | size 35694
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_hellaswag|10_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:2c74caa34cd1c6b227a1d66dcda7a0c61986435f925f17cf81e676f8c542d146
3 | size 67250
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:aqua-rat|0_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6337b98efe5a6d10a02d4c13d78bcff056797c65999dbfb8ef5ab128f88fe4cf
3 | size 26482
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:logiqa-en|0_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1a998148dbb28f6826861479e8d9fc7bf7f73b0ab6921dc9a6da811e70eddba4
3 | size 45688
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:lsat-rc|0_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5fab7e1da1cdd0e8f66831b57bcda28b6350c7cf16c9905417746278a6f30f31
3 | size 148786
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:snarks|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:00a0ea645c10c6a8e0d55408f20bf59f95c3cd135afaebc5df0d1fbb89c3b93d
3 | size 37857
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/Qwen2.5-VL-3B-Instruct-vlm/details_mmmu_pro:standard-4|0_2025-11-05T15-23-34.026089.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5eda1dbcf8c9604005ce8c27239a57e5f41f852dbd3da13656d94b01b325f16c
3 | size 11538690
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:aqua-rat|0_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ef968329bee498b3387ec8df3677ca9bbac72e90599efbe7f78db23f4227b2f6
3 | size 21785
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:logiqa-en|0_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5bb00dd8c872d95dd5b2999d788ece8c34d43b5b5ea4ef8f0859ba535d7b8cbf
3 | size 34021
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:lsat-ar|0_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:161eb467544ca6273231945067a8d70aedb0e4e6c3eba4e8b40532cd1c37e6b0
3 | size 30662
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:lsat-lr|0_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:da3a3dcc9ae24c6f3bc80f0ce72b64d09a5ab19d7803b168a8f1ad540f7f66c1
3 | size 39332
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:lsat-rc|0_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fb3787ff3e796b49199e8d438a30ce768438b6d4fc5df5941e393bfd1fdf2ae6
3 | size 74124
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:sat-en|0_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8a20696f4036b3a6e2b41b309f405d9a5445c4573150fc7e7bec15f28fd77bdf
3 | size 72441
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_arc:challenge|25_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:803a445f6d563c2d2097df7c7772cd81b3df0bacc9e702caaab5f0dde7fe5b25
3 | size 87624
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_truthfulqa:mc|0_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:d0886aca82f5687e1fd742c0c5b9fe046fb20622d9e67818d21d7300de27e746
3 | size 26034
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:navigate|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e62a25828dddf557091b9dcbc065f2c9e36fdf0c8d365dd1976584fc3f516eae
3 | size 34538
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:ruin_names|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:480bb8570ec01a173ebe85649989dc9a8ab64a4a2de2152d82bc344787bfffee
3 | size 34511
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_mmlu:college_chemistry|5_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:fa71775ee3b585b1612f2bbbd8327ba4f564c9eddcdce63533fd6d11c67c5d95
3 | size 50977
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_mmlu:us_foreign_policy|5_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:c5cae6c52cee4a49e9cf28920ec303235a15f2b9f075b4f2deff65bd220aea77
3 | size 52510
4 | 


--------------------------------------------------------------------------------
/examples/tasks/all_german_rag_evals.txt:
--------------------------------------------------------------------------------
1 | community|german_rag_eval:choose_question_by_context|0
2 | community|german_rag_eval:choose_context_by_question|0
3 | community|german_rag_eval:question_answer_match|0
4 | community|german_rag_eval:context_question_match|0
5 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:navigate|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:6baa807a3c85b3ca43708f5810f3898dd517db806ce4106124d4913b0fcea8b0
3 | size 28834
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:snarks|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:13659b3b6bcd5c2744fc3b33d8752589a1f6c52b2ed8ee17c6a3a4f28cd46908
3 | size 29149
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_mmlu:college_chemistry|5_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:25ea151a418829d528da4f762102052308b1cbb15b00d7190d5d0b9fd033436d
3 | size 37684
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_mmlu:us_foreign_policy|5_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:df6206dba3812d089f03078b97d067eb851eee03fd2aa295cbac2551f82837c0
3 | size 38453
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_agieval:sat-en-without-passage|0_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:61c1a8942675c523f499e19b01432fb0ce7b0cb8bbd910fe0a16b2b60bb7e80c
3 | size 32704
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:causal_judgment|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4b020f10c882cb482d6e2ac597e14d662a5d38f873f5f7eada8c839c5e13b476
3 | size 72052
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:geometric_shapes|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:5ecdb28b097666cd4ccb0a208708a2076779e06145719ebd24e75060a272bdcf
3 | size 49571
4 | 


--------------------------------------------------------------------------------
/examples/model_configs/inference_providers.yaml:
--------------------------------------------------------------------------------
1 | model_parameters:
2 |   model_name: "meta-llama/Llama-3.1-8B-Instruct"
3 |   provider: "nebius"
4 |   timeout: null
5 |   proxies: null
6 |   parallel_calls_count: 20
7 |   generation_parameters:
8 |     temperature: 0.4
9 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:ruin_names|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:7fe3c64179df68407c6fe6c6ad75b3f4a83a1edd286ca81da3fe96bcb5b21e9b
3 | size 27971
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:date_understanding|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:a8a1f1ef9ba8e3a58d2538f2f2e016769155f2b6c18da49454849c8b276cd398
3 | size 39423
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:disambiguation_qa|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:13fa9f4b64152c7140112b607f6dfddb5f9f755646bbef0b9cc5516a9c0e6de4
3 | size 38263
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:movie_recommendation|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:88e7d66c2396ab8a3f56ae9f4a342037a0f13f4ed83982312fdc7424eb74f60b
3 | size 36502
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:temporal_sequences|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:226b2d3fc783dcfecf3c634558746b1314f9f80a32a259c9fe174332fb1e3173
3 | size 50277
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_agieval:sat-en-without-passage|0_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:47204bfa1d6843f06ef6c08bb66d85adceab6457295f03303b7cd39bc7e4dd37
3 | size 25864
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:causal_judgment|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:eb844c74f574b377e4b27110dbdf0c28c227a96f4e8d1c0eac52578f4608bc49
3 | size 47558
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:date_understanding|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:b5ce79da0c3657667830882fa28ce623cb463bf5fb3c5e1367d6a5c13c480973
3 | size 30006
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:disambiguation_qa|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4d4919aa444d52a1589883329eb3fdbb583b029a6213d4af13aa17c11a835399
3 | size 30932
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:geometric_shapes|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:0dffe2c495874fa53e0289b2307161107c54e9d15c9a8aa39016c990f7d62f8f
3 | size 32464
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:temporal_sequences|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:23ffe95306670d3a737b30bf34866734dcba717742011a424cc0230377f52363
3 | size 34393
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:movie_recommendation|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:64975666dfc61cd3a3a515a88134775c0f90cff1e1b9120a8ab9c8861c68bb99
3 | size 29221
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:logical_deduction_five_objects|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4582e35192caeae218a50aa76738582d360914fd96cc9a4c3608d3683c44c33a
3 | size 47557
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:logical_deduction_seven_objects|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1a4b6292fb5df093df5ac43fba76b0af5b00337e0d2579a9c2b2f6398007b842
3 | size 56164
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:salient_translation_error_detection|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:ed82ceb8a3c05ae2c47b1769b333173e15069eb83710bc5d66918abb4ef4b7e7
3 | size 69137
4 | 


--------------------------------------------------------------------------------
/docs/source/package_reference/pipeline.mdx:
--------------------------------------------------------------------------------
 1 | # Pipeline
 2 | 
 3 | ## Pipeline
 4 | 
 5 | [[autodoc]] pipeline.Pipeline
 6 | 
 7 | ## PipelineParameters
 8 | 
 9 | [[autodoc]] pipeline.PipelineParameters
10 | 
11 | ## ParallelismManager
12 | 
13 | [[autodoc]] pipeline.ParallelismManager
14 | 


--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_mmlu_social_sciences.txt:
--------------------------------------------------------------------------------
1 | # MMLU (Social Sciences)
2 | community|serbian_evals:mmlu_globalne_cinjenice|0
3 | community|serbian_evals:mmlu_logicke_zablude|0
4 | community|serbian_evals:mmlu_sociologija|0
5 | community|serbian_evals:mmlu_human_aging|0
6 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:logical_deduction_five_objects|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1d10ce12e8f76b5ce3113273124e7683e5c5bddde6063cd3cbf25d495cffa6ba
3 | size 34653
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:logical_deduction_seven_objects|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e15bcad77e0453d7e987b4bf5216639b625f9df63341dfce4246dab88b87ca35
3 | size 38176
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:f6dd8f8d104f1a4252685019e5413ce9ecfc4611bb819ff627e77be296afc581
3 | size 52493
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-vllm/details_bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-11-05T14-52-08.352779.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:9651280724d245b37a7c3dde465c5a384de7b12055b9474696d533d58330b240
3 | size 59838
4 | 


--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_mmlu_ethics_philosophy.txt:
--------------------------------------------------------------------------------
1 | # MMLU (Ethics, Philosophy)
2 | community|serbian_evals:mmlu_moralni_sporovi|0
3 | community|serbian_evals:mmlu_moralne_dileme|0
4 | community|serbian_evals:mmlu_filozofija|0
5 | community|serbian_evals:mmlu_svetska_religija|0
6 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:salient_translation_error_detection|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:e34151ea0415cb442b47d334448abf127c8f1747da78a5a9977ff78ed2d831b5
3 | size 49337
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:tracking_shuffled_objects_five_objects|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:1d7e589d611391395b2990a29e55bdd856ab440d45cba22fcd190936daf391dd
3 | size 34842
4 | 


--------------------------------------------------------------------------------
/tests/reference_details/SmolLM2-1.7B-Instruct-transformers/details_bigbench_hard:tracking_shuffled_objects_seven_objects|3_2025-11-05T14-43-47.148527.parquet:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:8e116d939941d57db2e5515114bec4890b56b6a35a5a2e49c809e6361b947337
3 | size 37387
4 | 


--------------------------------------------------------------------------------
/examples/model_configs/transformers_vlm_model.yaml:
--------------------------------------------------------------------------------
 1 | model_parameters:
 2 |   model_name: "Qwen/Qwen2.5-VL-3B-Instruct"
 3 |   revision: "main"
 4 |   dtype: "float16"
 5 |   compile: false
 6 |   model_parallel: false
 7 |   batch_size: 1
 8 |   use_fast_image_processor: true
 9 |   generation_parameters:
10 |     temperature: 0.0
11 |     top_p: 0.9
12 | 


--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_misc.txt:
--------------------------------------------------------------------------------
1 | # MMLU (Miscellaneous)
2 | community|serbian_evals:mmlu_anatomija|0
3 | community|serbian_evals:mmlu_astronomija|0
4 | community|serbian_evals:mmlu_poslovna_etika|0
5 | community|serbian_evals:mmlu_kliničko_znanje|0
6 | community|serbian_evals:mmlu_razno|0
7 | community|serbian_evals:mmlu_elektrotehnika|0
8 | 


--------------------------------------------------------------------------------
/examples/model_configs/tgi_model.yaml:
--------------------------------------------------------------------------------
1 | model_parameters:
2 |   inference_server_address: "http://localhost:8080" # Replace with your actual TGI server address
3 |   inference_server_auth: null
4 |   model_name: null # Optional, only required if the TGI container was launched with model_id pointing to a local directory
5 |   generation_parameters:
6 |     temperature: 0.1
7 | 


--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/cf/th.txt:
--------------------------------------------------------------------------------
 1 | # General Knowledge (GK)
 2 | lighteval|meta_mmlu_tha_cf|0
 3 | lighteval|m3exams_tha_cf|0
 4 | 
 5 | # Reading Comprehension (RC)
 6 | lighteval|belebele_tha_Thai_cf|0
 7 | lighteval|thaiqa_tha|0
 8 | lighteval|xquad_tha|0
 9 | 
10 | # Natural Language Understanding (NLU)
11 | lighteval|community_hellaswag_tha_cf|0
12 | lighteval|xnli2.0_tha_cf|0
13 | 


--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/mcf/th.txt:
--------------------------------------------------------------------------------
 1 | # General Knowledge (GK)
 2 | lighteval|meta_mmlu_tha_mcf|5
 3 | lighteval|m3exams_tha_mcf|5
 4 | 
 5 | # Reading Comprehension (RC)
 6 | lighteval|belebele_tha_Thai_mcf|5
 7 | lighteval|thaiqa_tha|5
 8 | lighteval|xquad_tha|5
 9 | 
10 | # Natural Language Understanding (NLU)
11 | lighteval|community_hellaswag_tha_mcf|5
12 | lighteval|xnli2.0_tha_mcf|5
13 | 


--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_mmlu_math_logic.txt:
--------------------------------------------------------------------------------
1 | # MMLU (Math, Logic)
2 | community|serbian_evals:mmlu_abstract_algebra|0
3 | community|serbian_evals:mmlu_osnovna_matematika|0
4 | community|serbian_evals:mmlu_formalna_logika|0
5 | community|serbian_evals:mmlu_konceptualna_fizika|0
6 | community|serbian_evals:mmlu_metrika_ekonomije|0
7 | community|serbian_evals:mmlu_masinsko_ucenje|0
8 | 


--------------------------------------------------------------------------------
/examples/model_configs/litellm_model.yaml:
--------------------------------------------------------------------------------
 1 | model_parameters:
 2 |   model_name: "openai/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B"
 3 |   provider: "openai"
 4 |   base_url: "https://router.huggingface.co/hf-inference/v1"
 5 |   generation_parameters:
 6 |     temperature: 0.5
 7 |     max_new_tokens: 256
 8 |     top_p: 0.9
 9 |     seed: 0
10 |     repetition_penalty: 1.0
11 |     frequency_penalty: 0.0
12 | 


--------------------------------------------------------------------------------
/.github/workflows/pr_style_bot.yaml:
--------------------------------------------------------------------------------
 1 | name: PR Style Bot
 2 | 
 3 | on:
 4 |   issue_comment:
 5 |     types: [created]
 6 | 
 7 | permissions:
 8 |   pull-requests: write
 9 | 
10 | jobs:
11 |   style:
12 |     uses: huggingface/huggingface_hub/.github/workflows/style-bot-action.yml@main
13 |     with:
14 |       python_quality_dependencies: "[quality]"
15 |     secrets:
16 |       bot_token: ${{ secrets.HF_STYLE_BOT_ACTION }}
17 | 


--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/cf/te.txt:
--------------------------------------------------------------------------------
 1 | # General Knowledge (GK)
 2 | lighteval|mlmm_mmlu_tel_cf|0
 3 | 
 4 | # Reading Comprehension (RC)
 5 | lighteval|belebele_tel_Telu_cf|0
 6 | lighteval|indicqa_tel|0
 7 | 
 8 | # Reasoning (RES)
 9 | lighteval|indicxcopa_tel_cf|0
10 | 
11 | # Natural Language Understanding (NLU)
12 | lighteval|community_hellaswag_tel_cf|0
13 | lighteval|indicnxnli_tel_cf|0
14 | lighteval|xstory_cloze_tel_cf|0
15 | 


--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/mcf/te.txt:
--------------------------------------------------------------------------------
 1 | # General Knowledge (GK)
 2 | lighteval|mlmm_mmlu_tel_mcf|5
 3 | 
 4 | # Reading Comprehension (RC)
 5 | lighteval|belebele_tel_Telu_mcf|5
 6 | lighteval|indicqa_tel|5
 7 | 
 8 | # Reasoning (RES)
 9 | lighteval|indicxcopa_tel_mcf|5
10 | 
11 | # Natural Language Understanding (NLU)
12 | lighteval|community_hellaswag_tel_mcf|5
13 | lighteval|indicnxnli_tel_mcf|5
14 | lighteval|xstory_cloze_tel_mcf|5
15 | 


--------------------------------------------------------------------------------
/examples/model_configs/quantized_model.yaml:
--------------------------------------------------------------------------------
1 | model_parameters:
2 |   model_name: "HuggingFaceH4/zephyr-7b-beta" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ...
3 |   revision: "main" # revision to use
4 |   dtype: "4bit"  # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization.
5 |   compile: true
6 |   batch_size: 1 # batch size to use
7 | 


--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_mmlu_college_level.txt:
--------------------------------------------------------------------------------
1 | # MMLU (College Level Tasks)
2 | community|serbian_evals:mmlu_fakultet_biologija|0
3 | community|serbian_evals:mmlu_fakultet_hemija|0
4 | community|serbian_evals:mmlu_fakultet_racunari|0
5 | community|serbian_evals:mmlu_fakultet_matematika|0
6 | community|serbian_evals:mmlu_fakultet_medicina|0
7 | community|serbian_evals:mmlu_fakultet_fizika|0
8 | community|serbian_evals:mmlu_sigurnost_racunara|0
9 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/evaluation-task-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Evaluation/task request
 3 | about: Suggest a new evaluation you want us to add
 4 | title: "[EVAL]"
 5 | labels: new task
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Evaluation short description
11 | - Why is this evaluation interesting?
12 | - How used is it in the community?
13 | 
14 | ## Evaluation metadata
15 | Provide all available
16 | - Paper url:
17 | - Github url:
18 | - Dataset url:
19 | 


--------------------------------------------------------------------------------
/.github/workflows/trufflehog.yml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   push:
 3 | 
 4 | name: Scan Secret Leaks
 5 | 
 6 | permissions:
 7 |   contents: read
 8 | 
 9 | jobs:
10 |   trufflehog:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |     - name: Checkout code
14 |       uses: actions/checkout@v4
15 |       with:
16 |         fetch-depth: 0
17 |     - name: Secret Scanning
18 |       uses: trufflesecurity/trufflehog@main
19 |       with:
20 |         extra_args: --only-verified
21 | 


--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/cf/fr.txt:
--------------------------------------------------------------------------------
 1 | # General Knowledge (GK)
 2 | lighteval|meta_mmlu_fra_cf|0
 3 | lighteval|mlmm_arc_fra_cf:challenge|0
 4 | lighteval|mintaka_fra|0
 5 | 
 6 | # Reading Comprehension (RC)
 7 | lighteval|belebele_fra_Latn_cf|0
 8 | lighteval|fquadv2_fra|0
 9 | 
10 | # Reasoning (RES)
11 | lighteval|xcodah_fra_cf|0
12 | lighteval|xcsqa_fra_cf|0
13 | 
14 | # Natural Language Understanding (NLU)
15 | lighteval|mlmm_hellaswag_fra_cf|0
16 | lighteval|xnli2.0_fra_cf|0
17 | 


--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/cf/tr.txt:
--------------------------------------------------------------------------------
 1 | # General Knowledge (GK)
 2 | lighteval|community_arc_tur_cf:easy|0
 3 | lighteval|exams_tur_cf|0
 4 | lighteval|community_mmlu_tur_cf|0
 5 | 
 6 | # Reading Comprehension (RC)
 7 | lighteval|belebele_tur_Latn_cf|0
 8 | lighteval|tquadv2_tur|0
 9 | lighteval|xquad_tur|0
10 | 
11 | # Reasoning (RES)
12 | lighteval|xcopa_tur_cf|0
13 | 
14 | # Natural Language Understanding (NLU)
15 | lighteval|community_hellaswag_tur_cf|0
16 | lighteval|xnli2.0_tur_cf|0
17 | 


--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/mcf/fr.txt:
--------------------------------------------------------------------------------
 1 | # General Knowledge (GK)
 2 | lighteval|meta_mmlu_fra_mcf|5
 3 | lighteval|mlmm_arc_fra_mcf:challenge|5
 4 | lighteval|mintaka_fra|5
 5 | 
 6 | # Reading Comprehension (RC)
 7 | lighteval|belebele_fra_Latn_mcf|5
 8 | lighteval|fquadv2_fra|5
 9 | 
10 | # Reasoning (RES)
11 | lighteval|xcodah_fra_mcf|5
12 | lighteval|xcsqa_fra_mcf|5
13 | 
14 | # Natural Language Understanding (NLU)
15 | lighteval|mlmm_hellaswag_fra_mcf|5
16 | lighteval|xnli2.0_fra_mcf|5
17 | 


--------------------------------------------------------------------------------
/.github/workflows/doc-build.yml:
--------------------------------------------------------------------------------
 1 | name: Build Documentation
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - doc-builder*
 8 |       - v*-release
 9 | 
10 | jobs:
11 |   build:
12 |     uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
13 |     with:
14 |       commit_sha: ${{ github.sha }}
15 |       package: lighteval
16 |     secrets:
17 |       token: ${{ secrets.HUGGINGFACE_PUSH }}
18 |       hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
19 | 


--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/mcf/tr.txt:
--------------------------------------------------------------------------------
 1 | # General Knowledge (GK)
 2 | lighteval|community_arc_tur_mcf:easy|5
 3 | lighteval|exams_tur_mcf|5
 4 | lighteval|community_mmlu_tur_mcf|5
 5 | 
 6 | # Reading Comprehension (RC)
 7 | lighteval|belebele_tur_Latn_mcf|5
 8 | lighteval|tquadv2_tur|5
 9 | lighteval|xquad_tur|5
10 | 
11 | # Reasoning (RES)
12 | lighteval|xcopa_tur_mcf|5
13 | 
14 | # Natural Language Understanding (NLU)
15 | lighteval|community_hellaswag_tur_mcf|5
16 | lighteval|xnli2.0_tur_mcf|5
17 | 


--------------------------------------------------------------------------------
/.github/workflows/doc-pr-build.yml:
--------------------------------------------------------------------------------
 1 | name: Build PR Documentation
 2 | 
 3 | on:
 4 |   pull_request:
 5 | 
 6 | concurrency:
 7 |   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
 8 |   cancel-in-progress: true
 9 | 
10 | jobs:
11 |   build:
12 |     uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
13 |     with:
14 |       commit_sha: ${{ github.event.pull_request.head.sha }}
15 |       pr_number: ${{ github.event.number }}
16 |       package: lighteval
17 | 


--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/cf/sw.txt:
--------------------------------------------------------------------------------
 1 | # General Knowledge (GK)
 2 | lighteval|community_arc_swa_cf:easy|0
 3 | lighteval|m3exams_swa_cf|0
 4 | lighteval|openai_mmlu_swa_cf|0
 5 | 
 6 | # Reading Comprehension (RC)
 7 | lighteval|belebele_swh_Latn_cf|0
 8 | lighteval|kenswquad_swa|0
 9 | lighteval|tydiqa_swa|0
10 | 
11 | # Reasoning (RES)
12 | lighteval|xcsqa_swa_cf|0
13 | lighteval|xcopa_swa_cf|0
14 | 
15 | # Natural Language Understanding (NLU)
16 | lighteval|xnli2.0_swa_cf|0
17 | lighteval|xstory_cloze_swa_cf|0
18 | 


--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/mcf/sw.txt:
--------------------------------------------------------------------------------
 1 | # General Knowledge (GK)
 2 | lighteval|community_arc_swa_mcf:easy|5
 3 | lighteval|m3exams_swa_mcf|5
 4 | lighteval|openai_mmlu_swa_mcf|5
 5 | 
 6 | # Reading Comprehension (RC)
 7 | lighteval|belebele_swh_Latn_mcf|5
 8 | lighteval|kenswquad_swa|5
 9 | lighteval|tydiqa_swa|5
10 | 
11 | # Reasoning (RES)
12 | lighteval|xcsqa_swa_mcf|5
13 | lighteval|xcopa_swa_mcf|5
14 | 
15 | # Natural Language Understanding (NLU)
16 | lighteval|xnli2.0_swa_mcf|5
17 | lighteval|xstory_cloze_swa_mcf|5
18 | 


--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/cf/hi.txt:
--------------------------------------------------------------------------------
 1 | # General Knowledge (GK)
 2 | lighteval|meta_mmlu_hin_cf|0
 3 | lighteval|community_arc_hin_cf:easy|0
 4 | 
 5 | # Reading Comprehension (RC)
 6 | lighteval|belebele_hin_Deva_cf|0
 7 | lighteval|indicqa_hin|0
 8 | 
 9 | # Reasoning (RES)
10 | lighteval|xcodah_hin_cf|0
11 | lighteval|indicxcopa_hin_cf|0
12 | lighteval|xcsqa_hin_cf|0
13 | 
14 | # Natural Language Understanding (NLU)
15 | lighteval|mlmm_hellaswag_hin_cf|0
16 | lighteval|indicnxnli_hin_cf|0
17 | lighteval|xstory_cloze_hin_cf|0
18 | 


--------------------------------------------------------------------------------
/docs/source/package_reference/logging.mdx:
--------------------------------------------------------------------------------
 1 | # Logging
 2 | 
 3 | ## EvaluationTracker
 4 | [[autodoc]] logging.evaluation_tracker.EvaluationTracker
 5 | 
 6 | ## GeneralConfigLogger
 7 | [[autodoc]] logging.info_loggers.GeneralConfigLogger
 8 | ## DetailsLogger
 9 | [[autodoc]] logging.info_loggers.DetailsLogger
10 | ## MetricsLogger
11 | [[autodoc]] logging.info_loggers.MetricsLogger
12 | ## VersionsLogger
13 | [[autodoc]] logging.info_loggers.VersionsLogger
14 | ## TaskConfigLogger
15 | [[autodoc]] logging.info_loggers.TaskConfigLogger
16 | 


--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/mcf/hi.txt:
--------------------------------------------------------------------------------
 1 | # General Knowledge (GK)
 2 | lighteval|meta_mmlu_hin_mcf|5
 3 | lighteval|community_arc_hin_mcf:easy|5
 4 | 
 5 | # Reading Comprehension (RC)
 6 | lighteval|belebele_hin_Deva_mcf|5
 7 | lighteval|indicqa_hin|5
 8 | 
 9 | # Reasoning (RES)
10 | lighteval|xcodah_hin_mcf|5
11 | lighteval|indicxcopa_hin_mcf|5
12 | lighteval|xcsqa_hin_mcf|5
13 | 
14 | # Natural Language Understanding (NLU)
15 | lighteval|mlmm_hellaswag_hin_mcf|5
16 | lighteval|indicnxnli_hin_mcf|5
17 | lighteval|xstory_cloze_hin_mcf|5
18 | 


--------------------------------------------------------------------------------
/examples/model_configs/transformers_model.yaml:
--------------------------------------------------------------------------------
 1 | model_parameters:
 2 |   model_name: "HuggingFaceTB/SmolLM2-1.7B-Instruct"
 3 |   revision: "57aa3c6599c53705406c648e7acca7e11dc45ea3"
 4 |   dtype: "float16"
 5 |   compile: false
 6 |   model_parallel: false
 7 |   batch_size: 1
 8 |   continuous_batching: false
 9 |   model_loading_kwargs:
10 |     attn_implementation: "eager"
11 |     #tp_plan: "auto"
12 |   generation_parameters:
13 |     #num_blocks: 4096
14 |     #block_size: 64
15 |     #max_new_tokens: 256
16 |     temperature: 0.0
17 |     top_p: 0.9
18 | 


--------------------------------------------------------------------------------
/.github/workflows/doc-pr-upload.yml:
--------------------------------------------------------------------------------
 1 | name: Upload PR Documentation
 2 | 
 3 | on:
 4 |   workflow_run:
 5 |     workflows: ["Build PR Documentation"]
 6 |     types:
 7 |       - completed
 8 | 
 9 | jobs:
10 |   build:
11 |     uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
12 |     with:
13 |       package_name: lighteval
14 |     secrets:
15 |       hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
16 |       comment_bot_app_id: ${{ secrets.COMMENT_BOT_APP_ID }}
17 |       comment_bot_secret_pem: ${{ secrets.COMMENT_BOT_SECRET_PEM }}
18 | 


--------------------------------------------------------------------------------
/tests/unit/metrics/pytest.ini:
--------------------------------------------------------------------------------
 1 | [tool:pytest]
 2 | testpaths = .
 3 | python_files = test_*.py
 4 | python_classes = Test*
 5 | python_functions = test_*
 6 | addopts =
 7 |     -v
 8 |     --tb=short
 9 |     --strict-markers
10 |     --disable-warnings
11 | markers =
12 |     slow: marks tests as slow (deselect with '-m "not slow"')
13 |     unit: marks tests as unit tests
14 |     integration: marks tests as integration tests
15 |     automated: marks tests as automated metric tests
16 | filterwarnings =
17 |     ignore::DeprecationWarning
18 |     ignore::PendingDeprecationWarning
19 | 


--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/cf/ru.txt:
--------------------------------------------------------------------------------
 1 | # General Knowledge (GK)
 2 | lighteval|mlmm_arc_rus_cf:challenge|0
 3 | lighteval|rummlu_rus_cf|0
 4 | lighteval|mera_openbookqa_rus_cf|0
 5 | 
 6 | # Reading Comprehension (RC)
 7 | lighteval|belebele_rus_Cyrl_cf|0
 8 | lighteval|tydiqa_rus|0
 9 | lighteval|sber_squad_rus|0
10 | lighteval|xquad_rus|0
11 | 
12 | # Reasoning (RES)
13 | lighteval|parus_rus_cf|0
14 | lighteval|xcodah_rus_cf|0
15 | lighteval|xcsqa_rus_cf|0
16 | 
17 | # Natural Language Understanding (NLU)
18 | lighteval|mlmm_hellaswag_rus_cf|0
19 | lighteval|xnli2.0_rus_cf|0
20 | lighteval|xstory_cloze_rus_cf|0
21 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: "[FT] "
 5 | labels: feature request
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Issue encountered
11 | Is your feature request related to a problem? Please provide a clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | ## Solution/Feature
14 | A clear and concise description of what you want to happen.
15 | 
16 | ## Possible alternatives
17 | A clear and concise description of any alternative solutions or features you've considered.
18 | 


--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/mcf/ru.txt:
--------------------------------------------------------------------------------
 1 | # General Knowledge (GK)
 2 | lighteval|mlmm_arc_rus_mcf:challenge|5
 3 | lighteval|rummlu_rus_mcf|5
 4 | lighteval|mera_openbookqa_rus_mcf|5
 5 | 
 6 | # Reading Comprehension (RC)
 7 | lighteval|belebele_rus_Cyrl_mcf|5
 8 | lighteval|tydiqa_rus|5
 9 | lighteval|sber_squad_rus|5
10 | lighteval|xquad_rus|5
11 | 
12 | # Reasoning (RES)
13 | lighteval|parus_rus_mcf|0
14 | lighteval|xcodah_rus_mcf|5
15 | lighteval|xcsqa_rus_mcf|5
16 | 
17 | # Natural Language Understanding (NLU)
18 | lighteval|mlmm_hellaswag_rus_mcf|0
19 | lighteval|xnli2.0_rus_mcf|5
20 | lighteval|xstory_cloze_rus_mcf|5
21 | 


--------------------------------------------------------------------------------
/.github/release.yml:
--------------------------------------------------------------------------------
 1 | changelog:
 2 |   exclude:
 3 |     labels:
 4 |       - ignore-for-release
 5 |   categories:
 6 |     - title: New Features 🎉
 7 |       labels:
 8 |         - feature
 9 |     - title: Enhancement ⚙️
10 |       labels:
11 |         - enhancement
12 |     - title: Documentation 📚
13 |       labels:
14 |         - documentation
15 |     - title: New Tasks
16 |       labels:
17 |         - new-task
18 |     - title: Task and Metrics changes 🛠️
19 |       labels:
20 |         - task-update
21 |     - title: Bug Fixes 🐛
22 |       labels:
23 |         - bug
24 |     - title: Other Changes
25 |       labels:
26 |         - "*"
27 | 


--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/cf/zh.txt:
--------------------------------------------------------------------------------
 1 | # General Knowledge (GK)
 2 | lighteval|agieval_zho_cf|0
 3 | lighteval|ceval_zho_cf|0
 4 | lighteval|cmmlu_zho_cf|0
 5 | lighteval|m3exams_zho_cf|0
 6 | 
 7 | # Reading Comprehension (RC)
 8 | lighteval|belebele_zho_Hans_cf|0
 9 | lighteval|c3_zho_cf|0
10 | lighteval|cmrc2018_zho|0
11 | lighteval|chinese_squad_zho|0
12 | 
13 | # Reasoning (RES)
14 | lighteval|xcodah_zho_cf|0
15 | lighteval|xcopa_zho_cf|0
16 | lighteval|xcsqa_zho_cf|0
17 | 
18 | # Natural Language Understanding (NLU)
19 | lighteval|mlmm_hellaswag_zho_cf|0
20 | lighteval|ocnli_zho_cf|0
21 | lighteval|xwinograd_zho_cf|0
22 | lighteval|xstory_cloze_zho_cf|0
23 | 


--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/mcf/zh.txt:
--------------------------------------------------------------------------------
 1 | # General Knowledge (GK)
 2 | lighteval|agieval_zho_mcf|5
 3 | lighteval|ceval_zho_mcf|5
 4 | lighteval|cmmlu_zho_mcf|5
 5 | lighteval|m3exams_zho_mcf|5
 6 | 
 7 | # Reading Comprehension (RC)
 8 | lighteval|belebele_zho_Hans_mcf|5
 9 | lighteval|c3_zho_mcf|5
10 | lighteval|cmrc2018_zho|5
11 | lighteval|chinese_squad_zho|5
12 | 
13 | # Reasoning (RES)
14 | lighteval|xcodah_zho_mcf|5
15 | lighteval|xcopa_zho_mcf|5
16 | lighteval|xcsqa_zho_mcf|5
17 | 
18 | # Natural Language Understanding (NLU)
19 | lighteval|mlmm_hellaswag_zho_mcf|5
20 | lighteval|ocnli_zho_mcf|5
21 | lighteval|xwinograd_zho_mcf|5
22 | lighteval|xstory_cloze_zho_mcf|5
23 | 


--------------------------------------------------------------------------------
/docs/source/package_reference/tasks.mdx:
--------------------------------------------------------------------------------
 1 | # Tasks
 2 | 
 3 | ## LightevalTask
 4 | ### LightevalTaskConfig
 5 | [[autodoc]] tasks.lighteval_task.LightevalTaskConfig
 6 | ### LightevalTask
 7 | [[autodoc]] tasks.lighteval_task.LightevalTask
 8 | 
 9 | ## PromptManager
10 | [[autodoc]] tasks.prompt_manager.PromptManager
11 | 
12 | ## Registry
13 | [[autodoc]] tasks.registry.Registry
14 | 
15 | ## Doc
16 | [[autodoc]] tasks.requests.Doc
17 | 
18 | ## Datasets
19 | [[autodoc]] data.DynamicBatchDataset
20 | [[autodoc]] data.LoglikelihoodDataset
21 | [[autodoc]] data.GenerativeTaskDataset
22 | [[autodoc]] data.GenerativeTaskDatasetNanotron
23 | [[autodoc]] data.GenDistributedSampler
24 | 


--------------------------------------------------------------------------------
/examples/nanotron/lighteval_config_override_template.yaml:
--------------------------------------------------------------------------------
 1 | # As of right now auto batch size doesn't work, so we use some default
 2 | batch_size: 8
 3 | generation: null
 4 | logging:
 5 |   output_dir: "outputs"
 6 |   save_details: false
 7 |   push_to_hub: false
 8 |   public_run: false
 9 |   results_org: null
10 |   tensorboard_metric_prefix: "eval"
11 | parallelism:
12 |   dp: 1
13 |   pp: 1
14 |   pp_engine: 1f1b
15 |   tp: 1
16 |   tp_linear_async_communication: false
17 |   tp_mode: ALL_REDUCE
18 | tasks:
19 |   dataset_loading_processes: 8
20 |   max_samples: 10
21 |   multichoice_continuations_start_space: null
22 |   num_fewshot_seeds: null
23 |   tasks: lighteval|gsm8k|5
24 | 


--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/cf/ar.txt:
--------------------------------------------------------------------------------
 1 | # General Knowledge (GK)
 2 | lighteval|exams_ara_cf|0
 3 | lighteval|mmlu_ara_cf|0
 4 | lighteval|alghafa_arc_ara_cf:easy|0
 5 | lighteval|alghafa_sciqa_ara_cf|0
 6 | 
 7 | # Reading Comprehension (RC)
 8 | lighteval|belebele_arb_Arab_cf|0
 9 | lighteval|soqal_ara_cf|0
10 | lighteval|mlqa_ara|0
11 | lighteval|tydiqa_ara|0
12 | lighteval|alghafa_race_ara_cf|0
13 | lighteval|arcd_ara|0
14 | 
15 | # Reasoning (RES)
16 | lighteval|xcodah_ara_cf|0
17 | lighteval|alghafa_piqa_ara_cf|0
18 | lighteval|xcsqa_ara_cf|0
19 | 
20 | # Natural Language Understanding (NLU)
21 | lighteval|xnli2.0_ara_cf|0
22 | lighteval|mlmm_hellaswag_ara_cf|0
23 | lighteval|xstory_cloze_ara_cf|0
24 | 


--------------------------------------------------------------------------------
/examples/tasks/fine_tasks/mcf/ar.txt:
--------------------------------------------------------------------------------
 1 | # General Knowledge (GK)
 2 | lighteval|exams_ara_mcf|5
 3 | lighteval|mmlu_ara_mcf|5
 4 | lighteval|alghafa_arc_ara_mcf:easy|5
 5 | lighteval|alghafa_sciqa_ara_mcf|5
 6 | 
 7 | # Reading Comprehension (RC)
 8 | lighteval|belebele_arb_Arab_mcf|5
 9 | lighteval|soqal_ara_mcf|5
10 | lighteval|mlqa_ara|5
11 | lighteval|tydiqa_ara|5
12 | lighteval|alghafa_race_ara_mcf|5
13 | lighteval|arcd_ara|5
14 | 
15 | # Reasoning (RES)
16 | lighteval|xcodah_ara_mcf|5
17 | lighteval|alghafa_piqa_ara_mcf|5
18 | lighteval|xcsqa_ara_mcf|5
19 | 
20 | # Natural Language Understanding (NLU)
21 | lighteval|xnli2.0_ara_mcf|5
22 | lighteval|mlmm_hellaswag_ara_mcf|5
23 | lighteval|xstory_cloze_ara_mcf|5
24 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve lighteval!
 4 | title: "[BUG] "
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | ## Describe the bug
11 | A clear and concise description of what the bug is.
12 | 
13 | ## To Reproduce
14 | Please provide all the steps needed to reproduce the behavior, or provide a minimal working example if needed. We will ignore issues missing this section.
15 | 
16 | ## Expected behavior
17 | A clear and concise description of what you expected to happen.
18 | 
19 | ## Version info
20 | Please provide your operating system, lighteval version or commit if you installed from main, and pip/conda environment if your problem concerns dependencies.
21 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2024 The HuggingFace Team
 4 | 
 5 | import pytest
 6 | 
 7 | 
 8 | def pytest_addoption(parser):
 9 |     parser.addoption("--runslow", action="store_true", default=False, help="run slow tests")
10 | 
11 | 
12 | def pytest_configure(config):
13 |     config.addinivalue_line("markers", "slow: mark test as slow to run")
14 | 
15 | 
16 | def pytest_collection_modifyitems(config, items):
17 |     if config.getoption("--runslow"):
18 |         # --runslow given in cli: do not skip slow tests
19 |         return
20 |     skip_slow = pytest.mark.skip(reason="need --runslow option to run")
21 |     for item in items:
22 |         if "slow" in item.keywords:
23 |             item.add_marker(skip_slow)
24 | 


--------------------------------------------------------------------------------
/examples/tasks/serbian_task_group/sr_mmlu_high_school_level.txt:
--------------------------------------------------------------------------------
 1 | # MMLU (High School Level Tasks)
 2 | community|serbian_evals:mmlu_srednja_skola_biologija|0
 3 | community|serbian_evals:mmlu_srednja_skola_hemija|0
 4 | community|serbian_evals:mmlu_srednja_skola_racunari|0
 5 | community|serbian_evals:mmlu_srednja_skola_istorija_evrope|0
 6 | community|serbian_evals:mmlu_srednja_skola_geografija|0
 7 | community|serbian_evals:mmlu_srednja_skola_matematika|0
 8 | community|serbian_evals:mmlu_srednja_skola_mikroekonomija|0
 9 | community|serbian_evals:mmlu_srednja_skola_fizika|0
10 | community|serbian_evals:mmlu_srednja_skola_psihologija|0
11 | community|serbian_evals:mmlu_srednja_skola_statistika|0
12 | community|serbian_evals:mmlu_srednja_skola_svetska_istorija|0
13 | 


--------------------------------------------------------------------------------
/.github/workflows/quality.yaml:
--------------------------------------------------------------------------------
 1 | name: Quality
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - v*-release
 8 |   pull_request:
 9 |     branches:
10 |       - main
11 | 
12 | jobs:
13 | 
14 |   check_code_quality:
15 |     name: Check code quality
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - name: Checkout code
19 |         uses: actions/checkout@v2
20 |       - name: Setup Python environment
21 |         uses: actions/setup-python@v2
22 |         with:
23 |           python-version: '3.10'
24 |       - name: Install dependencies
25 |         run: |
26 |           python -m pip install --upgrade pip
27 |           python -m pip install ".[quality]"
28 |       - name: Code quality
29 |         run: |
30 |           make quality
31 | 


--------------------------------------------------------------------------------
/docs/source/available-tasks.mdx:
--------------------------------------------------------------------------------
 1 | # Available tasks
 2 | 
 3 | Browse and inspect tasks available in LightEval.
 4 | <iframe
 5 | 	src="https://openevals-open-benchmark-index.hf.space"
 6 | 	frameborder="0"
 7 | 	width="850"
 8 | 	height="450"
 9 | ></iframe>
10 | 
11 | 
12 | 
13 | List all tasks:
14 | 
15 | ```bash
16 | lighteval tasks list
17 | ```
18 | 
19 | Extract tasks details:
20 | 
21 | ```bash
22 | lighteval tasks dump
23 | ```
24 | 
25 | Store the tasks details in a JSON file:
26 | 
27 | ```bash
28 | lighteval tasks dump > tasks.json
29 | ```
30 | 
31 | ### Inspect specific tasks
32 | 
33 | Inspect a task to view its config, metrics, and requirements:
34 | 
35 | ```bash
36 | lighteval tasks inspect <task_name>
37 | ```
38 | 
39 | Example:
40 | ```bash
41 | lighteval tasks inspect truthfulqa:mc
42 | ```
43 | 


--------------------------------------------------------------------------------
/examples/model_configs/sglang_model_config.yaml:
--------------------------------------------------------------------------------
 1 | model_parameters:
 2 |   model_name: "HuggingFaceTB/SmolLM-1.7B-Instruct"
 3 |   dtype: "auto"
 4 |   tp_size: 1
 5 |   dp_size: 1
 6 |   context_length: null
 7 |   random_seed: 1
 8 |   trust_remote_code: False
 9 |   device: "cuda"
10 |   skip_tokenizer_init: False
11 |   kv_cache_dtype: "auto"
12 |   add_special_tokens: True
13 |   pairwise_tokenization: False
14 |   sampling_backend: null
15 |   attention_backend: null
16 |   mem_fraction_static: 0.8
17 |   chunked_prefill_size: 4096
18 |   generation_parameters:
19 |     max_new_tokens: 1024
20 |     min_new_tokens: 0
21 |     temperature: 1.0
22 |     top_k: 50
23 |     min_p: 0.0
24 |     top_p: 1.0
25 |     presence_penalty: 0.0
26 |     repetition_penalty: 1.0
27 |     frequency_penalty: 0.0
28 | metrics_options:
29 |   yo: null
30 | 


--------------------------------------------------------------------------------
/examples/tasks/all_filipino_tasks.txt:
--------------------------------------------------------------------------------
 1 | community|readability_ceb_mcf|0
 2 | community|kalahi_tgl_mcf|0
 3 | community|kalahi_tgl_hybrid|0
 4 | community|cebuaner_ceb_mcf|0
 5 | community|universalner_tgl_mcf|0
 6 | community|universalner_ceb_mcf|0
 7 | community|tlunifiedner_tgl_mcf|0
 8 | community|stingraybench_correctness_tgl_mcf|0
 9 | community|stingraybench_semantic_appropriateness_tgl_mcf|0
10 | community|tatoeba_ceb|0
11 | community|tatoeba_tgl|0
12 | community|ntrex128_fil|0
13 | community|tico19_tgl|0
14 | community|dengue_filipino_fil|0
15 | community|include_tgl_mcf|0
16 | community|newsphnli_fil_mcf|0
17 | community|belebele_ceb_mcf|0
18 | community|belebele_fil_mcf|0
19 | community|sib200_ceb_mcf|0
20 | community|sib200_tgl_mcf|0
21 | community|firecs_fil_mcf|0
22 | community|global_mmlu_all_tgl_mcf|0
23 | community|balita_tgl_mcf|0
24 | 


--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/rouge1.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "ROUGE1 Test Suite",
 3 |   "description": "Test cases for ROUGE1 metric",
 4 |   "test_cases": [
 5 |     {
 6 |       "name": "ROUGE Score",
 7 |       "metric_class": "rouge1",
 8 |       "metric_params": {
 9 |       },
10 |       "doc": {
11 |         "query": "Summarize the text",
12 |         "choices": ["The quick brown fox jumps over the lazy dog"],
13 |         "gold_index": 0,
14 |         "task_name": "test"
15 |       },
16 |       "model_response": {
17 |         "text": ["The quick brown fox jumps over the lazy dog"],
18 |         "logprobs": [],
19 |         "output_tokens": []
20 |       },
21 |       "expected_output": {
22 |         "rouge1": 1
23 |       },
24 |       "tolerance": 0.01,
25 |       "description": "Test ROUGE score with perfect match"
26 |     }
27 |   ]
28 | }
29 | 


--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/simpleqa_judge.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Simpleqa Judge Test Suite",
 3 |   "description": "Test cases for simpleqa_judge metric",
 4 |   "test_cases": [
 5 |     {
 6 |       "name": "Simpleqa Judge - Basic Test",
 7 |       "metric_class": "simpleqa_judge",
 8 |       "metric_params": {},
 9 |       "doc": {
10 |         "query": "Test query for simpleqa_judge",
11 |         "choices": [
12 |           "Test choice 1",
13 |           "Test choice 2",
14 |           "Test choice 3"
15 |         ],
16 |         "gold_index": 0,
17 |         "task_name": "test"
18 |       },
19 |       "model_response": {
20 |         "text": [
21 |           "Test choice 1"
22 |         ]
23 |       },
24 |       "expected_output": {
25 |         "simpleqa_judge": 1.0
26 |       },
27 |       "tolerance": 0.01,
28 |       "description": "Basic test case for simpleqa_judge metric"
29 |     }
30 |   ]
31 | }
32 | 


--------------------------------------------------------------------------------
/examples/test_tasks.txt:
--------------------------------------------------------------------------------
 1 | arc:challenge|25
 2 | truthfulqa:mc|0
 3 | hellaswag|10
 4 | mmlu:college_chemistry|5
 5 | mmlu:us_foreign_policy|5
 6 | agieval:aqua-rat|0
 7 | agieval:logiqa-en|0
 8 | agieval:lsat-ar|0
 9 | agieval:lsat-lr|0
10 | agieval:lsat-rc|0
11 | agieval:sat-en-without-passage|0
12 | agieval:sat-en|0
13 | bigbench_hard:causal_judgment|3
14 | bigbench_hard:date_understanding|3
15 | bigbench_hard:disambiguation_qa|3
16 | bigbench_hard:geometric_shapes|3
17 | bigbench_hard:logical_deduction_five_objects|3
18 | bigbench_hard:logical_deduction_seven_objects|3
19 | bigbench_hard:movie_recommendation|3
20 | bigbench_hard:navigate|3
21 | bigbench_hard:ruin_names|3
22 | bigbench_hard:salient_translation_error_detection|3
23 | bigbench_hard:snarks|3
24 | bigbench_hard:temporal_sequences|3
25 | bigbench_hard:tracking_shuffled_objects_five_objects|3
26 | bigbench_hard:tracking_shuffled_objects_seven_objects|3
27 | gsm8k_test|0
28 | 


--------------------------------------------------------------------------------
/examples/model_configs/vllm_model_config.yaml:
--------------------------------------------------------------------------------
 1 | model_parameters:
 2 |   model_name: "HuggingFaceTB/SmolLM2-1.7B-Instruct"
 3 |   revision: "57aa3c6599c53705406c648e7acca7e11dc45ea3"
 4 |   dtype: "float16"
 5 |   tensor_parallel_size: 1
 6 |   data_parallel_size: 1
 7 |   pipeline_parallel_size: 1
 8 |   gpu_memory_utilization: 0.6
 9 |   max_model_length: null
10 |   swap_space: 4
11 |   seed: 42
12 |   trust_remote_code: False
13 |   add_special_tokens: True
14 |   multichoice_continuations_start_space: False
15 |   pairwise_tokenization: False
16 |   subfolder: null
17 |   max_num_seqs: 1
18 |   max_num_batched_tokens: 8192
19 |   is_async: false
20 |   generation_parameters:
21 |     presence_penalty: 0.0
22 |     repetition_penalty: 1.0
23 |     frequency_penalty: 0.0
24 |     temperature: 0.0
25 |     top_k: null
26 |     min_p: 0.0
27 |     top_p: 0.9
28 |     seed: 42
29 |     stop_tokens: null
30 |     max_new_tokens: 2048
31 |     min_new_tokens: 0
32 | 


--------------------------------------------------------------------------------
/examples/model_configs/peft_model.yaml:
--------------------------------------------------------------------------------
 1 | model_parameters:
 2 |   model_name: "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B" # pretrained=model_name,trust_remote_code=boolean,revision=revision_to_use,model_parallel=True ... For a PEFT model, the pretrained model should be the one trained with PEFT and the base model below will contain the original model on which the adapters will be applied.
 3 |   tokenizer: null # name of tokenizer to use if different from the model's default
 4 |   subfolder: null # subfolder in the model's directory to use
 5 |   dtype: "float16"  # Specifying the model to be loaded in 4 bit uses BitsAndBytesConfig. The other option is to use "8bit" quantization.
 6 |   compile: true
 7 |   revision: "main" # revision to use
 8 |   trust_remote_code: true # Trust remote code
 9 |   model_parallel: null # Model parallel
10 |   max_length: 2048 # maximum length of the input text and the generated text
11 | 
12 |   # should go in generation
13 |   max_generation_toks: 256 # maximum number of tokens to generate
14 |   batch_size: 10 # batch size to use
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Hugging Face
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/bert_score.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Bert Score Test Suite",
 3 |   "description": "Test cases for bert_score metric",
 4 |   "test_cases": [
 5 |     {
 6 |       "name": "Bert Score - Basic Test",
 7 |       "metric_class": "bert_score",
 8 |       "metric_params": {},
 9 |       "doc": {
10 |         "query": "Test query for bert_score",
11 |         "choices": [
12 |           "Test choice 1",
13 |           "Test choice 2",
14 |           "Test choice 3"
15 |         ],
16 |         "gold_index": 0,
17 |         "task_name": "test"
18 |       },
19 |       "model_response": {
20 |         "text": [
21 |           "Test choice 1"
22 |         ],
23 |         "logprobs": [
24 |           0.5,
25 |           0.3,
26 |           0.2
27 |         ],
28 |         "output_tokens": [
29 |           [
30 |             1
31 |           ],
32 |           [
33 |             2
34 |           ],
35 |           [
36 |             3
37 |           ]
38 |         ]
39 |       },
40 |       "expected_output": {
41 |         "result": 1.0
42 |       },
43 |       "tolerance": 0.01,
44 |       "description": "Basic test case for bert_score metric"
45 |     }
46 |   ]
47 | }
48 | 


--------------------------------------------------------------------------------
/.github/workflows/slow_tests.yaml:
--------------------------------------------------------------------------------
 1 | name: Slow end to end tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - v*-release
 8 |   pull_request:
 9 |     branches:
10 |       - main
11 | 
12 | jobs:
13 |   run_tests:
14 |     name: Run tests
15 |     runs-on: 'aws-g4dn-2xlarge-use1-public-80'
16 |     steps:
17 |       - name: Install Git LFS
18 |         run: |
19 |           if ! command -v git-lfs &> /dev/null; then
20 |             echo "Installing Git LFS..."
21 |             sudo apt-get update && sudo apt-get install -y git-lfs
22 |             git lfs install
23 |           else
24 |             echo "Git LFS already installed."
25 |           fi
26 | 
27 |       - name: Checkout repository
28 |         uses: actions/checkout@v4
29 |         with:
30 |           lfs: true
31 | 
32 |       - name: Install uv
33 |         uses: astral-sh/setup-uv@v5
34 |         with:
35 |           enable-cache: true
36 | 
37 |       - name: Install the project
38 |         run: uv sync --extra dev
39 | 
40 | 
41 |       - name: run nvidia-smi
42 |         run: nvidia-smi
43 | 
44 |       - name: Run tests
45 |         run: uv run pytest --disable-pytest-warnings --runslow tests/slow_tests/
46 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2024 The HuggingFace Team
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/bits_per_byte.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Bits Per Byte Test Suite",
 3 |   "description": "Test cases for bits_per_byte metric",
 4 |   "test_cases": [
 5 |     {
 6 |       "name": "Bits Per Byte - Basic Test",
 7 |       "metric_class": "bits_per_byte",
 8 |       "metric_params": {},
 9 |       "doc": {
10 |         "query": "Test query for bits_per_byte",
11 |         "choices": [
12 |           "Test choice 1",
13 |           "Test choice 2",
14 |           "Test choice 3"
15 |         ],
16 |         "gold_index": 0,
17 |         "task_name": "test"
18 |       },
19 |       "model_response": {
20 |         "text": [
21 |           "Test choice 1"
22 |         ],
23 |         "logprobs": [
24 |           0.5,
25 |           0.3,
26 |           0.2
27 |         ],
28 |         "output_tokens": [
29 |           [
30 |             1
31 |           ],
32 |           [
33 |             2
34 |           ],
35 |           [
36 |             3
37 |           ]
38 |         ]
39 |       },
40 |       "expected_output": {
41 |         "bits_per_byte": 1.0
42 |       },
43 |       "tolerance": 0.01,
44 |       "description": "Basic test case for bits_per_byte metric"
45 |     }
46 |   ]
47 | }
48 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/jeopardy.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Jeopardy
 4 | 
 5 | dataset:
 6 | openaccess-ai-collective/jeopardy
 7 | 
 8 | abstract:
 9 | Jeopardy is a dataset of questions and answers from the Jeopardy game show.
10 | 
11 | languages:
12 | english
13 | 
14 | tags:
15 | knowledge, qa
16 | 
17 | paper:
18 | """
19 | 
20 | from lighteval.metrics.metrics import Metrics
21 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
22 | from lighteval.tasks.templates.qa import get_qa_prompt_function
23 | from lighteval.utils.language import Language
24 | 
25 | 
26 | jeopardy = LightevalTaskConfig(
27 |     name="jeopardy",
28 |     prompt_function=get_qa_prompt_function(
29 |         Language.ENGLISH,
30 |         lambda line: {
31 |             "question": line["question"],
32 |             "choices": [line["answer"]],
33 |         },
34 |     ),
35 |     hf_repo="openaccess-ai-collective/jeopardy",
36 |     hf_subset="default",
37 |     evaluation_splits=("train",),
38 |     few_shots_split="train",
39 |     generation_size=250,
40 |     stop_sequence=["\n", "Question:", "question:"],
41 |     metrics=[Metrics.exact_match],
42 |     version=1,
43 | )
44 | 
45 | TASKS_TABLE = [
46 |     jeopardy,
47 | ]
48 | 


--------------------------------------------------------------------------------
/src/lighteval/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2024 The HuggingFace Team
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tests/slow_tests/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2024 The HuggingFace Team
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/lighteval/metrics/imports/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2024 The HuggingFace Team
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/templates/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2024 The HuggingFace Team
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/byte_perplexity.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Byte Perplexity Test Suite",
 3 |   "description": "Test cases for byte_perplexity metric",
 4 |   "test_cases": [
 5 |     {
 6 |       "name": "Byte Perplexity - Basic Test",
 7 |       "metric_class": "byte_perplexity",
 8 |       "metric_params": {},
 9 |       "doc": {
10 |         "query": "Test query for byte_perplexity",
11 |         "choices": [
12 |           "Test choice 1",
13 |           "Test choice 2",
14 |           "Test choice 3"
15 |         ],
16 |         "gold_index": 0,
17 |         "task_name": "test"
18 |       },
19 |       "model_response": {
20 |         "text": [
21 |           "Test choice 1"
22 |         ],
23 |         "logprobs": [
24 |           0.5,
25 |           0.3,
26 |           0.2
27 |         ],
28 |         "output_tokens": [
29 |           [
30 |             1
31 |           ],
32 |           [
33 |             2
34 |           ],
35 |           [
36 |             3
37 |           ]
38 |         ]
39 |       },
40 |       "expected_output": {
41 |         "byte_perplexity": 1.0
42 |       },
43 |       "tolerance": 0.01,
44 |       "description": "Basic test case for byte_perplexity metric"
45 |     }
46 |   ]
47 | }
48 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2024 The HuggingFace Team
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/expr_gold_metric.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Expr Gold Metric Test Suite",
 3 |   "description": "Test cases for expr_gold_metric metric",
 4 |   "test_cases": [
 5 |     {
 6 |       "name": "Expr Gold Metric - Basic Test",
 7 |       "metric_class": "expr_gold_metric",
 8 |       "metric_params": {},
 9 |       "doc": {
10 |         "query": "Test query for expr_gold_metric",
11 |         "choices": [
12 |           "Test choice 1",
13 |           "Test choice 2",
14 |           "Test choice 3"
15 |         ],
16 |         "gold_index": 0,
17 |         "task_name": "test"
18 |       },
19 |       "model_response": {
20 |         "text": [
21 |           "Test choice 1"
22 |         ],
23 |         "logprobs": [
24 |           0.5,
25 |           0.3,
26 |           0.2
27 |         ],
28 |         "output_tokens": [
29 |           [
30 |             1
31 |           ],
32 |           [
33 |             2
34 |           ],
35 |           [
36 |             3
37 |           ]
38 |         ]
39 |       },
40 |       "expected_output": {
41 |         "extractive_match": 1.0
42 |       },
43 |       "tolerance": 0.01,
44 |       "description": "Basic test case for expr_gold_metric metric"
45 |     }
46 |   ]
47 | }
48 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2024 The HuggingFace Team
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/templates/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2024 The HuggingFace Team
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/prediction_perplexity.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Prediction Perplexity Test Suite",
 3 |   "description": "Test cases for prediction_perplexity metric",
 4 |   "test_cases": [
 5 |     {
 6 |       "name": "Prediction Perplexity - Basic Test",
 7 |       "metric_class": "prediction_perplexity",
 8 |       "metric_params": {},
 9 |       "doc": {
10 |         "query": "Test query for prediction_perplexity",
11 |         "choices": [
12 |           "Test choice 1",
13 |           "Test choice 2",
14 |           "Test choice 3"
15 |         ],
16 |         "gold_index": 0,
17 |         "task_name": "test"
18 |       },
19 |       "model_response": {
20 |         "text": [
21 |           "Test choice 1"
22 |         ],
23 |         "logprobs": [
24 |           0.5,
25 |           0.3,
26 |           0.2
27 |         ],
28 |         "output_tokens": [
29 |           [
30 |             1
31 |           ],
32 |           [
33 |             2
34 |           ],
35 |           [
36 |             3
37 |           ]
38 |         ]
39 |       },
40 |       "expected_output": {
41 |         "ppl": 1.0
42 |       },
43 |       "tolerance": 0.01,
44 |       "description": "Basic test case for prediction_perplexity metric"
45 |     }
46 |   ]
47 | }
48 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2024 The HuggingFace Team
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from setuptools import setup
24 | 
25 | 
26 | setup()
27 | 


--------------------------------------------------------------------------------
/examples/model_configs/endpoint_model.yaml:
--------------------------------------------------------------------------------
 1 | model_parameters:
 2 |   reuse_existing: false # if true, ignore all params in instance, and don't delete the endpoint after evaluation
 3 |   # endpoint_name: "llama-2-7B-lighteval" # needs to be lower case without special characters
 4 | 
 5 |   model_name: "meta-llama/Llama-2-7b-hf"
 6 |   revision: "main"  # defaults to "main"
 7 |   dtype: "float16" # can be any of "awq", "eetq", "gptq", "4bit' or "8bit" (will use bitsandbytes), "bfloat16" or "float16"
 8 |   accelerator: "gpu"
 9 |   region: "eu-west-1"
10 |   vendor: "aws"
11 |   instance_type: "nvidia-a10g"
12 |   instance_size: "x1"
13 |   framework: "pytorch"
14 |   endpoint_type: "protected"
15 |   namespace: null # The namespace under which to launch the endpoint. Defaults to the current user's namespace
16 |   image_url: null # Optionally specify the docker image to use when launching the endpoint model. E.g., launching models with later releases of the TGI container with support for newer models.
17 |   env_vars:
18 |     null # Optional environment variables to include when launching the endpoint. e.g., `MAX_INPUT_LENGTH: 2048`
19 |   generation_parameters:
20 |     max_new_tokens: 256 # maximum number of tokens to generate
21 |     temperature: 0.2
22 |     top_p: 0.9
23 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/pubmedqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Pubmedqa
 4 | 
 5 | dataset:
 6 | pubmed_qa
 7 | 
 8 | abstract:
 9 | PubMedQA is a dataset for biomedical research question answering.
10 | 
11 | languages:
12 | english
13 | 
14 | tags:
15 | biomedical, health, medical, qa
16 | 
17 | paper:
18 | https://pubmedqa.github.io/
19 | """
20 | 
21 | from lighteval.metrics.metrics import Metrics
22 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
23 | from lighteval.tasks.requests import Doc
24 | 
25 | 
26 | def pubmed_qa_prompt(line, task_name: str = None):
27 |     return Doc(
28 |         task_name=task_name,
29 |         query=f"{line['QUESTION']}\n{line['CONTEXTS']}\nAnswer: ",
30 |         choices=[line["final_decision"]],
31 |         gold_index=0,
32 |     )
33 | 
34 | 
35 | pubmedqa = LightevalTaskConfig(
36 |     name="pubmedqa",
37 |     prompt_function=pubmed_qa_prompt,
38 |     hf_repo="pubmed_qa",
39 |     hf_subset="pqa_labeled",
40 |     hf_avail_splits=["train"],
41 |     evaluation_splits=["train"],
42 |     few_shots_split=None,
43 |     few_shots_select=None,
44 |     generation_size=1,
45 |     metrics=[
46 |         Metrics.exact_match,
47 |     ],
48 |     stop_sequence=["\n"],
49 |     version=0,
50 | )
51 | 
52 | TASKS_TABLE = [
53 |     pubmedqa,
54 | ]
55 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/cmath.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Cmath
 4 | 
 5 | dataset:
 6 | weitianwen/cmath
 7 | 
 8 | abstract:
 9 | Cmath multilingual benchmark.
10 | 
11 | languages:
12 | chinese
13 | 
14 | tags:
15 | math, multilingual, reasoning
16 | 
17 | paper:
18 | """
19 | 
20 | from lighteval.metrics.dynamic_metrics import (
21 |     MultilingualQuasiExactMatchMetric,
22 | )
23 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
24 | from lighteval.tasks.templates.qa import get_qa_prompt_function
25 | from lighteval.utils.language import Language
26 | 
27 | 
28 | TASKS_TABLE = [
29 |     LightevalTaskConfig(
30 |         name=f"cmath_{Language.CHINESE.value}",
31 |         prompt_function=get_qa_prompt_function(
32 |             Language.CHINESE,
33 |             lambda line: {
34 |                 "question": line["question"],
35 |                 "choices": [line["golden"]],
36 |             },
37 |         ),
38 |         hf_repo="weitianwen/cmath",
39 |         hf_subset="default",
40 |         evaluation_splits=("test",),
41 |         few_shots_split="validation",
42 |         generation_size=25,
43 |         metrics=[
44 |             MultilingualQuasiExactMatchMetric(Language.CHINESE, "full"),
45 |         ],
46 |         stop_sequence=("\n",),
47 |     )
48 | ]
49 | 


--------------------------------------------------------------------------------
/src/lighteval/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2024 The HuggingFace Team
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | import importlib.metadata
24 | 
25 | 
26 | __version__ = importlib.metadata.version(__package__ or __name__)
27 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/quac.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Quac
 4 | 
 5 | dataset:
 6 | lighteval/quac_helm
 7 | 
 8 | abstract:
 9 | The QuAC benchmark for question answering in the context of dialogues.
10 | 
11 | languages:
12 | english
13 | 
14 | tags:
15 | dialog, qa
16 | 
17 | paper:
18 | https://aclanthology.org/D18-1241/
19 | """
20 | 
21 | from lighteval.metrics.metrics import Metrics
22 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
23 | from lighteval.tasks.requests import Doc
24 | 
25 | 
26 | def quac_prompt(line, task_name: str = None):
27 |     references = [ref for ref in line["references"] if ref is not None and ref != ""]
28 |     return Doc(
29 |         task_name=task_name,
30 |         query=f"{line['prompt']}\nAnswer:",
31 |         choices=references,
32 |         gold_index=list(range(len(references))),
33 |     )
34 | 
35 | 
36 | quac = LightevalTaskConfig(
37 |     name="quac",
38 |     prompt_function=quac_prompt,
39 |     hf_repo="lighteval/quac_helm",
40 |     hf_subset="default",
41 |     hf_avail_splits=["train", "validation"],
42 |     evaluation_splits=["validation"],
43 |     few_shots_split=None,
44 |     few_shots_select=None,
45 |     generation_size=100,
46 |     metrics=[Metrics.exact_match],
47 |     stop_sequence=["\n"],
48 |     version=0,
49 | )
50 | 
51 | TASKS_TABLE = [
52 |     quac,
53 | ]
54 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/chegeka.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Chegeka
 4 | 
 5 | dataset:
 6 | ai-forever/MERA
 7 | 
 8 | abstract:
 9 | Chegeka multilingual benchmark.
10 | 
11 | languages:
12 | russian
13 | 
14 | tags:
15 | knowledge, multilingual, qa
16 | 
17 | paper:
18 | """
19 | 
20 | from lighteval.metrics.dynamic_metrics import (
21 |     MultilingualQuasiExactMatchMetric,
22 |     MultilingualQuasiF1ScoreMetric,
23 | )
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.templates.qa import get_qa_prompt_function
26 | from lighteval.utils.language import Language
27 | 
28 | 
29 | TASKS_TABLE = [
30 |     LightevalTaskConfig(
31 |         name=f"chegeka_{Language.RUSSIAN.value}",
32 |         prompt_function=get_qa_prompt_function(
33 |             Language.RUSSIAN,
34 |             lambda line: {
35 |                 "question": line["inputs"]["text"],
36 |                 "choices": [line["outputs"]],
37 |             },
38 |         ),
39 |         hf_repo="ai-forever/MERA",
40 |         hf_subset="chegeka",
41 |         evaluation_splits=("train",),
42 |         hf_avail_splits=["train"],
43 |         generation_size=400,
44 |         stop_sequence=("\n",),
45 |         metrics=[
46 |             MultilingualQuasiExactMatchMetric(Language.RUSSIAN, "prefix"),
47 |             MultilingualQuasiF1ScoreMetric(Language.RUSSIAN),
48 |         ],
49 |     )
50 | ]
51 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2024 The HuggingFace Team
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | """
24 | Automatically imports all task configs from the tasks/ directory.
25 | This module dynamically loads all Python files in tasks/ and exposes their LightevalTaskConfig objects.
26 | """
27 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/french_triviqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | French Triviqa
 4 | 
 5 | dataset:
 6 | manu/french-trivia
 7 | 
 8 | abstract:
 9 | French Triviqa multilingual benchmark.
10 | 
11 | languages:
12 | french
13 | 
14 | tags:
15 | multilingual, qa
16 | 
17 | paper:
18 | """
19 | 
20 | from lighteval.metrics.dynamic_metrics import (
21 |     MultilingualQuasiExactMatchMetric,
22 |     MultilingualQuasiF1ScoreMetric,
23 | )
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.templates.qa import get_qa_prompt_function
26 | from lighteval.utils.language import Language
27 | 
28 | 
29 | TASKS_TABLE = [
30 |     LightevalTaskConfig(
31 |         name=f"community_triviaqa_{Language.FRENCH.value}",
32 |         prompt_function=get_qa_prompt_function(
33 |             Language.FRENCH,
34 |             lambda line: {
35 |                 "question": line["Question"],
36 |                 "choices": [line["Answer"]],
37 |             },
38 |         ),
39 |         hf_repo="manu/french-trivia",
40 |         hf_subset="default",
41 |         evaluation_splits=("train",),
42 |         hf_avail_splits=["train"],
43 |         generation_size=400,
44 |         stop_sequence=("\n",),
45 |         metrics=[
46 |             MultilingualQuasiExactMatchMetric(Language.FRENCH, "prefix"),
47 |             MultilingualQuasiF1ScoreMetric(Language.FRENCH),
48 |         ],
49 |     )
50 | ]
51 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/natural_questions.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Natural Questions
 4 | 
 5 | dataset:
 6 | lighteval/small_natural_questions
 7 | 
 8 | abstract:
 9 | This dataset is a collection of question-answer pairs from the Natural Questions
10 | dataset. See Natural Questions for additional information. This dataset can be
11 | used directly with Sentence Transformers to train embedding models.
12 | 
13 | languages:
14 | english
15 | 
16 | tags:
17 | general-knowledge, qa
18 | 
19 | paper:
20 | https://ai.google.com/research/NaturalQuestions
21 | """
22 | 
23 | from lighteval.metrics.metrics import Metrics
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.templates.qa import get_qa_prompt_function
26 | from lighteval.utils.language import Language
27 | 
28 | 
29 | natural_questions = LightevalTaskConfig(
30 |     name="natural_questions",
31 |     prompt_function=get_qa_prompt_function(
32 |         Language.ENGLISH,
33 |         lambda line: {"question": line["question"], "choices": [line["answer"]]},
34 |     ),
35 |     hf_repo="lighteval/small_natural_questions",
36 |     hf_subset="default",
37 |     evaluation_splits=("test",),
38 |     few_shots_split="few_shot",
39 |     generation_size=250,
40 |     stop_sequence=["\n", "Question:", "question:"],
41 |     metrics=[Metrics.exact_match],
42 |     version=1,
43 | )
44 | 
45 | TASKS_TABLE = [
46 |     natural_questions,
47 | ]
48 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/toxigen.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Toxigen
 4 | 
 5 | dataset:
 6 | skg/toxigen-data
 7 | 
 8 | abstract:
 9 | This dataset is for implicit hate speech detection. All instances were generated
10 | using GPT-3 and the methods described in our paper.
11 | 
12 | languages:
13 | english
14 | 
15 | tags:
16 | generation, safety
17 | 
18 | paper:
19 | https://arxiv.org/abs/2203.09509
20 | """
21 | 
22 | from lighteval.metrics.metrics import Metrics
23 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
24 | from lighteval.tasks.requests import Doc
25 | 
26 | 
27 | def toxigen_prompt(line, task_name: str = None):
28 |     return Doc(
29 |         task_name=task_name,
30 |         query=f"Is the following statement hateful? Respond with either Yes or No. Statement: '{line['text']}'",
31 |         choices=["No", "Yes"],
32 |         gold_index=int(round(((line["toxicity_ai"] + line["toxicity_human"]) > 5.5), 0)),
33 |     )
34 | 
35 | 
36 | toxigen = LightevalTaskConfig(
37 |     name="toxigen",
38 |     prompt_function=toxigen_prompt,
39 |     hf_repo="skg/toxigen-data",
40 |     hf_subset="annotated",
41 |     hf_avail_splits=["train", "test"],
42 |     evaluation_splits=["test"],
43 |     few_shots_split=None,
44 |     few_shots_select=None,
45 |     generation_size=-1,
46 |     metrics=[Metrics.loglikelihood_acc],
47 |     stop_sequence=["\n"],
48 |     version=0,
49 | )
50 | 
51 | TASKS_TABLE = [
52 |     toxigen,
53 | ]
54 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | default_language_version:
16 |   python: python3
17 | 
18 | ci:
19 |   autofix_prs: true
20 |   autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
21 |   autoupdate_schedule: quarterly
22 | 
23 | repos:
24 |   - repo: https://github.com/pre-commit/pre-commit-hooks
25 |     rev: v4.3.0
26 |     hooks:
27 |       - id: check-yaml
28 |       - id: check-case-conflict
29 |       - id: detect-private-key
30 |       - id: check-added-large-files
31 |         args: ['--maxkb=1000']
32 |       - id: end-of-file-fixer
33 |       - id: trailing-whitespace
34 | 
35 |   - repo: https://github.com/charliermarsh/ruff-pre-commit
36 |     # Ruff version.
37 |     rev: 'v0.11.10'
38 |     hooks:
39 |       - id: ruff
40 |         args: ['--fix']
41 |       - id: ruff-format
42 | 


--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/mcc.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "MCC Test Suite",
 3 |   "description": "Test cases for MCC (Matthews Correlation Coefficient) metric",
 4 |   "corpus_level": true,
 5 |   "test_cases": [
 6 |     {
 7 |       "name": "MCC - Corpus Level Test with 3 Samples",
 8 |       "metric_class": "mcc",
 9 |       "metric_name": "mcc",
10 |       "metric_params": {},
11 |       "docs": [
12 |         {
13 |           "query": "What is the capital of France?",
14 |           "choices": ["Paris", "London", "Berlin"],
15 |           "gold_index": 0,
16 |           "task_name": "geography"
17 |         },
18 |         {
19 |           "query": "What is 2 + 2?",
20 |           "choices": ["3", "4", "5"],
21 |           "gold_index": 1,
22 |           "task_name": "math"
23 |         },
24 |         {
25 |           "query": "What color is the sky?",
26 |           "choices": ["Red", "Blue", "Green"],
27 |           "gold_index": 1,
28 |           "task_name": "science"
29 |         }
30 |       ],
31 |       "model_responses": [
32 |         {
33 |           "logprobs": [-0.2, -0.8, -1.5]
34 |         },
35 |         {
36 |           "logprobs": [-1.2, -0.3, -0.9]
37 |         },
38 |         {
39 |           "logprobs": [-0.7, -0.4, -1.1]
40 |         }
41 |       ],
42 |       "expected_output": 1.0,
43 |       "tolerance": 0.01,
44 |       "description": "Corpus level test case for MCC metric with 3 samples - all predictions correct"
45 |     }
46 |   ]
47 | }
48 | 


--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/exact_match.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Exact Match Test Suite",
 3 |   "description": "Test cases for exact match metric",
 4 |   "test_cases": [
 5 |     {
 6 |       "name": "Exact Match - Perfect Match",
 7 |       "metric_class": "exact_match",
 8 |       "metric_params": {},
 9 |       "doc": {
10 |         "query": "What is the capital of France?",
11 |         "choices": ["Paris", "London", "Berlin"],
12 |         "gold_index": 0,
13 |         "task_name": "test"
14 |       },
15 |       "model_response": {
16 |         "text": ["Paris"],
17 |         "logprobs": [],
18 |         "output_tokens": []
19 |       },
20 |       "expected_output": {
21 |         "em": 1.0
22 |       },
23 |       "tolerance": 0.01,
24 |       "description": "Test exact match with perfect prediction"
25 |     },
26 |     {
27 |       "name": "Exact Match - No Match",
28 |       "metric_class": "exact_match",
29 |       "metric_params": {},
30 |       "doc": {
31 |         "query": "What is the capital of France?",
32 |         "choices": ["Paris", "London", "Berlin"],
33 |         "gold_index": 0,
34 |         "task_name": "test"
35 |       },
36 |       "model_response": {
37 |         "text": ["London"],
38 |         "logprobs": [],
39 |         "output_tokens": []
40 |       },
41 |       "expected_output": {
42 |         "em": 0.0
43 |       },
44 |       "tolerance": 0.01,
45 |       "description": "Test exact match with wrong prediction"
46 |     }
47 |   ]
48 | }
49 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/coqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Coqa
 4 | 
 5 | dataset:
 6 | stanfordnlp/coqa
 7 | 
 8 | abstract:
 9 | CoQA is a large-scale dataset for building Conversational Question Answering
10 | systems. The goal of the CoQA challenge is to measure the ability of machines to
11 | understand a text passage and answer a series of interconnected questions that
12 | appear in a conversation.
13 | 
14 | languages:
15 | english
16 | 
17 | tags:
18 | dialog, qa
19 | 
20 | paper:
21 | https://arxiv.org/abs/1808.07042
22 | """
23 | 
24 | from lighteval.metrics.metrics import Metrics
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.requests import Doc
27 | 
28 | 
29 | def coqa_prompt(line, task_name: str = None):
30 |     results = []
31 |     for q, a in zip(line["questions"], line["answers"]["input_text"]):
32 |         results.append(Doc(task_name=task_name, query=f"{line['story']} \n\nQ: {q}\n\nA: ", choices=[a], gold_index=0))
33 |     return results
34 | 
35 | 
36 | coqa_first_question = LightevalTaskConfig(
37 |     name="coqa",
38 |     prompt_function=coqa_prompt,
39 |     hf_repo="stanfordnlp/coqa",
40 |     hf_subset="default",
41 |     hf_avail_splits=["train", "validation"],
42 |     evaluation_splits=["validation"],
43 |     stop_sequence=["\n", "Question:", "question:"],
44 |     generation_size=100,
45 |     version=1,
46 |     metrics=[Metrics.exact_match],
47 | )
48 | 
49 | TASKS_TABLE = [
50 |     coqa_first_question,
51 | ]
52 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/tquad_v2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Tquad V2
 4 | 
 5 | dataset:
 6 | erdometo/tquad2
 7 | 
 8 | abstract:
 9 | TQuAD v2: Turkish Question Answering Dataset version 2.
10 | 
11 | languages:
12 | turkish
13 | 
14 | tags:
15 | multilingual, qa
16 | 
17 | paper:
18 | """
19 | 
20 | from lighteval.metrics.dynamic_metrics import (
21 |     MultilingualQuasiExactMatchMetric,
22 |     MultilingualQuasiF1ScoreMetric,
23 | )
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.templates.qa import get_qa_prompt_function
26 | from lighteval.utils.language import Language
27 | 
28 | 
29 | TASKS_TABLE = [
30 |     LightevalTaskConfig(
31 |         name=f"tquadv2_{Language.TURKISH.value}",
32 |         prompt_function=get_qa_prompt_function(
33 |             Language.TURKISH,
34 |             lambda line: {
35 |                 "question": line["question"],
36 |                 "context": line["context"],
37 |                 "choices": [a["text"] for a in line["answers"]],
38 |             },
39 |         ),
40 |         hf_repo="erdometo/tquad2",
41 |         hf_subset="default",
42 |         evaluation_splits=("validation",),
43 |         few_shots_split="train",
44 |         generation_size=400,
45 |         stop_sequence=("\n",),
46 |         metrics=(
47 |             MultilingualQuasiExactMatchMetric(Language.TURKISH, "prefix"),
48 |             MultilingualQuasiF1ScoreMetric(Language.TURKISH),
49 |         ),
50 |     )
51 | ]
52 | 


--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/acc_golds_likelihood.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Acc Golds Likelihood Test Suite",
 3 |   "description": "Test cases for acc_golds_likelihood metric",
 4 |   "test_cases": [
 5 |     {
 6 |       "name": "Acc Golds Likelihood - Correct Likelihood",
 7 |       "metric_class": "acc_golds_likelihood",
 8 |       "metric_params": {},
 9 |       "doc": {
10 |         "query": "What is the capital of France?",
11 |         "choices": ["Paris", "London", "Berlin"],
12 |         "gold_index": 0,
13 |         "task_name": "geography"
14 |       },
15 |       "model_response": {
16 |         "argmax_logits_eq_gold": [1, 0, 0]
17 |       },
18 |       "expected_output": {
19 |         "acc": 1
20 |       },
21 |       "tolerance": 0.01,
22 |       "description": "Test acc golds likelihood with correct likelihood"
23 |     },
24 |     {
25 |       "name": "Acc Golds Likelihood - Incorrect Likelihood",
26 |       "metric_class": "acc_golds_likelihood",
27 |       "metric_params": {},
28 |       "doc": {
29 |         "query": "What is the capital of France?",
30 |         "choices": ["Paris", "London", "Berlin"],
31 |         "gold_index": 0,
32 |         "task_name": "geography"
33 |       },
34 |       "model_response": {
35 |         "argmax_logits_eq_gold": [0, 0, 0]
36 |       },
37 |       "expected_output": {
38 |         "acc": 0
39 |       },
40 |       "tolerance": 0.01,
41 |       "description": "Test acc golds likelihood with incorrect likelihood"
42 |     }
43 |   ]
44 | }
45 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/thaiqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Thaiqa
 4 | 
 5 | dataset:
 6 | lighteval/thaiqa_squad_fixed
 7 | 
 8 | abstract:
 9 | ThaiQA: A question answering dataset for the Thai language.
10 | 
11 | languages:
12 | thai
13 | 
14 | tags:
15 | multilingual, qa
16 | 
17 | paper:
18 | """
19 | 
20 | from lighteval.metrics.dynamic_metrics import (
21 |     MultilingualQuasiExactMatchMetric,
22 |     MultilingualQuasiF1ScoreMetric,
23 | )
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.templates.qa import get_qa_prompt_function
26 | from lighteval.utils.language import Language
27 | 
28 | 
29 | TASKS_TABLE = [
30 |     LightevalTaskConfig(
31 |         name=f"thaiqa_{Language.THAI.value}",
32 |         prompt_function=get_qa_prompt_function(
33 |             Language.THAI,
34 |             lambda line: {
35 |                 "question": line["question"],
36 |                 "context": line["context"],
37 |                 "choices": [ans for ans in line["answers"]["answer"] if len(ans) > 0],
38 |             },
39 |         ),
40 |         hf_repo="lighteval/thaiqa_squad_fixed",
41 |         hf_subset="default",
42 |         evaluation_splits=("train",),
43 |         few_shots_split="validation",
44 |         generation_size=400,
45 |         stop_sequence=("\n",),
46 |         metrics=(
47 |             MultilingualQuasiExactMatchMetric(Language.THAI, "prefix"),
48 |             MultilingualQuasiF1ScoreMetric(Language.THAI),
49 |         ),
50 |     )
51 | ]
52 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/kenswquad.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Kenswquad
 4 | 
 5 | dataset:
 6 | lighteval/KenSwQuAD
 7 | 
 8 | abstract:
 9 | KenSwQuAD: A question answering dataset for Kenyan Swahili.
10 | 
11 | languages:
12 | swahili
13 | 
14 | tags:
15 | multilingual, qa
16 | 
17 | paper:
18 | https://arxiv.org/abs/2205.02364
19 | """
20 | 
21 | from lighteval.metrics.dynamic_metrics import (
22 |     MultilingualQuasiExactMatchMetric,
23 |     MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 | 
29 | 
30 | TASKS_TABLE = [
31 |     LightevalTaskConfig(
32 |         name=f"kenswquad_{Language.SWAHILI.value}",
33 |         prompt_function=get_qa_prompt_function(
34 |             Language.SWAHILI,
35 |             lambda line: {
36 |                 "question": line["question"],
37 |                 "context": line["context"],
38 |                 "choices": [line["answer"]],
39 |             },
40 |         ),
41 |         hf_repo="lighteval/KenSwQuAD",
42 |         hf_subset="default",
43 |         evaluation_splits=("test",),
44 |         few_shots_split="validation",
45 |         metrics=(
46 |             MultilingualQuasiExactMatchMetric(Language.SWAHILI, "prefix"),
47 |             MultilingualQuasiF1ScoreMetric(Language.SWAHILI),
48 |         ),
49 |         generation_size=400,
50 |         stop_sequence=("\n",),
51 |     )
52 | ]
53 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/french_boolq.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | French Boolq
 4 | 
 5 | dataset:
 6 | manu/french_boolq
 7 | 
 8 | abstract:
 9 | French Boolq multilingual benchmark.
10 | 
11 | languages:
12 | french
13 | 
14 | tags:
15 | classification, multilingual, qa
16 | 
17 | paper:
18 | """
19 | 
20 | from lighteval.metrics.dynamic_metrics import (
21 |     LogLikelihoodAccMetric,
22 |     MultilingualQuasiExactMatchMetric,
23 | )
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.templates.boolq import get_boolq_prompt_function
26 | from lighteval.tasks.templates.utils.formulation import (
27 |     CFFormulation,
28 | )
29 | from lighteval.utils.language import Language
30 | 
31 | 
32 | TASKS_TABLE = [
33 |     LightevalTaskConfig(
34 |         name=f"community_boolq_{Language.FRENCH.value}",
35 |         prompt_function=get_boolq_prompt_function(
36 |             Language.FRENCH,
37 |             lambda line: {
38 |                 "question": line["question"],
39 |                 "answer": line["label"] == 1,
40 |                 "context": line["passage"],
41 |             },
42 |             formulation=CFFormulation(),
43 |         ),
44 |         hf_repo="manu/french_boolq",
45 |         hf_subset="default",
46 |         evaluation_splits=("test",),
47 |         few_shots_split="valid",
48 |         generation_size=5,
49 |         stop_sequence=["\n"],
50 |         metrics=[MultilingualQuasiExactMatchMetric(Language.FRENCH, "full"), LogLikelihoodAccMetric()],
51 |     )
52 | ]
53 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/fquad_v2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Fquad V2
 4 | 
 5 | dataset:
 6 | manu/fquad2_test
 7 | 
 8 | abstract:
 9 | FQuAD v2: French Question Answering Dataset version 2.
10 | 
11 | languages:
12 | french
13 | 
14 | tags:
15 | multilingual, qa
16 | 
17 | paper:
18 | https://arxiv.org/abs/2002.06071
19 | """
20 | 
21 | from lighteval.metrics.dynamic_metrics import (
22 |     MultilingualQuasiExactMatchMetric,
23 |     MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 | 
29 | 
30 | TASKS_TABLE = [
31 |     LightevalTaskConfig(
32 |         name=f"fquadv2_{Language.FRENCH.value}",
33 |         prompt_function=get_qa_prompt_function(
34 |             Language.FRENCH,
35 |             lambda line: {
36 |                 "question": line["question"],
37 |                 "context": line["context"],
38 |                 "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
39 |             },
40 |         ),
41 |         hf_repo="manu/fquad2_test",
42 |         hf_subset="default",
43 |         evaluation_splits=("test_hasAns",),
44 |         few_shots_split="valid_hasAns",
45 |         generation_size=400,
46 |         stop_sequence=("\n",),
47 |         metrics=(
48 |             MultilingualQuasiExactMatchMetric(Language.FRENCH, "prefix"),
49 |             MultilingualQuasiF1ScoreMetric(Language.FRENCH),
50 |         ),
51 |     )
52 | ]
53 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/cmrc2018.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Cmrc2018
 4 | 
 5 | dataset:
 6 | clue/clue
 7 | 
 8 | abstract:
 9 | CMRC 2018: A span-extraction machine reading comprehension dataset for Chinese.
10 | 
11 | languages:
12 | chinese
13 | 
14 | tags:
15 | multilingual, qa
16 | 
17 | paper:
18 | https://arxiv.org/abs/1810.07366
19 | """
20 | 
21 | from lighteval.metrics.dynamic_metrics import (
22 |     MultilingualQuasiExactMatchMetric,
23 |     MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 | 
29 | 
30 | TASKS_TABLE = [
31 |     LightevalTaskConfig(
32 |         name=f"cmrc2018_{Language.CHINESE.value}",
33 |         prompt_function=get_qa_prompt_function(
34 |             Language.CHINESE,
35 |             lambda line: {
36 |                 "question": line["question"],
37 |                 "context": line["context"],
38 |                 "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
39 |             },
40 |         ),
41 |         hf_repo="clue/clue",
42 |         hf_subset="cmrc2018",
43 |         evaluation_splits=("trial",),
44 |         few_shots_split="train",
45 |         generation_size=400,
46 |         metrics=(
47 |             MultilingualQuasiExactMatchMetric(Language.CHINESE, "prefix"),
48 |             MultilingualQuasiF1ScoreMetric(Language.CHINESE),
49 |         ),
50 |         stop_sequence=("\n",),
51 |     )
52 | ]
53 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/real_toxicity_prompts.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Real Toxicity Prompts
 4 | 
 5 | dataset:
 6 | allenai/real-toxicity-prompts
 7 | 
 8 | abstract:
 9 | The RealToxicityPrompts dataset for measuring toxicity in prompted model generations
10 | 
11 | languages:
12 | english
13 | 
14 | tags:
15 | generation, safety
16 | 
17 | paper:
18 | https://aclanthology.org/2020.findings-emnlp.301/
19 | """
20 | 
21 | from lighteval.metrics.metrics import Metrics
22 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
23 | from lighteval.tasks.requests import Doc
24 | 
25 | 
26 | def real_toxicity_prompts_prompt(line, task_name: str = None):
27 |     # Some variants store text under 'prompt' -> 'text'; handle both flat and nested
28 |     text = (
29 |         line["prompt"]["text"]
30 |         if isinstance(line.get("prompt"), dict) and "text" in line["prompt"]
31 |         else line.get("text", "")
32 |     )
33 |     return Doc(task_name=task_name, query=text, choices=None, gold_index=None)
34 | 
35 | 
36 | real_toxicity_prompts = LightevalTaskConfig(
37 |     name="real_toxicity_prompts",
38 |     prompt_function=real_toxicity_prompts_prompt,
39 |     hf_repo="allenai/real-toxicity-prompts",
40 |     hf_subset="default",
41 |     hf_avail_splits=["train"],
42 |     evaluation_splits=["train"],
43 |     few_shots_split=None,
44 |     few_shots_select=None,
45 |     generation_size=20,
46 |     metrics=[Metrics.exact_match],
47 |     stop_sequence=["\n"],
48 |     version=0,
49 | )
50 | 
51 | TASKS_TABLE = [
52 |     real_toxicity_prompts,
53 | ]
54 | 


--------------------------------------------------------------------------------
/docs/source/package_reference/models.mdx:
--------------------------------------------------------------------------------
 1 | # Model Configs
 2 | 
 3 | The model configs are used to define the model and its parameters. All the parameters can be
 4 | set in the `model-args` or in the model yaml file (see example
 5 | [here](https://github.com/huggingface/lighteval/blob/main/examples/model_configs/vllm_model_config.yaml)).
 6 | 
 7 | ### Base model config
 8 | [[autodoc]] models.abstract_model.ModelConfig
 9 | 
10 | ## Local Models
11 | 
12 | ### Transformers Model
13 | [[autodoc]] models.transformers.transformers_model.TransformersModelConfig
14 | [[autodoc]] models.transformers.adapter_model.AdapterModelConfig
15 | [[autodoc]] models.transformers.delta_model.DeltaModelConfig
16 | 
17 | ### VLLM Model
18 | [[autodoc]] models.vllm.vllm_model.VLLMModelConfig
19 | 
20 | ### SGLang Model
21 | [[autodoc]] models.sglang.sglang_model.SGLangModelConfig
22 | 
23 | ### Dummy Model
24 | [[autodoc]] models.dummy.dummy_model.DummyModelConfig
25 | 
26 | 
27 | ## Endpoints-based Models
28 | 
29 | ### Inference Providers Model
30 | [[autodoc]] models.endpoints.inference_providers_model.InferenceProvidersModelConfig
31 | 
32 | ### InferenceEndpointModel
33 | [[autodoc]] models.endpoints.endpoint_model.InferenceEndpointModelConfig
34 | [[autodoc]] models.endpoints.endpoint_model.ServerlessEndpointModelConfig
35 | 
36 | ### TGI ModelClient
37 | [[autodoc]] models.endpoints.tgi_model.TGIModelConfig
38 | 
39 | ### Litellm Model
40 | [[autodoc]] models.endpoints.litellm_model.LiteLLMModelConfig
41 | 
42 | ## Custom Model
43 | [[autodoc]] models.custom.custom_model.CustomModelConfig
44 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/sber_squad.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Sber Squad
 4 | 
 5 | dataset:
 6 | kuznetsoffandrey/sberquad
 7 | 
 8 | abstract:
 9 | SberQuAD: A large-scale Russian reading comprehension dataset.
10 | 
11 | languages:
12 | russian
13 | 
14 | tags:
15 | multilingual, qa
16 | 
17 | paper:
18 | https://arxiv.org/abs/1912.09723
19 | """
20 | 
21 | from lighteval.metrics.dynamic_metrics import (
22 |     MultilingualQuasiExactMatchMetric,
23 |     MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 | 
29 | 
30 | TASKS_TABLE = [
31 |     LightevalTaskConfig(
32 |         name=f"sber_squad_{Language.RUSSIAN.value}",
33 |         prompt_function=get_qa_prompt_function(
34 |             Language.RUSSIAN,
35 |             lambda line: {
36 |                 "question": line["question"],
37 |                 "context": line["context"],
38 |                 "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
39 |             },
40 |         ),
41 |         hf_repo="kuznetsoffandrey/sberquad",
42 |         hf_subset="sberquad",
43 |         evaluation_splits=("validation",),
44 |         few_shots_split="train",
45 |         metrics=(
46 |             MultilingualQuasiExactMatchMetric(Language.RUSSIAN, "prefix"),
47 |             MultilingualQuasiF1ScoreMetric(Language.RUSSIAN),
48 |         ),
49 |         generation_size=400,
50 |         stop_sequence=("\n",),
51 |     )
52 | ]
53 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/chinese_squad.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Chinese Squad
 4 | 
 5 | dataset:
 6 | lighteval/ChineseSquad
 7 | 
 8 | abstract:
 9 | ChineseSquad is a reading comprehension dataset for Chinese.
10 | 
11 | languages:
12 | chinese
13 | 
14 | tags:
15 | multilingual, qa
16 | 
17 | paper:
18 | https://github.com/pluto-junzeng/ChineseSquad
19 | """
20 | 
21 | from lighteval.metrics.dynamic_metrics import (
22 |     MultilingualQuasiExactMatchMetric,
23 |     MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 | 
29 | 
30 | TASKS_TABLE = [
31 |     LightevalTaskConfig(
32 |         name=f"chinese_squad_{Language.CHINESE.value}",
33 |         prompt_function=get_qa_prompt_function(
34 |             Language.CHINESE,
35 |             lambda line: {
36 |                 "question": line["question"],
37 |                 "context": line["context"],
38 |                 "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
39 |             },
40 |         ),
41 |         hf_repo="lighteval/ChineseSquad",
42 |         hf_subset="default",
43 |         evaluation_splits=("validation",),
44 |         few_shots_split="train",
45 |         metrics=(
46 |             MultilingualQuasiExactMatchMetric(Language.CHINESE, "prefix"),
47 |             MultilingualQuasiF1ScoreMetric(Language.CHINESE),
48 |         ),
49 |         generation_size=400,
50 |         stop_sequence=("\n",),
51 |     )
52 | ]
53 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/squad_it.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Squad It
 4 | 
 5 | dataset:
 6 | crux82/squad_it
 7 | 
 8 | abstract:
 9 | SQuAD-it: Italian translation of the SQuAD dataset.
10 | 
11 | languages:
12 | italian
13 | 
14 | tags:
15 | multilingual, qa
16 | 
17 | paper:
18 | https://github.com/crux82/squad-it
19 | """
20 | 
21 | from lighteval.metrics.dynamic_metrics import (
22 |     MultilingualQuasiExactMatchMetric,
23 |     MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 | 
29 | 
30 | TASKS_TABLE = [
31 |     LightevalTaskConfig(
32 |         name=f"squad_{Language.ITALIAN.value}",
33 |         prompt_function=get_qa_prompt_function(
34 |             Language.ITALIAN,
35 |             lambda line: {
36 |                 "question": line["question"],
37 |                 "context": line["context"],
38 |                 "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
39 |             },
40 |         ),
41 |         hf_repo="crux82/squad_it",
42 |         hf_subset="default",
43 |         hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]),
44 |         evaluation_splits=("test",),
45 |         few_shots_split="train",
46 |         generation_size=400,
47 |         stop_sequence=("\n",),
48 |         metrics=(
49 |             MultilingualQuasiExactMatchMetric(Language.ITALIAN, "prefix"),
50 |             MultilingualQuasiF1ScoreMetric(Language.ITALIAN),
51 |         ),
52 |     )
53 | ]
54 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/arcd.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Arcd
 4 | 
 5 | dataset:
 6 | hsseinmz/arcd
 7 | 
 8 | abstract:
 9 | ARCD: Arabic Reading Comprehension Dataset.
10 | 
11 | languages:
12 | arabic
13 | 
14 | tags:
15 | multilingual, multiple-choice, qa, reasoning
16 | 
17 | paper:
18 | https://arxiv.org/pdf/1906.05394
19 | """
20 | 
21 | from lighteval.metrics.dynamic_metrics import (
22 |     MultilingualQuasiExactMatchMetric,
23 |     MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 | 
29 | 
30 | # ARCD: Arabic Reading Comprehension Dataset.
31 | # https://arxiv.org/pdf/1906.05394
32 | 
33 | 
34 | TASKS_TABLE = [
35 |     LightevalTaskConfig(
36 |         name=f"arcd_{Language.ARABIC.value}",
37 |         prompt_function=get_qa_prompt_function(
38 |             Language.ARABIC,
39 |             lambda line: {
40 |                 "question": line["question"],
41 |                 "context": line["context"],
42 |                 "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
43 |             },
44 |         ),
45 |         hf_repo="hsseinmz/arcd",
46 |         hf_subset="plain_text",
47 |         evaluation_splits=("validation",),
48 |         few_shots_split="train",
49 |         metrics=(
50 |             MultilingualQuasiExactMatchMetric(Language.ARABIC, "prefix"),
51 |             MultilingualQuasiF1ScoreMetric(Language.ARABIC),
52 |         ),
53 |         generation_size=400,
54 |         stop_sequence=("\n",),
55 |     )
56 | ]
57 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/prost.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Prost
 4 | 
 5 | dataset:
 6 | lighteval/prost
 7 | 
 8 | abstract:
 9 | PROST is a benchmark for testing physical reasoning about objects through space
10 | and time. It includes 18,736 multiple-choice questions covering 10 core physics
11 | concepts, designed to probe models in zero-shot settings. Results show that even
12 | large pretrained models struggle with physical reasoning and are sensitive to
13 | question phrasing, underscoring their limited real-world understanding.
14 | 
15 | languages:
16 | english
17 | 
18 | tags:
19 | reasoning, qa, physical-commonsense
20 | 
21 | paper:
22 | https://arxiv.org/abs/2106.03634
23 | """
24 | 
25 | from lighteval.metrics.metrics import Metrics
26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
27 | from lighteval.tasks.requests import Doc
28 | 
29 | 
30 | def prost_prompt(line, task_name: str = None):
31 |     return Doc(
32 |         task_name=task_name,
33 |         query=line["question"],
34 |         choices=[f" {c}" for c in line["choices"]],
35 |         gold_index=int(line["label"]) if isinstance(line["label"], int) else int(line["label"]),
36 |     )
37 | 
38 | 
39 | prost = LightevalTaskConfig(
40 |     name="prost",
41 |     prompt_function=prost_prompt,
42 |     hf_repo="lighteval/prost",
43 |     hf_subset="default",
44 |     hf_avail_splits=["test"],
45 |     evaluation_splits=["test"],
46 |     few_shots_split=None,
47 |     few_shots_select=None,
48 |     generation_size=-1,
49 |     metrics=[Metrics.loglikelihood_acc],
50 |     stop_sequence=["\n"],
51 |     version=0,
52 | )
53 | 
54 | TASKS_TABLE = [
55 |     prost,
56 | ]
57 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/squad_es.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Squad Es
 4 | 
 5 | dataset:
 6 | ccasimiro/squad_es
 7 | 
 8 | abstract:
 9 | SQuAD-es: Spanish translation of the Stanford Question Answering Dataset
10 | 
11 | languages:
12 | spanish
13 | 
14 | tags:
15 | multilingual, qa
16 | 
17 | paper:
18 | https://huggingface.co/datasets/ccasimiro/squad_es
19 | """
20 | 
21 | from lighteval.metrics.dynamic_metrics import (
22 |     MultilingualQuasiExactMatchMetric,
23 |     MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 | 
29 | 
30 | TASKS_TABLE = [
31 |     LightevalTaskConfig(
32 |         name=f"squad_{Language.SPANISH.value}",
33 |         prompt_function=get_qa_prompt_function(
34 |             Language.SPANISH,
35 |             lambda line: {
36 |                 "question": line["question"],
37 |                 "context": line["context"],
38 |                 "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
39 |             },
40 |         ),
41 |         hf_repo="ccasimiro/squad_es",
42 |         hf_subset="v2.0.0",
43 |         hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]),
44 |         evaluation_splits=("validation",),
45 |         few_shots_split="train",
46 |         metrics=(
47 |             MultilingualQuasiExactMatchMetric(Language.SPANISH, "prefix"),
48 |             MultilingualQuasiF1ScoreMetric(Language.SPANISH),
49 |         ),
50 |         generation_size=400,
51 |         stop_sequence=("\n",),
52 |     )
53 | ]
54 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/narrativeqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Narrativeqa
 4 | 
 5 | dataset:
 6 | lighteval/narrative_qa_helm
 7 | 
 8 | abstract:
 9 | NarrativeQA is a reading comprehension benchmark that tests deep understanding
10 | of full narratives—books and movie scripts—rather than shallow text matching. To
11 | answer its questions, models must integrate information across entire stories.
12 | 
13 | languages:
14 | english
15 | 
16 | tags:
17 | qa, reading-comprehension
18 | 
19 | paper:
20 | https://aclanthology.org/Q18-1023/
21 | """
22 | 
23 | from lighteval.metrics.metrics import Metrics
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.requests import Doc
26 | 
27 | 
28 | narrativeqa_instruction = "Answer the question based on the passage.\n"
29 | 
30 | 
31 | def narrativeqa_prompt(line, task_name: str = None):
32 |     return Doc(
33 |         task_name=task_name,
34 |         query=f"Passage: {line['passage']}\nQuestion: {line['question']}\nAnswer:",
35 |         gold_index=list(range(len(line["references"]))),
36 |         choices=[[str(a) for a in line["references"]]],
37 |     )
38 | 
39 | 
40 | narrativeqa = LightevalTaskConfig(
41 |     name="narrativeqa",
42 |     prompt_function=narrativeqa_prompt,
43 |     hf_repo="lighteval/narrative_qa_helm",
44 |     hf_subset="default",
45 |     hf_avail_splits=["train", "test", "validation"],
46 |     evaluation_splits=["test"],
47 |     few_shots_split=None,
48 |     few_shots_select=None,
49 |     generation_size=100,
50 |     metrics=[Metrics.exact_match],
51 |     stop_sequence=["\n"],
52 |     version=0,
53 | )
54 | 
55 | TASKS_TABLE = [
56 |     narrativeqa,
57 | ]
58 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/legalsupport.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Legalsupport
 4 | 
 5 | dataset:
 6 | lighteval/LegalSupport
 7 | 
 8 | abstract:
 9 | Measures fine-grained legal reasoning through reverse entailment.
10 | 
11 | languages:
12 | english
13 | 
14 | tags:
15 | legal
16 | 
17 | paper:
18 | """
19 | 
20 | from lighteval.metrics.metrics import Metrics
21 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
22 | from lighteval.tasks.requests import Doc
23 | 
24 | 
25 | def legalsupport_prompt(line, task_name: str = None):
26 |     query = f"Which statement best supports the passage?\nPassage: {line['context']}\n"
27 |     query += "".join(
28 |         [
29 |             f"{key}. {choice}\n"
30 |             for key, choice in zip(
31 |                 ["a", "b"], [line["citation_a"]["parenthetical"], line["citation_b"]["parenthetical"]]
32 |             )
33 |         ]
34 |     )
35 |     query += "Answer:"
36 | 
37 |     return Doc(
38 |         task_name=task_name,
39 |         query=query,
40 |         choices=["a", "b"],
41 |         gold_index=0 if line["answer_label"] == "citation_a" else 1,
42 |     )
43 | 
44 | 
45 | legalsupport = LightevalTaskConfig(
46 |     name="legalsupport",
47 |     prompt_function=legalsupport_prompt,
48 |     hf_repo="lighteval/LegalSupport",
49 |     hf_subset="default",
50 |     hf_avail_splits=["train", "test", "validation"],
51 |     evaluation_splits=["validation", "test"],
52 |     few_shots_split=None,
53 |     few_shots_select=None,
54 |     generation_size=None,
55 |     metrics=[Metrics.loglikelihood_acc],
56 |     stop_sequence=["\n"],
57 |     version=0,
58 | )
59 | 
60 | TASKS_TABLE = [
61 |     legalsupport,
62 | ]
63 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/sciq.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Sciq
 4 | 
 5 | dataset:
 6 | allenai/sciq
 7 | 
 8 | abstract:
 9 | The SciQ dataset contains 13,679 crowdsourced science exam questions about
10 | Physics, Chemistry and Biology, among others. The questions are in
11 | multiple-choice format with 4 answer options each. For the majority of the
12 | questions, an additional paragraph with supporting evidence for the correct
13 | answer is provided.
14 | 
15 | languages:
16 | english
17 | 
18 | tags:
19 | physics, chemistry, biology, reasoning, multiple-choice, qa
20 | 
21 | paper:
22 | https://arxiv.org/abs/1707.06209
23 | """
24 | 
25 | from lighteval.metrics.metrics import Metrics
26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
27 | from lighteval.tasks.requests import Doc
28 | 
29 | 
30 | def sciq_prompt(line, task_name: str = None):
31 |     return Doc(
32 |         task_name=task_name,
33 |         query=f"{line['support']}\nQuestion: {line['question']}\nAnswer:".strip(),
34 |         choices=[
35 |             f" {c}" for c in [line["distractor1"], line["distractor2"], line["distractor3"], line["correct_answer"]]
36 |         ],
37 |         gold_index=3,
38 |     )
39 | 
40 | 
41 | sciq = LightevalTaskConfig(
42 |     name="sciq",
43 |     prompt_function=sciq_prompt,
44 |     hf_repo="allenai/sciq",
45 |     hf_subset="default",
46 |     hf_avail_splits=["train", "validation", "test"],
47 |     evaluation_splits=["test"],
48 |     few_shots_split=None,
49 |     few_shots_select=None,
50 |     generation_size=-1,
51 |     metrics=[Metrics.loglikelihood_acc],
52 |     stop_sequence=["\n"],
53 |     version=0,
54 | )
55 | 
56 | TASKS_TABLE = [
57 |     sciq,
58 | ]
59 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/qasper.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Qasper
 4 | 
 5 | dataset:
 6 | allenai/qasper
 7 | 
 8 | abstract:
 9 | QASPER is a dataset for question answering on scientific research papers. It
10 | consists of 5,049 questions over 1,585 Natural Language Processing papers. Each
11 | question is written by an NLP practitioner who read only the title and abstract
12 | of the corresponding paper, and the question seeks information present in the
13 | full text. The questions are then answered by a separate set of NLP
14 | practitioners who also provide supporting evidence to answers.
15 | 
16 | languages:
17 | english
18 | 
19 | tags:
20 | qa, scientific
21 | 
22 | paper:
23 | https://arxiv.org/abs/2105.03011
24 | """
25 | 
26 | from lighteval.metrics.metrics import Metrics
27 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
28 | from lighteval.tasks.requests import Doc
29 | 
30 | 
31 | def qasper_prompt(line, task_name: str = None):
32 |     return Doc(
33 |         task_name=task_name,
34 |         query=f"Title: {line['title']}\n\nPassage: {line['passage']}\n\n Question: {line['question']}\nAnswer: ",
35 |         gold_index=0,
36 |         choices=[line["gold"]],
37 |     )
38 | 
39 | 
40 | qasper = LightevalTaskConfig(
41 |     name="qasper",
42 |     prompt_function=qasper_prompt,
43 |     hf_repo="allenai/qasper",
44 |     hf_subset="qasper",
45 |     hf_avail_splits=["train", "validation"],
46 |     evaluation_splits=["validation"],
47 |     few_shots_split=None,
48 |     few_shots_select=None,
49 |     generation_size=20,
50 |     metrics=[Metrics.f1_score],
51 |     stop_sequence=["\n"],
52 |     version=0,
53 | )
54 | 
55 | TASKS_TABLE = [
56 |     qasper,
57 | ]
58 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/faquad.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Faquad
 4 | 
 5 | dataset:
 6 | eraldoluis/faquad
 7 | 
 8 | abstract:
 9 | FaQuAD: A Portuguese Reading Comprehension Dataset
10 | 
11 | languages:
12 | portuguese
13 | 
14 | tags:
15 | multilingual, qa
16 | 
17 | paper:
18 | https://arxiv.org/abs/2007.15671
19 | """
20 | 
21 | from lighteval.metrics.dynamic_metrics import (
22 |     MultilingualQuasiExactMatchMetric,
23 |     MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 | 
29 | 
30 | TASKS_TABLE = [
31 |     LightevalTaskConfig(
32 |         name=f"faquad_{Language.PORTUGUESE.value}",
33 |         prompt_function=get_qa_prompt_function(
34 |             Language.PORTUGUESE,
35 |             lambda line: {
36 |                 "question": line["question"],
37 |                 "context": line["context"],
38 |                 "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
39 |             },
40 |         ),
41 |         hf_repo="eraldoluis/faquad",
42 |         hf_subset="plain_text",
43 |         hf_revision="205ba826a2282a4a5aa9bd3651e55ee4f2da1546",
44 |         hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]),
45 |         evaluation_splits=("validation",),
46 |         few_shots_split="train",
47 |         metrics=(
48 |             MultilingualQuasiExactMatchMetric(Language.PORTUGUESE, "prefix"),
49 |             MultilingualQuasiF1ScoreMetric(Language.PORTUGUESE),
50 |         ),
51 |         generation_size=400,
52 |         stop_sequence=("\n",),
53 |     )
54 | ]
55 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/germanquad.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Germanquad
 4 | 
 5 | dataset:
 6 | deepset/germanquad
 7 | 
 8 | abstract:
 9 | GermanQuAD: High-quality German QA dataset with 13,722 questions.
10 | 
11 | languages:
12 | german
13 | 
14 | tags:
15 | multilingual, qa
16 | 
17 | paper:
18 | https://arxiv.org/abs/2104.12741
19 | """
20 | 
21 | from lighteval.metrics.dynamic_metrics import (
22 |     MultilingualQuasiExactMatchMetric,
23 |     MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 | 
29 | 
30 | TASKS_TABLE = [
31 |     LightevalTaskConfig(
32 |         name=f"germanquad_{Language.GERMAN.value}",
33 |         prompt_function=get_qa_prompt_function(
34 |             Language.GERMAN,
35 |             lambda line: {
36 |                 "question": line["question"],
37 |                 "context": line["context"],
38 |                 "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
39 |             },
40 |         ),
41 |         hf_repo="deepset/germanquad",
42 |         hf_subset="plain_text",
43 |         hf_revision="fff05ceaf2ffbe5b65c7e0c57e678f7b7e1a0581",
44 |         hf_filter=lambda line: any(len(ans) > 0 for ans in line["answers"]["text"]),
45 |         evaluation_splits=("test",),
46 |         few_shots_split="train",
47 |         generation_size=400,
48 |         stop_sequence=("\n",),
49 |         metrics=(
50 |             MultilingualQuasiExactMatchMetric(Language.GERMAN, "prefix"),
51 |             MultilingualQuasiF1ScoreMetric(Language.GERMAN),
52 |         ),
53 |     )
54 | ]
55 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/webqs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Webqs
 4 | 
 5 | dataset:
 6 | stanfordnlp/web_questions
 7 | 
 8 | abstract:
 9 | This dataset consists of 6,642 question/answer pairs. The questions are supposed
10 | to be answerable by Freebase, a large knowledge graph. The questions are mostly
11 | centered around a single named entity. The questions are popular ones asked on
12 | the web.
13 | 
14 | languages:
15 | english
16 | 
17 | tags:
18 | qa
19 | 
20 | paper:
21 | https://aclanthology.org/D13-1160.pdf
22 | """
23 | 
24 | from lighteval.metrics.metrics import Metrics
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.requests import Doc
27 | 
28 | 
29 | def webqs_prompt(line, task_name: str = None):
30 |     def _remove_prefixes(aliases):
31 |         aliases.sort()
32 |         ret = [aliases[0]]
33 |         for alias in aliases[1:]:
34 |             if not alias.startswith(ret[-1]):
35 |                 ret.append(alias)
36 |         return ret
37 | 
38 |     return Doc(
39 |         task_name=task_name,
40 |         query=f"Question: {line['question']}\nAnswer:",
41 |         gold_index=0,
42 |         choices=[[f" {c}" for c in _remove_prefixes(line["answers"])]],
43 |     )
44 | 
45 | 
46 | webqs = LightevalTaskConfig(
47 |     name="webqs",
48 |     prompt_function=webqs_prompt,
49 |     hf_repo="stanfordnlp/web_questions",
50 |     hf_subset="default",
51 |     hf_avail_splits=["train", "test"],
52 |     evaluation_splits=["test"],
53 |     few_shots_split=None,
54 |     few_shots_select=None,
55 |     generation_size=-1,
56 |     metrics=[Metrics.exact_match],
57 |     stop_sequence=["\n"],
58 |     version=0,
59 | )
60 | 
61 | TASKS_TABLE = [
62 |     webqs,
63 | ]
64 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/aimo.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | AIMO Progress Prize 1
 4 | 
 5 | dataset:
 6 | lighteval/aimo_progress_prize_1
 7 | 
 8 | abstract:
 9 | Task to evaluate LLMs on the training set of the Kaggle AIMO competition:
10 | 
11 | languages:
12 | english
13 | 
14 | tags:
15 | math, reasoning
16 | 
17 | paper:
18 | """
19 | 
20 | from inspect_ai.dataset import Sample
21 | from inspect_ai.solver import generate
22 | 
23 | from lighteval.metrics.metrics import Metrics, math_scorer
24 | from lighteval.metrics.normalizations import math_normalizer
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.requests import Doc
27 | 
28 | 
29 | def aimo_prompt(line, task_name: str = None):
30 |     return Doc(
31 |         task_name=task_name,
32 |         choices=[str(line["answer"])],
33 |         gold_index=0,
34 |         query=line["problem"],
35 |     )
36 | 
37 | 
38 | def record_to_sample(record):
39 |     return Sample(input=record["problem"], target=str(record["answer"]))
40 | 
41 | 
42 | task = LightevalTaskConfig(
43 |     name="aimo_progress_prize_1",
44 |     prompt_function=aimo_prompt,
45 |     sample_fields=record_to_sample,
46 |     solver=[generate(cache=True)],
47 |     scorer=math_scorer(),
48 |     hf_subset="",
49 |     hf_repo="lighteval/aimo_progress_prize_1",
50 |     hf_avail_splits=["train"],
51 |     evaluation_splits=["train"],
52 |     few_shots_split="train",
53 |     few_shots_select="sequential",
54 |     metrics=[
55 |         Metrics.exact_match(sample_params={"normalize_gold": math_normalizer, "normalize_pred": math_normalizer})
56 |     ],
57 |     generation_size=2048,
58 |     stop_sequence=None,
59 | )
60 | 
61 | # STORE YOUR EVALS
62 | TASKS_TABLE = [task]
63 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/asdiv.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Asdiv
 4 | 
 5 | dataset:
 6 | EleutherAI/asdiv
 7 | 
 8 | abstract:
 9 | ASDiv is a dataset for arithmetic reasoning that contains 2,000+ questions
10 | covering addition, subtraction, multiplication, and division.
11 | 
12 | languages:
13 | english
14 | 
15 | tags:
16 | math, reasoning
17 | 
18 | paper:
19 | https://arxiv.org/abs/2410.12853
20 | """
21 | 
22 | from inspect_ai.dataset import Sample
23 | from inspect_ai.solver import generate
24 | 
25 | from lighteval.metrics.metrics import Metrics, math_scorer
26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
27 | from lighteval.tasks.requests import Doc
28 | 
29 | 
30 | def asdiv_prompt(line, task_name: str = None):
31 |     return Doc(
32 |         task_name=task_name,
33 |         query=f"{line['body']}\nQuestion:{line['question']}\nAnswer:",
34 |         choices=line["answer"].split(" (")[0],
35 |         gold_index=[0],
36 |     )
37 | 
38 | 
39 | def record_to_sample(record):
40 |     query = f"{record['body']}\n{record['question']}"
41 |     target = record["answer"].split(" (")[0]
42 |     return Sample(input=query, target=target)
43 | 
44 | 
45 | asdiv = LightevalTaskConfig(
46 |     name="asdiv",
47 |     prompt_function=asdiv_prompt,
48 |     hf_repo="EleutherAI/asdiv",
49 |     hf_subset="asdiv",
50 |     hf_avail_splits=["validation"],
51 |     evaluation_splits=["validation"],
52 |     few_shots_split=None,
53 |     few_shots_select=None,
54 |     generation_size=-1,
55 |     metrics=[Metrics.exact_match],
56 |     stop_sequence=["\n"],
57 |     version=0,
58 |     sample_fields=record_to_sample,
59 |     solver=[generate(cache=True)],
60 |     scorer=math_scorer(),
61 | )
62 | 
63 | TASKS_TABLE = [asdiv]
64 | 


--------------------------------------------------------------------------------
/examples/tasks/bbh.txt:
--------------------------------------------------------------------------------
 1 | lighteval|bigbench:causal_judgment|3
 2 | lighteval|bigbench:date_understanding|3
 3 | lighteval|bigbench:disambiguation_qa|3
 4 | lighteval|bigbench:geometric_shapes|3
 5 | lighteval|bigbench:logical_deduction_five_objects|3
 6 | lighteval|bigbench:logical_deduction_seven_objects|3
 7 | lighteval|bigbench:logical_deduction_three_objects|3
 8 | lighteval|bigbench:movie_recommendation|3
 9 | lighteval|bigbench:navigate|3
10 | lighteval|bigbench:reasoning_about_colored_objects|3
11 | lighteval|bigbench:ruin_names|3
12 | lighteval|bigbench:salient_translation_error_detection|3
13 | lighteval|bigbench:snarks|3
14 | lighteval|bigbench:sports_understanding|3
15 | lighteval|bigbench:temporal_sequences|3
16 | lighteval|bigbench:tracking_shuffled_objects_five_objects|3
17 | lighteval|bigbench:tracking_shuffled_objects_seven_objects|3
18 | lighteval|bigbench:tracking_shuffled_objects_three_objects|3
19 | harness|bigbench:causal_judgment|3
20 | harness|bigbench:date_understanding|3
21 | harness|bigbench:disambiguation_qa|3
22 | harness|bigbench:geometric_shapes|3
23 | harness|bigbench:logical_deduction_five_objects|3
24 | harness|bigbench:logical_deduction_seven_objects|3
25 | harness|bigbench:logical_deduction_three_objects|3
26 | harness|bigbench:movie_recommendation|3
27 | harness|bigbench:navigate|3
28 | harness|bigbench:reasoning_about_colored_objects|3
29 | harness|bigbench:ruin_names|3
30 | harness|bigbench:salient_translation_error_detection|3
31 | harness|bigbench:snarks|3
32 | harness|bigbench:sports_understanding|3
33 | harness|bigbench:temporal_sequences|3
34 | harness|bigbench:tracking_shuffled_objects_five_objects|3
35 | harness|bigbench:tracking_shuffled_objects_seven_objects|3
36 | harness|bigbench:tracking_shuffled_objects_three_objects|3
37 | 


--------------------------------------------------------------------------------
/.github/workflows/tests.yaml:
--------------------------------------------------------------------------------
 1 | name: Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - v*-release
 8 |   pull_request:
 9 |     branches:
10 |       - main
11 | 
12 | jobs:
13 |   run_tests:
14 |     name: Run tests
15 |     runs-on: ubuntu-latest
16 |     steps:
17 |       - name: Checkout repository
18 |         uses: actions/checkout@v4
19 |         with:
20 |           lfs: true
21 | 
22 |       - name: Cache Hugging Face models
23 |         uses: actions/cache@v4
24 |         with:
25 |           path: cache/models
26 |           key: hf-models-${{ runner.os }}-${{ github.ref }}
27 |           restore-keys: hf-models-${{ runner.os }}-
28 | 
29 |       - name: Cache Hugging Face datasets
30 |         uses: actions/cache@v4
31 |         with:
32 |           path: cache/datasets
33 |           key: hf-datasets-${{ runner.os }}-${{ github.ref }}
34 |           restore-keys: hf-datasets-${{ runner.os }}-
35 | 
36 |       - name: Cache uv virtual environment
37 |         uses: actions/cache@v4
38 |         with:
39 |           path: .venv
40 |           key: uv-env-${{ runner.os }}-${{ hashFiles('pyproject.toml') }}
41 |           restore-keys: uv-env-${{ runner.os }}-
42 | 
43 |       - name: Install uv
44 |         uses: astral-sh/setup-uv@v5
45 |         with:
46 |           enable-cache: true
47 | 
48 |       - name: Install the project
49 |         run: uv sync --extra dev
50 | 
51 |       - name: Ensure cache directories exist
52 |         run: mkdir -p cache/models cache/datasets
53 | 
54 |       - name: Run tests
55 |         env:
56 |           HF_TEST_TOKEN: ${{ secrets.HF_TEST_TOKEN }}
57 |           HF_HOME: "cache/models"
58 |           HF_DATASETS_CACHE: "cache/datasets"
59 |         run: uv run pytest -x --disable-pytest-warnings
60 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/twitterAAE.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Twitteraae
 4 | 
 5 | dataset:
 6 | lighteval/twitterAAE
 7 | 
 8 | abstract:
 9 | Demographic Dialectal Variation in Social Media: A Case Study of African-American English
10 | 
11 | languages:
12 | english
13 | 
14 | tags:
15 | language-modeling
16 | 
17 | paper:
18 | https://aclanthology.org/D16-1120/
19 | """
20 | 
21 | from lighteval.metrics.metrics import Metrics
22 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
23 | from lighteval.tasks.requests import Doc
24 | 
25 | 
26 | def twitter_aae_prompt(line, task_name: str = None):
27 |     return Doc(task_name=task_name, query=line["tweet"], choices=None, gold_index=None)
28 | 
29 | 
30 | twitterAAE_aa = LightevalTaskConfig(
31 |     name="twitterAAE:aa",
32 |     prompt_function=twitter_aae_prompt,
33 |     hf_repo="lighteval/twitterAAE",
34 |     hf_subset="aa",
35 |     hf_avail_splits=["test"],
36 |     evaluation_splits=["test"],
37 |     few_shots_split=None,
38 |     few_shots_select=None,
39 |     generation_size=-1,
40 |     metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
41 |     stop_sequence=["\n"],
42 |     version=0,
43 | )
44 | 
45 | 
46 | twitterAAE_white = LightevalTaskConfig(
47 |     name="twitterAAE:white",
48 |     prompt_function=twitter_aae_prompt,
49 |     hf_repo="lighteval/twitterAAE",
50 |     hf_subset="white",
51 |     hf_avail_splits=["test"],
52 |     evaluation_splits=["test"],
53 |     few_shots_split=None,
54 |     few_shots_select=None,
55 |     generation_size=-1,
56 |     metrics=[Metrics.word_perplexity, Metrics.byte_perplexity, Metrics.bits_per_byte],
57 |     stop_sequence=["\n"],
58 |     version=0,
59 | )
60 | 
61 | TASKS_TABLE = [
62 |     twitterAAE_aa,
63 |     twitterAAE_white,
64 | ]
65 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/logiqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Logiqa
 4 | 
 5 | dataset:
 6 | lighteval/logiqa_harness
 7 | 
 8 | abstract:
 9 | LogiQA is a machine reading comprehension dataset focused on testing logical
10 | reasoning abilities. It contains 8,678 expert-written multiple-choice questions
11 | covering various types of deductive reasoning. While humans perform strongly,
12 | state-of-the-art models lag far behind, making LogiQA a benchmark for advancing
13 | logical reasoning in NLP systems.
14 | 
15 | languages:
16 | english
17 | 
18 | tags:
19 | qa
20 | 
21 | paper:
22 | https://arxiv.org/abs/2007.08124
23 | """
24 | 
25 | from lighteval.metrics.metrics import Metrics
26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
27 | from lighteval.tasks.requests import Doc
28 | 
29 | 
30 | def logiqa_prompt(line, task_name: str = None):
31 |     query = f"Passage: {line['context']}\nQuestion: {line['question']}\nChoices:\n"
32 |     query += "".join([f"{key}. {choice}\n" for key, choice in zip(["A", "B", "C", "D"], line["options"])])
33 |     query += "Answer:"
34 | 
35 |     return Doc(
36 |         task_name=task_name,
37 |         query=query,
38 |         choices=[f" {c}" for c in line["options"]],
39 |         gold_index=["a", "b", "c", "d"].index(line["label"]),
40 |     )
41 | 
42 | 
43 | logiqa = LightevalTaskConfig(
44 |     name="logiqa",
45 |     prompt_function=logiqa_prompt,
46 |     hf_repo="lighteval/logiqa_harness",
47 |     hf_subset="logiqa",
48 |     hf_avail_splits=["train", "validation", "test"],
49 |     evaluation_splits=["test"],
50 |     few_shots_split=None,
51 |     few_shots_select=None,
52 |     generation_size=-1,
53 |     metrics=[Metrics.loglikelihood_acc],
54 |     stop_sequence=["\n"],
55 |     version=0,
56 | )
57 | 
58 | TASKS_TABLE = [
59 |     logiqa,
60 | ]
61 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/hindi_boolq.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Hindi Boolq
 4 | 
 5 | dataset:
 6 | ai4bharat/boolq-hi
 7 | 
 8 | abstract:
 9 | Hindi Boolq multilingual benchmark.
10 | 
11 | languages:
12 | gujarati, hindi, malayalam, marathi, tamil
13 | 
14 | tags:
15 | classification, multilingual, qa
16 | 
17 | paper:
18 | """
19 | 
20 | from langcodes import standardize_tag
21 | 
22 | from lighteval.metrics.dynamic_metrics import (
23 |     LogLikelihoodAccMetric,
24 |     MultilingualQuasiExactMatchMetric,
25 | )
26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
27 | from lighteval.tasks.templates.boolq import get_boolq_prompt_function
28 | from lighteval.tasks.templates.utils.formulation import (
29 |     CFFormulation,
30 | )
31 | from lighteval.utils.language import Language
32 | 
33 | 
34 | TASKS_TABLE = [
35 |     LightevalTaskConfig(
36 |         name=f"community_boolq_{language.value}",
37 |         prompt_function=get_boolq_prompt_function(
38 |             language,
39 |             lambda line: {
40 |                 "question": line["question"],
41 |                 "answer": line["answer"],
42 |                 "context": line["passage"],
43 |             },
44 |             formulation=CFFormulation(),
45 |         ),
46 |         hf_repo="ai4bharat/boolq-hi",
47 |         hf_subset=standardize_tag(language.value),
48 |         evaluation_splits=("validation",),
49 |         few_shots_split="train",
50 |         generation_size=5,
51 |         stop_sequence=["\n"],
52 |         metrics=[MultilingualQuasiExactMatchMetric(language, "full"), LogLikelihoodAccMetric()],
53 |     )
54 |     for language in [
55 |         Language.HINDI,
56 |         Language.GUJARATI,
57 |         Language.MALAYALAM,
58 |         Language.MARATHI,
59 |         Language.TAMIL,
60 |     ]
61 | ]
62 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/mintaka.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Mintaka
 4 | 
 5 | dataset:
 6 | AmazonScience/mintaka
 7 | 
 8 | abstract:
 9 | Mintaka multilingual benchmark.
10 | 
11 | languages:
12 | arabic, english, french, german, hindi, italian, japanese, portuguese, spanish
13 | 
14 | tags:
15 | knowledge, multilingual, qa
16 | 
17 | paper:
18 | """
19 | 
20 | from langcodes import standardize_tag
21 | 
22 | from lighteval.metrics.dynamic_metrics import (
23 |     MultilingualQuasiExactMatchMetric,
24 |     MultilingualQuasiF1ScoreMetric,
25 | )
26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
27 | from lighteval.tasks.templates.qa import get_qa_prompt_function
28 | from lighteval.utils.language import Language
29 | 
30 | 
31 | TASKS_TABLE = [
32 |     LightevalTaskConfig(
33 |         name=f"mintaka_{lang.value}",
34 |         prompt_function=get_qa_prompt_function(
35 |             lang,
36 |             lambda line: {
37 |                 "question": line["question"],
38 |                 "choices": [line["answerText"]],
39 |             },
40 |         ),
41 |         hf_repo="AmazonScience/mintaka",
42 |         hf_subset=standardize_tag(lang.value),
43 |         evaluation_splits=("test",),
44 |         few_shots_split="train",
45 |         generation_size=400,
46 |         stop_sequence=("\n",),
47 |         metrics=[
48 |             MultilingualQuasiExactMatchMetric(lang, "prefix"),
49 |             MultilingualQuasiF1ScoreMetric(lang),
50 |         ],
51 |     )
52 |     for lang in [
53 |         Language.ARABIC,
54 |         Language.GERMAN,
55 |         Language.ENGLISH,
56 |         Language.SPANISH,
57 |         Language.FRENCH,
58 |         Language.HINDI,
59 |         Language.ITALIAN,
60 |         Language.JAPANESE,
61 |         Language.PORTUGUESE,
62 |     ]
63 | ]
64 | 


--------------------------------------------------------------------------------
/tests/unit/models/test_base_model.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2024 The HuggingFace Team
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from lighteval.models.model_loader import load_model
24 | from lighteval.models.transformers.transformers_model import TransformersModel, TransformersModelConfig
25 | 
26 | 
27 | def test_empty_requests():
28 |     model_config = TransformersModelConfig(
29 |         model_name="hf-internal-testing/tiny-random-LlamaForCausalLM", model_parallel=False, revision="main"
30 |     )
31 |     model: TransformersModel = load_model(config=model_config)
32 | 
33 |     assert model.loglikelihood([]) == []
34 |     assert model.loglikelihood_rolling([]) == []
35 |     assert model.greedy_until([]) == []
36 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/winogrande.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Winogrande
 4 | 
 5 | dataset:
 6 | allenai/winogrande
 7 | 
 8 | abstract:
 9 | WinoGrande is a new collection of 44k problems, inspired by Winograd Schema
10 | Challenge (Levesque, Davis, and Morgenstern 2011), but adjusted to improve the
11 | scale and robustness against the dataset-specific bias. Formulated as a
12 | fill-in-a-blank task with binary options, the goal is to choose the right option
13 | for a given sentence which requires commonsense reasoning.
14 | 
15 | languages:
16 | english
17 | 
18 | tags:
19 | commonsense, multiple-choice
20 | 
21 | paper:
22 | https://arxiv.org/abs/1907.10641
23 | """
24 | 
25 | from lighteval.metrics.metrics import Metrics
26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
27 | from lighteval.tasks.requests import Doc
28 | 
29 | 
30 | def winogrande_prompt(line, task_name: str = None):
31 |     query, end_of_target = line["sentence"].split("_")
32 |     end_of_target = end_of_target.strip()
33 |     return Doc(
34 |         task_name=task_name,
35 |         query=query,
36 |         choices=[f"{line['option1']} {end_of_target}", f"{line['option2']} {end_of_target}"],
37 |         gold_index=int(line["answer"]) - 1 if line["answer"] != "" else -1,
38 |     )
39 | 
40 | 
41 | winogrande = LightevalTaskConfig(
42 |     name="winogrande",
43 |     prompt_function=winogrande_prompt,
44 |     hf_repo="allenai/winogrande",
45 |     hf_subset="winogrande_xl",
46 |     hf_avail_splits=["train", "test", "validation"],
47 |     evaluation_splits=["validation"],
48 |     few_shots_split=None,
49 |     few_shots_select="random_sampling",
50 |     generation_size=-1,
51 |     metrics=[Metrics.loglikelihood_acc],
52 |     stop_sequence=["\n"],
53 |     version=0,
54 | )
55 | 
56 | TASKS_TABLE = [
57 |     winogrande,
58 | ]
59 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/swag.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Swag
 4 | 
 5 | dataset:
 6 | allenai/swag
 7 | 
 8 | abstract:
 9 | The dataset consists of 113k multiple choice questions about grounded situations
10 | (73k training, 20k validation, 20k test). Each question is a video caption from
11 | LSMDC or ActivityNet Captions, with four answer choices about what might happen
12 | next in the scene. The correct answer is the (real) video caption for the next
13 | event in the video; the three incorrect answers are adversarially generated and
14 | human verified, so as to fool machines but not humans. SWAG aims to be a
15 | benchmark for evaluating grounded commonsense NLI and for learning
16 | representations.
17 | 
18 | languages:
19 | english
20 | 
21 | tags:
22 | narrative, reasoning
23 | 
24 | paper:
25 | https://arxiv.org/abs/1808.05326
26 | """
27 | 
28 | from lighteval.metrics.metrics import Metrics
29 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
30 | from lighteval.tasks.requests import Doc
31 | 
32 | 
33 | def swag_prompt(line, task_name: str = None):
34 |     choices = [line["ending0"], line["ending1"], line["ending2"], line["ending3"]]
35 |     return Doc(
36 |         task_name=task_name,
37 |         query=line["startphrase"],
38 |         choices=choices,
39 |         gold_index=int(line["label"]),
40 |     )
41 | 
42 | 
43 | swag = LightevalTaskConfig(
44 |     name="swag",
45 |     prompt_function=swag_prompt,
46 |     hf_repo="allenai/swag",
47 |     hf_subset="regular",
48 |     hf_avail_splits=["train", "validation"],
49 |     evaluation_splits=["validation"],
50 |     few_shots_split=None,
51 |     few_shots_select=None,
52 |     generation_size=-1,
53 |     metrics=[Metrics.loglikelihood_acc],
54 |     stop_sequence=["\n"],
55 |     version=0,
56 | )
57 | 
58 | TASKS_TABLE = [
59 |     swag,
60 | ]
61 | 


--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/avg_at_k_math.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Avg At K Math Test Suite",
 3 |   "description": "Test cases for avg_at_k_math metric",
 4 |   "test_cases": [
 5 |     {
 6 |       "name": "Avg at K Math - Correct Math",
 7 |       "metric_class": "avg_at_k_math",
 8 |       "metric_params": {"k": 1},
 9 |       "doc": {
10 |         "query": "What is 2 + 2?",
11 |         "choices": ["4"],
12 |         "gold_index": 0,
13 |         "task_name": "math"
14 |       },
15 |       "model_response": {
16 |         "text": ["4"]
17 |       },
18 |       "expected_output": {
19 |         "avg@k:k=1": 1.0
20 |       },
21 |       "tolerance": 0.01,
22 |       "description": "Test avg at k math with correct math answer"
23 |     },
24 |     {
25 |       "name": "Avg at K Math - Wrong Math",
26 |       "metric_class": "avg_at_k_math",
27 |       "metric_params": {"k": 1},
28 |       "doc": {
29 |         "query": "What is 2 + 2?",
30 |         "choices": ["4"],
31 |         "gold_index": 0,
32 |         "task_name": "math"
33 |       },
34 |       "model_response": {
35 |         "text": ["5"]
36 |       },
37 |       "expected_output": {
38 |         "avg@k:k=1": 0.0
39 |       },
40 |       "tolerance": 0.01,
41 |       "description": "Test avg at k math with wrong math answer"
42 |     },
43 |     {
44 |       "name": "Avg at K Math - Multiple Attempts",
45 |       "metric_class": "avg_at_k_math",
46 |       "metric_params": {"k": 2},
47 |       "doc": {
48 |         "query": "What is 3 * 4?",
49 |         "choices": ["12"],
50 |         "gold_index": 0,
51 |         "task_name": "math"
52 |       },
53 |       "model_response": {
54 |         "text": ["12", "15"]
55 |       },
56 |       "expected_output": {
57 |         "avg@k:k=2": 0.5
58 |       },
59 |       "tolerance": 0.01,
60 |       "description": "Test avg at k math with multiple attempts"
61 |     }
62 |   ]
63 | }
64 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/mgsm.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Mgsm
 4 | 
 5 | dataset:
 6 | juletxara/mgsm
 7 | 
 8 | abstract:
 9 | Mgsm multilingual benchmark.
10 | 
11 | languages:
12 | bengali, chinese, english, french, german, japanese, russian, spanish, swahili,
13 | telugu, thai
14 | 
15 | tags:
16 | math, multilingual, reasoning
17 | 
18 | paper:
19 | """
20 | 
21 | from langcodes import standardize_tag
22 | 
23 | from lighteval.metrics.dynamic_metrics import (
24 |     MultilingualQuasiExactMatchMetric,
25 | )
26 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
27 | from lighteval.tasks.templates.qa import get_qa_prompt_function
28 | from lighteval.utils.language import Language
29 | 
30 | 
31 | TASKS_TABLE = [
32 |     LightevalTaskConfig(
33 |         name=f"mgsm_{language.value}",
34 |         prompt_function=get_qa_prompt_function(
35 |             language,
36 |             lambda line: {
37 |                 "question": line["question"],
38 |                 # The cot is available but we have no use:
39 |                 # line["answer"]
40 |                 "choices": [str(line["answer_number"])],
41 |             },
42 |         ),
43 |         hf_repo="juletxara/mgsm",
44 |         hf_subset=standardize_tag(language.value),
45 |         evaluation_splits=("test",),
46 |         few_shots_split="train",
47 |         generation_size=25,
48 |         metrics=[
49 |             MultilingualQuasiExactMatchMetric(language, "full"),
50 |         ],
51 |         stop_sequence=("\n",),
52 |     )
53 |     for language in [
54 |         Language.ENGLISH,
55 |         Language.SPANISH,
56 |         Language.FRENCH,
57 |         Language.GERMAN,
58 |         Language.RUSSIAN,
59 |         Language.CHINESE,
60 |         Language.JAPANESE,
61 |         Language.THAI,
62 |         Language.SWAHILI,
63 |         Language.BENGALI,
64 |         Language.TELUGU,
65 |     ]
66 | ]
67 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/med_dialog.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Med Dialog
 4 | 
 5 | dataset:
 6 | lighteval/med_dialog
 7 | 
 8 | abstract:
 9 | A collection of medical dialogue datasets.
10 | 
11 | languages:
12 | english
13 | 
14 | tags:
15 | dialog, health, medical
16 | 
17 | paper:
18 | """
19 | 
20 | from lighteval.metrics.metrics import Metrics
21 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
22 | from lighteval.tasks.requests import Doc
23 | 
24 | 
25 | def med_dialog_prompt(line, task_name: str = None):
26 |     return Doc(
27 |         task_name=task_name,
28 |         query=f"###\nArticle:{line['src']}\n\nSummarize the above article in 1 sentence.\n",
29 |         gold_index=0,
30 |         choices=[line["tgt"]],
31 |     )
32 | 
33 | 
34 | med_dialog_healthcaremagic = LightevalTaskConfig(
35 |     name="med_dialog:healthcaremagic",
36 |     prompt_function=med_dialog_prompt,
37 |     hf_repo="lighteval/med_dialog",
38 |     hf_subset="healthcaremagic",
39 |     hf_avail_splits=["train", "test", "validation"],
40 |     evaluation_splits=["validation", "test"],
41 |     few_shots_split=None,
42 |     few_shots_select=None,
43 |     generation_size=128,
44 |     metrics=[
45 |         Metrics.exact_match,
46 |     ],
47 |     stop_sequence=["\n"],
48 |     version=0,
49 | )
50 | 
51 | 
52 | med_dialog_icliniq = LightevalTaskConfig(
53 |     name="med_dialog:icliniq",
54 |     prompt_function=med_dialog_prompt,
55 |     hf_repo="lighteval/med_dialog",
56 |     hf_subset="icliniq",
57 |     hf_avail_splits=["train", "test", "validation"],
58 |     evaluation_splits=["validation", "test"],
59 |     few_shots_split=None,
60 |     few_shots_select=None,
61 |     generation_size=128,
62 |     metrics=[
63 |         Metrics.exact_match,
64 |     ],
65 |     stop_sequence=["\n"],
66 |     version=0,
67 | )
68 | 
69 | TASKS_TABLE = [
70 |     med_dialog_healthcaremagic,
71 |     med_dialog_icliniq,
72 | ]
73 | 


--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/pass_at_k_math.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Pass At K Math Test Suite",
 3 |   "description": "Test cases for pass_at_k_math metric",
 4 |   "test_cases": [
 5 |     {
 6 |       "name": "Pass at K Math - Correct Math",
 7 |       "metric_class": "pass_at_k_math",
 8 |       "metric_params": {"k": 1, "n": 2},
 9 |       "doc": {
10 |         "query": "What is 2 + 2?",
11 |         "choices": ["4"],
12 |         "gold_index": 0,
13 |         "task_name": "math"
14 |       },
15 |       "model_response": {
16 |         "text": ["4", "5"]
17 |       },
18 |       "expected_output": {
19 |         "pass@k:k=1&n=2": 0.5
20 |       },
21 |       "tolerance": 0.01,
22 |       "description": "Test pass at k math with correct math answer"
23 |     },
24 |     {
25 |       "name": "Pass at K Math - Wrong Math",
26 |       "metric_class": "pass_at_k_math",
27 |       "metric_params": {"k": 1, "n": 2},
28 |       "doc": {
29 |         "query": "What is 2 + 2?",
30 |         "choices": ["4"],
31 |         "gold_index": 0,
32 |         "task_name": "math"
33 |       },
34 |       "model_response": {
35 |         "text": ["5", "6"]
36 |       },
37 |       "expected_output": {
38 |         "pass@k:k=1&n=2": 0.0
39 |       },
40 |       "tolerance": 0.01,
41 |       "description": "Test pass at k math with wrong math answer"
42 |     },
43 |     {
44 |       "name": "Pass at K Math - Multiple Attempts",
45 |       "metric_class": "pass_at_k_math",
46 |       "metric_params": {"k": 2, "n": 3},
47 |       "doc": {
48 |         "query": "What is 3 * 4?",
49 |         "choices": ["12"],
50 |         "gold_index": 0,
51 |         "task_name": "math"
52 |       },
53 |       "model_response": {
54 |         "text": ["10", "12", "15"]
55 |       },
56 |       "expected_output": {
57 |         "pass@k:k=2&n=3": 0.66
58 |       },
59 |       "tolerance": 0.01,
60 |       "description": "Test pass at k math with multiple attempts"
61 |     }
62 |   ]
63 | }
64 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/soqal.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Soqal
 4 | 
 5 | dataset:
 6 | OALL/AlGhafa-Arabic-LLM-Benchmark-Native
 7 | 
 8 | abstract:
 9 | SOQAL: A large-scale Arabic reading comprehension dataset.
10 | 
11 | languages:
12 | arabic
13 | 
14 | tags:
15 | multilingual, qa
16 | 
17 | paper:
18 | https://arxiv.org/abs/1906.05394
19 | """
20 | 
21 | from lighteval.metrics.dynamic_metrics import (
22 |     LogLikelihoodAccMetric,
23 | )
24 | from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.multilingual.adapters import (
27 |     alghafa_adapter,
28 | )
29 | from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
30 | from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
31 | from lighteval.tasks.templates.utils.formulation import (
32 |     CFFormulation,
33 |     HybridFormulation,
34 |     MCFFormulation,
35 | )
36 | from lighteval.utils.language import Language
37 | 
38 | 
39 | TASKS_TABLE = [
40 |     LightevalTaskConfig(
41 |         name=f"soqal_{Language.ARABIC.value}_{formulation.name.lower()}",
42 |         hf_subset="multiple_choice_grounded_statement_soqal_task",
43 |         prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation),
44 |         evaluation_splits=["test"],
45 |         few_shots_split="validation",
46 |         hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Native",
47 |         metrics=get_metrics_for_formulation(
48 |             formulation,
49 |             [
50 |                 LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
51 |                 LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
52 |             ],
53 |         ),
54 |     )
55 |     for formulation in [
56 |         MCFFormulation(),
57 |         CFFormulation(),
58 |         HybridFormulation(),
59 |     ]
60 | ]
61 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/piqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Piqa
 4 | 
 5 | dataset:
 6 | ybisk/piqa
 7 | 
 8 | abstract:
 9 | PIQA is a benchmark for testing physical commonsense reasoning. It contains
10 | questions requiring this kind of physical commonsense reasoning.
11 | 
12 | languages:
13 | english
14 | 
15 | tags:
16 | commonsense, multiple-choice, qa
17 | 
18 | paper:
19 | https://arxiv.org/abs/1911.11641
20 | """
21 | 
22 | from string import ascii_uppercase
23 | 
24 | from lighteval.metrics.metrics import Metrics
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.requests import Doc
27 | 
28 | 
29 | def piqa_prompt(line, task_name: str = None):
30 |     letters = list(ascii_uppercase)[:2]
31 |     query = "The following are multiple choice questions (with answers) about common sense.\n"
32 |     query += f"Question: {line['goal']}\n"
33 |     query += "".join([f"{key}. {choice}\n" for key, choice in zip(letters, [line["sol1"], line["sol2"]])])
34 |     query += "Answer: "
35 | 
36 |     gold_ix = int(line["label"])
37 |     is_few_shots = line.get("__few_shots", False)
38 |     return Doc(
39 |         task_name=task_name,
40 |         query=query,
41 |         choices=letters if not is_few_shots else [line["sol1"], line["sol2"]],
42 |         gold_index=gold_ix,
43 |         instruction="The following are multiple choice questions (with answers) about common sense.\n",
44 |     )
45 | 
46 | 
47 | piqa = LightevalTaskConfig(
48 |     name="piqa",
49 |     prompt_function=piqa_prompt,
50 |     hf_repo="ybisk/piqa",
51 |     hf_subset="plain_text",
52 |     hf_avail_splits=["train", "test", "validation"],
53 |     evaluation_splits=["validation", "test"],
54 |     few_shots_split=None,
55 |     few_shots_select=None,
56 |     generation_size=1,
57 |     metrics=[
58 |         Metrics.exact_match,
59 |     ],
60 |     stop_sequence=["\n"],
61 |     version=0,
62 | )
63 | 
64 | TASKS_TABLE = [
65 |     piqa,
66 | ]
67 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/thai_exams.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Thai Exams
 4 | 
 5 | dataset:
 6 | scb10x/thai_exam
 7 | 
 8 | abstract:
 9 | Thai Exams multilingual benchmark.
10 | 
11 | languages:
12 | thai
13 | 
14 | tags:
15 | knowledge, multilingual, multiple-choice
16 | 
17 | paper:
18 | """
19 | 
20 | from lighteval.metrics.dynamic_metrics import (
21 |     LogLikelihoodAccMetric,
22 | )
23 | from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.multilingual.adapters import (
26 |     thai_exams_adapter,
27 | )
28 | from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
29 | from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
30 | from lighteval.tasks.templates.utils.formulation import (
31 |     CFFormulation,
32 |     HybridFormulation,
33 |     MCFFormulation,
34 | )
35 | from lighteval.utils.language import Language
36 | 
37 | 
38 | THAI_EXAMS_SUBSETS = ["a_level", "ic", "onet", "tgat", "tpat1"]
39 | 
40 | 
41 | TASKS_TABLE = [
42 |     LightevalTaskConfig(
43 |         name=f"thai_exams_{Language.THAI.value}_{formulation.name.lower()}:{subset}",
44 |         prompt_function=get_mcq_prompt_function(Language.THAI, thai_exams_adapter, formulation=formulation),
45 |         hf_repo="scb10x/thai_exam",
46 |         hf_subset=subset,
47 |         evaluation_splits=("test",),
48 |         few_shots_split="train",
49 |         metrics=get_metrics_for_formulation(
50 |             formulation,
51 |             [
52 |                 LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
53 |                 LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
54 |             ],
55 |         ),
56 |     )
57 |     for subset in THAI_EXAMS_SUBSETS
58 |     for formulation in [
59 |         MCFFormulation(),
60 |         CFFormulation(),
61 |         HybridFormulation(),
62 |     ]
63 | ]
64 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/hellaswag.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Hellaswag
 4 | 
 5 | dataset:
 6 | Rowan/hellaswag
 7 | 
 8 | abstract:
 9 | HellaSwag is a commonsense inference benchmark designed to challenge language
10 | models with adversarially filtered multiple-choice questions.
11 | 
12 | languages:
13 | english
14 | 
15 | tags:
16 | multiple-choice, narrative, reasoning
17 | 
18 | paper:
19 | https://arxiv.org/abs/1905.07830
20 | """
21 | 
22 | from string import ascii_uppercase
23 | 
24 | from lighteval.metrics.metrics import Metrics
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.requests import Doc
27 | 
28 | 
29 | def hellaswag_prompt(line, task_name: str = None):
30 |     query = "The following are multiple choice questions (with answers) about common sense.\n\n"
31 |     query += f"Question: {line['activity_label']}: {line['ctx_a']} {line['ctx_b'].capitalize()}\n"
32 |     query += "".join([f"{key}. {choice}\n" for key, choice in zip(ascii_uppercase, line["endings"])])
33 |     query += "Answer:"
34 | 
35 |     gold_ix = int(line["label"]) if line["label"] != "" else -1
36 |     return Doc(
37 |         task_name=task_name,
38 |         query=query,
39 |         choices=[" " + i for i in ascii_uppercase[: len(line["endings"])]],
40 |         gold_index=gold_ix,
41 |         instruction="The following are multiple choice questions (with answers) about common sense.\n\n",
42 |     )
43 | 
44 | 
45 | hellaswag = LightevalTaskConfig(
46 |     name="hellaswag",
47 |     prompt_function=hellaswag_prompt,
48 |     hf_repo="Rowan/hellaswag",
49 |     hf_subset="default",
50 |     hf_avail_splits=["train", "test", "validation"],
51 |     evaluation_splits=["validation"],
52 |     few_shots_split=None,
53 |     few_shots_select=None,
54 |     generation_size=1,
55 |     metrics=[
56 |         Metrics.exact_match,
57 |     ],
58 |     stop_sequence=["\n"],
59 |     version=0,
60 | )
61 | 
62 | TASKS_TABLE = [
63 |     hellaswag,
64 | ]
65 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/storycloze.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Storycloze
 4 | 
 5 | dataset:
 6 | MoE-UNC/story_cloze
 7 | 
 8 | abstract:
 9 | A Corpus and Cloze Evaluation for Deeper Understanding of
10 | Commonsense Stories
11 | 
12 | languages:
13 | english
14 | 
15 | tags:
16 | narrative, reasoning
17 | 
18 | paper:
19 | https://arxiv.org/abs/1604.01696
20 | """
21 | 
22 | from lighteval.metrics.metrics import Metrics
23 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
24 | from lighteval.tasks.requests import Doc
25 | 
26 | 
27 | def storycloze_prompt(line, task_name: str = None):
28 |     context = "\n".join(
29 |         [line["input_sentence_1"], line["input_sentence_2"], line["input_sentence_3"], line["input_sentence_4"]]
30 |     )
31 |     choices = [line["sentence_quiz1"], line["sentence_quiz2"]]
32 |     gold = int(line["answer_right_ending"]) - 1
33 |     return Doc(task_name=task_name, query=context + "\n", choices=choices, gold_index=gold)
34 | 
35 | 
36 | storycloze_2016 = LightevalTaskConfig(
37 |     name="storycloze:2016",
38 |     prompt_function=storycloze_prompt,
39 |     hf_repo="MoE-UNC/story_cloze",
40 |     hf_subset="2016",
41 |     hf_avail_splits=["validation"],
42 |     evaluation_splits=["validation"],
43 |     few_shots_split=None,
44 |     few_shots_select=None,
45 |     generation_size=-1,
46 |     metrics=[Metrics.exact_match],
47 |     stop_sequence=["\n"],
48 |     version=0,
49 | )
50 | 
51 | 
52 | storycloze_2018 = LightevalTaskConfig(
53 |     name="storycloze:2018",
54 |     prompt_function=storycloze_prompt,
55 |     hf_repo="MoE-UNC/story_cloze",
56 |     hf_subset="2018",
57 |     hf_avail_splits=["validation"],
58 |     evaluation_splits=["validation"],
59 |     few_shots_split=None,
60 |     few_shots_select=None,
61 |     generation_size=-1,
62 |     metrics=[Metrics.exact_match],
63 |     stop_sequence=["\n"],
64 |     version=0,
65 | )
66 | 
67 | TASKS_TABLE = [
68 |     storycloze_2016,
69 |     storycloze_2018,
70 | ]
71 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/squad_v2.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Squad V2
 4 | 
 5 | dataset:
 6 | rajpurkar/squad_v2
 7 | 
 8 | abstract:
 9 | Stanford Question Answering Dataset (SQuAD) is a reading comprehension dataset,
10 | consisting of questions posed by crowdworkers on a set of Wikipedia articles,
11 | where the answer to every question is a segment of text, or span, from the
12 | corresponding reading passage, or the question might be unanswerable.
13 | SQuAD 2.0 combines the 100,000 questions in SQuAD1.1 with over 50,000
14 | unanswerable questions written adversarially by crowdworkers to look similar to
15 | answerable ones. To do well on SQuAD2.0, systems must not only answer questions
16 | when possible, but also determine when no answer is supported by the paragraph
17 | and abstain from answering.
18 | 
19 | languages:
20 | english
21 | 
22 | tags:
23 | qa
24 | 
25 | paper:
26 | https://arxiv.org/abs/1806.03822
27 | """
28 | 
29 | from lighteval.metrics.metrics import Metrics
30 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
31 | from lighteval.tasks.templates.qa import get_qa_prompt_function
32 | from lighteval.utils.language import Language
33 | 
34 | 
35 | squad_v2 = LightevalTaskConfig(
36 |     name="squad_v2",
37 |     prompt_function=get_qa_prompt_function(
38 |         Language.ENGLISH,
39 |         lambda line: {
40 |             "question": line["question"],
41 |             "context": line["context"],
42 |             "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
43 |         },
44 |     ),
45 |     hf_repo="rajpurkar/squad_v2",
46 |     hf_subset="squad_v2",
47 |     hf_filter=lambda line: any(ans for ans in line["answers"]["text"] if len(ans) > 0),
48 |     evaluation_splits=("validation",),
49 |     few_shots_split="train",
50 |     stop_sequence=["\n", "Question:", "question:"],
51 |     generation_size=200,
52 |     metrics=[Metrics.exact_match],
53 |     version=1,
54 | )
55 | 
56 | TASKS_TABLE = [
57 |     squad_v2,
58 | ]
59 | 


--------------------------------------------------------------------------------
/tests/unit/models/test_abstract_model.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2024 The HuggingFace Team
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | from transformers import AutoTokenizer
24 | 
25 | from lighteval.models.dummy.dummy_model import DummyModel, DummyModelConfig
26 | 
27 | 
28 | def test_tok_encode_pair():
29 |     model = DummyModel(config=DummyModelConfig(seed=42))
30 |     model._tokenizer = AutoTokenizer.from_pretrained("facebook/xglm-564M")
31 |     context = "答案："
32 |     continuation = ["1"]
33 |     non_pairwise_tokens = model.tok_encode_pair(context, continuation, pairwise=False)
34 |     pairwise_tokens = model.tok_encode_pair(context, continuation, pairwise=True)
35 |     # Non-pairwise merged "：1" to one token
36 |     assert non_pairwise_tokens == ([[6, 47873]], [[34871]])
37 |     # Pairwise separated "：" and "1"
38 |     assert pairwise_tokens == ([[6, 47873, 13]], [[82]])
39 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/hellaswag_tel.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Hellaswag Tel
 4 | 
 5 | dataset:
 6 | LightFury9/hellaswag-telugu
 7 | 
 8 | abstract:
 9 | Hellaswag Tel multilingual benchmark.
10 | 
11 | languages:
12 | telugu
13 | 
14 | tags:
15 | multilingual, multiple-choice, reasoning
16 | 
17 | paper:
18 | """
19 | 
20 | from lighteval.metrics.dynamic_metrics import (
21 |     LogLikelihoodAccMetric,
22 | )
23 | from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
26 | from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function
27 | from lighteval.tasks.templates.utils.formulation import (
28 |     CFFormulation,
29 |     HybridFormulation,
30 |     MCFFormulation,
31 | )
32 | from lighteval.utils.language import Language
33 | 
34 | 
35 | TASKS_TABLE = [
36 |     LightevalTaskConfig(
37 |         name=f"community_hellaswag_{Language.TELUGU.value}_{formulation.name.lower()}",
38 |         prompt_function=get_hellaswag_prompt_function(
39 |             language=Language.TELUGU,
40 |             adapter=lambda line: {
41 |                 "ctx_a": line["ctx_a"],
42 |                 "continuations": line["endings"],
43 |                 "gold_idx": int(line["label"]),
44 |             },
45 |             formulation=formulation,
46 |         ),
47 |         hf_repo="LightFury9/hellaswag-telugu",
48 |         hf_subset="default",
49 |         evaluation_splits=("valid",),
50 |         few_shots_split="train",
51 |         metrics=get_metrics_for_formulation(
52 |             formulation,
53 |             [
54 |                 LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
55 |                 LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
56 |             ],
57 |         ),
58 |     )
59 |     for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
60 | ]
61 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/arabic_arc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Arabic Arc
 4 | 
 5 | dataset:
 6 | OALL/AlGhafa-Arabic-LLM-Benchmark-Translated
 7 | 
 8 | abstract:
 9 | Arabic Arc multilingual benchmark.
10 | 
11 | languages:
12 | arabic
13 | 
14 | tags:
15 | multilingual, multiple-choice, reasoning
16 | 
17 | paper:
18 | """
19 | 
20 | from lighteval.metrics.dynamic_metrics import (
21 |     LogLikelihoodAccMetric,
22 | )
23 | from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.multilingual.adapters import (
26 |     alghafa_adapter,
27 | )
28 | from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
29 | from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
30 | from lighteval.tasks.templates.utils.formulation import (
31 |     CFFormulation,
32 |     HybridFormulation,
33 |     MCFFormulation,
34 | )
35 | from lighteval.utils.language import Language
36 | 
37 | 
38 | TASKS_TABLE = [
39 |     LightevalTaskConfig(
40 |         name=f"alghafa_arc_{Language.ARABIC.value}_{formulation.name.lower()}:easy",
41 |         prompt_function=get_mcq_prompt_function(Language.ARABIC, alghafa_adapter, formulation=formulation),
42 |         hf_repo="OALL/AlGhafa-Arabic-LLM-Benchmark-Translated",
43 |         hf_revision="08663706ee7cab30c4b7dc1bb00042a3227ce1ff",
44 |         hf_subset="arc_easy_ar",
45 |         evaluation_splits=["test"],
46 |         few_shots_split="validation",
47 |         few_shots_select="sequential",
48 |         metrics=get_metrics_for_formulation(
49 |             formulation,
50 |             [
51 |                 LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
52 |                 LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
53 |             ],
54 |         ),
55 |     )
56 |     for formulation in [
57 |         MCFFormulation(),
58 |         CFFormulation(),
59 |         HybridFormulation(),
60 |     ]
61 | ]
62 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/mathqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Mathqa
 4 | 
 5 | dataset:
 6 | allenai/math_qa
 7 | 
 8 | abstract:
 9 | large-scale dataset of math word problems.  Our dataset is gathered by using a
10 | new representation language to annotate over the AQuA-RAT dataset with
11 | fully-specified operational programs.  AQuA-RAT has provided the questions,
12 | options, rationale, and the correct options.
13 | 
14 | languages:
15 | english
16 | 
17 | tags:
18 | math, qa, reasoning
19 | 
20 | paper:
21 | https://arxiv.org/abs/1905.13319
22 | """
23 | 
24 | from lighteval.metrics.metrics import Metrics
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.requests import Doc
27 | 
28 | 
29 | def mathqa_prompt(line, task_name: str = None):
30 |     query = f"Problem: {line['Problem']}\n"
31 |     query += "Options:\n"
32 |     query += "".join(
33 |         [
34 |             f"{key}) {choice}\n"
35 |             for key, choice in zip(
36 |                 ["a", "b", "c", "d", "e"],
37 |                 [line["option_a"], line["option_b"], line["option_c"], line["option_d"], line["option_e"]],
38 |             )
39 |         ]
40 |     )
41 |     query += "Answer:"
42 |     return Doc(
43 |         task_name=task_name,
44 |         query=query,
45 |         choices=[
46 |             f" {c}" for c in [line["option_a"], line["option_b"], line["option_c"], line["option_d"], line["option_e"]]
47 |         ],
48 |         gold_index=["a", "b", "c", "d", "e"].index(line["correct"]),
49 |     )
50 | 
51 | 
52 | mathqa = LightevalTaskConfig(
53 |     name="mathqa",
54 |     prompt_function=mathqa_prompt,
55 |     hf_repo="allenai/math_qa",
56 |     hf_subset="default",
57 |     hf_avail_splits=["train", "validation", "test"],
58 |     evaluation_splits=["test"],
59 |     few_shots_split=None,
60 |     few_shots_select=None,
61 |     generation_size=-1,
62 |     metrics=[Metrics.loglikelihood_acc],
63 |     stop_sequence=["\n"],
64 |     version=0,
65 | )
66 | 
67 | TASKS_TABLE = [
68 |     mathqa,
69 | ]
70 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/triviaqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Triviaqa
 4 | 
 5 | dataset:
 6 | mandarjoshi/trivia_qa
 7 | 
 8 | abstract:
 9 | TriviaqQA is a reading comprehension dataset containing over 650K
10 | question-answer-evidence triples. TriviaqQA includes 95K question-answer pairs
11 | authored by trivia enthusiasts and independently gathered evidence documents,
12 | six per question on average, that provide high quality distant supervision for
13 | answering the questions.
14 | 
15 | languages:
16 | english
17 | 
18 | tags:
19 | qa
20 | 
21 | paper:
22 | https://arxiv.org/abs/1705.03551
23 | """
24 | 
25 | import string
26 | 
27 | from lighteval.metrics.metrics import Metrics
28 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
29 | from lighteval.tasks.requests import Doc
30 | 
31 | 
32 | def triviaqa_prompt(line, task_name: str = None):
33 |     def _remove_prefixes(aliases):
34 |         aliases.sort()
35 |         ret = [aliases[0]]
36 |         for alias in aliases[1:]:
37 |             if not alias.startswith(ret[-1]):
38 |                 ret.append(alias)
39 |         return ret
40 | 
41 |     list_of_candidates = [
42 |         alias.lower().translate(str.maketrans("", "", string.punctuation))
43 |         for alias in _remove_prefixes(line["answer"]["aliases"])
44 |     ]
45 | 
46 |     return Doc(
47 |         task_name=task_name,
48 |         query=f"Question: {line['question']}\nAnswer:",
49 |         gold_index=0,
50 |         choices=[list_of_candidates],
51 |     )
52 | 
53 | 
54 | triviaqa = LightevalTaskConfig(
55 |     name="triviaqa",
56 |     prompt_function=triviaqa_prompt,
57 |     hf_repo="mandarjoshi/trivia_qa",
58 |     hf_subset="rc.nocontext",
59 |     hf_avail_splits=["train", "test", "validation"],
60 |     evaluation_splits=["validation"],
61 |     few_shots_split=None,
62 |     few_shots_select=None,
63 |     generation_size=20,
64 |     metrics=[Metrics.exact_match],
65 |     stop_sequence=["\n", ".", ","],
66 |     version=0,
67 | )
68 | 
69 | TASKS_TABLE = [
70 |     triviaqa,
71 | ]
72 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/utils/task_utils.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2024 The HuggingFace Team
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | from lighteval.metrics.dynamic_metrics import LogLikelihoodAccMetric
25 | from lighteval.metrics.utils.metric_utils import Metric
26 | from lighteval.tasks.templates.utils.formulation import Formulation, MCFFormulation
27 | 
28 | 
29 | def normalize_subset(subset: str) -> str:
30 |     return subset.replace(" ", "_").replace("(", "").replace(")", "").lower()
31 | 
32 | 
33 | def get_metrics_for_formulation(formulation: Formulation, metrics: list[Metric]) -> list[Metric]:
34 |     """Choose the appropriate metrics for the given formulation otherwise fallback to the original metrics."""
35 |     match formulation:
36 |         #
37 |         case MCFFormulation(choice_prefix="Letters"):
38 |             return [LogLikelihoodAccMetric(normalization=None)]
39 |         case _:
40 |             return metrics
41 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/simpleqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Simpleqa
 4 | 
 5 | dataset:
 6 | lighteval/SimpleQA
 7 | 
 8 | abstract:
 9 | A factuality benchmark called SimpleQA that measures the ability for language
10 | models to answer short, fact-seeking questions.
11 | 
12 | languages:
13 | english
14 | 
15 | tags:
16 | factuality, general-knowledge, qa
17 | 
18 | paper:
19 | https://openai.com/index/introducing-simpleqa/
20 | 
21 | starred:
22 | true
23 | """
24 | 
25 | from inspect_ai.dataset import Sample
26 | from inspect_ai.scorer import model_graded_fact
27 | from inspect_ai.solver import generate
28 | 
29 | from lighteval.metrics.metrics import Metrics
30 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
31 | from lighteval.tasks.requests import Doc
32 | 
33 | 
34 | def simpleqa_prompt(line, task_name: str = None):
35 |     query = f"Question: {line['question']}\n"
36 |     query += "".join(
37 |         [f"\n{key}. {choice}" for key, choice in zip(["A", "B", "C", "D", "E", "F"], line["choices"]["text"])]
38 |     )
39 |     query += "\nAnswer:"
40 |     return Doc(
41 |         task_name=task_name,
42 |         query=query,
43 |         choices=line["choices"]["text"],
44 |         gold_index=line["choices"]["label"].index(line["answerKey"]),
45 |     )
46 | 
47 | 
48 | def record_to_sample(record):
49 |     query = record["problem"]
50 |     target = record["answer"]
51 |     return Sample(input=query, target=target)
52 | 
53 | 
54 | simpleqa = LightevalTaskConfig(
55 |     name="simpleqa",
56 |     prompt_function=simpleqa_prompt,
57 |     hf_repo="lighteval/SimpleQA",
58 |     hf_subset="default",
59 |     hf_avail_splits=["test"],
60 |     evaluation_splits=["test"],
61 |     few_shots_split="few_shot",
62 |     few_shots_select=None,
63 |     generation_size=2048,
64 |     metrics=[Metrics.exact_match],
65 |     stop_sequence=["\n"],
66 |     version=0,
67 |     sample_fields=record_to_sample,
68 |     solver=[generate(cache=True)],
69 |     scorer=model_graded_fact(),
70 | )
71 | 
72 | TASKS_TABLE = [
73 |     simpleqa,
74 | ]
75 | 


--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/avg_at_k.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Avg At K Test Suite",
 3 |   "description": "Test cases for avg_at_k metric",
 4 |   "test_cases": [
 5 |     {
 6 |       "name": "Avg at K - Correct in Top K",
 7 |       "metric_class": "avg_at_k",
 8 |       "metric_params": {"k": 2},
 9 |       "doc": {
10 |         "query": "What is the capital of France?",
11 |         "choices": ["London", "Paris", "Berlin"],
12 |         "gold_index": 1,
13 |         "task_name": "geography"
14 |       },
15 |       "model_response": {
16 |         "text": ["Paris", "London", "Berlin"]
17 |       },
18 |       "expected_output": {
19 |         "avg@k:k=2": 0.5
20 |       },
21 |       "tolerance": 0.01,
22 |       "description": "Test avg at k with correct answer in top k"
23 |     },
24 |     {
25 |       "name": "Avg at K - Not in Top K",
26 |       "metric_class": "avg_at_k",
27 |       "metric_params": {"k": 1},
28 |       "doc": {
29 |         "query": "What is the capital of France?",
30 |         "choices": ["London", "Paris", "Berlin"],
31 |         "gold_index": 1,
32 |         "task_name": "geography"
33 |       },
34 |       "model_response": {
35 |         "text": ["London", "Berlin", "Paris"]
36 |       },
37 |       "expected_output": {
38 |         "avg@k:k=1": 0.0
39 |       },
40 |       "tolerance": 0.01,
41 |       "description": "Test avg at k with correct answer not in top k"
42 |     },
43 |     {
44 |       "name": "Avg at K - Multiple Correct",
45 |       "metric_class": "avg_at_k",
46 |       "metric_params": {"k": 3},
47 |       "doc": {
48 |         "query": "Which are European capitals?",
49 |         "choices": ["London", "Paris", "Tokyo", "Berlin"],
50 |         "gold_index": [0, 1, 3],
51 |         "task_name": "geography"
52 |       },
53 |       "model_response": {
54 |         "text": ["Paris", "London", "Berlin", "Tokyo"]
55 |       },
56 |       "expected_output": {
57 |         "avg@k:k=3": 0.33
58 |       },
59 |       "tolerance": 0.01,
60 |       "description": "Test avg at k with multiple correct answers"
61 |     }
62 |   ]
63 | }
64 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/entity_data_imputation.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Entity Data Imputation
 4 | 
 5 | dataset:
 6 | lighteval/Buy, lighteval/Restaurant
 7 | 
 8 | abstract:
 9 | Scenario that tests the ability to impute missing entities in a data table.
10 | 
11 | languages:
12 | english
13 | 
14 | tags:
15 | reasoning
16 | 
17 | paper:
18 | https://ieeexplore.ieee.org/document/9458712
19 | """
20 | 
21 | from lighteval.metrics.metrics import Metrics
22 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
23 | from lighteval.tasks.requests import Doc
24 | 
25 | 
26 | def entity_data_imputation_prompt(line, task_name: str = None):
27 |     return Doc(
28 |         task_name=task_name,
29 |         query=f"What is the missing value?\n{line['text']}\nAnswer:",
30 |         choices=[line["gold"]],
31 |         gold_index=0,
32 |         instruction="What is the missing value?\n",
33 |     )
34 | 
35 | 
36 | entity_data_imputation_Buy = LightevalTaskConfig(
37 |     name="entity_data_imputation:Buy",
38 |     prompt_function=entity_data_imputation_prompt,
39 |     hf_repo="lighteval/Buy",
40 |     hf_subset="default",
41 |     hf_avail_splits=["train", "test", "valid"],
42 |     evaluation_splits=["valid", "test"],
43 |     few_shots_split=None,
44 |     few_shots_select=None,
45 |     generation_size=5,
46 |     metrics=[
47 |         Metrics.exact_match,
48 |     ],
49 |     stop_sequence=["\n"],
50 |     version=0,
51 | )
52 | 
53 | 
54 | entity_data_imputation_Restaurant = LightevalTaskConfig(
55 |     name="entity_data_imputation:Restaurant",
56 |     prompt_function=entity_data_imputation_prompt,
57 |     hf_repo="lighteval/Restaurant",
58 |     hf_subset="default",
59 |     hf_avail_splits=["train"],
60 |     evaluation_splits=["train"],
61 |     few_shots_split=None,
62 |     few_shots_select=None,
63 |     generation_size=5,
64 |     metrics=[
65 |         Metrics.exact_match,
66 |     ],
67 |     stop_sequence=["\n"],
68 |     version=0,
69 | )
70 | 
71 | TASKS_TABLE = [
72 |     entity_data_imputation_Buy,
73 |     entity_data_imputation_Restaurant,
74 | ]
75 | 


--------------------------------------------------------------------------------
/tests/unit/metrics/test_cases/drop.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "name": "Drop Test Suite",
 3 |   "description": "Test cases for drop metric",
 4 |   "test_cases": [
 5 |     {
 6 |       "name": "DROP - Correct Answer",
 7 |       "metric_class": "drop",
 8 |       "metric_params": {},
 9 |       "doc": {
10 |         "query": "What is 2 + 2?",
11 |         "specific": {
12 |           "golds_no_preprocessing": ["4"]
13 |         },
14 |         "choices": ["4"],
15 |         "gold_index": 0,
16 |         "task_name": "math"
17 |       },
18 |       "model_response": {
19 |         "text": ["4"]
20 |       },
21 |       "expected_output": {
22 |         "em": 1.0,
23 |         "f1": 1.0
24 |       },
25 |       "tolerance": 0.01,
26 |       "description": "Test DROP with correct answer"
27 |     },
28 |     {
29 |       "name": "DROP - Wrong Answer",
30 |       "metric_class": "drop",
31 |       "metric_params": {},
32 |       "doc": {
33 |         "query": "What is 2 + 2?",
34 |         "specific": {
35 |           "golds_no_preprocessing": ["4"]
36 |         },
37 |         "choices": ["4"],
38 |         "gold_index": 0,
39 |         "task_name": "math"
40 |       },
41 |       "model_response": {
42 |         "text": ["5"]
43 |       },
44 |       "expected_output": {
45 |         "em": 0.0,
46 |         "f1": 0.0
47 |       },
48 |       "tolerance": 0.01,
49 |       "description": "Test DROP with wrong answer"
50 |     },
51 |     {
52 |       "name": "DROP - Partial Match",
53 |       "metric_class": "drop",
54 |       "metric_params": {},
55 |       "doc": {
56 |         "query": "What is the sum of 2 and 2?",
57 |         "specific": {
58 |           "golds_no_preprocessing": ["4", "four"]
59 |         },
60 |         "choices": ["4", "four"],
61 |         "gold_index": 0,
62 |         "task_name": "math"
63 |       },
64 |       "model_response": {
65 |         "text": ["4"]
66 |       },
67 |       "expected_output": {
68 |         "em": 1.0,
69 |         "f1": 1.0
70 |       },
71 |       "tolerance": 0.01,
72 |       "description": "Test DROP with partial match"
73 |     }
74 |   ]
75 | }
76 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/hellaswag_hin.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Hellaswag Hin
 4 | 
 5 | dataset:
 6 | ai4bharat/hellaswag-hi
 7 | 
 8 | abstract:
 9 | Hellaswag Hin multilingual benchmark.
10 | 
11 | languages:
12 | hindi
13 | 
14 | tags:
15 | multilingual, multiple-choice, reasoning
16 | 
17 | paper:
18 | """
19 | 
20 | from lighteval.metrics.dynamic_metrics import (
21 |     LogLikelihoodAccMetric,
22 | )
23 | from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
24 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
25 | from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
26 | from lighteval.tasks.templates.hellaswag import get_hellaswag_prompt_function
27 | from lighteval.tasks.templates.utils.formulation import (
28 |     CFFormulation,
29 |     HybridFormulation,
30 |     MCFFormulation,
31 | )
32 | from lighteval.utils.language import Language
33 | 
34 | 
35 | TASKS_TABLE = [
36 |     LightevalTaskConfig(
37 |         name=f"community_hellaswag_{Language.HINDI.value}_{formulation.name.lower()}",
38 |         prompt_function=get_hellaswag_prompt_function(
39 |             language=Language.HINDI,
40 |             adapter=lambda line: {
41 |                 "ctx_a": line["ctx_a"],
42 |                 "continuations": line["endings"],
43 |                 "gold_idx": int(line["label"]),
44 |             },
45 |             formulation=formulation,
46 |         ),
47 |         hf_repo="ai4bharat/hellaswag-hi",
48 |         hf_filter=lambda line: all(len(choice.strip()) > 0 for choice in line["endings"]),
49 |         hf_subset="hi",
50 |         evaluation_splits=("validation",),
51 |         few_shots_split="validation",
52 |         metrics=get_metrics_for_formulation(
53 |             formulation,
54 |             [
55 |                 LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
56 |                 LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
57 |             ],
58 |         ),
59 |     )
60 |     for formulation in [MCFFormulation(), CFFormulation(), HybridFormulation()]
61 | ]
62 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/covid_dialogue.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Covid Dialogue
 4 | 
 5 | dataset:
 6 | lighteval/covid_dialogue
 7 | 
 8 | abstract:
 9 | The COVID-19 Dialogue dataset is a collection of 500+ dialogues between
10 | doctors and patients during the COVID-19 pandemic.
11 | 
12 | languages:
13 | english
14 | 
15 | tags:
16 | dialog, medical
17 | 
18 | paper:
19 | https://arxiv.org/abs/2004.06561
20 | """
21 | 
22 | from inspect_ai.dataset import Sample
23 | from inspect_ai.scorer import model_graded_fact
24 | from inspect_ai.solver import generate, system_message
25 | 
26 | from lighteval.metrics.metrics import Metrics
27 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
28 | from lighteval.tasks.requests import Doc
29 | 
30 | 
31 | PROMPT = "Generate a response given a patient's questions and concerns."
32 | 
33 | 
34 | def covid_dialogue_prompt(line, task_name: str = None):
35 |     return Doc(
36 |         task_name=task_name,
37 |         query=f"Generate a response given a patient's questions and concerns.\nPatient: {line['query']}\nDoctor: ",
38 |         choices=[line["answer"]],
39 |         gold_index=0,
40 |         instruction="Generate a response given a patient's questions and concerns.\n",
41 |     )
42 | 
43 | 
44 | def record_to_sample(record):
45 |     query = record["query"]
46 |     target = record["answer"]
47 |     return Sample(input=query, target=target)
48 | 
49 | 
50 | covid_dialogue = LightevalTaskConfig(
51 |     name="covid_dialogue",
52 |     prompt_function=covid_dialogue_prompt,
53 |     hf_repo="lighteval/covid_dialogue",
54 |     hf_subset="default",
55 |     hf_avail_splits=["train", "test", "validation"],
56 |     evaluation_splits=["validation", "test"],
57 |     few_shots_split=None,
58 |     few_shots_select=None,
59 |     generation_size=128,
60 |     metrics=[Metrics.exact_match],
61 |     stop_sequence=["\n"],
62 |     version=0,
63 |     sample_fields=record_to_sample,
64 |     solver=[system_message(PROMPT), generate(cache=True)],
65 |     scorer=model_graded_fact(),
66 | )
67 | 
68 | TASKS_TABLE = [
69 |     covid_dialogue,
70 | ]
71 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/tydiqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Tydiqa
 4 | 
 5 | dataset:
 6 | google-research-datasets/tydiqa
 7 | 
 8 | abstract:
 9 | Other QA tasks for RC TyDi QA: A benchmark for information-seeking question answering in typologically diverse languages. https://arxiv.org/abs/2003.05002
10 | 
11 | languages:
12 | arabic, bengali, english, finnish, indonesian, japanese, korean, russian, swahili, telugu, thai
13 | 
14 | tags:
15 | multilingual, qa
16 | 
17 | paper:
18 | https://arxiv.org/abs/2003.05002
19 | """
20 | 
21 | from lighteval.metrics.dynamic_metrics import (
22 |     MultilingualQuasiExactMatchMetric,
23 |     MultilingualQuasiF1ScoreMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 | 
29 | 
30 | TASKS_TABLE = [
31 |     LightevalTaskConfig(
32 |         name=f"tydiqa_{language.value}",
33 |         prompt_function=get_qa_prompt_function(
34 |             language,
35 |             lambda line: {
36 |                 "question": line["question"],
37 |                 "context": line["context"],
38 |                 "choices": [ans for ans in line["answers"]["text"] if len(ans) > 0],
39 |             },
40 |         ),
41 |         hf_repo="google-research-datasets/tydiqa",
42 |         hf_subset="secondary_task",
43 |         evaluation_splits=("validation",),
44 |         few_shots_split="train",
45 |         generation_size=400,
46 |         stop_sequence=("\n",),
47 |         metrics=(
48 |             MultilingualQuasiExactMatchMetric(language, "prefix"),
49 |             MultilingualQuasiF1ScoreMetric(language),
50 |         ),
51 |     )
52 |     for language in [
53 |         Language.ENGLISH,
54 |         Language.ARABIC,
55 |         Language.BENGALI,
56 |         Language.FINNISH,
57 |         Language.INDONESIAN,
58 |         Language.JAPANESE,
59 |         Language.KOREAN,
60 |         Language.SWAHILI,
61 |         Language.RUSSIAN,
62 |         Language.TELUGU,
63 |         Language.THAI,
64 |     ]
65 | ]
66 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/templates/utils/adapter_utils.py:
--------------------------------------------------------------------------------
 1 | # MIT License
 2 | 
 3 | # Copyright (c) 2024 The HuggingFace Team
 4 | 
 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | # of this software and associated documentation files (the "Software"), to deal
 7 | # in the Software without restriction, including without limitation the rights
 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | # copies of the Software, and to permit persons to whom the Software is
10 | # furnished to do so, subject to the following conditions:
11 | 
12 | # The above copyright notice and this permission notice shall be included in all
13 | # copies or substantial portions of the Software.
14 | 
15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | # SOFTWARE.
22 | 
23 | 
24 | from typing import Any, Callable, Mapping, TypeVar
25 | 
26 | 
27 | AdapterReturnTypeVar = TypeVar("AdapterReturnTypeVar")
28 | 
29 | 
30 | def create_adapter_from_dict(
31 |     adapter: Mapping[str, Any] | Callable[[dict], AdapterReturnTypeVar],
32 | ) -> Callable[[dict], AdapterReturnTypeVar]:
33 |     """Creates adapter function for the template input from a dict.
34 | 
35 |     Args:
36 |         adapter: Dict of the form {key: value} where value is key in the input dict to get.
37 | 
38 |     Returns:
39 |         Callable[[dict], AdapterReturnTypeVar]: A function that adapts dictionary input to the expected format
40 |     """
41 |     if not isinstance(adapter, Mapping):
42 |         return adapter
43 | 
44 |     def adapter_fn(line: dict):
45 |         return {key: line[value] for key, value in adapter.items()}
46 | 
47 |     return adapter_fn  # type: ignore
48 | 


--------------------------------------------------------------------------------
/docs/source/_toctree.yml:
--------------------------------------------------------------------------------
 1 | - sections:
 2 |   - local: index
 3 |     title: 🤗 Lighteval
 4 |   - local: installation
 5 |     title: Installation
 6 |   - local: quicktour
 7 |     title: Quicktour
 8 |   title: Getting started
 9 | - sections:
10 |   - local: inspect-ai
11 |     title: Examples using Inspect-AI
12 |   - local: saving-and-reading-results
13 |     title: Save and read results
14 |   - local: caching
15 |     title: Caching
16 |   - local: using-the-python-api
17 |     title: Use the Python API
18 |   - local: adding-a-custom-task
19 |     title: Add a custom task
20 |   - local: adding-a-new-metric
21 |     title: Add a custom metric
22 |   - local: evaluating-a-custom-model
23 |     title: Evaluate a custom model
24 |   - local: use-inference-providers-as-backend
25 |     title: Use HF's inference providers as backend
26 |   - local: use-litellm-as-backend
27 |     title: Use litellm as backend
28 |   - local: use-vllm-as-backend
29 |     title: Use vllm as backend
30 |   - local: use-sglang-as-backend
31 |     title: Use SGLang as backend
32 |   - local: use-huggingface-inference-endpoints-or-tgi-as-backend
33 |     title: Use Hugging Face inference endpoints or TGI as backend
34 |   - local: contributing-to-multilingual-evaluations
35 |     title: Contributing to multilingual evaluations
36 |   title: Guides
37 | - sections:
38 |   - local: metric-list
39 |     title: Available Metrics
40 |   - local: available-tasks
41 |     title: Available Tasks
42 |   title: API
43 | - sections:
44 |   - sections:
45 |     - local: package_reference/evaluation_tracker
46 |       title: EvaluationTracker
47 |     - local: package_reference/models
48 |       title: Model Configs
49 |     - local: package_reference/pipeline
50 |       title: Pipeline
51 |     title: Main classes
52 |   - local: package_reference/metrics
53 |     title: Metrics
54 |   - local: package_reference/tasks
55 |     title: Tasks
56 |   - local: package_reference/logging
57 |     title: Logging
58 |   - local: package_reference/models_outputs
59 |     title: ModelResponse
60 |   - local: package_reference/doc
61 |     title: Doc
62 |   title: Reference
63 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/afri_mgsm.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Afri Mgsm
 4 | 
 5 | dataset:
 6 | masakhane/afrimgsm
 7 | 
 8 | abstract:
 9 | African MGSM: MGSM for African Languages
10 | 
11 | languages:
12 | amharic, ewe, french, hausa, igbo, kinyarwanda, lingala, luganda, oromo, shona,
13 | sotho, swahili, twi, wolof, xhosa, yoruba, zulu
14 | 
15 | tags:
16 | math, multilingual, reasoning
17 | 
18 | paper:
19 | https://arxiv.org/abs/2406.03368.
20 | """
21 | 
22 | from lighteval.metrics.dynamic_metrics import (
23 |     MultilingualQuasiExactMatchMetric,
24 | )
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.templates.qa import get_qa_prompt_function
27 | from lighteval.utils.language import Language
28 | 
29 | 
30 | TASKS_TABLE = [
31 |     LightevalTaskConfig(
32 |         name=f"afri_mgsm_{language.value}",
33 |         prompt_function=get_qa_prompt_function(
34 |             language,
35 |             lambda line: {
36 |                 "question": line["question"],
37 |                 # The cot is available but we have no use:
38 |                 # line["answer"]
39 |                 "choices": [str(line["answer_number"])],
40 |             },
41 |         ),
42 |         hf_repo="masakhane/afrimgsm",
43 |         hf_subset=language.value,
44 |         evaluation_splits=("test",),
45 |         few_shots_split="train",
46 |         generation_size=25,
47 |         metrics=[
48 |             MultilingualQuasiExactMatchMetric(language, "full"),
49 |         ],
50 |         stop_sequence=("\n",),
51 |     )
52 |     for language in [
53 |         Language.AMHARIC,
54 |         # Language.EWE,
55 |         Language.FRENCH,
56 |         # Language.HAUSA,
57 |         # Language.IGBO,
58 |         # Language.KINYARWANDA,
59 |         # Language.LINGALA,
60 |         # Language.LUGANDA,
61 |         # Language.OROMO,
62 |         # Language.SHONA,
63 |         # Language.SOTHO,
64 |         Language.SWAHILI,
65 |         # Language.TWI,
66 |         # Language.WOLOF,
67 |         # Language.XHOSA,
68 |         Language.YORUBA,
69 |         # Language.ZULU,
70 |     ]
71 | ]
72 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/babi_qa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Babi Qa
 4 | 
 5 | dataset:
 6 | facebook/babi_qa
 7 | 
 8 | abstract:
 9 | The bAbI benchmark for measuring understanding and reasoning, evaluates reading
10 | comprehension via question answering.
11 | 
12 | languages:
13 | english
14 | 
15 | tags:
16 | qa, reasoning
17 | 
18 | paper:
19 | https://arxiv.org/abs/1502.05698
20 | """
21 | 
22 | import json
23 | 
24 | from lighteval.metrics.metrics import Metrics
25 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
26 | from lighteval.tasks.requests import Doc
27 | 
28 | 
29 | # TODO: clean dataset and convert to inspect-ai
30 | 
31 | 
32 | def babi_qa_prompt(line, task_name: str = None):
33 |     def process_path(path: str) -> str:
34 |         steps = path.split(",")
35 |         directions = {"s": "south", "n": "north", "e": "east", "w": "west"}
36 |         path = " ".join([directions[step] for step in steps])
37 |         return path
38 | 
39 |     if isinstance(line["story"], dict):
40 |         line = line["story"]
41 |     else:
42 |         line = json.loads(line["story"])
43 | 
44 |     results = []
45 |     story = []
46 |     for type, text, answer in zip(line["type"], line["text"], line["answer"]):
47 |         if type == "supporting fact":
48 |             story.append(text)
49 |         elif type == "question":
50 |             text = text.replace("_", process_path(answer))
51 |             query = "\n".join(story) + f"\nQuestion: {text}\nAnswer: "
52 |             results.append(Doc(task_name=task_name, query=query, choices=[answer], gold_index=0))
53 |             story = []
54 |     return results
55 | 
56 | 
57 | babi_qa = LightevalTaskConfig(
58 |     name="babi_qa",
59 |     prompt_function=babi_qa_prompt,
60 |     hf_repo="facebook/babi_qa",
61 |     hf_subset="en-valid-qa1",
62 |     hf_avail_splits=["train", "test", "validation"],
63 |     evaluation_splits=["validation", "test"],
64 |     few_shots_split=None,
65 |     few_shots_select=None,
66 |     generation_size=-1,
67 |     metrics=[Metrics.exact_match],
68 |     stop_sequence=["\n"],
69 |     version=0,
70 | )
71 | 
72 | TASKS_TABLE = [babi_qa]
73 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/tasks/openbookqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Openbookqa
 4 | 
 5 | dataset:
 6 | allenai/openbookqa
 7 | 
 8 | abstract:
 9 | OpenBookQA is a question-answering dataset modeled after open-book exams for
10 | assessing human understanding of a subject. It contains multiple-choice
11 | questions that require combining facts from a given open book with broad common
12 | knowledge. The task tests language models' ability to leverage provided
13 | information and apply common sense reasoning.
14 | 
15 | languages:
16 | english
17 | 
18 | tags:
19 | multiple-choice, qa
20 | 
21 | paper:
22 | https://arxiv.org/abs/1809.02789
23 | """
24 | 
25 | from string import ascii_uppercase
26 | 
27 | from lighteval.metrics.metrics import Metrics
28 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
29 | from lighteval.tasks.requests import Doc
30 | 
31 | 
32 | def openbookqa_prompt(line, task_name: str = None):
33 |     query = "The following are multiple choice questions (with answers) about common sense.\n"
34 |     query += f"Question: {line['question_stem']}\n"
35 |     query += "".join([f"{key}. {choice}\n" for key, choice in zip(ascii_uppercase, line["choices"]["text"])])
36 |     query += "Answer: "
37 | 
38 |     gold_ix = ["A", "B", "C", "D", "E"].index(line["answerKey"].strip())
39 |     return Doc(
40 |         task_name=task_name,
41 |         query=query,
42 |         choices=list(ascii_uppercase[: len(line["choices"]["text"])]),
43 |         gold_index=gold_ix,
44 |         instruction="The following are multiple choice questions (with answers) about common sense.\n",
45 |     )
46 | 
47 | 
48 | openbookqa = LightevalTaskConfig(
49 |     name="openbookqa",
50 |     prompt_function=openbookqa_prompt,
51 |     hf_repo="allenai/openbookqa",
52 |     hf_subset="main",
53 |     hf_avail_splits=["train", "test", "validation"],
54 |     evaluation_splits=["validation", "test"],
55 |     few_shots_split=None,
56 |     few_shots_select=None,
57 |     generation_size=1,
58 |     metrics=[
59 |         Metrics.exact_match,
60 |     ],
61 |     stop_sequence=["\n"],
62 |     version=0,
63 | )
64 | 
65 | TASKS_TABLE = [
66 |     openbookqa,
67 | ]
68 | 


--------------------------------------------------------------------------------
/docs/source/caching.mdx:
--------------------------------------------------------------------------------
 1 | # Caching System
 2 | 
 3 | Lighteval includes a caching system that can significantly speed up evaluations by storing and reusing model predictions.
 4 | This is especially useful when running the same evaluation multiple times, or comparing different evaluation metrics on the same model outputs.
 5 | 
 6 | ## How It Works
 7 | 
 8 | The caching system caches the predictions of the model for now (we will add tokenized input caching later).
 9 | It stores model responses objects (generations, logits, probabilities) for evaluation samples.
10 | 
11 | ### Cache Structure
12 | 
13 | Cached data is stored on disk using HuggingFace datasets in the following structure:
14 | 
15 | ```
16 | .cache/
17 | └── huggingface/
18 |     └── lighteval/
19 |         └── predictions/
20 |             └── {model_name}/
21 |                 └── {model_hash}/
22 |                     └── {task_name}.parquet
23 | ```
24 | 
25 | Where:
26 | - `model_name`: The model name (path on the hub or local path)
27 | - `model_hash`: Hash of the model configuration to ensure cache invalidation when parameters change
28 | - `task_name`: Name of the evaluation task
29 | 
30 | ### Cache Recreation
31 | 
32 | A new cache is automatically created when:
33 | - Model configuration changes (different parameters, quantization, etc.)
34 | - Model weights change (different revision, checkpoint, etc.)
35 | - Generation parameters change (temperature, max_tokens, etc.)
36 | 
37 | This ensures that cached results are always consistent with your current model setup.
38 | 
39 | ## Using Caching
40 | 
41 | ### Automatic Caching
42 | 
43 | All built-in model classes in Lighteval automatically support caching. No additional configuration is needed.
44 | For custom models you need to add a cache to the model class and decorators on all functions.
45 | 
46 | ## Cache Management
47 | 
48 | ### Clearing Cache
49 | 
50 | To clear cache for a specific model, delete the corresponding directory:
51 | 
52 | ```bash
53 | rm -rf ~/.cache/huggingface/lighteval/predictions/{model_name}/{model_hash}/
54 | ```
55 | 
56 | To clear all caches:
57 | 
58 | ```bash
59 | rm -rf ~/.cache/huggingface/lighteval/predictions
60 | ```
61 | 


--------------------------------------------------------------------------------
/src/lighteval/tasks/multilingual/tasks/openbook_es.py:
--------------------------------------------------------------------------------
 1 | """
 2 | name:
 3 | Openbook Es
 4 | 
 5 | dataset:
 6 | BSC-LT/openbookqa-es
 7 | 
 8 | abstract:
 9 | Spanish version of OpenBookQA from BSC Language Technology group
10 | 
11 | languages:
12 | spanish
13 | 
14 | tags:
15 | multilingual, multiple-choice, reasoning
16 | 
17 | paper:
18 | https://huggingface.co/datasets/BSC-LT/openbookqa-es
19 | """
20 | 
21 | from string import ascii_uppercase
22 | 
23 | from lighteval.metrics.dynamic_metrics import (
24 |     LogLikelihoodAccMetric,
25 | )
26 | from lighteval.metrics.normalizations import LogProbCharNorm, LogProbTokenNorm
27 | from lighteval.tasks.lighteval_task import LightevalTaskConfig
28 | from lighteval.tasks.multilingual.utils.task_utils import get_metrics_for_formulation
29 | from lighteval.tasks.templates.multichoice import get_mcq_prompt_function
30 | from lighteval.tasks.templates.utils.formulation import (
31 |     CFFormulation,
32 |     HybridFormulation,
33 |     MCFFormulation,
34 | )
35 | from lighteval.utils.language import Language
36 | 
37 | 
38 | TASKS_TABLE = [
39 |     LightevalTaskConfig(
40 |         name=f"openbookqa_{Language.SPANISH.value}_{formulation.name.lower()}",
41 |         prompt_function=get_mcq_prompt_function(
42 |             Language.SPANISH,
43 |             lambda line: {
44 |                 "question": line["question_stem"],
45 |                 "choices": line["choices"]["text"],
46 |                 "gold_idx": ascii_uppercase.index(line["answerKey"]),
47 |             },
48 |             formulation=formulation,
49 |         ),
50 |         hf_repo="BSC-LT/openbookqa-es",
51 |         hf_subset="default",
52 |         evaluation_splits=("test",),
53 |         few_shots_split="validation",
54 |         metrics=get_metrics_for_formulation(
55 |             formulation,
56 |             [
57 |                 LogLikelihoodAccMetric(normalization=LogProbTokenNorm()),
58 |                 LogLikelihoodAccMetric(normalization=LogProbCharNorm()),
59 |             ],
60 |         ),
61 |     )
62 |     for formulation in [
63 |         MCFFormulation(),
64 |         CFFormulation(),
65 |         HybridFormulation(),
66 |     ]
67 | ]
68 | 


--------------------------------------------------------------------------------