├── .gitignore
├── LICENSE
├── README.md
├── assembly
    └── ca_module.py
├── categories.py
├── datautils.py
├── disassembly
    └── cd_module.py
├── eval.py
├── imgs
    ├── llama_1_results.png
    ├── llama_2_results.png
    └── qllm.png
├── lm_eval
    ├── __init__.py
    ├── base.py
    ├── datasets
    │   ├── README.md
    │   ├── __init__.py
    │   ├── asdiv
    │   │   ├── __init__.py
    │   │   ├── asdiv.py
    │   │   └── dataset_infos.json
    │   ├── coqa
    │   │   ├── __init__.py
    │   │   ├── coqa.py
    │   │   └── dataset_infos.json
    │   ├── drop
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── drop.py
    │   ├── headqa
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── headqa.py
    │   ├── hendrycks_ethics
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── hendrycks_ethics.py
    │   ├── hendrycks_math
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── hendrycks_math.py
    │   ├── logiqa
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── logiqa.py
    │   ├── mutual
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── mutual.py
    │   ├── pile
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── pile.py
    │   ├── quac
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── quac.py
    │   ├── sat_analogies
    │   │   ├── __init__.py
    │   │   └── sat_analogies.py
    │   ├── triviaqa
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── triviaqa.py
    │   └── unscramble
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── unscramble.py
    ├── decontamination
    │   ├── __init__.py
    │   ├── archiver.py
    │   ├── decontaminate.py
    │   └── janitor.py
    ├── evaluator copy.py
    ├── evaluator.py
    ├── metrics.py
    ├── models
    │   ├── __init__.py
    │   ├── dummy.py
    │   ├── gpt2.py
    │   ├── gpt3.py
    │   ├── huggingface.py
    │   └── textsynth.py
    ├── tasks
    │   ├── __init__.py
    │   ├── anli.py
    │   ├── arc.py
    │   ├── arithmetic.py
    │   ├── asdiv.py
    │   ├── blimp.py
    │   ├── cbt.py
    │   ├── coqa.py
    │   ├── crowspairs.py
    │   ├── drop.py
    │   ├── glue.py
    │   ├── gsm8k.py
    │   ├── headqa.py
    │   ├── hellaswag.py
    │   ├── hendrycks_ethics.py
    │   ├── hendrycks_math.py
    │   ├── hendrycks_test.py
    │   ├── lambada.py
    │   ├── lambada_cloze.py
    │   ├── lambada_multilingual.py
    │   ├── logiqa.py
    │   ├── mathqa.py
    │   ├── mc_taco.py
    │   ├── mutual.py
    │   ├── naturalqs.py
    │   ├── openbookqa.py
    │   ├── pile.py
    │   ├── piqa.py
    │   ├── prost.py
    │   ├── pubmedqa.py
    │   ├── qa4mre.py
    │   ├── qasper.py
    │   ├── quac.py
    │   ├── race.py
    │   ├── sat.py
    │   ├── sciq.py
    │   ├── squad.py
    │   ├── storycloze.py
    │   ├── superglue.py
    │   ├── swag.py
    │   ├── toxigen.py
    │   ├── translation.py
    │   ├── triviaqa.py
    │   ├── truthfulqa.py
    │   ├── unscramble.py
    │   ├── webqs.py
    │   ├── wikitext.py
    │   ├── winogrande.py
    │   └── wsc273.py
    └── utils.py
├── main.py
├── models
    ├── LMClass.py
    ├── int_llama_layer.py
    ├── int_opt_layer.py
    ├── int_qllm_llama_layer.py
    ├── models_utils.py
    └── transformation.py
├── parallel_utils.py
├── pyproject.toml
├── quantize
    ├── __init__.py
    ├── int_linear.py
    ├── int_linear_lora.py
    ├── int_matmul.py
    ├── learnable_norm.py
    ├── qllm.py
    └── quantizer.py
├── reassembly
    └── cr_module.py
├── scripts
    ├── llama-13b
    │   ├── w4a4.sh
    │   ├── w4a8.sh
    │   └── w6a6.sh
    ├── llama-2-13b
    │   ├── w4a4.sh
    │   ├── w4a8.sh
    │   └── w6a6.sh
    ├── llama-2-70b
    │   ├── w4a4.sh
    │   ├── w4a8.sh
    │   └── w6a6.sh
    ├── llama-2-7b
    │   ├── w4a4.sh
    │   ├── w4a8.sh
    │   └── w6a6.sh
    ├── llama-30b
    │   ├── w4a4.sh
    │   ├── w4a8.sh
    │   └── w6a6.sh
    ├── llama-65b
    │   ├── w4a4.sh
    │   ├── w4a8.sh
    │   └── w6a6.sh
    └── llama-7b
    │   ├── w4a4.sh
    │   ├── w4a8.sh
    │   └── w6a6.sh
├── train_utils.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | __pycache__
 2 | build
 3 | dist
 4 | *.txt
 5 | *.pt
 6 | *egg-info*
 7 | tmp
 8 | output
 9 | *.pyc
10 | .idea
11 | *.zip
12 | cache/
13 | temp/
14 | checkpoints/
15 | huggingface/
16 | log/
17 | act_scales/
18 | act_shifts/
19 | temp.sh
20 | output/
21 | .vscode/
22 | plot/
23 | wandb/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # QLLM: Accurate and Efficient Low-Bitwidth Quantization for Large Language Models (ICLR 2024)
 2 | 
 3 | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 
 4 | [![arXiv](https://img.shields.io/badge/QLLM-2310.08041-b31b1b.svg)](https://arxiv.org/abs/2310.08041)
 5 | 
 6 | This is the official PyTorch implementation of [QLLM: Accurate and Efficient Low-Bitwidth Quantization for Large Language Models](https://arxiv.org/abs/2310.08041).
 7 | 
 8 | By [Jing Liu](https://jing-liu.com/), [Ruihao Gong](https://xhplus.github.io/), [Xiuying Wei](https://wimh966.github.io/), [Zhiwei Dong](https://zwdong.com.cn/), [Jianfei Cai](https://jianfei-cai.github.io/), and [Bohan Zhuang](https://bohanzhuang.github.io/).
 9 | 
10 | ![qllm](imgs/qllm.png)
11 | 
12 | We propose QLLM, an accurate and efficient low-bitwidth post-training quantization method designed for LLMs.
13 | 
14 | ## 📰 News
15 | - [10-03-2024]  Release the code!🌟
16 | - [17-01-2024] QLLM is accepted by ICLR 2024! 👏
17 | 
18 | ## 📖 Contents
19 | - [Install](#🛠-install)
20 | - [Usage](#⚙️-usage)
21 | - [Results](#📋-results)
22 | - [Citation](#📝-citation)
23 | - [License](#🧾-license)
24 | - [Acknowledgement](#🙏-acknowledgement)
25 | 
26 | ## 🛠 Install
27 | ```
28 | conda create -n qllm python=3.10 -y
29 | conda activate qllm
30 | git clone https://github.com/ModelTC/QLLM
31 | cd QLLM
32 | pip install --upgrade pip 
33 | pip install -e .
34 | ```
35 | 
36 | ## ⚙️ Usage
37 | We provide the training scripts in `scripts` folder. For example, to perform W4A8 quantization for LLaMA-7B, run
38 | ```
39 | sh scripts/llama-7b/w4a4.sh
40 | ```
41 | Remember to change the path of model `model` and output path `output_dir`.
42 | 
43 | ## 📋 Results
44 | * QLLM achieve SoTA performance in weight-activation quantization
45 | 
46 | ![weight_activation_llama_1](imgs/llama_1_results.png)
47 | ![weight_activation_llama_2](imgs/llama_2_results.png)
48 | 
49 | ## 📝 Citation
50 | If you find our `QLLM` useful in your research, please consider to cite the following related papers:
51 | ```
52 | @inproceedings{liu2024qllm,
53 |   title = {{QLLM}: Accurate and Efficient Low-Bitwidth Quantization for Large Language Models},
54 |   author = {Liu, Jing and Gong, Ruihao and Wei, Xiuying and Dong, Zhiwei and Cai, Jianfei and Zhuang, Bohan},
55 |   booktitle = {International Conference on Learning Representations (ICLR)},
56 |   year = {2024},
57 | }
58 | ```
59 | 
60 | ## 🧾 License
61 | This repository is released under the Apache 2.0 license as found in the [LICENSE](./LICENSE) file.
62 | 
63 | ## 🙏 Acknowledgement
64 | This repository is built upon [OmniQuant](https://github.com/OpenGVLab/OmniQuant). We thank the authors for their open-sourced code.


--------------------------------------------------------------------------------
/categories.py:
--------------------------------------------------------------------------------
 1 | subcategories = {
 2 |     "abstract_algebra": ["math"],
 3 |     "anatomy": ["health"],
 4 |     "astronomy": ["physics"],
 5 |     "business_ethics": ["business"],
 6 |     "clinical_knowledge": ["health"],
 7 |     "college_biology": ["biology"],
 8 |     "college_chemistry": ["chemistry"],
 9 |     "college_computer_science": ["computer science"],
10 |     "college_mathematics": ["math"],
11 |     "college_medicine": ["health"],
12 |     "college_physics": ["physics"],
13 |     "computer_security": ["computer science"],
14 |     "conceptual_physics": ["physics"],
15 |     "econometrics": ["economics"],
16 |     "electrical_engineering": ["engineering"],
17 |     "elementary_mathematics": ["math"],
18 |     "formal_logic": ["philosophy"],
19 |     "global_facts": ["other"],
20 |     "high_school_biology": ["biology"],
21 |     "high_school_chemistry": ["chemistry"],
22 |     "high_school_computer_science": ["computer science"],
23 |     "high_school_european_history": ["history"],
24 |     "high_school_geography": ["geography"],
25 |     "high_school_government_and_politics": ["politics"],
26 |     "high_school_macroeconomics": ["economics"],
27 |     "high_school_mathematics": ["math"],
28 |     "high_school_microeconomics": ["economics"],
29 |     "high_school_physics": ["physics"],
30 |     "high_school_psychology": ["psychology"],
31 |     "high_school_statistics": ["math"],
32 |     "high_school_us_history": ["history"],
33 |     "high_school_world_history": ["history"],
34 |     "human_aging": ["health"],
35 |     "human_sexuality": ["culture"],
36 |     "international_law": ["law"],
37 |     "jurisprudence": ["law"],
38 |     "logical_fallacies": ["philosophy"],
39 |     "machine_learning": ["computer science"],
40 |     "management": ["business"],
41 |     "marketing": ["business"],
42 |     "medical_genetics": ["health"],
43 |     "miscellaneous": ["other"],
44 |     "moral_disputes": ["philosophy"],
45 |     "moral_scenarios": ["philosophy"],
46 |     "nutrition": ["health"],
47 |     "philosophy": ["philosophy"],
48 |     "prehistory": ["history"],
49 |     "professional_accounting": ["other"],
50 |     "professional_law": ["law"],
51 |     "professional_medicine": ["health"],
52 |     "professional_psychology": ["psychology"],
53 |     "public_relations": ["politics"],
54 |     "security_studies": ["politics"],
55 |     "sociology": ["culture"],
56 |     "us_foreign_policy": ["politics"],
57 |     "virology": ["health"],
58 |     "world_religions": ["philosophy"],
59 | }
60 | 
61 | categories = {
62 |     "STEM": [
63 |         "physics",
64 |         "chemistry",
65 |         "biology",
66 |         "computer science",
67 |         "math",
68 |         "engineering",
69 |     ],
70 |     "humanities": ["history", "philosophy", "law"],
71 |     "social sciences": ["politics", "culture", "economics", "geography", "psychology"],
72 |     "other (business, health, misc.)": ["other", "business", "health"],
73 | }
74 | 


--------------------------------------------------------------------------------
/disassembly/cd_module.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class CDModule(nn.Module):
 6 |     def __init__(self, channel_ratio):
 7 |         super().__init__()
 8 |         self.channel_ratio = channel_ratio
 9 |         self.outlier_channel_idx = None
10 |         self.num_disassembly = None
11 |         self.scaling_factors = None
12 |         self.num_additional_channels = 0
13 | 
14 |     def find_threshold_uniform(self, x_max):
15 |         x_max = x_max.float()
16 |         num_channels = x_max.numel()
17 |         channel_constraint = int(num_channels * self.channel_ratio)
18 |         channelmax_max = x_max.max()
19 |         channelmax_min = x_max.min()
20 | 
21 |         th = channelmax_max
22 |         step_num = max(100, int(channelmax_max / 0.5))
23 |         step = (channelmax_max - channelmax_min) / step_num
24 |         while th >= channelmax_min:
25 |             num_disassembly = torch.ceil(x_max / th)
26 |             num_disassembly = torch.clamp(num_disassembly, min=1.0)
27 |             num_additional_channels = num_disassembly.int().sum().item() - num_channels
28 |             if num_additional_channels > channel_constraint:
29 |                 th += step
30 |                 break
31 |             else:
32 |                 th -= step
33 |         print("Find threshold {} using uniform method".format(th))
34 |         return th
35 | 
36 |     def find_outlier_channels(self, x_min, x_max):
37 |         with torch.no_grad():
38 |             x_max = torch.maximum(x_min.abs(), x_max)
39 |             th = self.find_threshold_uniform(x_max)
40 |             outlier_channel_idx = (x_max > th).nonzero().view(-1)
41 |             num_disassembly = torch.ceil(x_max / th)
42 |             num_disassembly = torch.clamp(num_disassembly, min=1.0)
43 |             scaling_factors = (1.0 / num_disassembly).repeat_interleave(num_disassembly.int())
44 |             if len(outlier_channel_idx) != 0:
45 |                 del self.outlier_channel_idx
46 |                 del self.num_disassembly
47 |                 del self.scaling_factors
48 |                 self.register_buffer("outlier_channel_idx", outlier_channel_idx)
49 |                 self.register_buffer("num_disassembly", num_disassembly)
50 |                 self.register_buffer("scaling_factors", scaling_factors)
51 | 
52 |     def forward(self, x):
53 |         if self.outlier_channel_idx is not None:
54 |             if x.ndim == 2:
55 |                 x = x.unsqueeze(0)
56 |             B, N, C = x.shape
57 |             x = x.view(B * N, C)
58 |             x = torch.repeat_interleave(x, self.num_disassembly.int(), dim=1)
59 |             x = x * self.scaling_factors.unsqueeze(0)
60 |             C = x.shape[1]
61 |             x = x.view(B, N, C)
62 |         return x
63 | 


--------------------------------------------------------------------------------
/imgs/llama_1_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/imgs/llama_1_results.png


--------------------------------------------------------------------------------
/imgs/llama_2_results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/imgs/llama_2_results.png


--------------------------------------------------------------------------------
/imgs/qllm.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/imgs/qllm.png


--------------------------------------------------------------------------------
/lm_eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/README.md:
--------------------------------------------------------------------------------
1 | # datasets
2 | 
3 | This directory contains custom HuggingFace [dataset loading scripts](https://huggingface.co/docs/datasets/dataset_script). They are provided to maintain backward compatibility with the ad-hoc data downloaders in earlier versions of the `lm-evaluation-harness` before HuggingFace [`datasets`](https://huggingface.co/docs/datasets/index) was adopted as the default downloading manager. For example, some instances in the HuggingFace `datasets` repository process features (e.g. whitespace stripping, lower-casing, etc.) in ways that the `lm-evaluation-harness` did not.
4 | 
5 | __NOTE__: We are __not__ accepting any additional loading scripts into the main branch! If you'd like to use a custom dataset, fork the repo and follow HuggingFace's loading script guide found [here](https://huggingface.co/docs/datasets/dataset_script). You can then override your `Task`'s `DATASET_PATH` attribute to point to this script's local path.
6 | 
7 | 
8 | __WARNING__: A handful of loading scripts are included in this collection because they have not yet been pushed to the Huggingface Hub or a HuggingFace organization repo. We will remove such scripts once pushed.
9 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/asdiv/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/asdiv/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/asdiv/asdiv.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """ASDIV dataset."""
 15 | 
 16 | 
 17 | import os
 18 | import xml.etree.ElementTree as ET
 19 | 
 20 | import datasets
 21 | 
 22 | 
 23 | _CITATION = """\
 24 | @misc{miao2021diverse,
 25 |     title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
 26 |     author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
 27 |     year={2021},
 28 |     eprint={2106.15772},
 29 |     archivePrefix={arXiv},
 30 |     primaryClass={cs.AI}
 31 | }
 32 | """
 33 | 
 34 | _DESCRIPTION = """\
 35 | ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language
 36 | patterns and problem types) English math word problem (MWP) corpus for evaluating
 37 | the capability of various MWP solvers. Existing MWP corpora for studying AI progress
 38 | remain limited either in language usage patterns or in problem types. We thus present
 39 | a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem
 40 | types taught in elementary school. Each MWP is annotated with its problem type and grade
 41 | level (for indicating the level of difficulty).
 42 | """
 43 | 
 44 | _HOMEPAGE = "https://github.com/chaochun/nlu-asdiv-dataset"
 45 | 
 46 | # TODO: Add the licence for the dataset here if you can find it
 47 | _LICENSE = ""
 48 | 
 49 | _URLS = "https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip"
 50 | 
 51 | 
 52 | class ASDiv(datasets.GeneratorBasedBuilder):
 53 |     """ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers"""
 54 | 
 55 |     VERSION = datasets.Version("0.0.1")
 56 | 
 57 |     BUILDER_CONFIGS = [
 58 |         datasets.BuilderConfig(
 59 |             name="asdiv",
 60 |             version=VERSION,
 61 |             description="A diverse corpus for evaluating and developing english math word problem solvers",
 62 |         )
 63 |     ]
 64 | 
 65 |     def _info(self):
 66 |         features = datasets.Features(
 67 |             {
 68 |                 "body": datasets.Value("string"),
 69 |                 "question": datasets.Value("string"),
 70 |                 "solution_type": datasets.Value("string"),
 71 |                 "answer": datasets.Value("string"),
 72 |                 "formula": datasets.Value("string"),
 73 |             }
 74 |         )
 75 |         return datasets.DatasetInfo(
 76 |             description=_DESCRIPTION,
 77 |             features=features,
 78 |             homepage=_HOMEPAGE,
 79 |             license=_LICENSE,
 80 |             citation=_CITATION,
 81 |         )
 82 | 
 83 |     def _split_generators(self, dl_manager):
 84 |         urls = _URLS
 85 |         data_dir = dl_manager.download_and_extract(urls)
 86 |         base_filepath = "nlu-asdiv-dataset-55790e5270bb91ccfa5053194b25732534696b50"
 87 |         return [
 88 |             datasets.SplitGenerator(
 89 |                 name=datasets.Split.VALIDATION,
 90 |                 # These kwargs will be passed to _generate_examples
 91 |                 gen_kwargs={
 92 |                     "filepath": os.path.join(
 93 |                         data_dir, base_filepath, "dataset", "ASDiv.xml"
 94 |                     ),
 95 |                     "split": datasets.Split.VALIDATION,
 96 |                 },
 97 |             ),
 98 |         ]
 99 | 
100 |     # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
101 |     def _generate_examples(self, filepath, split):
102 |         tree = ET.parse(filepath)
103 |         root = tree.getroot()
104 |         for key, problem in enumerate(root.iter("Problem")):
105 |             yield key, {
106 |                 "body": problem.find("Body").text,
107 |                 "question": problem.find("Question").text,
108 |                 "solution_type": problem.find("Solution-Type").text,
109 |                 "answer": problem.find("Answer").text,
110 |                 "formula": problem.find("Formula").text,
111 |             }
112 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/asdiv/dataset_infos.json:
--------------------------------------------------------------------------------
1 | {"asdiv": {"description": "ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language\npatterns and problem types) English math word problem (MWP) corpus for evaluating\nthe capability of various MWP solvers. Existing MWP corpora for studying AI progress\nremain limited either in language usage patterns or in problem types. We thus present\na new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem\ntypes taught in elementary school. Each MWP is annotated with its problem type and grade\nlevel (for indicating the level of difficulty).\n", "citation": "@misc{miao2021diverse,\n    title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},\n    author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},\n    year={2021},\n    eprint={2106.15772},\n    archivePrefix={arXiv},\n    primaryClass={cs.AI}\n}\n", "homepage": "https://github.com/chaochun/nlu-asdiv-dataset", "license": "", "features": {"body": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "solution_type": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "formula": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "as_div", "config_name": "asdiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 501489, "num_examples": 2305, "dataset_name": "as_div"}}, "download_checksums": {"https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip": {"num_bytes": 440966, "checksum": "8f1fe4f6d5f170ec1e24ab78c244153c14c568b1bb2b1dad0324e71f37939a2d"}}, "download_size": 440966, "post_processing_size": null, "dataset_size": 501489, "size_in_bytes": 942455}}
2 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/coqa/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/coqa/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/coqa/dataset_infos.json:
--------------------------------------------------------------------------------
1 | {"coqa": {"description": "CoQA is a large-scale dataset for building Conversational Question Answering\nsystems. The goal of the CoQA challenge is to measure the ability of machines to\nunderstand a text passage and answer a series of interconnected questions that\nappear in a conversation.\n", "citation": "@misc{reddy2018coqa,\n    title={CoQA: A Conversational Question Answering Challenge},\n    author={Siva Reddy and Danqi Chen and Christopher D. Manning},\n    year={2018},\n    eprint={1808.07042},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://stanfordnlp.github.io/coqa/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "story": {"dtype": "string", "id": null, "_type": "Value"}, "questions": {"feature": {"input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answers": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "additional_answers": {"0": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "1": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "2": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "coqa", "config_name": "coqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 26250528, "num_examples": 7199, "dataset_name": "coqa"}, "validation": {"name": "validation", "num_bytes": 3765933, "num_examples": 500, "dataset_name": "coqa"}}, "download_checksums": {"https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json": {"num_bytes": 49001836, "checksum": "b0fdb2bc1bd38dd3ca2ce5fa2ac3e02c6288ac914f241ac409a655ffb6619fa6"}, "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json": {"num_bytes": 9090845, "checksum": "dfa367a9733ce53222918d0231d9b3bedc2b8ee831a2845f62dfc70701f2540a"}}, "download_size": 58092681, "post_processing_size": null, "dataset_size": 30016461, "size_in_bytes": 88109142}}
2 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/drop/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/drop/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/drop/dataset_infos.json:
--------------------------------------------------------------------------------
1 | {"drop": {"description": "DROP is a QA dataset which tests comprehensive understanding of paragraphs. In \nthis crowdsourced, adversarially-created, 96k question-answering benchmark, a \nsystem must resolve multiple references in a question, map them onto a paragraph,\nand perform discrete operations over them (such as addition, counting, or sorting).\n", "citation": "@misc{dua2019drop,\n    title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, \n    author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},\n    year={2019},\n    eprint={1903.00161},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://allenai.org/data/drop", "license": "", "features": {"section_id": {"dtype": "string", "id": null, "_type": "Value"}, "passage": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "query_id": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "validated_answers": {"feature": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "drop", "config_name": "drop", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 108858121, "num_examples": 77409, "dataset_name": "drop"}, "validation": {"name": "validation", "num_bytes": 12560739, "num_examples": 9536, "dataset_name": "drop"}}, "download_checksums": {"https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip": {"num_bytes": 8308692, "checksum": "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"}}, "download_size": 8308692, "post_processing_size": null, "dataset_size": 121418860, "size_in_bytes": 129727552}}
2 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/headqa/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/headqa/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/headqa/dataset_infos.json:
--------------------------------------------------------------------------------
1 | {"es": {"description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "citation": "@inproceedings{vilares-gomez-rodriguez-2019-head,\n    title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n    author = \"Vilares, David  and\n      G{'o}mez-Rodr{'i}guez, Carlos\",\n    booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n    month = jul,\n    year = \"2019\",\n    address = \"Florence, Italy\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/P19-1092\",\n    doi = \"10.18653/v1/P19-1092\",\n    pages = \"960--966\",\n    abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n", "homepage": "https://aghie.github.io/head-qa/", "license": "MIT License", "features": {"name": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"dtype": "string", "id": null, "_type": "Value"}, "qid": {"dtype": "int32", "id": null, "_type": "Value"}, "qtext": {"dtype": "string", "id": null, "_type": "Value"}, "ra": {"dtype": "int32", "id": null, "_type": "Value"}, "answers": [{"aid": {"dtype": "int32", "id": null, "_type": "Value"}, "atext": {"dtype": "string", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "head_qa", "config_name": "es", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1196021, "num_examples": 2657, "dataset_name": "head_qa"}, "test": {"name": "test", "num_bytes": 1169819, "num_examples": 2742, "dataset_name": "head_qa"}, "validation": {"name": "validation", "num_bytes": 556924, "num_examples": 1366, "dataset_name": "head_qa"}}, "download_checksums": {"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t": {"num_bytes": 79365502, "checksum": "6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}}, "download_size": 79365502, "post_processing_size": null, "dataset_size": 2922764, "size_in_bytes": 82288266}, "en": {"description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "citation": "@inproceedings{vilares-gomez-rodriguez-2019-head,\n    title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n    author = \"Vilares, David  and\n      G{'o}mez-Rodr{'i}guez, Carlos\",\n    booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n    month = jul,\n    year = \"2019\",\n    address = \"Florence, Italy\",\n    publisher = \"Association for Computational Linguistics\",\n    url = \"https://www.aclweb.org/anthology/P19-1092\",\n    doi = \"10.18653/v1/P19-1092\",\n    pages = \"960--966\",\n    abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n", "homepage": "https://aghie.github.io/head-qa/", "license": "MIT License", "features": {"name": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"dtype": "string", "id": null, "_type": "Value"}, "qid": {"dtype": "int32", "id": null, "_type": "Value"}, "qtext": {"dtype": "string", "id": null, "_type": "Value"}, "ra": {"dtype": "int32", "id": null, "_type": "Value"}, "answers": [{"aid": {"dtype": "int32", "id": null, "_type": "Value"}, "atext": {"dtype": "string", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "head_qa", "config_name": "en", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1123151, "num_examples": 2657, "dataset_name": "head_qa"}, "test": {"name": "test", "num_bytes": 1097349, "num_examples": 2742, "dataset_name": "head_qa"}, "validation": {"name": "validation", "num_bytes": 523462, "num_examples": 1366, "dataset_name": "head_qa"}}, "download_checksums": {"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t": {"num_bytes": 79365502, "checksum": "6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}}, "download_size": 79365502, "post_processing_size": null, "dataset_size": 2743962, "size_in_bytes": 82109464}}
2 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/hendrycks_ethics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/hendrycks_ethics/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/hendrycks_math/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/hendrycks_math/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/hendrycks_math/hendrycks_math.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """MATH dataset."""
 15 | 
 16 | 
 17 | import json
 18 | import os
 19 | import pathlib
 20 | 
 21 | import datasets
 22 | 
 23 | 
 24 | _CITATION = """\
 25 | @article{hendrycksmath2021,
 26 |   title={Measuring Mathematical Problem Solving With the Math Dataset},
 27 |   author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
 28 |   journal={NeurIPS},
 29 |   year={2021}
 30 | }
 31 | """
 32 | 
 33 | _DESCRIPTION = """\
 34 | MATH is a dataset of 12,500 challenging competition mathematics problems. Each
 35 | problem in Math has a full step-by-step solution which can be used to teach
 36 | models to generate answer derivations and explanations.
 37 | """
 38 | 
 39 | _HOMEPAGE = "https://github.com/hendrycks/math"
 40 | 
 41 | # TODO: Add the licence for the dataset here if you can find it
 42 | _LICENSE = ""
 43 | 
 44 | _URLS = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"
 45 | 
 46 | _NAMES = [
 47 |     "algebra",
 48 |     "counting_and_probability",
 49 |     "geometry",
 50 |     "intermediate_algebra",
 51 |     "number_theory",
 52 |     "prealgebra",
 53 |     "precalculus",
 54 | ]
 55 | 
 56 | 
 57 | class HendrycksMath(datasets.GeneratorBasedBuilder):
 58 |     """MATH is a dataset of 12,500 challenging competition mathematics problems."""
 59 | 
 60 |     VERSION = datasets.Version("0.0.1")
 61 | 
 62 |     BUILDER_CONFIGS = [
 63 |         datasets.BuilderConfig(name=name, version=version, description=name)
 64 |         for name, version in zip(_NAMES, [VERSION] * len(_NAMES))
 65 |     ]
 66 | 
 67 |     def _info(self):
 68 |         features = datasets.Features(
 69 |             {
 70 |                 "problem": datasets.Value("string"),
 71 |                 "level": datasets.Value("string"),
 72 |                 "type": datasets.Value("string"),
 73 |                 "solution": datasets.Value("string"),
 74 |             }
 75 |         )
 76 |         return datasets.DatasetInfo(
 77 |             description=_DESCRIPTION,
 78 |             features=features,
 79 |             homepage=_HOMEPAGE,
 80 |             license=_LICENSE,
 81 |             citation=_CITATION,
 82 |         )
 83 | 
 84 |     def _split_generators(self, dl_manager):
 85 |         urls = _URLS
 86 |         data_dir = dl_manager.download_and_extract(urls)
 87 |         return [
 88 |             datasets.SplitGenerator(
 89 |                 name=datasets.Split.TRAIN,
 90 |                 # These kwargs will be passed to _generate_examples
 91 |                 gen_kwargs={
 92 |                     "basepath": os.path.join(
 93 |                         data_dir, "MATH", "train", self.config.name
 94 |                     ),
 95 |                     "split": "train",
 96 |                 },
 97 |             ),
 98 |             datasets.SplitGenerator(
 99 |                 name=datasets.Split.TEST,
100 |                 # These kwargs will be passed to _generate_examples
101 |                 gen_kwargs={
102 |                     "basepath": os.path.join(
103 |                         data_dir, "MATH", "test", self.config.name
104 |                     ),
105 |                     "split": "test",
106 |                 },
107 |             ),
108 |         ]
109 | 
110 |     # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
111 |     def _generate_examples(self, basepath, split):
112 |         key = 0
113 |         for file in sorted(pathlib.Path(basepath).iterdir()):
114 |             with open(file, "r", encoding="utf-8") as f:
115 |                 data = json.load(f)
116 |                 yield key, {
117 |                     "problem": data["problem"],
118 |                     "level": data["level"],
119 |                     "type": data["type"],
120 |                     "solution": data["solution"],
121 |                 }
122 |                 key += 1
123 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/logiqa/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/logiqa/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/logiqa/dataset_infos.json:
--------------------------------------------------------------------------------
1 | {"logiqa": {"description": "LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA\ninstances, covering multiple types of deductive reasoning. Results show that state-\nof-the-art neural models perform by far worse than human ceiling. The dataset can\nalso serve as a benchmark for reinvestigating logical AI under the deep learning\nNLP setting.\n", "citation": "@misc{liu2020logiqa,\n    title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, \n    author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},\n    year={2020},\n    eprint={2007.08124},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/lgw863/LogiQA-dataset", "license": "", "features": {"label": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "logiqa", "config_name": "logiqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 6419852, "num_examples": 7376, "dataset_name": "logiqa"}, "test": {"name": "test", "num_bytes": 571705, "num_examples": 651, "dataset_name": "logiqa"}, "validation": {"name": "validation", "num_bytes": 562437, "num_examples": 651, "dataset_name": "logiqa"}}, "download_checksums": {"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt": {"num_bytes": 6281272, "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt": {"num_bytes": 559060, "checksum": "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt": {"num_bytes": 550021, "checksum": "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"}}, "download_size": 7390353, "post_processing_size": null, "dataset_size": 7553994, "size_in_bytes": 14944347}}
2 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/logiqa/logiqa.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """LogiQA dataset."""
 15 | 
 16 | 
 17 | import datasets
 18 | 
 19 | 
 20 | _CITATION = """\
 21 | @misc{liu2020logiqa,
 22 |     title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning},
 23 |     author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},
 24 |     year={2020},
 25 |     eprint={2007.08124},
 26 |     archivePrefix={arXiv},
 27 |     primaryClass={cs.CL}
 28 | }
 29 | """
 30 | 
 31 | _DESCRIPTION = """\
 32 | LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA
 33 | instances, covering multiple types of deductive reasoning. Results show that state-
 34 | of-the-art neural models perform by far worse than human ceiling. The dataset can
 35 | also serve as a benchmark for reinvestigating logical AI under the deep learning
 36 | NLP setting.
 37 | """
 38 | 
 39 | _HOMEPAGE = "https://github.com/lgw863/LogiQA-dataset"
 40 | 
 41 | # TODO: Add the licence for the dataset here if you can find it
 42 | _LICENSE = ""
 43 | 
 44 | _URLS = {
 45 |     "train": "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt",
 46 |     "validation": "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt",
 47 |     "test": "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt",
 48 | }
 49 | 
 50 | 
 51 | class Logiqa(datasets.GeneratorBasedBuilder):
 52 |     """LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning"""
 53 | 
 54 |     VERSION = datasets.Version("0.0.1")
 55 | 
 56 |     BUILDER_CONFIGS = [
 57 |         datasets.BuilderConfig(
 58 |             name="logiqa", version=VERSION, description="The LogiQA dataset."
 59 |         ),
 60 |     ]
 61 | 
 62 |     def _info(self):
 63 |         features = datasets.Features(
 64 |             {
 65 |                 "label": datasets.Value("string"),
 66 |                 "context": datasets.Value("string"),
 67 |                 "question": datasets.Value("string"),
 68 |                 "options": datasets.features.Sequence(datasets.Value("string")),
 69 |             }
 70 |         )
 71 |         return datasets.DatasetInfo(
 72 |             description=_DESCRIPTION,
 73 |             features=features,
 74 |             homepage=_HOMEPAGE,
 75 |             license=_LICENSE,
 76 |             citation=_CITATION,
 77 |         )
 78 | 
 79 |     def _split_generators(self, dl_manager):
 80 |         urls = {
 81 |             "train": _URLS["train"],
 82 |             "test": _URLS["test"],
 83 |             "validation": _URLS["validation"],
 84 |         }
 85 |         data_dir = dl_manager.download_and_extract(urls)
 86 |         return [
 87 |             datasets.SplitGenerator(
 88 |                 name=datasets.Split.TRAIN,
 89 |                 # These kwargs will be passed to _generate_examples
 90 |                 gen_kwargs={
 91 |                     "filepath": data_dir["train"],
 92 |                     "split": "train",
 93 |                 },
 94 |             ),
 95 |             datasets.SplitGenerator(
 96 |                 name=datasets.Split.TEST,
 97 |                 # These kwargs will be passed to _generate_examples
 98 |                 gen_kwargs={"filepath": data_dir["test"], "split": "test"},
 99 |             ),
100 |             datasets.SplitGenerator(
101 |                 name=datasets.Split.VALIDATION,
102 |                 # These kwargs will be passed to _generate_examples
103 |                 gen_kwargs={
104 |                     "filepath": data_dir["validation"],
105 |                     "split": "validation",
106 |                 },
107 |             ),
108 |         ]
109 | 
110 |     # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
111 |     def _generate_examples(self, filepath, split):
112 |         def normalize(text):
113 |             return text.replace(".", ". ").strip()
114 | 
115 |         with open(filepath, encoding="utf-8") as f:
116 |             data = f.read().strip().split("\n\n")
117 |             for key, row in enumerate(data):
118 |                 example = row.split("\n")
119 |                 yield key, {
120 |                     "label": example[0].strip(),
121 |                     "context": normalize(example[1]),
122 |                     "question": normalize(example[2]),
123 |                     "options": [normalize(option[2:]) for option in example[3:]],
124 |                 }
125 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/mutual/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/mutual/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/mutual/dataset_infos.json:
--------------------------------------------------------------------------------
1 | {"mutual": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nThe MuTual dataset.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 5141602, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 634396, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 624271, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6400269, "size_in_bytes": 17398147}, "mutual_plus": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nMuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual_plus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 4921179, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 606620, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 597340, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6125139, "size_in_bytes": 17123017}}
2 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/mutual/mutual.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """MuTual dataset."""
 15 | 
 16 | 
 17 | import json
 18 | import os
 19 | from pathlib import Path
 20 | 
 21 | import datasets
 22 | 
 23 | 
 24 | _CITATION = """\
 25 | @inproceedings{mutual,
 26 |     title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning",
 27 |     author = "Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" ,
 28 |     booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics",
 29 |     year = "2020",
 30 |     publisher = "Association for Computational Linguistics",
 31 | }
 32 | """
 33 | 
 34 | _DESCRIPTION = """\
 35 | MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is
 36 | modified from Chinese high school English listening comprehension test data.
 37 | """
 38 | 
 39 | _HOMEPAGE = "https://github.com/Nealcly/MuTual"
 40 | 
 41 | # TODO: Add the licence for the dataset here if you can find it
 42 | _LICENSE = ""
 43 | 
 44 | _URLS = "https://github.com/Nealcly/MuTual/archive/master.zip"
 45 | 
 46 | 
 47 | class Mutual(datasets.GeneratorBasedBuilder):
 48 |     """MuTual: A Dataset for Multi-Turn Dialogue Reasoning"""
 49 | 
 50 |     VERSION = datasets.Version("0.0.1")
 51 | 
 52 |     BUILDER_CONFIGS = [
 53 |         datasets.BuilderConfig(
 54 |             name="mutual", version=VERSION, description="The MuTual dataset."
 55 |         ),
 56 |         datasets.BuilderConfig(
 57 |             name="mutual_plus",
 58 |             version=VERSION,
 59 |             description="MuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.",
 60 |         ),
 61 |     ]
 62 | 
 63 |     def _info(self):
 64 |         features = datasets.Features(
 65 |             {
 66 |                 "answers": datasets.Value("string"),
 67 |                 "options": datasets.features.Sequence(datasets.Value("string")),
 68 |                 "article": datasets.Value("string"),
 69 |                 "id": datasets.Value("string"),
 70 |             }
 71 |         )
 72 |         return datasets.DatasetInfo(
 73 |             description=f"{_DESCRIPTION}\n{self.config.description}",
 74 |             features=features,
 75 |             homepage=_HOMEPAGE,
 76 |             license=_LICENSE,
 77 |             citation=_CITATION,
 78 |         )
 79 | 
 80 |     def _split_generators(self, dl_manager):
 81 |         urls = _URLS
 82 |         data_dir = dl_manager.download_and_extract(urls)
 83 |         return [
 84 |             datasets.SplitGenerator(
 85 |                 name=datasets.Split.TRAIN,
 86 |                 # These kwargs will be passed to _generate_examples
 87 |                 gen_kwargs={
 88 |                     "basepath": os.path.join(
 89 |                         data_dir, "MuTual-master", "data", self.config.name, "train"
 90 |                     ),
 91 |                     "split": "train",
 92 |                 },
 93 |             ),
 94 |             datasets.SplitGenerator(
 95 |                 name=datasets.Split.TEST,
 96 |                 # These kwargs will be passed to _generate_examples
 97 |                 gen_kwargs={
 98 |                     "basepath": os.path.join(
 99 |                         data_dir, "MuTual-master", "data", self.config.name, "test"
100 |                     ),
101 |                     "split": "test",
102 |                 },
103 |             ),
104 |             datasets.SplitGenerator(
105 |                 name=datasets.Split.VALIDATION,
106 |                 # These kwargs will be passed to _generate_examples
107 |                 gen_kwargs={
108 |                     "basepath": os.path.join(
109 |                         data_dir, "MuTual-master", "data", self.config.name, "dev"
110 |                     ),
111 |                     "split": "dev",
112 |                 },
113 |             ),
114 |         ]
115 | 
116 |     # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
117 |     def _generate_examples(self, basepath, split):
118 |         # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
119 |         # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example.
120 |         key = 0
121 |         for file in sorted(Path(basepath).iterdir()):
122 |             if file.suffix != ".txt":
123 |                 continue
124 |             with open(file, "r", encoding="utf-8") as f:
125 |                 data_str = f.read()
126 |                 # Ignore the occasional empty file.
127 |                 if not data_str:
128 |                     continue
129 |                 data = json.loads(data_str)
130 |                 yield key, {
131 |                     "answers": data["answers"],
132 |                     "options": data["options"],
133 |                     "article": data["article"],
134 |                     "id": data["id"],
135 |                 }
136 |                 key += 1
137 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/pile/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/pile/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/pile/pile.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Pile dataset."""
 15 | 
 16 | 
 17 | import json
 18 | 
 19 | import datasets
 20 | 
 21 | 
 22 | _CITATION = """\
 23 | @article{pile,
 24 |   title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
 25 |   author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},
 26 |   journal={arXiv preprint arXiv:2101.00027},
 27 |   year={2020}
 28 | }
 29 | """
 30 | 
 31 | _DESCRIPTION = """\
 32 | The Pile is a 825 GiB diverse, open source language modeling data set that consists
 33 | of 22 smaller, high-quality datasets combined together. To score well on Pile
 34 | BPB (bits per byte), a model must be able to understand many disparate domains
 35 | including books, github repositories, webpages, chat logs, and medical, physics,
 36 | math, computer science, and philosophy papers.
 37 | """
 38 | 
 39 | _HOMEPAGE = "https://pile.eleuther.ai/"
 40 | 
 41 | # TODO: Add the licence for the dataset here if you can find it
 42 | _LICENSE = ""
 43 | 
 44 | _URLS = {
 45 |     "validation": "https://the-eye.eu/public/AI/pile/val.jsonl.zst",
 46 |     "test": "https://the-eye.eu/public/AI/pile/test.jsonl.zst",
 47 | }
 48 | 
 49 | _NAMES = {
 50 |     "pile_arxiv": "ArXiv",
 51 |     "pile_books3": "Books3",
 52 |     "pile_bookcorpus2": "BookCorpus2",
 53 |     "pile_dm-mathematics": "DM Mathematics",
 54 |     "pile_enron": "Enron Emails",
 55 |     "pile_europarl": "EuroParl",
 56 |     "pile_freelaw": "FreeLaw",
 57 |     "pile_github": "Github",
 58 |     "pile_gutenberg": "Gutenberg (PG-19)",
 59 |     "pile_hackernews": "HackerNews",
 60 |     "pile_nih-exporter": "NIH ExPorter",
 61 |     "pile_opensubtitles": "OpenSubtitles",
 62 |     "pile_openwebtext2": "OpenWebText2",
 63 |     "pile_philpapers": "PhilPapers",
 64 |     "pile_pile-cc": "Pile-CC",
 65 |     "pile_pubmed-abstracts": "PubMed Abstracts",
 66 |     "pile_pubmed-central": "PubMed Central",
 67 |     "pile_stackexchange": "StackExchange",
 68 |     "pile_upsto": "USPTO Backgrounds",
 69 |     "pile_ubuntu-irc": "Ubuntu IRC",
 70 |     "pile_wikipedia": "Wikipedia (en)",
 71 |     "pile_youtubesubtitles": "YoutubeSubtitles",
 72 | }
 73 | 
 74 | 
 75 | class Pile(datasets.GeneratorBasedBuilder):
 76 |     """The Pile is a 825 GiB diverse, open source language modeling dataset."""
 77 | 
 78 |     VERSION = datasets.Version("0.0.1")
 79 | 
 80 |     BUILDER_CONFIGS = [
 81 |         datasets.BuilderConfig(name=name, version=version, description=_NAMES[name])
 82 |         for name, version in zip(_NAMES.keys(), [VERSION] * len(_NAMES))
 83 |     ]
 84 | 
 85 |     def _info(self):
 86 |         features = datasets.Features(
 87 |             {
 88 |                 "text": datasets.Value("string"),
 89 |             }
 90 |         )
 91 |         return datasets.DatasetInfo(
 92 |             description=f"{_DESCRIPTION}\n{self.config.description}",
 93 |             features=features,
 94 |             homepage=_HOMEPAGE,
 95 |             license=_LICENSE,
 96 |             citation=_CITATION,
 97 |         )
 98 | 
 99 |     def _split_generators(self, dl_manager):
100 |         urls = {"validation": _URLS["validation"], "test": _URLS["test"]}
101 |         data_dir = dl_manager.download_and_extract(urls)
102 |         return [
103 |             datasets.SplitGenerator(
104 |                 name=datasets.Split.TEST,
105 |                 # These kwargs will be passed to _generate_examples
106 |                 gen_kwargs={"filepath": data_dir["test"], "split": "test"},
107 |             ),
108 |             datasets.SplitGenerator(
109 |                 name=datasets.Split.VALIDATION,
110 |                 # These kwargs will be passed to _generate_examples
111 |                 gen_kwargs={
112 |                     "filepath": data_dir["validation"],
113 |                     "split": "validation",
114 |                 },
115 |             ),
116 |         ]
117 | 
118 |     # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
119 |     def _generate_examples(self, filepath, split):
120 |         with open(filepath, encoding="utf-8") as f:
121 |             for key, row in enumerate(f):
122 |                 data = json.loads(row)
123 |                 if data["meta"]["pile_set_name"] == _NAMES[self.config.name]:
124 |                     yield key, {
125 |                         "text": data["text"],
126 |                     }
127 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/quac/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/quac/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/quac/dataset_infos.json:
--------------------------------------------------------------------------------
1 | {"quac": {"description": "Question Answering in Context (QuAC) is a dataset for modeling, understanding, and \nparticipating in information seeking dialog. Data instances consist of an interactive\ndialog between two crowd workers: (1) a student who poses a sequence of freeform\nquestions to learn as much as possible about a hidden Wikipedia text, and (2)\na teacher who answers the questions by providing short excerpts (spans) from the text.\n", "citation": "@article{choi2018quac,\n    title={Quac: Question answering in context},\n    author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},\n    journal={arXiv preprint arXiv:1808.07036},\n    year={2018}\n}\n", "homepage": "https://quac.ai/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "section_title": {"dtype": "string", "id": null, "_type": "Value"}, "paragraph": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "quac", "config_name": "quac", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 212391958, "num_examples": 83568, "dataset_name": "quac"}, "validation": {"name": "validation", "num_bytes": 20678483, "num_examples": 7354, "dataset_name": "quac"}}, "download_checksums": {"https://s3.amazonaws.com/my89public/quac/train_v0.2.json": {"num_bytes": 68114819, "checksum": "ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a"}, "https://s3.amazonaws.com/my89public/quac/val_v0.2.json": {"num_bytes": 8929167, "checksum": "09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378"}}, "download_size": 77043986, "post_processing_size": null, "dataset_size": 233070441, "size_in_bytes": 310114427}}
2 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/quac/quac.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | # TODO: Address all TODOs and remove all explanatory comments
 15 | """QuAC dataset."""
 16 | 
 17 | 
 18 | import json
 19 | 
 20 | import datasets
 21 | 
 22 | 
 23 | _CITATION = """\
 24 | @article{choi2018quac,
 25 |     title={Quac: Question answering in context},
 26 |     author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},
 27 |     journal={arXiv preprint arXiv:1808.07036},
 28 |     year={2018}
 29 | }
 30 | """
 31 | 
 32 | _DESCRIPTION = """\
 33 | Question Answering in Context (QuAC) is a dataset for modeling, understanding, and
 34 | participating in information seeking dialog. Data instances consist of an interactive
 35 | dialog between two crowd workers: (1) a student who poses a sequence of freeform
 36 | questions to learn as much as possible about a hidden Wikipedia text, and (2)
 37 | a teacher who answers the questions by providing short excerpts (spans) from the text.
 38 | """
 39 | 
 40 | _HOMEPAGE = "https://quac.ai/"
 41 | 
 42 | # TODO: Add the licence for the dataset here if you can find it
 43 | _LICENSE = ""
 44 | 
 45 | _URLS = {
 46 |     "train": "https://s3.amazonaws.com/my89public/quac/train_v0.2.json",
 47 |     "validation": "https://s3.amazonaws.com/my89public/quac/val_v0.2.json",
 48 | }
 49 | 
 50 | 
 51 | class Quac(datasets.GeneratorBasedBuilder):
 52 |     """Question Answering in Context (QuAC) is a dataset for modeling, understanding, and  participating in information seeking dialog."""
 53 | 
 54 |     VERSION = datasets.Version("1.1.0")
 55 | 
 56 |     BUILDER_CONFIGS = [
 57 |         datasets.BuilderConfig(
 58 |             name="quac", version=VERSION, description="The QuAC dataset"
 59 |         ),
 60 |     ]
 61 | 
 62 |     def _info(self):
 63 |         features = datasets.Features(
 64 |             {
 65 |                 "title": datasets.Value("string"),
 66 |                 "section_title": datasets.Value("string"),
 67 |                 "paragraph": datasets.Value("string"),
 68 |                 "question": datasets.Value("string"),
 69 |                 "answer": datasets.Value("string"),
 70 |             }
 71 |         )
 72 |         return datasets.DatasetInfo(
 73 |             description=_DESCRIPTION,
 74 |             features=features,
 75 |             homepage=_HOMEPAGE,
 76 |             license=_LICENSE,
 77 |             citation=_CITATION,
 78 |         )
 79 | 
 80 |     def _split_generators(self, dl_manager):
 81 |         urls = {"train": _URLS["train"], "validation": _URLS["validation"]}
 82 |         data_dir = dl_manager.download_and_extract(urls)
 83 |         return [
 84 |             datasets.SplitGenerator(
 85 |                 name=datasets.Split.TRAIN,
 86 |                 # These kwargs will be passed to _generate_examples
 87 |                 gen_kwargs={
 88 |                     "filepath": data_dir["train"],
 89 |                     "split": "train",
 90 |                 },
 91 |             ),
 92 |             datasets.SplitGenerator(
 93 |                 name=datasets.Split.VALIDATION,
 94 |                 # These kwargs will be passed to _generate_examples
 95 |                 gen_kwargs={"filepath": data_dir["validation"], "split": "validation"},
 96 |             ),
 97 |         ]
 98 | 
 99 |     # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
100 |     def _generate_examples(self, filepath, split):
101 |         with open(filepath, encoding="utf-8") as f:
102 |             data = json.load(f)["data"]
103 |             key = 0
104 |             for row in data:
105 |                 paragraph = row["paragraphs"][0]["context"].replace("CANNOTANSWER", "")
106 |                 qas = row["paragraphs"][0]["qas"]
107 |                 qa_pairs = [(qa["question"], qa["answers"][0]["text"]) for qa in qas]
108 |                 for (question, answer) in qa_pairs:
109 |                     # Yields examples as (key, example) tuples
110 |                     yield key, {
111 |                         "title": row["title"],
112 |                         "section_title": row["section_title"],
113 |                         "paragraph": paragraph,
114 |                         "question": question,
115 |                         "answer": answer,
116 |                     }
117 |                     key += 1
118 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/sat_analogies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/sat_analogies/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/sat_analogies/sat_analogies.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """SAT Analogy Questions dataset."""
 15 | 
 16 | 
 17 | import os
 18 | 
 19 | import datasets
 20 | 
 21 | 
 22 | _CITATION = """\
 23 | @article{article,
 24 |     author = {Turney, Peter},
 25 |     year = {2006},
 26 |     month = {09},
 27 |     pages = {379-416},
 28 |     title = {Similarity of Semantic Relations},
 29 |     volume = {32},
 30 |     journal = {Computational Linguistics},
 31 |     doi = {10.1162/coli.2006.32.3.379}
 32 | }
 33 | """
 34 | 
 35 | _DESCRIPTION = """\
 36 | SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374
 37 | multiple-choice analogy questions; 5 choices per question.
 38 | """
 39 | 
 40 | _HOMEPAGE = "https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art)"
 41 | 
 42 | # TODO: Add the licence for the dataset here if you can find it
 43 | _LICENSE = ""
 44 | 
 45 | 
 46 | class SatAnalogies(datasets.GeneratorBasedBuilder):
 47 |     """SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 multiple-choice analogy questions."""
 48 | 
 49 |     VERSION = datasets.Version("0.0.1")
 50 | 
 51 |     BUILDER_CONFIGS = [
 52 |         datasets.BuilderConfig(
 53 |             name="sat_analogies",
 54 |             version=VERSION,
 55 |             description="The SAT Analogy Questions dataset",
 56 |         ),
 57 |     ]
 58 | 
 59 |     @property
 60 |     def manual_download_instructions(self):
 61 |         return (
 62 |             "To use SAT Analogy Questions you have to download it manually. Please "
 63 |             "email Peter Turney to request the data (https://www.apperceptual.com). "
 64 |             "Once you receive a download link for the dataset, supply the local path "
 65 |             "as the `data_dir` arg: "
 66 |             "`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`"
 67 |         )
 68 | 
 69 |     def _info(self):
 70 |         features = datasets.Features(
 71 |             {
 72 |                 "source": datasets.Value("string"),
 73 |                 "stem": datasets.Value("string"),
 74 |                 "choices": datasets.features.Sequence(datasets.Value("string")),
 75 |                 "solution": datasets.Value("string"),
 76 |             }
 77 |         )
 78 |         return datasets.DatasetInfo(
 79 |             description=_DESCRIPTION,
 80 |             features=features,
 81 |             homepage=_HOMEPAGE,
 82 |             license=_LICENSE,
 83 |             citation=_CITATION,
 84 |         )
 85 | 
 86 |     def _split_generators(self, dl_manager):
 87 |         data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir))
 88 |         if not os.path.exists(data_dir):
 89 |             raise FileNotFoundError(
 90 |                 f"{data_dir} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('matinf', data_dir=...)` that includes SAT-package-V3.txt. Manual download instructions: {self.manual_download_instructions}"
 91 |             )
 92 |         return [
 93 |             datasets.SplitGenerator(
 94 |                 name=datasets.Split.VALIDATION,
 95 |                 # These kwargs will be passed to _generate_examples
 96 |                 gen_kwargs={
 97 |                     "filepath": os.path.join(data_dir, "SAT-package-V3.txt"),
 98 |                 },
 99 |             )
100 |         ]
101 | 
102 |     # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
103 |     def _generate_examples(self, filepath):
104 |         data = []
105 |         with open(filepath, "r", encoding="utf-8") as f:
106 |             record = []
107 |             for line in f:
108 |                 line = line.strip()
109 |                 if len(line) == 0 and record:
110 |                     data.append(record)
111 |                     record = []
112 |                 elif len(line) > 0 and line[0] == "#":
113 |                     # Skip comments.
114 |                     continue
115 |                 else:
116 |                     record.append(line)
117 |             data.append(record)
118 |         for key, record in enumerate(data):
119 |             source = record[-8]
120 |             stem = record[-7]
121 |             choices = record[-6:-1]
122 |             solution = record[-1]
123 |             yield key, {
124 |                 "source": source,
125 |                 "stem": stem,
126 |                 "choices": choices,
127 |                 "solution": solution,
128 |             }
129 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/triviaqa/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | dataset_info:
 3 |   features:
 4 |   - name: question_id
 5 |     dtype: string
 6 |   - name: question_source
 7 |     dtype: string
 8 |   - name: question
 9 |     dtype: string
10 |   - name: answer
11 |     struct:
12 |     - name: aliases
13 |       sequence: string
14 |     - name: value
15 |       dtype: string
16 |   - name: search_results
17 |     sequence:
18 |     - name: description
19 |       dtype: string
20 |     - name: filename
21 |       dtype: string
22 |     - name: rank
23 |       dtype: int32
24 |     - name: title
25 |       dtype: string
26 |     - name: url
27 |       dtype: string
28 |     - name: search_context
29 |       dtype: string
30 |   config_name: triviaqa
31 |   splits:
32 |   - name: train
33 |     num_bytes: 1270894387
34 |     num_examples: 87622
35 |   - name: validation
36 |     num_bytes: 163755044
37 |     num_examples: 11313
38 |   download_size: 632549060
39 |   dataset_size: 1434649431
40 | ---
41 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/triviaqa/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/triviaqa/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/triviaqa/dataset_infos.json:
--------------------------------------------------------------------------------
1 | {"triviaqa": {"description": "TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n", "citation": "@InProceedings{JoshiTriviaQA2017,\n    author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n    title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n    booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n    month = {July},\n    year = {2017},\n    address = {Vancouver, Canada},\n    publisher = {Association for Computational Linguistics},\n}\n", "homepage": "https://nlp.cs.washington.edu/triviaqa/", "license": "Apache License 2.0", "features": {"question_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_source": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"aliases": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "value": {"dtype": "string", "id": null, "_type": "Value"}}, "search_results": {"feature": {"description": {"dtype": "string", "id": null, "_type": "Value"}, "filename": {"dtype": "string", "id": null, "_type": "Value"}, "rank": {"dtype": "int32", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}, "search_context": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "triviaqa", "config_name": "triviaqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1271393601, "num_examples": 87622, "dataset_name": "triviaqa"}, "validation": {"name": "validation", "num_bytes": 163819509, "num_examples": 11313, "dataset_name": "triviaqa"}}, "download_checksums": {"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz": {"num_bytes": 546481381, "checksum": "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}}, "download_size": 546481381, "post_processing_size": null, "dataset_size": 1435213110, "size_in_bytes": 1981694491}}
2 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/unscramble/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/unscramble/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/unscramble/unscramble.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Unscramble dataset."""
 15 | 
 16 | 
 17 | import json
 18 | import os
 19 | 
 20 | import datasets
 21 | 
 22 | 
 23 | _CITATION = """\
 24 | @inproceedings{NEURIPS2020_1457c0d6,
 25 |     author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
 26 |     booktitle = {Advances in Neural Information Processing Systems},
 27 |     editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
 28 |     pages = {1877--1901},
 29 |     publisher = {Curran Associates, Inc.},
 30 |     title = {Language Models are Few-Shot Learners},
 31 |     url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
 32 |     volume = {33},
 33 |     year = {2020}
 34 | }
 35 | """
 36 | 
 37 | _DESCRIPTION = """\
 38 | Unscramble is a small battery of 5 “character manipulation” tasks. Each task
 39 | involves giving the model a word distorted by some combination of scrambling,
 40 | addition, or deletion of characters, and asking it to recover the original word.
 41 | """
 42 | 
 43 | _HOMEPAGE = "https://github.com/openai/gpt-3/tree/master/data"
 44 | 
 45 | # TODO: Add the licence for the dataset here if you can find it
 46 | _LICENSE = ""
 47 | 
 48 | _BASE_URL = "https://raw.githubusercontent.com/openai/gpt-3/master/data"
 49 | 
 50 | 
 51 | _DESCRIPTIONS = {
 52 |     "mid_word_1_anagrams": "Anagrams of all but the first and last letter.",
 53 |     "mid_word_2_anagrams": "Anagrams of all but the first and last 2 letters.",
 54 |     "cycle_letters_in_word": "Cycle letters in the word.",
 55 |     "random_insertion_in_word": "Random insertions in the word that must be removed.",
 56 |     "reversed_words": "Words spelled backwards that must be reversed.",
 57 | }
 58 | _NAMES = _DESCRIPTIONS.keys()
 59 | 
 60 | 
 61 | class Unscramble(datasets.GeneratorBasedBuilder):
 62 |     """Unscramble is a small battery of 5 “character manipulation” tasks."""
 63 | 
 64 |     VERSION = datasets.Version("0.0.1")
 65 | 
 66 |     BUILDER_CONFIGS = [
 67 |         datasets.BuilderConfig(
 68 |             name=name, version=version, description=_DESCRIPTIONS[name]
 69 |         )
 70 |         for name, version in zip(_NAMES, [VERSION] * len(_NAMES))
 71 |     ]
 72 | 
 73 |     def _info(self):
 74 |         features = datasets.Features(
 75 |             {
 76 |                 "context": datasets.Value("string"),
 77 |                 "completion": datasets.Value("string"),
 78 |             }
 79 |         )
 80 |         return datasets.DatasetInfo(
 81 |             description=_DESCRIPTION,
 82 |             features=features,
 83 |             homepage=_HOMEPAGE,
 84 |             license=_LICENSE,
 85 |             citation=_CITATION,
 86 |         )
 87 | 
 88 |     def _split_generators(self, dl_manager):
 89 |         urls = os.path.join(_BASE_URL, f"{self.config.name}.jsonl.gz")
 90 |         data_dir = dl_manager.download_and_extract(urls)
 91 |         return [
 92 |             datasets.SplitGenerator(
 93 |                 name=datasets.Split.VALIDATION,
 94 |                 # These kwargs will be passed to _generate_examples
 95 |                 gen_kwargs={
 96 |                     "filepath": data_dir,
 97 |                     "split": "validation",
 98 |                 },
 99 |             ),
100 |         ]
101 | 
102 |     # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
103 |     def _generate_examples(self, filepath, split):
104 |         with open(filepath, encoding="utf-8") as f:
105 |             for key, row in enumerate(f):
106 |                 data = json.loads(row)
107 |                 yield key, {
108 |                     "context": data["context"],
109 |                     "completion": data["completion"],
110 |                 }
111 | 


--------------------------------------------------------------------------------
/lm_eval/decontamination/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/decontamination/__init__.py


--------------------------------------------------------------------------------
/lm_eval/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import gpt2
 2 | from . import gpt3
 3 | from . import huggingface
 4 | from . import textsynth
 5 | from . import dummy
 6 | 
 7 | MODEL_REGISTRY = {
 8 |     "hf": gpt2.HFLM,
 9 |     "hf-causal": huggingface.AutoCausalLM,
10 |     "hf-seq2seq": huggingface.AutoSeq2SeqLM,
11 |     "gpt2": gpt2.GPT2LM,
12 |     "gpt3": gpt3.GPT3LM,
13 |     "textsynth": textsynth.TextSynthLM,
14 |     "dummy": dummy.DummyLM,
15 | }
16 | 
17 | 
18 | def get_model(model_name):
19 |     return MODEL_REGISTRY[model_name]
20 | 


--------------------------------------------------------------------------------
/lm_eval/models/dummy.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from lm_eval.base import LM
 3 | 
 4 | 
 5 | class DummyLM(LM):
 6 |     def __init__(self):
 7 |         pass
 8 | 
 9 |     @classmethod
10 |     def create_from_arg_string(cls, arg_string, additional_config=None):
11 |         return cls()
12 | 
13 |     def loglikelihood(self, requests):
14 |         res = []
15 | 
16 |         for _ in requests:
17 |             res.append((-random.random(), False))
18 | 
19 |         return res
20 | 
21 |     def greedy_until(self, requests):
22 |         res = []
23 | 
24 |         for ctx, _ in requests:
25 |             res.append("lol")
26 |             assert ctx.strip() != ""
27 | 
28 |         return res
29 | 
30 |     def loglikelihood_rolling(self, requests):
31 |         res = []
32 | 
33 |         for _ in requests:
34 |             res.append(-random.random())
35 | 
36 |         return res
37 | 


--------------------------------------------------------------------------------
/lm_eval/models/gpt2.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import transformers
  3 | from lm_eval.base import BaseLM
  4 | 
  5 | 
  6 | class HFLM(BaseLM):
  7 |     def __init__(
  8 |         self,
  9 |         device="cuda",
 10 |         pretrained="gpt2",
 11 |         revision="main",
 12 |         low_cpu_mem_usage=None,
 13 |         subfolder=None,
 14 |         tokenizer=None,
 15 |         batch_size=1,
 16 |     ):
 17 |         super().__init__()
 18 | 
 19 |         assert isinstance(device, str)
 20 |         assert isinstance(pretrained, str)
 21 |         assert isinstance(batch_size, int)
 22 | 
 23 |         if device:
 24 |             if device not in ["cuda", "cpu"]:
 25 |                 device = int(device)
 26 |             self._device = torch.device(device)
 27 |             print(f"Using device '{device}'")
 28 |         else:
 29 |             print("Device not specified")
 30 |             print(f"Cuda Available? {torch.cuda.is_available()}")
 31 |             self._device = (
 32 |                 torch.device("cuda")
 33 |                 if torch.cuda.is_available()
 34 |                 else torch.device("cpu")
 35 |             )
 36 | 
 37 |         # TODO: update this to be less of a hack once subfolder is fixed in HF
 38 |         revision = revision + ("/" + subfolder if subfolder is not None else "")
 39 | 
 40 |         self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained(
 41 |             pretrained, revision=revision, low_cpu_mem_usage=low_cpu_mem_usage
 42 |         ).to(self.device)
 43 |         self.gpt2.eval()
 44 | 
 45 |         self.tokenizer = transformers.AutoTokenizer.from_pretrained(
 46 |             pretrained if tokenizer is None else tokenizer,
 47 |             revision=revision,
 48 |         )
 49 | 
 50 |         assert isinstance(
 51 |             self.tokenizer,
 52 |             (
 53 |                 transformers.GPT2Tokenizer,
 54 |                 transformers.GPT2TokenizerFast,
 55 |                 transformers.T5Tokenizer,
 56 |                 transformers.T5TokenizerFast,
 57 |             ),
 58 |         ), "this tokenizer has not been checked for compatibility yet!"
 59 | 
 60 |         self.vocab_size = self.tokenizer.vocab_size
 61 | 
 62 |         if isinstance(
 63 |             self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast)
 64 |         ):
 65 |             assert self.tokenizer.encode("hello\n\nhello") == [
 66 |                 31373,
 67 |                 198,
 68 |                 198,
 69 |                 31373,
 70 |             ], self.tokenizer.encode("hello\n\nhello")
 71 | 
 72 |         # multithreading and batching
 73 |         self.batch_size_per_gpu = batch_size  # todo: adaptive batch size
 74 | 
 75 |         # TODO: fix multi-gpu
 76 |         # gpus = torch.cuda.device_count()
 77 |         # if gpus > 1:
 78 |         #     self.gpt2 = nn.DataParallel(self.gpt2)
 79 | 
 80 |     @property
 81 |     def eot_token_id(self):
 82 |         # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
 83 |         return self.tokenizer.eos_token_id
 84 | 
 85 |     @property
 86 |     def max_length(self):
 87 |         try:
 88 |             return self.gpt2.config.n_ctx
 89 |         except AttributeError:
 90 |             # gptneoconfig doesn't have n_ctx apparently
 91 |             return self.gpt2.config.max_position_embeddings
 92 | 
 93 |     @property
 94 |     def max_gen_toks(self):
 95 |         return 256
 96 | 
 97 |     @property
 98 |     def batch_size(self):
 99 |         # TODO: fix multi-gpu
100 |         return self.batch_size_per_gpu  # * gpus
101 | 
102 |     @property
103 |     def device(self):
104 |         # TODO: fix multi-gpu
105 |         return self._device
106 | 
107 |     def tok_encode(self, string: str):
108 |         return self.tokenizer.encode(string, add_special_tokens=False)
109 | 
110 |     def tok_decode(self, tokens):
111 |         return self.tokenizer.decode(tokens)
112 | 
113 |     def _model_call(self, inps):
114 |         """
115 |         inps: a torch tensor of shape [batch, sequence]
116 |         the size of sequence may vary from call to call
117 | 
118 |         returns: a torch tensor of shape [batch, sequence, vocab] with the
119 |         logits returned from the model
120 |         """
121 |         with torch.no_grad():
122 |             return self.gpt2(inps)[0]
123 | 
124 |     def _model_generate(self, context, max_length, eos_token_id):
125 |         return self.gpt2.generate(
126 |             context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False
127 |         )
128 | 
129 | 
130 | # for backwards compatibility
131 | GPT2LM = HFLM
132 | 


--------------------------------------------------------------------------------
/lm_eval/models/textsynth.py:
--------------------------------------------------------------------------------
  1 | """ TextSynth API
  2 | Implementation provided by Fabrice Bellard:
  3 |     https://github.com/EleutherAI/lm-evaluation-harness/issues/295
  4 | 
  5 | In order to use the API, you must have a valid TextSynth account and
  6 | enough credits.
  7 | 
  8 | Example usage:
  9 | 
 10 |     python main.py --model textsynth --model_args engine=gptj_6B --no_cache --tasks piqa
 11 | 
 12 | Homepage: https://textsynth.com/index.html
 13 | """
 14 | import logging
 15 | import os
 16 | import requests as _requests
 17 | import time
 18 | from tqdm import tqdm
 19 | from lm_eval.base import BaseLM
 20 | 
 21 | 
 22 | logger = logging.getLogger(__name__)
 23 | 
 24 | 
 25 | def textsynth_completion(**kwargs):
 26 |     """Query TextSynth API for completion.
 27 |     Retry with back-off until they respond.
 28 |     """
 29 |     backoff_time = 3
 30 |     while True:
 31 |         try:
 32 |             return _requests.post(**kwargs)
 33 |         except _requests.exceptions.RequestException:
 34 |             import traceback
 35 | 
 36 |             traceback.print_exc()
 37 |             time.sleep(backoff_time)
 38 |             backoff_time *= 1.5
 39 | 
 40 | 
 41 | class TextSynthLM(BaseLM):
 42 |     def __init__(self, engine, truncate=False):
 43 |         """
 44 |         :param engine: str
 45 |             TextSynth API engine (e.g. `gptj_6B`)
 46 |         :param truncate: bool
 47 |             Truncate input if too long (if False and input is too long, throw error)
 48 |         """
 49 |         super().__init__()
 50 | 
 51 |         self.engine = engine
 52 |         self.truncate = truncate
 53 |         self.api_url = "https://api.textsynth.com"
 54 |         # Read from environment variable TEXTSYNTH_API_SECRET_KEY
 55 |         self.api_key = os.environ["TEXTSYNTH_API_SECRET_KEY"]
 56 | 
 57 |     @property
 58 |     def eot_token_id(self):
 59 |         # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
 60 |         raise NotImplementedError()
 61 | 
 62 |     @property
 63 |     def max_length(self):
 64 |         # NOTE: Turn on truncation to avoid errors on long inputs.
 65 |         return 2048
 66 | 
 67 |     @property
 68 |     def max_gen_toks(self):
 69 |         return 256
 70 | 
 71 |     @property
 72 |     def batch_size(self):
 73 |         # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
 74 |         raise NotImplementedError()
 75 | 
 76 |     @property
 77 |     def device(self):
 78 |         # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
 79 |         raise NotImplementedError()
 80 | 
 81 |     def tok_encode(self, string: str):
 82 |         # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
 83 |         raise NotImplementedError()
 84 | 
 85 |     def tok_decode(self, tokens):
 86 |         # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until
 87 |         raise NotImplementedError()
 88 | 
 89 |     def loglikelihood(self, requests):
 90 |         res = []
 91 |         for context, continuation in tqdm(requests):
 92 |             response = textsynth_completion(
 93 |                 url=self.api_url + "/v1/engines/" + self.engine + "/logprob",
 94 |                 headers={"Authorization": "Bearer " + self.api_key},
 95 |                 json={"context": context, "continuation": continuation},
 96 |             )
 97 |             resp = response.json()
 98 |             if "logprob" in resp:
 99 |                 logprob = resp["logprob"]
100 |                 is_greedy = resp["is_greedy"]
101 |                 res.append((logprob, is_greedy))
102 |             else:
103 |                 logger.error(
104 |                     f"The following response does not contain `logprobs`. Got:\n{resp}"
105 |                 )
106 |                 assert False
107 |         return res
108 | 
109 |     def loglikelihood_rolling(self, requests):
110 |         # TODO: The TextSynth API does not support tokenized inputs so we cannot
111 |         # manually partition long contexts into smaller rolling windows as
112 |         # done for other models derived from `BaseLM`. Override this method
113 |         # with a windowing scheme that works for direct string inputs.
114 |         raise NotImplementedError(
115 |             "`loglikelihood_rolling` is currently not supported due to lack of "
116 |             "input tokenization support from TextSynth."
117 |         )
118 | 
119 |     def greedy_until(self, requests):
120 |         if not requests:
121 |             return []
122 | 
123 |         res = []
124 |         for request in tqdm(requests):
125 |             inp = request[0]
126 |             until = request[1]
127 |             response = textsynth_completion(
128 |                 url=self.api_url + "/v1/engines/" + self.engine + "/completions",
129 |                 headers={"Authorization": "Bearer " + self.api_key},
130 |                 json={
131 |                     "prompt": inp,
132 |                     "max_tokens": self.max_gen_toks,
133 |                     "top_k": 1,
134 |                     "stop": until,
135 |                 },
136 |             )
137 |             resp = response.json()
138 |             if "text" in resp:
139 |                 s = resp["text"]
140 |                 res.append(s)
141 |             else:
142 |                 logger.error(
143 |                     f"The following response does not contain generated `text`. "
144 |                     "Got:\n{resp}"
145 |                 )
146 |                 assert False
147 |         return res
148 | 
149 |     def _model_call(self, inps):
150 |         # Isn't used because we override _loglikelihood_tokens
151 |         raise NotImplementedError()
152 | 
153 |     def _model_generate(self, context, max_length, eos_token_id):
154 |         # Isn't used because we override greedy_until
155 |         raise NotImplementedError()
156 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/anli.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Adversarial NLI: A New Benchmark for Natural Language Understanding
  3 | https://arxiv.org/pdf/1910.14599.pdf
  4 | 
  5 | Adversarial NLI (ANLI) is a dataset collected via an iterative, adversarial
  6 | human-and-model-in-the-loop procedure. It consists of three rounds that progressively
  7 | increase in difficulty and complexity, and each question-answer includes annotator-
  8 | provided explanations.
  9 | 
 10 | Homepage: "https://github.com/facebookresearch/anli"
 11 | """
 12 | import numpy as np
 13 | from lm_eval.base import rf, Task
 14 | from lm_eval.metrics import mean
 15 | 
 16 | 
 17 | _CITATION = """
 18 | @inproceedings{nie-etal-2020-adversarial,
 19 |     title = "Adversarial {NLI}: A New Benchmark for Natural Language Understanding",
 20 |     author = "Nie, Yixin  and
 21 |       Williams, Adina  and
 22 |       Dinan, Emily  and
 23 |       Bansal, Mohit  and
 24 |       Weston, Jason  and
 25 |       Kiela, Douwe",
 26 |     booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics",
 27 |     year = "2020",
 28 |     publisher = "Association for Computational Linguistics",
 29 | }
 30 | """
 31 | 
 32 | 
 33 | class ANLIBase(Task):
 34 |     VERSION = 0
 35 |     DATASET_PATH = "anli"
 36 |     DATASET_NAME = None
 37 |     SPLIT = None
 38 | 
 39 |     def has_training_docs(self):
 40 |         return True
 41 | 
 42 |     def has_validation_docs(self):
 43 |         return True
 44 | 
 45 |     def has_test_docs(self):
 46 |         return True
 47 | 
 48 |     def training_docs(self):
 49 |         if self.has_training_docs():
 50 |             if self._training_docs is None:
 51 |                 self._training_docs = list(self.dataset["train_r" + str(self.SPLIT)])
 52 |             return self._training_docs
 53 | 
 54 |     def validation_docs(self):
 55 |         if self.has_validation_docs():
 56 |             return self.dataset["dev_r" + str(self.SPLIT)]
 57 | 
 58 |     def test_docs(self):
 59 |         if self.has_test_docs():
 60 |             return self.dataset["test_r" + str(self.SPLIT)]
 61 | 
 62 |     def doc_to_text(self, doc):
 63 |         # OA does this a bit weirdly: they prepend "anli 1:  anli 1:  " to the beginning
 64 |         # of the prompt (yes, repeating it!). also, " True, False, or Neither?" is directly
 65 |         # appended onto the question, with no "Answer:" or even a newline. Do we *really*
 66 |         # want to do it exactly as OA did?
 67 |         return (
 68 |             doc["premise"]
 69 |             + "\nQuestion: "
 70 |             + doc["hypothesis"]
 71 |             + " True, False, or Neither?\nAnswer:"
 72 |         )
 73 | 
 74 |     def should_decontaminate(self):
 75 |         return True
 76 | 
 77 |     def doc_to_decontamination_query(self, doc):
 78 |         return doc["premise"]
 79 | 
 80 |     def doc_to_target(self, doc):
 81 |         # True = entailment
 82 |         # False = contradiction
 83 |         # Neither = neutral
 84 |         return " " + ["True", "Neither", "False"][doc["label"]]
 85 | 
 86 |     def construct_requests(self, doc, ctx):
 87 |         """Uses RequestFactory to construct Requests and returns an iterable of
 88 |         Requests which will be sent to the LM.
 89 | 
 90 |         :param doc:
 91 |             The document as returned from training_docs, validation_docs, or test_docs.
 92 |         :param ctx: str
 93 |             The context string, generated by fewshot_context. This includes the natural
 94 |             language description, as well as the few shot examples, and the question
 95 |             part of the document for `doc`.
 96 |         """
 97 |         ll_true, _ = rf.loglikelihood(ctx, " True")
 98 |         ll_neither, _ = rf.loglikelihood(ctx, " Neither")
 99 |         ll_false, _ = rf.loglikelihood(ctx, " False")
100 |         return ll_true, ll_neither, ll_false
101 | 
102 |     def process_results(self, doc, results):
103 |         """Take a single document and the LM results and evaluates, returning a
104 |         dict where keys are the names of submetrics and values are the values of
105 |         the metric for that one document
106 | 
107 |         :param doc:
108 |             The document as returned from training_docs, validation_docs, or test_docs.
109 |         :param results:
110 |             The results of the requests created in construct_requests.
111 |         """
112 |         gold = doc["label"]
113 |         pred = np.argmax(results)
114 |         return {"acc": pred == gold}
115 | 
116 |     def aggregation(self):
117 |         """
118 |         :returns: {str: [float] -> float}
119 |             A dictionary where keys are the names of submetrics and values are
120 |             functions that aggregate a list of metrics
121 |         """
122 |         return {"acc": mean}
123 | 
124 |     def higher_is_better(self):
125 |         """
126 |         :returns: {str: bool}
127 |             A dictionary where keys are the names of submetrics and values are
128 |             whether a higher value of the submetric is better
129 |         """
130 |         return {"acc": True}
131 | 
132 | 
133 | class ANLIRound1(ANLIBase):
134 |     SPLIT = 1
135 | 
136 | 
137 | class ANLIRound2(ANLIBase):
138 |     SPLIT = 2
139 | 
140 | 
141 | class ANLIRound3(ANLIBase):
142 |     SPLIT = 3
143 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/arc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge
 3 | https://arxiv.org/pdf/1803.05457.pdf
 4 | 
 5 | The ARC dataset consists of 7,787 science exam questions drawn from a variety
 6 | of sources, including science questions provided under license by a research
 7 | partner affiliated with AI2. These are text-only, English language exam questions
 8 | that span several grade levels as indicated in the files. Each question has a
 9 | multiple choice structure (typically 4 answer options). The questions are sorted
10 | into a Challenge Set of 2,590 “hard” questions (those that both a retrieval and
11 | a co-occurrence method fail to answer correctly) and an Easy Set of 5,197 questions.
12 | 
13 | Homepage: https://allenai.org/data/arc
14 | """
15 | 
16 | from lm_eval.base import MultipleChoiceTask
17 | 
18 | 
19 | _CITATION = """
20 | @article{Clark2018ThinkYH,
21 |   title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
22 |   author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
23 |   journal={ArXiv},
24 |   year={2018},
25 |   volume={abs/1803.05457}
26 | }
27 | """
28 | 
29 | 
30 | class ARCEasy(MultipleChoiceTask):
31 |     VERSION = 0
32 |     DATASET_PATH = "ai2_arc"
33 |     DATASET_NAME = "ARC-Easy"
34 | 
35 |     def has_training_docs(self):
36 |         return True
37 | 
38 |     def has_validation_docs(self):
39 |         return True
40 | 
41 |     def has_test_docs(self):
42 |         return True
43 | 
44 |     def training_docs(self):
45 |         if self._training_docs is None:
46 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
47 |         return self._training_docs
48 | 
49 |     def validation_docs(self):
50 |         return map(self._process_doc, self.dataset["validation"])
51 | 
52 |     def test_docs(self):
53 |         return map(self._process_doc, self.dataset["test"])
54 | 
55 |     def _process_doc(self, doc):
56 |         # NOTE: Some `doc["answerKey"]`s are in numeric string format being one
57 |         # of {'1', '2', '3', '4', '5'}. We map them back to letters.
58 |         num_to_letter = {"1": "A", "2": "B", "3": "C", "4": "D", "5": "E"}
59 |         doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"])
60 |         out_doc = {
61 |             "id": doc["id"],
62 |             "query": "Question: " + doc["question"] + "\nAnswer:",
63 |             "choices": doc["choices"]["text"],
64 |             "gold": ["A", "B", "C", "D", "E"].index(doc["answerKey"]),
65 |         }
66 |         return out_doc
67 | 
68 |     def doc_to_text(self, doc):
69 |         return doc["query"]
70 | 
71 |     def should_decontaminate(self):
72 |         return True
73 | 
74 |     def doc_to_decontamination_query(self, doc):
75 |         return doc["query"]
76 | 
77 | 
78 | class ARCChallenge(ARCEasy):
79 |     DATASET_PATH = "ai2_arc"
80 |     DATASET_NAME = "ARC-Challenge"
81 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/arithmetic.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Language Models are Few-Shot Learners
  3 | https://arxiv.org/pdf/2005.14165.pdf
  4 | 
  5 | A small battery of 10 tests that involve asking language models a simple arithmetic
  6 | problem in natural language.
  7 | 
  8 | Homepage: https://github.com/openai/gpt-3/tree/master/data
  9 | """
 10 | from lm_eval.base import Task, rf
 11 | from lm_eval.metrics import mean
 12 | 
 13 | 
 14 | _CITATION = """
 15 | @inproceedings{NEURIPS2020_1457c0d6,
 16 |     author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
 17 |     booktitle = {Advances in Neural Information Processing Systems},
 18 |     editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
 19 |     pages = {1877--1901},
 20 |     publisher = {Curran Associates, Inc.},
 21 |     title = {Language Models are Few-Shot Learners},
 22 |     url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
 23 |     volume = {33},
 24 |     year = {2020}
 25 | }
 26 | """
 27 | 
 28 | 
 29 | class Arithmetic(Task):
 30 |     VERSION = 0
 31 |     DATASET_PATH = "EleutherAI/arithmetic"
 32 | 
 33 |     def has_training_docs(self):
 34 |         return False
 35 | 
 36 |     def has_validation_docs(self):
 37 |         return True
 38 | 
 39 |     def has_test_docs(self):
 40 |         return False
 41 | 
 42 |     def training_docs(self):
 43 |         return NotImplemented
 44 | 
 45 |     def validation_docs(self):
 46 |         return self.dataset["validation"]
 47 | 
 48 |     def test_docs(self):
 49 |         return NotImplemented
 50 | 
 51 |     def doc_to_text(self, doc):
 52 |         return doc["context"]
 53 | 
 54 |     def should_decontaminate(self):
 55 |         return True
 56 | 
 57 |     def doc_to_decontamination_query(self, doc):
 58 |         return doc["context"]
 59 | 
 60 |     def doc_to_target(self, doc):
 61 |         return doc["completion"]
 62 | 
 63 |     def construct_requests(self, doc, ctx):
 64 |         ll, is_prediction = rf.loglikelihood(ctx, doc["completion"])
 65 |         return is_prediction
 66 | 
 67 |     def process_results(self, doc, results):
 68 |         (is_prediction,) = results
 69 |         return {"acc": is_prediction}
 70 | 
 71 |     def aggregation(self):
 72 |         return {
 73 |             "acc": mean,
 74 |         }
 75 | 
 76 |     def higher_is_better(self):
 77 |         return {"acc": True}
 78 | 
 79 | 
 80 | class Arithmetic2DPlus(Arithmetic):
 81 |     DATASET_NAME = "arithmetic_2da"
 82 | 
 83 | 
 84 | class Arithmetic2DMinus(Arithmetic):
 85 |     DATASET_NAME = "arithmetic_2ds"
 86 | 
 87 | 
 88 | class Arithmetic3DPlus(Arithmetic):
 89 |     DATASET_NAME = "arithmetic_3da"
 90 | 
 91 | 
 92 | class Arithmetic3DMinus(Arithmetic):
 93 |     DATASET_NAME = "arithmetic_3ds"
 94 | 
 95 | 
 96 | class Arithmetic4DPlus(Arithmetic):
 97 |     DATASET_NAME = "arithmetic_4da"
 98 | 
 99 | 
100 | class Arithmetic4DMinus(Arithmetic):
101 |     DATASET_NAME = "arithmetic_4ds"
102 | 
103 | 
104 | class Arithmetic5DPlus(Arithmetic):
105 |     DATASET_NAME = "arithmetic_5da"
106 | 
107 | 
108 | class Arithmetic5DMinus(Arithmetic):
109 |     DATASET_NAME = "arithmetic_5ds"
110 | 
111 | 
112 | class Arithmetic2DMultiplication(Arithmetic):
113 |     DATASET_NAME = "arithmetic_2dm"
114 | 
115 | 
116 | class Arithmetic1DComposite(Arithmetic):
117 |     DATASET_NAME = "arithmetic_1dc"
118 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/asdiv.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers
 3 | https://arxiv.org/abs/2106.15772
 4 | 
 5 | ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language
 6 | patterns and problem types) English math word problem (MWP) corpus for evaluating
 7 | the capability of various MWP solvers. Existing MWP corpora for studying AI progress
 8 | remain limited either in language usage patterns or in problem types. We thus present
 9 | a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem
10 | types taught in elementary school. Each MWP is annotated with its problem type and grade
11 | level (for indicating the level of difficulty).
12 | 
13 | NOTE: We currently ignore formulas for answer generation.
14 | 
15 | Homepage: https://github.com/chaochun/nlu-asdiv-dataset
16 | """
17 | import inspect
18 | import lm_eval.datasets.asdiv.asdiv
19 | from lm_eval.base import rf, Task
20 | from lm_eval.metrics import mean
21 | 
22 | 
23 | _CITATION = """
24 | @misc{miao2021diverse,
25 |     title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
26 |     author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
27 |     year={2021},
28 |     eprint={2106.15772},
29 |     archivePrefix={arXiv},
30 |     primaryClass={cs.AI}
31 | }
32 | """
33 | 
34 | 
35 | class Asdiv(Task):
36 |     VERSION = 0
37 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.asdiv.asdiv)
38 | 
39 |     def has_training_docs(self):
40 |         return False
41 | 
42 |     def has_validation_docs(self):
43 |         return True
44 | 
45 |     def has_test_docs(self):
46 |         return False
47 | 
48 |     def training_docs(self):
49 |         raise NotImplementedError("This dataset has no training docs")
50 | 
51 |     def validation_docs(self):
52 |         return self.dataset["validation"]
53 | 
54 |     def test_docs(self):
55 |         raise NotImplementedError("This dataset has no test docs")
56 | 
57 |     def fewshot_context(
58 |         self, doc, num_fewshot, provide_description=None, rnd=None, description=None
59 |     ):
60 |         assert num_fewshot == 0, "ASDiv is intended only for the zero-shot setting."
61 |         return super().fewshot_context(
62 |             doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
63 |         )
64 | 
65 |     def doc_to_text(self, doc):
66 |         # TODO: add solution-type
67 |         return doc["body"] + "\n" + "Question:" + doc["question"] + "\n" + "Answer:"
68 | 
69 |     def should_decontaminate(self):
70 |         return True
71 | 
72 |     def doc_to_decontamination_query(self, doc):
73 |         return doc["body"] + " " + doc["question"]
74 | 
75 |     def doc_to_target(self, doc):
76 |         # TODO: add formula
77 | 
78 |         answer = doc["answer"].split(" (")[0]
79 |         return " " + answer
80 | 
81 |     def construct_requests(self, doc, ctx):
82 |         ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
83 |         return ll, is_greedy
84 | 
85 |     def process_results(self, doc, results):
86 |         ll, is_greedy = results
87 | 
88 |         return {"acc": int(is_greedy)}
89 | 
90 |     def aggregation(self):
91 |         return {"acc": mean}
92 | 
93 |     def higher_is_better(self):
94 |         return {"acc": True}
95 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/cbt.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The Children’s Book Test (CBT) from the paper:
  3 | https://research.fb.com/wp-content/uploads/2016/11/the_goldilocks_principle_reading_children_s_books_with_explicit_memory_representations.pdf
  4 | 
  5 | The Children's Book Test (CBT) is test of how well language models capture
  6 | meaning in children's books. Unlike standard language modelling benchmarks,
  7 | it distinguishes the task of predicting syntactic function words from that
  8 | of predicting lower-frequency words, which carry greater semantic content.
  9 | 
 10 | NOTE: This evaluation is based on the (context + query) question-answering variant
 11 | used by the Recurrent Language Models described in the paper. See section 4.4.
 12 | 
 13 | Homepage: https://github.com/facebookresearch/ParlAI/tree/main/parlai/tasks/cbt
 14 | """
 15 | import numpy as np
 16 | from lm_eval.base import rf, Task
 17 | from lm_eval.metrics import mean
 18 | 
 19 | 
 20 | _CITATION = """
 21 | @misc{hill2016goldilocks,
 22 |     title={The Goldilocks Principle: Reading Children's Books with Explicit Memory Representations},
 23 |     author={Felix Hill and Antoine Bordes and Sumit Chopra and Jason Weston},
 24 |     year={2016},
 25 |     eprint={1511.02301},
 26 |     archivePrefix={arXiv},
 27 |     primaryClass={cs.CL}
 28 | }
 29 | """
 30 | 
 31 | 
 32 | class CBTBase(Task):
 33 |     VERSION = 0
 34 |     DATASET_PATH = "cbt"
 35 |     DATASET_NAME = None
 36 | 
 37 |     def has_training_docs(self):
 38 |         return True
 39 | 
 40 |     def has_validation_docs(self):
 41 |         return True
 42 | 
 43 |     def has_test_docs(self):
 44 |         return True
 45 | 
 46 |     def training_docs(self):
 47 |         if self._training_docs is None:
 48 |             self._training_docs = list(self.dataset["train"])
 49 |         return self._training_docs
 50 | 
 51 |     def validation_docs(self):
 52 |         return self.dataset["validation"]
 53 | 
 54 |     def test_docs(self):
 55 |         return self.dataset["test"]
 56 | 
 57 |     def detokenize(self, text):
 58 |         text = text.replace(" '", "'")
 59 |         text = text.replace(" \n", "\n")
 60 |         text = text.replace("\n ", "\n")
 61 |         text = text.replace(" n't", "n't")
 62 |         text = text.replace("`` ", '"')
 63 |         text = text.replace("''", '"')
 64 |         # punctuation
 65 |         text = text.replace(" :", ":")
 66 |         text = text.replace(" ;", ";")
 67 |         text = text.replace(" !", "!")
 68 |         text = text.replace(" ?", "?")
 69 |         text = text.replace(" ,", ",")
 70 |         text = text.replace(" .", ".")
 71 |         return text
 72 | 
 73 |     def doc_to_text(self, doc):
 74 |         passage = " ".join(doc["sentences"])
 75 |         text = "Passage: " + passage + "\nQuestion: " + doc["question"]
 76 |         return self.detokenize(text)
 77 | 
 78 |     def should_decontaminate(self):
 79 |         return True
 80 | 
 81 |     def doc_to_decontamination_query(self, doc):
 82 |         passage = " ".join(doc["sentences"])
 83 |         return passage
 84 | 
 85 |     def doc_to_target(self, doc):
 86 |         return ""
 87 | 
 88 |     def fewshot_examples(self, k, rnd):
 89 |         assert (
 90 |             k == 0
 91 |         ), f"CBT is only implemented for the zero-shot setting. Given k={k}."
 92 |         return super().fewshot_examples(k, rnd)
 93 | 
 94 |     def construct_requests(self, doc, ctx):
 95 |         """Uses RequestFactory to construct Requests and returns an iterable of
 96 |         Requests which will be sent to the LM.
 97 | 
 98 |         :param doc:
 99 |             The document as returned from training_docs, validation_docs, or test_docs.
100 |         :param ctx: str
101 |             The context string, generated by fewshot_context. This includes the natural
102 |             language description, as well as the few shot examples, and the question
103 |             part of the document for `doc`.
104 |         """
105 |         lls = []
106 |         for option in doc["options"]:
107 |             # Following Section 4.4 "Recurrent Language Models" in the CBT paper:
108 |             # "we rank candidate [option] c based on p(q1 . . . qk−1, c, qk+1 . . . ql)
109 |             # rather than simply p(q1 . . . qk−1, c)."
110 |             lls.append(rf.loglikelihood("", ctx.replace("XXXXX", option))[0])
111 |         return lls
112 | 
113 |     def process_results(self, doc, results):
114 |         """Take a single document and the LM results and evaluates, returning a
115 |         dict where keys are the names of submetrics and values are the values of
116 |         the metric for that one document
117 | 
118 |         :param doc:
119 |             The document as returned from training_docs, validation_docs, or test_docs.
120 |         :param results:
121 |             The results of the requests created in construct_requests.
122 |         """
123 |         gold = doc["options"].index(doc["answer"])
124 |         pred = np.argmax(results)
125 |         return {"acc": pred == gold}
126 | 
127 |     def aggregation(self):
128 |         """
129 |         :returns: {str: [float] -> float}
130 |             A dictionary where keys are the names of submetrics and values are
131 |             functions that aggregate a list of metrics
132 |         """
133 |         return {"acc": mean}
134 | 
135 |     def higher_is_better(self):
136 |         """
137 |         :returns: {str: bool}
138 |             A dictionary where keys are the names of submetrics and values are
139 |             whether a higher value of the submetric is better
140 |         """
141 |         return {"acc": True}
142 | 
143 | 
144 | class CBTCN(CBTBase):
145 |     DATASET_NAME = "CN"
146 | 
147 | 
148 | class CBTNE(CBTBase):
149 |     DATASET_NAME = "NE"
150 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/gsm8k.py:
--------------------------------------------------------------------------------
  1 | """
  2 | "Training Verifiers to Solve Math Word Problems"
  3 | https://arxiv.org/abs/2110.14168
  4 | 
  5 | State-of-the-art language models can match human performance on many tasks, but
  6 | they still struggle to robustly perform multi-step mathematical reasoning. To
  7 | diagnose the failures of current models and support research, we introduce GSM8K,
  8 | a dataset of 8.5K high quality linguistically diverse grade school math word problems.
  9 | We find that even the largest transformer models fail to achieve high test performance,
 10 | despite the conceptual simplicity of this problem distribution.
 11 | 
 12 | NOTE: See the official implementation of the task:
 13 |     https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py
 14 | for how to make use of the dataset's calculator annotations in your language
 15 | model's sample/generation function.
 16 | 
 17 | Homepage: https://github.com/openai/grade-school-math
 18 | """
 19 | import re
 20 | from lm_eval.base import Task, rf
 21 | from lm_eval.metrics import mean
 22 | 
 23 | 
 24 | _CITATION = """
 25 | @misc{cobbe2021training,
 26 |       title={Training Verifiers to Solve Math Word Problems},
 27 |       author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
 28 |       year={2021},
 29 |       eprint={2110.14168},
 30 |       archivePrefix={arXiv},
 31 |       primaryClass={cs.LG}
 32 | }
 33 | """
 34 | 
 35 | 
 36 | ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
 37 | INVALID_ANS = "[invalid]"
 38 | 
 39 | 
 40 | class GradeSchoolMath8K(Task):
 41 |     VERSION = 0
 42 |     DATASET_PATH = "gsm8k"
 43 |     DATASET_NAME = "main"
 44 | 
 45 |     def has_training_docs(self):
 46 |         return True
 47 | 
 48 |     def has_validation_docs(self):
 49 |         return False
 50 | 
 51 |     def has_test_docs(self):
 52 |         return True
 53 | 
 54 |     def training_docs(self):
 55 |         return self.dataset["train"]
 56 | 
 57 |     def validation_docs(self):
 58 |         raise NotImplementedError
 59 | 
 60 |     def test_docs(self):
 61 |         return self.dataset["test"]
 62 | 
 63 |     def doc_to_text(self, doc):
 64 |         return "Question: " + doc["question"] + "\nAnswer:"
 65 | 
 66 |     def doc_to_target(self, doc):
 67 |         return " " + doc["answer"]
 68 | 
 69 |     def construct_requests(self, doc, ctx):
 70 |         """Uses RequestFactory to construct Requests and returns an iterable of
 71 |         Requests which will be sent to the LM.
 72 | 
 73 |         :param doc:
 74 |             The document as returned from training_docs, validation_docs, or test_docs.
 75 |         :param ctx: str
 76 |             The context string, generated by fewshot_context. This includes the natural
 77 |             language description, as well as the few shot examples, and the question
 78 |             part of the document for `doc`.
 79 |         """
 80 |         # NOTE: The paper implements "verifiers" that assign a score to multiple
 81 |         # solutions and output the highest ranked solution.
 82 |         completion = rf.greedy_until(ctx, ["\n"])
 83 |         return completion
 84 | 
 85 |     def _extract_answer(self, completion):
 86 |         match = ANS_RE.search(completion)
 87 |         if match:
 88 |             match_str = match.group(1).strip()
 89 |             match_str = match_str.replace(",", "")
 90 |             return match_str
 91 |         else:
 92 |             return INVALID_ANS
 93 | 
 94 |     def _is_correct(self, completion, answer):
 95 |         gold = self._extract_answer(answer)
 96 |         assert gold != INVALID_ANS, "No ground truth answer found in the document."
 97 |         return self._extract_answer(completion) == gold
 98 | 
 99 |     def process_results(self, doc, results):
100 |         """Take a single document and the LM results and evaluates, returning a
101 |         dict where keys are the names of submetrics and values are the values of
102 |         the metric for that one document
103 | 
104 |         :param doc:
105 |             The document as returned from training_docs, validation_docs, or test_docs.
106 |         :param results:
107 |             The results of the requests created in construct_requests.
108 |         """
109 |         completion = results[0]
110 |         answer = doc["answer"]
111 |         return {"acc": self._is_correct(completion, answer)}
112 | 
113 |     def aggregation(self):
114 |         """
115 |         :returns: {str: [float] -> float}
116 |             A dictionary where keys are the names of submetrics and values are
117 |             functions that aggregate a list of metrics
118 |         """
119 |         return {"acc": mean}
120 | 
121 |     def higher_is_better(self):
122 |         """
123 |         :returns: {str: bool}
124 |             A dictionary where keys are the names of submetrics and values are
125 |             whether a higher value of the submetric is better
126 |         """
127 |         return {"acc": True}
128 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/headqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Interpretable Multi-Step Reasoning with Knowledge Extraction on Complex Healthcare Question Answering
 3 | https://aclanthology.org/P19-1092.pdf
 4 | 
 5 | HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to
 6 | access a specialized position in the Spanish healthcare system, and are challenging
 7 | even for highly specialized humans.
 8 | 
 9 | Homepage: https://aghie.github.io/head-qa/
10 | """
11 | import inspect
12 | import lm_eval.datasets.headqa.headqa
13 | from lm_eval.base import MultipleChoiceTask
14 | 
15 | 
16 | _CITATION = """
17 | @misc{liu2020interpretable,
18 |     title={Interpretable Multi-Step Reasoning with Knowledge Extraction on Complex Healthcare Question Answering},
19 |     author={Ye Liu and Shaika Chowdhury and Chenwei Zhang and Cornelia Caragea and Philip S. Yu},
20 |     year={2020},
21 |     eprint={2008.02434},
22 |     archivePrefix={arXiv},
23 |     primaryClass={cs.AI}
24 | }
25 | """
26 | 
27 | 
28 | class HeadQABase(MultipleChoiceTask):
29 |     VERSION = 0
30 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.headqa.headqa)
31 | 
32 |     def has_training_docs(self):
33 |         return True
34 | 
35 |     def has_validation_docs(self):
36 |         return True
37 | 
38 |     def has_test_docs(self):
39 |         return True
40 | 
41 |     def training_docs(self):
42 |         if self._training_docs is None:
43 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
44 |         return self._training_docs
45 | 
46 |     def validation_docs(self):
47 |         return map(self._process_doc, self.dataset["validation"])
48 | 
49 |     def test_docs(self):
50 |         return map(self._process_doc, self.dataset["test"])
51 | 
52 |     def _process_doc(self, doc):
53 |         out_doc = {
54 |             "id": doc["qid"],
55 |             "query": "Question: " + doc["qtext"] + "\nAnswer:",
56 |             "choices": [answer["atext"] for answer in doc["answers"]],
57 |             "gold": int(doc["ra"]) - 1,
58 |         }
59 |         return out_doc
60 | 
61 |     def doc_to_text(self, doc):
62 |         return doc["query"]
63 | 
64 |     def should_decontaminate(self):
65 |         return True
66 | 
67 |     def doc_to_decontamination_query(self, doc):
68 |         return doc["query"]
69 | 
70 | 
71 | class HeadQAEn(HeadQABase):
72 |     DATASET_NAME = "en"
73 | 
74 | 
75 | class HeadQAEs(HeadQABase):
76 |     DATASET_NAME = "es"
77 | 
78 | 
79 | # for backwards compatibility
80 | class HeadQAEsDeprecated(HeadQABase):
81 |     DATASET_NAME = "es"
82 | 
83 |     def __init__(self):
84 |         super().__init__()
85 |         print(
86 |             "WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info."
87 |         )
88 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/hellaswag.py:
--------------------------------------------------------------------------------
 1 | """
 2 | HellaSwag: Can a Machine Really Finish Your Sentence?
 3 | https://arxiv.org/pdf/1905.07830.pdf
 4 | 
 5 | Hellaswag is a commonsense inference challenge dataset. Though its questions are
 6 | trivial for humans (>95% accuracy), state-of-the-art models struggle (<48%). This is
 7 | achieved via Adversarial Filtering (AF), a data collection paradigm wherein a
 8 | series of discriminators iteratively select an adversarial set of machine-generated
 9 | wrong answers. AF proves to be surprisingly robust. The key insight is to scale up
10 | the length and complexity of the dataset examples towards a critical 'Goldilocks'
11 | zone wherein generated text is ridiculous to humans, yet often misclassified by
12 | state-of-the-art models.
13 | 
14 | Homepage: https://rowanzellers.com/hellaswag/
15 | """
16 | 
17 | import re
18 | from lm_eval.base import MultipleChoiceTask
19 | 
20 | 
21 | _CITATION = """
22 | @inproceedings{zellers2019hellaswag,
23 |     title={HellaSwag: Can a Machine Really Finish Your Sentence?},
24 |     author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
25 |     booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
26 |     year={2019}
27 | }
28 | """
29 | 
30 | 
31 | class HellaSwag(MultipleChoiceTask):
32 |     VERSION = 0
33 |     DATASET_PATH = "hellaswag"
34 |     DATASET_NAME = None
35 | 
36 |     def has_training_docs(self):
37 |         return True
38 | 
39 |     def has_validation_docs(self):
40 |         return True
41 | 
42 |     def has_test_docs(self):
43 |         return False
44 | 
45 |     def training_docs(self):
46 |         if self._training_docs is None:
47 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
48 |         return self._training_docs
49 | 
50 |     def validation_docs(self):
51 |         return map(self._process_doc, self.dataset["validation"])
52 | 
53 |     def _process_doc(self, doc):
54 |         ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
55 |         out_doc = {
56 |             "query": self.preprocess(doc["activity_label"] + ": " + ctx),
57 |             "choices": [self.preprocess(ending) for ending in doc["endings"]],
58 |             "gold": int(doc["label"]),
59 |         }
60 |         return out_doc
61 | 
62 |     @classmethod
63 |     def preprocess(cls, text):
64 |         text = text.strip()
65 |         # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
66 |         text = text.replace(" [title]", ". ")
67 |         text = re.sub("\\[.*?\\]", "", text)
68 |         text = text.replace("  ", " ")
69 |         return text
70 | 
71 |     def doc_to_text(self, doc):
72 |         return doc["query"]
73 | 
74 |     def should_decontaminate(self):
75 |         return True
76 | 
77 |     def doc_to_decontamination_query(self, doc):
78 |         return doc["query"]
79 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/hendrycks_test.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Measuring Massive Multitask Language Understanding
  3 | https://arxiv.org/pdf/2009.03300.pdf
  4 | 
  5 | The Hendryck's Test is a benchmark that measured a text model’s multitask accuracy.
  6 | The test covers 57 tasks including elementary mathematics, US history, computer
  7 | science, law, and more. To attain high accuracy on this test, models must possess
  8 | extensive world knowledge and problem solving ability. By comprehensively evaluating
  9 | the breadth and depth of a model’s academic and professional understanding,
 10 | Hendryck's Test can be used to analyze models across many tasks and to identify
 11 | important shortcomings.
 12 | 
 13 | Homepage: https://github.com/hendrycks/test
 14 | """
 15 | 
 16 | from lm_eval.base import MultipleChoiceTask
 17 | 
 18 | 
 19 | _CITATION = """
 20 | @article{hendryckstest2021,
 21 |     title={Measuring Massive Multitask Language Understanding},
 22 |     author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt},
 23 |     journal={Proceedings of the International Conference on Learning Representations (ICLR)},
 24 |     year={2021}
 25 | }
 26 | """
 27 | 
 28 | 
 29 | SUBJECTS = [
 30 |     "abstract_algebra",
 31 |     "anatomy",
 32 |     "astronomy",
 33 |     "business_ethics",
 34 |     "clinical_knowledge",
 35 |     "college_biology",
 36 |     "college_chemistry",
 37 |     "college_computer_science",
 38 |     "college_mathematics",
 39 |     "college_medicine",
 40 |     "college_physics",
 41 |     "computer_security",
 42 |     "conceptual_physics",
 43 |     "econometrics",
 44 |     "electrical_engineering",
 45 |     "elementary_mathematics",
 46 |     "formal_logic",
 47 |     "global_facts",
 48 |     "high_school_biology",
 49 |     "high_school_chemistry",
 50 |     "high_school_computer_science",
 51 |     "high_school_european_history",
 52 |     "high_school_geography",
 53 |     "high_school_government_and_politics",
 54 |     "high_school_macroeconomics",
 55 |     "high_school_mathematics",
 56 |     "high_school_microeconomics",
 57 |     "high_school_physics",
 58 |     "high_school_psychology",
 59 |     "high_school_statistics",
 60 |     "high_school_us_history",
 61 |     "high_school_world_history",
 62 |     "human_aging",
 63 |     "human_sexuality",
 64 |     "international_law",
 65 |     "jurisprudence",
 66 |     "logical_fallacies",
 67 |     "machine_learning",
 68 |     "management",
 69 |     "marketing",
 70 |     "medical_genetics",
 71 |     "miscellaneous",
 72 |     "moral_disputes",
 73 |     "moral_scenarios",
 74 |     "nutrition",
 75 |     "philosophy",
 76 |     "prehistory",
 77 |     "professional_accounting",
 78 |     "professional_law",
 79 |     "professional_medicine",
 80 |     "professional_psychology",
 81 |     "public_relations",
 82 |     "security_studies",
 83 |     "sociology",
 84 |     "us_foreign_policy",
 85 |     "virology",
 86 |     "world_religions",
 87 | ]
 88 | 
 89 | 
 90 | def create_all_tasks():
 91 |     """Creates a dictionary of tasks from a list of subjects
 92 |     :return: {task_name: task}
 93 |         e.g. {hendrycksTest-abstract_algebra: Task, hendrycksTest-anatomy: Task}
 94 |     """
 95 |     return {f"hendrycksTest-{sub}": create_task(sub) for sub in SUBJECTS}
 96 | 
 97 | 
 98 | def create_task(subject):
 99 |     class HendrycksTest(GeneralHendrycksTest):
100 |         def __init__(self):
101 |             super().__init__(subject)
102 | 
103 |     return HendrycksTest
104 | 
105 | 
106 | class GeneralHendrycksTest(MultipleChoiceTask):
107 |     VERSION = 0
108 |     DATASET_PATH = "hendrycks_test"
109 |     DATASET_NAME = None
110 | 
111 |     def __init__(self, subject):
112 |         self.DATASET_NAME = subject
113 |         super().__init__()
114 | 
115 |     def has_training_docs(self):
116 |         return False
117 | 
118 |     def has_validation_docs(self):
119 |         return True
120 | 
121 |     def has_test_docs(self):
122 |         return True
123 | 
124 |     def validation_docs(self):
125 |         return map(self._process_doc, self.dataset["validation"])
126 | 
127 |     def test_docs(self):
128 |         return map(self._process_doc, self.dataset["test"])
129 | 
130 |     def _process_doc(self, doc):
131 |         def format_example(doc, keys):
132 |             """
133 |             Question: <prompt>
134 |             Choices:
135 |             A. <choice1>
136 |             B. <choice2>
137 |             C. <choice3>
138 |             D. <choice4>
139 |             Answer:
140 |             """
141 |             prompt = "Question: " + doc["question"] + "\nChoices:\n"
142 |             prompt += "".join(
143 |                 [f"{key}. {choice}\n" for key, choice in zip(keys, doc["choices"])]
144 |             )
145 |             prompt += "Answer:"
146 |             return prompt
147 | 
148 |         keys = ["A", "B", "C", "D"]
149 |         return {
150 |             "query": format_example(doc, keys),
151 |             "choices": doc["choices"],
152 |             "gold": (
153 |                 keys.index(doc["answer"])
154 |                 if isinstance(doc["answer"], str)
155 |                 else doc["answer"]
156 |             ),
157 |         }
158 | 
159 |     def fewshot_examples(self, k, rnd):
160 |         # fewshot_examples is not just sampling from train_docs because dev is
161 |         # in the same distribution as val/test but auxiliary_train isn't
162 | 
163 |         if self._fewshot_docs is None:
164 |             self._fewshot_docs = list(map(self._process_doc, self.dataset["dev"]))
165 | 
166 |         return rnd.sample(list(self._fewshot_docs), k)
167 | 
168 |     def doc_to_text(self, doc):
169 |         return doc["query"]
170 | 
171 |     def should_decontaminate(self):
172 |         return True
173 | 
174 |     def doc_to_decontamination_query(self, doc):
175 |         return doc["query"]
176 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/lambada.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The LAMBADA dataset: Word prediction requiring a broad discourse context∗
  3 | https://arxiv.org/pdf/1606.06031.pdf
  4 | 
  5 | LAMBADA is a dataset to evaluate the capabilities of computational models for text
  6 | understanding by means of a word prediction task. LAMBADA is a collection of narrative
  7 | passages sharing the characteristic that human subjects are able to guess their last
  8 | word if they are exposed to the whole passage, but not if they only see the last
  9 | sentence preceding the target word. To succeed on LAMBADA, computational models
 10 | cannot simply rely on local context, but must be able to keep track of information
 11 | in the broader discourse.
 12 | 
 13 | Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
 14 | """
 15 | from lm_eval.base import Task, rf
 16 | from lm_eval.metrics import mean, perplexity
 17 | 
 18 | 
 19 | _CITATION = """
 20 | @misc{
 21 |     author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
 22 |     title={The LAMBADA dataset},
 23 |     DOI={10.5281/zenodo.2630551},
 24 |     publisher={Zenodo},
 25 |     year={2016},
 26 |     month={Aug}
 27 | }
 28 | """
 29 | 
 30 | 
 31 | class LambadaBase(Task):
 32 |     VERSION = None
 33 | 
 34 |     def training_docs(self):
 35 |         if self.has_training_docs():
 36 |             return self.dataset["train"]
 37 | 
 38 |     def validation_docs(self):
 39 |         if self.has_validation_docs():
 40 |             return self.dataset["validation"]
 41 | 
 42 |     def test_docs(self):
 43 |         if self.has_test_docs():
 44 |             return self.dataset["test"]
 45 | 
 46 |     def doc_to_text(self, doc):
 47 |         return doc["text"].rsplit(" ", 1)[0]
 48 | 
 49 |     def should_decontaminate(self):
 50 |         return True
 51 | 
 52 |     def doc_to_decontamination_query(self, doc):
 53 |         return doc["text"]
 54 | 
 55 |     def doc_to_target(self, doc):
 56 |         return " " + doc["text"].rsplit(" ", 1)[1]
 57 | 
 58 |     def construct_requests(self, doc, ctx):
 59 |         ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
 60 | 
 61 |         return ll, is_greedy
 62 | 
 63 |     def process_results(self, doc, results):
 64 |         ll, is_greedy = results
 65 | 
 66 |         return {"ppl": ll, "acc": int(is_greedy)}
 67 | 
 68 |     def aggregation(self):
 69 |         return {"ppl": perplexity, "acc": mean}
 70 | 
 71 |     def higher_is_better(self):
 72 |         return {"ppl": False, "acc": True}
 73 | 
 74 | 
 75 | class LambadaStandard(LambadaBase):
 76 |     """The LAMBADA task using the standard original LAMBADA dataset."""
 77 | 
 78 |     VERSION = 0
 79 |     DATASET_PATH = "lambada"
 80 | 
 81 |     def has_training_docs(self):
 82 |         return False
 83 | 
 84 |     def has_validation_docs(self):
 85 |         return True
 86 | 
 87 |     def has_test_docs(self):
 88 |         return True
 89 | 
 90 | 
 91 | class LambadaOpenAI(LambadaBase):
 92 |     """The LAMBADA task using the LAMBADA OpenAI dataset, a modified version of the
 93 |     original LAMBADA dataset created by OpenAI for evaluating their GPT-2 model.
 94 | 
 95 |     Reference: https://github.com/openai/gpt-2/issues/131#issuecomment-497136199
 96 |     """
 97 | 
 98 |     VERSION = 0
 99 |     DATASET_PATH = "EleutherAI/lambada_openai"
100 | 
101 |     def has_training_docs(self):
102 |         return False
103 | 
104 |     def has_validation_docs(self):
105 |         return False
106 | 
107 |     def has_test_docs(self):
108 |         return True
109 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/lambada_cloze.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The LAMBADA dataset: Word prediction requiring a broad discourse context∗
 3 | https://arxiv.org/pdf/1606.06031.pdf
 4 | 
 5 | Cloze-style LAMBADA dataset.
 6 | LAMBADA is a dataset to evaluate the capabilities of computational models for text
 7 | understanding by means of a word prediction task. LAMBADA is a collection of narrative
 8 | passages sharing the characteristic that human subjects are able to guess their last
 9 | word if they are exposed to the whole passage, but not if they only see the last
10 | sentence preceding the target word. To succeed on LAMBADA, computational models
11 | cannot simply rely on local context, but must be able to keep track of information
12 | in the broader discourse.
13 | 
14 | Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
15 | """
16 | from lm_eval.tasks.lambada import LambadaOpenAI, LambadaStandard
17 | 
18 | 
19 | _CITATION = """
20 | @misc{
21 |     author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
22 |     title={The LAMBADA dataset},
23 |     DOI={10.5281/zenodo.2630551},
24 |     publisher={Zenodo},
25 |     year={2016},
26 |     month={Aug}
27 | }
28 | """
29 | 
30 | 
31 | class LambadaStandardCloze(LambadaStandard):
32 |     """Cloze-style LambadaStandard."""
33 | 
34 |     VERSION = 0
35 | 
36 |     def doc_to_text(self, doc):
37 |         return doc["text"].rsplit(" ", 1)[0] + " ____. ->"
38 | 
39 |     def should_decontaminate(self):
40 |         return True
41 | 
42 |     def doc_to_decontamination_query(self, doc):
43 |         return doc["text"]
44 | 
45 |     def doc_to_target(self, doc):
46 |         return " " + doc["text"].rsplit(" ", 1)[1]
47 | 
48 | 
49 | class LambadaOpenAICloze(LambadaOpenAI):
50 |     """Cloze-style LambadaOpenAI."""
51 | 
52 |     VERSION = 0
53 | 
54 |     def doc_to_text(self, doc):
55 |         return doc["text"].rsplit(" ", 1)[0] + " ____. ->"
56 | 
57 |     def should_decontaminate(self):
58 |         return True
59 | 
60 |     def doc_to_decontamination_query(self, doc):
61 |         return doc["text"]
62 | 
63 |     def doc_to_target(self, doc):
64 |         return " " + doc["text"].rsplit(" ", 1)[1]
65 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/lambada_multilingual.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The LAMBADA (OpenAI) dataset: Word prediction requiring a broad discourse context∗
 3 | https://arxiv.org/pdf/1606.06031.pdf
 4 | 
 5 | The LAMBADA OpenAI dataset machine-translated to other languages.
 6 | LAMBADA is a dataset to evaluate the capabilities of computational models for text
 7 | understanding by means of a word prediction task. LAMBADA is a collection of narrative
 8 | passages sharing the characteristic that human subjects are able to guess their last
 9 | word if they are exposed to the whole passage, but not if they only see the last
10 | sentence preceding the target word. To succeed on LAMBADA, computational models
11 | cannot simply rely on local context, but must be able to keep track of information
12 | in the broader discourse.
13 | 
14 | Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
15 | 
16 | Reference (OpenAI): https://github.com/openai/gpt-2/issues/131#issuecomment-497136199
17 | """
18 | from .lambada import LambadaOpenAI
19 | 
20 | 
21 | _CITATION = """
22 | @misc{
23 |     author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
24 |     title={The LAMBADA dataset},
25 |     DOI={10.5281/zenodo.2630551},
26 |     publisher={Zenodo},
27 |     year={2016},
28 |     month={Aug}
29 | }
30 | """
31 | 
32 | 
33 | class LambadaOpenAIMultilingualEnglish(LambadaOpenAI):
34 |     VERSION = 0
35 |     DATASET_NAME = "en"
36 | 
37 | 
38 | class LambadaOpenAIMultilingualFrench(LambadaOpenAI):
39 |     VERSION = 0
40 |     DATASET_NAME = "fr"
41 | 
42 | 
43 | class LambadaOpenAIMultilingualGerman(LambadaOpenAI):
44 |     VERSION = 0
45 |     DATASET_NAME = "de"
46 | 
47 | 
48 | class LambadaOpenAIMultilingualItalian(LambadaOpenAI):
49 |     VERSION = 0
50 |     DATASET_NAME = "it"
51 | 
52 | 
53 | class LambadaOpenAIMultilingualSpanish(LambadaOpenAI):
54 |     VERSION = 0
55 |     DATASET_NAME = "es"
56 | 
57 | 
58 | LANG_CLASSES = [
59 |     LambadaOpenAIMultilingualEnglish,
60 |     LambadaOpenAIMultilingualFrench,
61 |     LambadaOpenAIMultilingualGerman,
62 |     LambadaOpenAIMultilingualItalian,
63 |     LambadaOpenAIMultilingualSpanish,
64 | ]
65 | 
66 | 
67 | def construct_tasks():
68 |     tasks = {}
69 |     for lang_class in LANG_CLASSES:
70 |         tasks[f"lambada_openai_mt_{lang_class.DATASET_NAME}"] = lang_class
71 |     return tasks
72 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/logiqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning
 3 | https://arxiv.org/pdf/2007.08124.pdf
 4 | 
 5 | LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA
 6 | instances, covering multiple types of deductive reasoning. Results show that state-
 7 | of-the-art neural models perform by far worse than human ceiling. The dataset can
 8 | also serve as a benchmark for reinvestigating logical AI under the deep learning
 9 | NLP setting.
10 | 
11 | Homepage: https://github.com/lgw863/LogiQA-dataset
12 | """
13 | import inspect
14 | import lm_eval.datasets.logiqa.logiqa
15 | from lm_eval.base import MultipleChoiceTask
16 | 
17 | 
18 | _CITATION = """
19 | @misc{liu2020logiqa,
20 |     title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning},
21 |     author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},
22 |     year={2020},
23 |     eprint={2007.08124},
24 |     archivePrefix={arXiv},
25 |     primaryClass={cs.CL}
26 | }
27 | """
28 | 
29 | 
30 | class LogiQA(MultipleChoiceTask):
31 |     VERSION = 0
32 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.logiqa.logiqa)
33 |     DATASET_NAME = None
34 | 
35 |     def has_training_docs(self):
36 |         return True
37 | 
38 |     def has_validation_docs(self):
39 |         return True
40 | 
41 |     def has_test_docs(self):
42 |         return True
43 | 
44 |     def training_docs(self):
45 |         if self._training_docs is None:
46 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
47 |         return self._training_docs
48 | 
49 |     def validation_docs(self):
50 |         return map(self._process_doc, self.dataset["validation"])
51 | 
52 |     def test_docs(self):
53 |         return map(self._process_doc, self.dataset["test"])
54 | 
55 |     def _process_doc(self, doc):
56 |         def format_example(doc, choices):
57 |             """
58 |             Passage: <passage>
59 |             Question: <question>
60 |             Choices:
61 |             A. <choice1>
62 |             B. <choice2>
63 |             C. <choice3>
64 |             D. <choice4>
65 |             Answer:
66 |             """
67 |             prompt = "Passage: " + doc["context"] + "\n"
68 |             prompt += "Question: " + doc["question"] + "\nChoices:\n"
69 |             for choice, option in zip(choices, doc["options"]):
70 |                 prompt += f"{choice.upper()}. {option}\n"
71 |             prompt += "Answer:"
72 |             return prompt
73 | 
74 |         choices = ["a", "b", "c", "d"]
75 |         return {
76 |             "passage": doc["context"],  # Used for decontamination
77 |             "query": format_example(doc, choices),
78 |             "choices": doc["options"],
79 |             "gold": choices.index(doc["label"]),
80 |         }
81 | 
82 |     def doc_to_text(self, doc):
83 |         return doc["query"]
84 | 
85 |     def should_decontaminate(self):
86 |         return True
87 | 
88 |     def doc_to_decontamination_query(self, doc):
89 |         return doc["passage"]
90 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/mathqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | MathQA: Towards Interpretable Math Word Problem Solving with Operation-Based Formalisms
 3 | https://arxiv.org/pdf/1905.13319.pdf
 4 | 
 5 | MathQA is a large-scale dataset of 37k English multiple-choice math word problems
 6 | covering multiple math domain categories by modeling operation programs corresponding
 7 | to word problems in the AQuA dataset (Ling et al., 2017).
 8 | 
 9 | Homepage: https://math-qa.github.io/math-QA/
10 | """
11 | import re
12 | from lm_eval.base import MultipleChoiceTask
13 | 
14 | 
15 | _CITATION = """
16 | @misc{amini2019mathqa,
17 |     title={MathQA: Towards Interpretable Math Word Problem Solving with Operation-Based Formalisms},
18 |     author={Aida Amini and Saadia Gabriel and Peter Lin and Rik Koncel-Kedziorski and Yejin Choi and Hannaneh Hajishirzi},
19 |     year={2019},
20 |     eprint={1905.13319},
21 |     archivePrefix={arXiv},
22 |     primaryClass={cs.CL}
23 | }
24 | """
25 | 
26 | 
27 | class MathQA(MultipleChoiceTask):
28 |     VERSION = 0
29 |     DATASET_PATH = "math_qa"
30 |     DATASET_NAME = None
31 | 
32 |     def has_training_docs(self):
33 |         return True
34 | 
35 |     def has_validation_docs(self):
36 |         return True
37 | 
38 |     def has_test_docs(self):
39 |         return True
40 | 
41 |     def training_docs(self):
42 |         if self._training_docs is None:
43 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
44 |         return self._training_docs
45 | 
46 |     def validation_docs(self):
47 |         return map(self._process_doc, self.dataset["validation"])
48 | 
49 |     def test_docs(self):
50 |         return map(self._process_doc, self.dataset["test"])
51 | 
52 |     def _process_doc(self, doc):
53 |         answer_idx = ["a", "b", "c", "d", "e"].index(doc["correct"])
54 |         choices = [
55 |             c[4:].rstrip(" ,")
56 |             for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc["options"])
57 |         ]
58 | 
59 |         out_doc = {
60 |             "query": "Question: " + doc["Problem"] + "\nAnswer:",
61 |             "choices": choices,
62 |             "gold": answer_idx,
63 |         }
64 |         return out_doc
65 | 
66 |     def doc_to_text(self, doc):
67 |         return doc["query"]
68 | 
69 |     def should_decontaminate(self):
70 |         return True
71 | 
72 |     def doc_to_decontamination_query(self, doc):
73 |         return doc["query"]
74 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/mutual.py:
--------------------------------------------------------------------------------
  1 | """
  2 | MuTual: A Dataset for Multi-Turn Dialogue Reasoning
  3 | https://www.aclweb.org/anthology/2020.acl-main.130/
  4 | 
  5 | MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is
  6 | modified from Chinese high school English listening comprehension test data.
  7 | 
  8 | Homepage: https://github.com/Nealcly/MuTual
  9 | """
 10 | import numpy as np
 11 | import inspect
 12 | import lm_eval.datasets.mutual.mutual
 13 | from lm_eval.base import Task, rf
 14 | from lm_eval.metrics import mean
 15 | 
 16 | 
 17 | _CITATION = """
 18 | @inproceedings{mutual,
 19 |     title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning",
 20 |     author = "Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" ,
 21 |     booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics",
 22 |     year = "2020",
 23 |     publisher = "Association for Computational Linguistics",
 24 | }
 25 | """
 26 | 
 27 | 
 28 | class MuTualBase(Task):
 29 |     VERSION = 1
 30 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.mutual.mutual)
 31 |     DATASET_NAME = None
 32 |     CHOICES = ["A", "B", "C", "D"]
 33 | 
 34 |     def has_training_docs(self):
 35 |         return True
 36 | 
 37 |     def has_validation_docs(self):
 38 |         return True
 39 | 
 40 |     def has_test_docs(self):
 41 |         return False
 42 | 
 43 |     def training_docs(self):
 44 |         return self.dataset["train"]
 45 | 
 46 |     def validation_docs(self):
 47 |         return self.dataset["validation"]
 48 | 
 49 |     def test_docs(self):
 50 |         return NotImplemented
 51 | 
 52 |     def doc_to_text(self, doc):
 53 |         return self.detokenize(doc["article"])
 54 | 
 55 |     def should_decontaminate(self):
 56 |         return True
 57 | 
 58 |     def doc_to_decontamination_query(self, doc):
 59 |         return doc["article"]
 60 | 
 61 |     def doc_to_target(self, doc):
 62 |         return " " + self.detokenize(doc["options"][self.CHOICES.index(doc["answers"])])
 63 | 
 64 |     def construct_requests(self, doc, ctx):
 65 |         lls = []
 66 |         for option in doc["options"]:
 67 |             lls.append(rf.loglikelihood(ctx, f" {self.detokenize(option)}")[0])
 68 |         return lls
 69 | 
 70 |     def detokenize(self, text):
 71 |         text = text.replace(" '", "'")
 72 |         text = text.replace(" \n", "\n")
 73 |         text = text.replace("\n ", "\n")
 74 |         text = text.replace(" n't", "n't")
 75 |         text = text.replace("`` ", '"')
 76 |         text = text.replace("''", '"')
 77 |         # punctuation
 78 |         text = text.replace(" :", ":")
 79 |         text = text.replace(" ;", ";")
 80 |         text = text.replace(" !", "!")
 81 |         text = text.replace(" ?", "?")
 82 |         text = text.replace(" ,", ",")
 83 |         text = text.replace(" .", ".")
 84 |         return text
 85 | 
 86 |     def process_results(self, doc, results):
 87 |         gold = self.CHOICES.index(doc["answers"])
 88 |         r4_1 = np.argmax(results) == gold  # r4_1 = accuracy
 89 |         ranks = sorted(results, reverse=True)
 90 |         r4_2 = (ranks.index(results[gold]) == 1) + r4_1
 91 |         mrr = 1.0 / (ranks.index(results[gold]) + 1)  # `+ 1` for index offset
 92 |         return {"r@1": r4_1, "r@2": r4_2, "mrr": mrr}
 93 | 
 94 |     def aggregation(self):
 95 |         return {"r@1": mean, "r@2": mean, "mrr": mean}
 96 | 
 97 |     def higher_is_better(self):
 98 |         return {"r@1": True, "r@2": True, "mrr": True}
 99 | 
100 | 
101 | class MuTual(MuTualBase):
102 |     DATASET_NAME = "mutual"
103 | 
104 | 
105 | class MuTualPlus(MuTualBase):
106 |     DATASET_NAME = "mutual_plus"
107 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/openbookqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering
 3 | https://arxiv.org/pdf/1809.02789.pdf
 4 | 
 5 | OpenBookQA is a question-answering dataset modeled after open book exams for
 6 | assessing human understanding of a subject. It consists of 5,957 multiple-choice
 7 | elementary-level science questions (4,957 train, 500 dev, 500 test), which probe
 8 | the understanding of a small “book” of 1,326 core science facts and the application
 9 | of these facts to novel situations. For training, the dataset includes a mapping
10 | from each question to the core science fact it was designed to probe. Answering
11 | OpenBookQA questions requires additional broad common knowledge, not contained
12 | in the book. The questions, by design, are answered incorrectly by both a retrieval-
13 | based algorithm and a word co-occurrence algorithm.
14 | 
15 | Homepage: https://allenai.org/data/open-book-qa
16 | """
17 | from lm_eval.base import MultipleChoiceTask
18 | 
19 | 
20 | _CITATION = """
21 | @inproceedings{OpenBookQA2018,
22 |     title={Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering},
23 |     author={Todor Mihaylov and Peter Clark and Tushar Khot and Ashish Sabharwal},
24 |     booktitle={EMNLP},
25 |     year={2018}
26 | }
27 | """
28 | 
29 | 
30 | class OpenBookQA(MultipleChoiceTask):
31 |     VERSION = 0
32 |     DATASET_PATH = "openbookqa"
33 |     DATASET_NAME = "main"
34 | 
35 |     def has_training_docs(self):
36 |         return True
37 | 
38 |     def has_validation_docs(self):
39 |         return True
40 | 
41 |     def has_test_docs(self):
42 |         return True
43 | 
44 |     def training_docs(self):
45 |         if self._training_docs is None:
46 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
47 |         return self._training_docs
48 | 
49 |     def validation_docs(self):
50 |         return map(self._process_doc, self.dataset["validation"])
51 | 
52 |     def test_docs(self):
53 |         return map(self._process_doc, self.dataset["test"])
54 | 
55 |     def _process_doc(self, doc):
56 |         out_doc = {
57 |             "id": doc["id"],
58 |             "query": doc["question_stem"],
59 |             "choices": doc["choices"]["text"],
60 |             "gold": ["A", "B", "C", "D"].index(doc["answerKey"].strip()),
61 |         }
62 |         return out_doc
63 | 
64 |     def doc_to_text(self, doc):
65 |         return doc["query"]
66 | 
67 |     def should_decontaminate(self):
68 |         return True
69 | 
70 |     def doc_to_decontamination_query(self, doc):
71 |         return doc["query"]
72 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/pile.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The Pile: An 800GB Dataset of Diverse Text for Language Modeling
  3 | https://arxiv.org/pdf/2101.00027.pdf
  4 | 
  5 | The Pile is a 825 GiB diverse, open source language modelling data set that consists
  6 | of 22 smaller, high-quality datasets combined together. To score well on Pile
  7 | BPB (bits per byte), a model must be able to understand many disparate domains
  8 | including books, github repositories, webpages, chat logs, and medical, physics,
  9 | math, computer science, and philosophy papers.
 10 | 
 11 | Homepage: https://pile.eleuther.ai/
 12 | """
 13 | import inspect
 14 | import lm_eval.datasets.pile.pile
 15 | from lm_eval.base import PerplexityTask
 16 | 
 17 | 
 18 | _CITATION = """
 19 | @article{pile,
 20 |   title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
 21 |   author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},
 22 |   journal={arXiv preprint arXiv:2101.00027},
 23 |   year={2020}
 24 | }
 25 | """
 26 | 
 27 | 
 28 | class PilePerplexityTask(PerplexityTask):
 29 |     VERSION = 1
 30 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.pile.pile)
 31 |     DATASET_NAME = None
 32 | 
 33 |     def has_validation_docs(self):
 34 |         return True
 35 | 
 36 |     def has_test_docs(self):
 37 |         return True
 38 | 
 39 |     def validation_docs(self):
 40 |         for doc in self.dataset["validation"]:
 41 |             yield doc["text"]
 42 | 
 43 |     def test_docs(self):
 44 |         for doc in self.dataset["test"]:
 45 |             yield doc["text"]
 46 | 
 47 | 
 48 | class PileArxiv(PilePerplexityTask):
 49 |     DATASET_NAME = "pile_arxiv"
 50 | 
 51 | 
 52 | class PileBooks3(PilePerplexityTask):
 53 |     DATASET_NAME = "pile_books3"
 54 | 
 55 | 
 56 | class PileBookCorpus2(PilePerplexityTask):
 57 |     DATASET_NAME = "pile_bookcorpus2"
 58 | 
 59 | 
 60 | class PileDmMathematics(PilePerplexityTask):
 61 |     DATASET_NAME = "pile_dm-mathematics"
 62 | 
 63 | 
 64 | class PileEnron(PilePerplexityTask):
 65 |     DATASET_NAME = "pile_enron"
 66 | 
 67 | 
 68 | class PileEuroparl(PilePerplexityTask):
 69 |     DATASET_NAME = "pile_europarl"
 70 | 
 71 | 
 72 | class PileFreeLaw(PilePerplexityTask):
 73 |     DATASET_NAME = "pile_freelaw"
 74 | 
 75 | 
 76 | class PileGithub(PilePerplexityTask):
 77 |     DATASET_NAME = "pile_github"
 78 | 
 79 | 
 80 | class PileGutenberg(PilePerplexityTask):
 81 |     DATASET_NAME = "pile_gutenberg"
 82 | 
 83 | 
 84 | class PileHackernews(PilePerplexityTask):
 85 |     DATASET_NAME = "pile_hackernews"
 86 | 
 87 | 
 88 | class PileNIHExporter(PilePerplexityTask):
 89 |     DATASET_NAME = "pile_nih-exporter"
 90 | 
 91 | 
 92 | class PileOpenSubtitles(PilePerplexityTask):
 93 |     DATASET_NAME = "pile_opensubtitles"
 94 | 
 95 | 
 96 | class PileOpenWebText2(PilePerplexityTask):
 97 |     DATASET_NAME = "pile_openwebtext2"
 98 | 
 99 | 
100 | class PilePhilPapers(PilePerplexityTask):
101 |     DATASET_NAME = "pile_philpapers"
102 | 
103 | 
104 | class PilePileCc(PilePerplexityTask):
105 |     DATASET_NAME = "pile_pile-cc"
106 | 
107 | 
108 | class PilePubmedAbstracts(PilePerplexityTask):
109 |     DATASET_NAME = "pile_pubmed-abstracts"
110 | 
111 | 
112 | class PilePubmedCentral(PilePerplexityTask):
113 |     DATASET_NAME = "pile_pubmed-central"
114 | 
115 | 
116 | class PileStackExchange(PilePerplexityTask):
117 |     DATASET_NAME = "pile_stackexchange"
118 | 
119 | 
120 | class PileUspto(PilePerplexityTask):
121 |     DATASET_NAME = "pile_upsto"
122 | 
123 | 
124 | class PileUbuntuIrc(PilePerplexityTask):
125 |     DATASET_NAME = "pile_ubuntu-irc"
126 | 
127 | 
128 | class PileWikipedia(PilePerplexityTask):
129 |     DATASET_NAME = "pile_wikipedia"
130 | 
131 | 
132 | class PileYoutubeSubtitles(PilePerplexityTask):
133 |     DATASET_NAME = "pile_youtubesubtitles"
134 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/piqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | PIQA: Reasoning about Physical Commonsense in Natural Language
 3 | https://arxiv.org/pdf/1911.11641.pdf
 4 | 
 5 | Physical Interaction: Question Answering (PIQA) is a physical commonsense
 6 | reasoning and a corresponding benchmark dataset. PIQA was designed to investigate
 7 | the physical knowledge of existing models. To what extent are current approaches
 8 | actually learning about the world?
 9 | 
10 | Homepage: https://yonatanbisk.com/piqa/
11 | """
12 | 
13 | from lm_eval.base import MultipleChoiceTask
14 | 
15 | 
16 | _CITATION = """
17 | @inproceedings{Bisk2020,
18 |     author = {Yonatan Bisk and Rowan Zellers and
19 |             Ronan Le Bras and Jianfeng Gao
20 |             and Yejin Choi},
21 |     title = {PIQA: Reasoning about Physical Commonsense in
22 |            Natural Language},
23 |     booktitle = {Thirty-Fourth AAAI Conference on
24 |                Artificial Intelligence},
25 |     year = {2020},
26 | }
27 | """
28 | 
29 | 
30 | class PiQA(MultipleChoiceTask):
31 |     VERSION = 0
32 |     DATASET_PATH = "piqa"
33 |     DATASET_NAME = None
34 | 
35 |     def has_training_docs(self):
36 |         return True
37 | 
38 |     def has_validation_docs(self):
39 |         return True
40 | 
41 |     def has_test_docs(self):
42 |         return False
43 | 
44 |     def training_docs(self):
45 |         if self._training_docs is None:
46 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
47 |         return self._training_docs
48 | 
49 |     def validation_docs(self):
50 |         return map(self._process_doc, self.dataset["validation"])
51 | 
52 |     def _process_doc(self, doc):
53 |         out_doc = {
54 |             "goal": doc["goal"],
55 |             "choices": [doc["sol1"], doc["sol2"]],
56 |             "gold": doc["label"],
57 |         }
58 |         return out_doc
59 | 
60 |     def doc_to_text(self, doc):
61 |         return "Question: " + doc["goal"] + "\nAnswer:"
62 | 
63 |     def should_decontaminate(self):
64 |         return True
65 | 
66 |     def doc_to_decontamination_query(self, doc):
67 |         return doc["goal"]
68 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/prost.py:
--------------------------------------------------------------------------------
 1 | """
 2 | PROST: Physical Reasoning about Objects Through Space and Time
 3 | https://arxiv.org/pdf/2106.03634.pdf
 4 | 
 5 | PROST, Physical Reasoning about Objects Through Space and Time, is a dataset
 6 | consisting of 18,736 multiple-choice questions made from 14 manually curated
 7 | templates, covering 10 physical reasoning concepts. All questions are designed
 8 | to probe both causal and masked language models in a zero-shot setting.
 9 | 
10 | NOTE: PROST is limited to the zero-shot setting to adhere to authors' intentions
11 | as discussed in section 7 of the paper: "We hope that the community will use
12 | this dataset in the intended way: in a zero-shot setting to probe models which
13 | have been trained on data not specifically collected to succeed on PROST."
14 | 
15 | Homepage: https://github.com/nala-cub/prost
16 | """
17 | from lm_eval.base import MultipleChoiceTask
18 | 
19 | 
20 | _CITATION = """
21 | @inproceedings{aroca-ouellette-etal-2021-prost,
22 |     title = "{PROST}: {P}hysical Reasoning about Objects through Space and Time",
23 |     author = "Aroca-Ouellette, St{\'e}phane  and
24 |       Paik, Cory  and
25 |       Roncone, Alessandro  and
26 |       Kann, Katharina",
27 |     booktitle = "Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021",
28 |     month = aug,
29 |     year = "2021",
30 |     address = "Online",
31 |     publisher = "Association for Computational Linguistics",
32 |     url = "https://aclanthology.org/2021.findings-acl.404",
33 |     pages = "4597--4608",
34 | }
35 | """
36 | 
37 | 
38 | class PROST(MultipleChoiceTask):
39 |     VERSION = 0
40 |     DATASET_PATH = "corypaik/prost"
41 |     DATASET_NAME = None
42 | 
43 |     def has_training_docs(self):
44 |         return False
45 | 
46 |     def has_validation_docs(self):
47 |         return False
48 | 
49 |     def has_test_docs(self):
50 |         return True
51 | 
52 |     def test_docs(self):
53 |         return map(self._process_doc, self.dataset["test"])
54 | 
55 |     def fewshot_context(
56 |         self, doc, num_fewshot, provide_description=None, rnd=None, description=None
57 |     ):
58 |         assert (
59 |             num_fewshot == 0
60 |         ), "PROST is designed to probe models in a zero-shot fashion only."
61 |         return super().fewshot_context(
62 |             doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
63 |         )
64 | 
65 |     def _process_doc(self, doc):
66 |         out_doc = {
67 |             "query": f"{doc['context']}\nQuestion: {doc['ex_question']}\nAnswer:",
68 |             "choices": [doc["A"], doc["B"], doc["C"], doc["D"]],
69 |             "gold": doc["label"],
70 |         }
71 |         return out_doc
72 | 
73 |     def doc_to_text(self, doc):
74 |         return doc["query"]
75 | 
76 |     def should_decontaminate(self):
77 |         return True
78 | 
79 |     def doc_to_decontamination_query(self, doc):
80 |         return doc["query"]
81 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/pubmedqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | PubMedQA: A Dataset for Biomedical Research Question Answering
 3 | https://arxiv.org/pdf/1909.06146.pdf
 4 | 
 5 | PubMedQA is a novel biomedical question answering (QA) dataset collected from
 6 | PubMed abstracts. The task of PubMedQA is to answer research questions with
 7 | yes/no/maybe (e.g.: Do preoperative statins reduce atrial fibrillation after
 8 | coronary artery bypass grafting?) using the corresponding abstracts. PubMedQA
 9 | has 1k expert-annotated, 61.2k unlabeled and 211.3k artificially generated QA
10 | instances. Each PubMedQA instance is composed of (1) a question which is either
11 | an existing research article title or derived from one, (2) a context which is
12 | the corresponding abstract without its conclusion, (3) a long answer, which is
13 | the conclusion of the abstract and, presumably, answers the research question,
14 | and (4) a yes/no/maybe answer which summarizes the conclusion.
15 | 
16 | Homepage: https://pubmedqa.github.io/
17 | """
18 | import numpy as np
19 | from lm_eval.base import rf, Task
20 | from lm_eval.metrics import mean
21 | 
22 | 
23 | _CITATION = """
24 | @inproceedings{jin2019pubmedqa,
25 |     title={PubMedQA: A Dataset for Biomedical Research Question Answering},
26 |     author={Jin, Qiao and Dhingra, Bhuwan and Liu, Zhengping and Cohen, William and Lu, Xinghua},
27 |     booktitle={Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
28 |     pages={2567--2577},
29 |     year={2019}
30 | }
31 | """
32 | 
33 | 
34 | class Pubmed_QA(Task):
35 |     VERSION = 0
36 |     DATASET_PATH = "pubmed_qa"
37 |     DATASET_NAME = "pqa_labeled"
38 | 
39 |     def has_training_docs(self):
40 |         return False
41 | 
42 |     def has_validation_docs(self):
43 |         return False
44 | 
45 |     def has_test_docs(self):
46 |         return True
47 | 
48 |     def test_docs(self):
49 |         if self.has_test_docs():
50 |             # HF is labelled as train but its really just for testing
51 |             return self.dataset["train"]
52 | 
53 |     def doc_to_text(self, doc):
54 |         ctxs = "\n".join(doc["context"]["contexts"])
55 |         return "Abstract: {}\nQuestion: {}\nAnswer:".format(
56 |             ctxs, doc["question"], doc["final_decision"]
57 |         )
58 | 
59 |     def should_decontaminate(self):
60 |         return True
61 | 
62 |     def doc_to_decontamination_query(self, doc):
63 |         return doc["question"] + " " + "\n".join(doc["context"]["contexts"])
64 | 
65 |     def doc_to_target(self, doc):
66 |         return " {}".format(doc["final_decision"])
67 | 
68 |     def construct_requests(self, doc, ctx):
69 |         """Uses RequestFactory to construct Requests and returns
70 |         an iterable of Requests which will be sent to the LM.
71 |         """
72 |         ll_yes, _ = rf.loglikelihood(ctx, " yes")
73 |         ll_no, _ = rf.loglikelihood(ctx, " no")
74 |         ll_maybe, _ = rf.loglikelihood(ctx, " maybe")
75 |         return ll_yes, ll_no, ll_maybe
76 | 
77 |     def process_results(self, doc, results):
78 |         gold = doc["final_decision"]
79 |         ll_yes, ll_no, ll_maybe = results
80 |         pred = np.argmax(results)
81 |         return {
82 |             "acc": ["yes", "no", "maybe"][pred] == gold,
83 |         }
84 | 
85 |     def aggregation(self):
86 |         return {"acc": mean}
87 | 
88 |     def higher_is_better(self):
89 |         return {"acc": True}
90 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/qa4mre.py:
--------------------------------------------------------------------------------
 1 | """
 2 | QA4MRE 2011-2013: Overview of Question Answering for Machine Reading Evaluation
 3 | https://www.cs.cmu.edu/~./hovy/papers/13CLEF-QA4MRE.pdf
 4 | 
 5 | The (English only) QA4MRE challenge which was run as a Lab at CLEF 2011-2013.
 6 | The main objective of this exercise is to develop a methodology for evaluating
 7 | Machine Reading systems through Question Answering and Reading Comprehension
 8 | Tests. Systems should be able to extract knowledge from large volumes of text
 9 | and use this knowledge to answer questions. Four different tasks have been
10 | organized during these years: Main Task, Processing Modality and Negation for
11 | Machine Reading, Machine Reading of Biomedical Texts about Alzheimer's disease,
12 | and Entrance Exam.
13 | 
14 | Homepage: http://nlp.uned.es/clef-qa/repository/qa4mre.php
15 | """
16 | from lm_eval.base import MultipleChoiceTask
17 | 
18 | 
19 | _CITATION = """
20 | @inproceedings{Peas2013QA4MRE2O,
21 |     title={QA4MRE 2011-2013: Overview of Question Answering for Machine Reading Evaluation},
22 |     author={Anselmo Pe{\~n}as and Eduard H. Hovy and Pamela Forner and {\'A}lvaro Rodrigo and Richard F. E. Sutcliffe and Roser Morante},
23 |     booktitle={CLEF},
24 |     year={2013}
25 | }
26 | """  # noqa: W605
27 | 
28 | 
29 | class QA4MRE(MultipleChoiceTask):
30 |     VERSION = 0
31 |     DATASET_PATH = "qa4mre"
32 |     DATASET_NAME = None
33 | 
34 |     def has_training_docs(self):
35 |         return False
36 | 
37 |     def has_validation_docs(self):
38 |         return False
39 | 
40 |     def has_test_docs(self):
41 |         return True
42 | 
43 |     def test_docs(self):
44 |         # `qa4mre` only has train data so we use it for the test docs.
45 |         return map(self._process_doc, self.dataset["train"])
46 | 
47 |     def _process_doc(self, doc):
48 |         choices = doc["answer_options"]["answer_str"]
49 |         out_doc = {
50 |             "source": doc["document_str"].strip().replace("'", "'"),
51 |             "query": doc["question_str"],
52 |             "choices": choices,
53 |             "gold": int(doc["correct_answer_id"]) - 1,
54 |         }
55 |         return out_doc
56 | 
57 |     def doc_to_text(self, doc):
58 |         return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"])
59 | 
60 |     def should_decontaminate(self):
61 |         return True
62 | 
63 |     def doc_to_decontamination_query(self, doc):
64 |         return doc["source"] + " " + doc["query"]
65 | 
66 | 
67 | class QA4MRE_2011(QA4MRE):
68 |     DATASET_NAME = "2011.main.EN"
69 | 
70 | 
71 | class QA4MRE_2012(QA4MRE):
72 |     DATASET_NAME = "2012.main.EN"
73 | 
74 | 
75 | class QA4MRE_2013(QA4MRE):
76 |     DATASET_NAME = "2013.main.EN"
77 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/quac.py:
--------------------------------------------------------------------------------
  1 | """
  2 | QuAC: Question Answering in Context
  3 | https://arxiv.org/abs/1808.07036
  4 | 
  5 | Question Answering in Context (QuAC) is a dataset for modeling, understanding, and
  6 | participating in information seeking dialog. Data instances consist of an interactive
  7 | dialog between two crowd workers: (1) a student who poses a sequence of freeform
  8 | questions to learn as much as possible about a hidden Wikipedia text, and (2)
  9 | a teacher who answers the questions by providing short excerpts (spans) from the text.
 10 | 
 11 | Homepage: https://quac.ai/
 12 | """
 13 | import inspect
 14 | import lm_eval.datasets.quac.quac
 15 | from lm_eval.base import Task
 16 | 
 17 | 
 18 | _CITATION = """
 19 | @article{choi2018quac,
 20 |     title={Quac: Question answering in context},
 21 |     author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},
 22 |     journal={arXiv preprint arXiv:1808.07036},
 23 |     year={2018}
 24 | }
 25 | """
 26 | 
 27 | 
 28 | class QuAC(Task):
 29 |     VERSION = 0
 30 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.quac.quac)
 31 |     DATASET_NAME = None
 32 | 
 33 |     def has_training_docs(self):
 34 |         return True
 35 | 
 36 |     def has_validation_docs(self):
 37 |         return True
 38 | 
 39 |     def has_test_docs(self):
 40 |         return False
 41 | 
 42 |     def training_docs(self):
 43 |         if self._training_docs is None:
 44 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
 45 |         return self._training_docs
 46 | 
 47 |     def validation_docs(self):
 48 |         return map(self._process_doc, self.dataset["validation"])
 49 | 
 50 |     def test_docs(self):
 51 |         raise NotImplementedError("QuAC has no test docs.")
 52 | 
 53 |     def _process_doc(self, doc):
 54 |         doc["title"] = doc["title"] + " - " + doc["section_title"]
 55 |         return doc
 56 | 
 57 |     def doc_to_text(self, doc):
 58 |         return (
 59 |             "TITLE: "
 60 |             + doc["title"]
 61 |             + "\n"
 62 |             + "PARAGRAPH: "
 63 |             + doc["paragraph"]
 64 |             + "\n\n"
 65 |             + "Q: "
 66 |             + doc["question"]
 67 |             + "\n\n"
 68 |             + "A: "
 69 |         )
 70 | 
 71 |     def should_decontaminate(self):
 72 |         return True
 73 | 
 74 |     def doc_to_decontamination_query(self, doc):
 75 |         return doc["paragraph"]
 76 | 
 77 |     def doc_to_target(self, doc):
 78 |         return doc["answer"]
 79 | 
 80 |     def construct_requests(self, doc, ctx):
 81 |         """Uses RequestFactory to construct Requests and returns an iterable of
 82 |         Requests which will be sent to the LM.
 83 | 
 84 |         :param doc:
 85 |             The document as returned from training_docs, validation_docs, or test_docs.
 86 |         :param ctx: str
 87 |             The context string, generated by fewshot_context. This includes the natural
 88 |             language description, as well as the few shot examples, and the question
 89 |             part of the document for `doc`.
 90 |         """
 91 |         # TODO: implement evaluation.
 92 |         raise NotImplementedError("Evaluation not implemented")
 93 | 
 94 |     def process_results(self, doc, results):
 95 |         """Take a single document and the LM results and evaluates, returning a
 96 |         dict where keys are the names of submetrics and values are the values of
 97 |         the metric for that one document
 98 | 
 99 |         :param doc:
100 |             The document as returned from training_docs, validation_docs, or test_docs.
101 |         :param results:
102 |             The results of the requests created in construct_requests.
103 |         """
104 |         # TODO: implement evaluation.
105 |         raise NotImplementedError("Evaluation not implemented")
106 | 
107 |     def aggregation(self):
108 |         """
109 |         :returns: {str: [float] -> float}
110 |             A dictionary where keys are the names of submetrics and values are
111 |             functions that aggregate a list of metrics
112 |         """
113 |         # TODO: implement evaluation.
114 |         raise NotImplementedError("Evaluation not implemented")
115 | 
116 |     def higher_is_better(self):
117 |         """
118 |         :returns: {str: bool}
119 |             A dictionary where keys are the names of submetrics and values are
120 |             whether a higher value of the submetric is better
121 |         """
122 |         # TODO: implement evaluation.
123 |         raise NotImplementedError("Evaluation not implemented")
124 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/sat.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Similarity of Semantic Relations
 3 | https://arxiv.org/pdf/cs/0608100.pdf
 4 | 
 5 | SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374
 6 | multiple-choice analogy questions; 5 choices per question.
 7 | 
 8 | Homepage: https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art)
 9 | """
10 | import inspect
11 | import lm_eval.datasets.sat_analogies.sat_analogies
12 | from lm_eval.base import MultipleChoiceTask
13 | 
14 | 
15 | _CITATION = """
16 | @article{article,
17 |     author = {Turney, Peter},
18 |     year = {2006},
19 |     month = {09},
20 |     pages = {379-416},
21 |     title = {Similarity of Semantic Relations},
22 |     volume = {32},
23 |     journal = {Computational Linguistics},
24 |     doi = {10.1162/coli.2006.32.3.379}
25 | }
26 | """
27 | 
28 | 
29 | class SATAnalogies(MultipleChoiceTask):
30 |     VERSION = 0
31 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.sat_analogies.sat_analogies)
32 |     DATASET_NAME = None
33 | 
34 |     def __init__(self, data_dir: str):
35 |         """
36 |         SAT Analog Questions is not publicly available. You must request the data
37 |         by emailing Peter Turney and then download it to a local directory path
38 |         which should be passed into the `data_dir` arg.
39 |         """
40 |         super().__init__(data_dir=data_dir)
41 | 
42 |     def has_training_docs(self):
43 |         return False
44 | 
45 |     def has_validation_docs(self):
46 |         return True
47 | 
48 |     def has_test_docs(self):
49 |         return False
50 | 
51 |     def training_docs(self):
52 |         return []
53 | 
54 |     def validation_docs(self):
55 |         return map(self._process_doc, self.dataset["validation"])
56 | 
57 |     def test_docs(self):
58 |         return []
59 | 
60 |     def _process_doc(self, doc):
61 |         return {
62 |             "source": doc["source"],
63 |             "query": doc["stem"].split(" ")[:2],
64 |             "choices": [
65 |                 "{} is to {}".format(*c.split(" ")[:2]) for c in doc["choices"]
66 |             ],
67 |             "gold": ["a", "b", "c", "d", "e"].index(doc["solution"].strip()),
68 |         }
69 | 
70 |     def doc_to_text(self, doc):
71 |         return "{} is to {} as".format(*doc["query"])
72 | 
73 |     def should_decontaminate(self):
74 |         return True
75 | 
76 |     def doc_to_decontamination_query(self, doc):
77 |         return doc["source"] + "\n" + " ".join(doc["query"])
78 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/sciq.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Crowdsourcing Multiple Choice Science Questions
 3 | https://aclanthology.org/W17-4413.pdf
 4 | 
 5 | The SciQ dataset contains 13,679 crowdsourced science exam questions about Physics,
 6 | Chemistry and Biology, among others. The questions are in multiple-choice format
 7 | with 4 answer options each. For the majority of the questions, an additional paragraph
 8 | with supporting evidence for the correct answer is provided.
 9 | 
10 | Homepage: https://allenai.org/data/sciq
11 | """
12 | from lm_eval.base import MultipleChoiceTask
13 | 
14 | 
15 | _CITATION = """
16 | @inproceedings{Welbl2017CrowdsourcingMC,
17 |     title={Crowdsourcing Multiple Choice Science Questions},
18 |     author={Johannes Welbl and Nelson F. Liu and Matt Gardner},
19 |     booktitle={NUT@EMNLP},
20 |     year={2017}
21 | }
22 | """
23 | 
24 | 
25 | class SciQ(MultipleChoiceTask):
26 |     VERSION = 0
27 |     DATASET_PATH = "sciq"
28 |     DATASET_NAME = None
29 | 
30 |     def has_training_docs(self):
31 |         return True
32 | 
33 |     def has_validation_docs(self):
34 |         return True
35 | 
36 |     def has_test_docs(self):
37 |         return True
38 | 
39 |     def training_docs(self):
40 |         if self._training_docs is None:
41 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
42 |         return self._training_docs
43 | 
44 |     def validation_docs(self):
45 |         return map(self._process_doc, self.dataset["validation"])
46 | 
47 |     def test_docs(self):
48 |         return map(self._process_doc, self.dataset["test"])
49 | 
50 |     def _process_doc(self, doc):
51 |         choices = [
52 |             doc["distractor1"],
53 |             doc["distractor2"],
54 |             doc["distractor3"],
55 |             doc["correct_answer"],
56 |         ]
57 |         src = doc["support"]
58 |         out_doc = {
59 |             "source": src,
60 |             "query": doc["question"],
61 |             "choices": choices,
62 |             "gold": 3,
63 |         }
64 |         return out_doc
65 | 
66 |     def doc_to_text(self, doc):
67 |         return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"]).strip()
68 | 
69 |     def should_decontaminate(self):
70 |         return True
71 | 
72 |     def doc_to_decontamination_query(self, doc):
73 |         return doc["source"] + " " + doc["query"]
74 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/swag.py:
--------------------------------------------------------------------------------
 1 | """
 2 | SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference
 3 | https://arxiv.org/pdf/1808.05326.pdf
 4 | 
 5 | SWAG (Situations With Adversarial Generations) is an adversarial dataset
 6 | that consists of 113k multiple choice questions about grounded situations. Each
 7 | question is a video caption from LSMDC or ActivityNet Captions, with four answer
 8 | choices about what might happen next in the scene. The correct answer is the
 9 | (real) video caption for the next event in the video; the three incorrect
10 | answers are adversarially generated and human verified, so as to fool machines
11 | but not humans.
12 | 
13 | Homepage: https://rowanzellers.com/swag/
14 | """
15 | from lm_eval.base import MultipleChoiceTask
16 | 
17 | 
18 | _CITATION = """
19 | @inproceedings{zellers2018swagaf,
20 |     title={SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference},
21 |     author={Zellers, Rowan and Bisk, Yonatan and Schwartz, Roy and Choi, Yejin},
22 |     booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
23 |     year={2018}
24 | }
25 | """
26 | 
27 | 
28 | class SWAG(MultipleChoiceTask):
29 |     VERSION = 0
30 |     DATASET_PATH = "swag"
31 |     DATASET_NAME = "regular"
32 | 
33 |     def has_training_docs(self):
34 |         return True
35 | 
36 |     def has_validation_docs(self):
37 |         return True
38 | 
39 |     def has_test_docs(self):
40 |         return False
41 | 
42 |     def training_docs(self):
43 |         if self._training_docs is None:
44 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
45 |         return self._training_docs
46 | 
47 |     def validation_docs(self):
48 |         return map(self._process_doc, self.dataset["validation"])
49 | 
50 |     def _process_doc(self, doc):
51 |         out_doc = {
52 |             "query": doc["startphrase"],
53 |             "choices": [doc["ending0"], doc["ending1"], doc["ending2"], doc["ending3"]],
54 |             "gold": int(doc["label"]),
55 |         }
56 |         return out_doc
57 | 
58 |     def doc_to_text(self, doc):
59 |         return doc["query"]
60 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/toxigen.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ToxiGen: A Large-Scale Machine-Generated Dataset for Adversarial and Implicit Hate Speech Detection
 3 | https://arxiv.org/abs/2203.09509
 4 | 
 5 | Classify input text as either hateful or not hateful.
 6 | 
 7 | Homepage: https://github.com/microsoft/TOXIGEN
 8 | """
 9 | from lm_eval.base import MultipleChoiceTask
10 | import numpy as np
11 | import pandas as pd
12 | 
13 | 
14 | _CITATION = """
15 | @inproceedings{hartvigsen2022toxigen,
16 |   title={ToxiGen: A Large-Scale Machine-Generated Dataset for Implicit and Adversarial Hate Speech Detection},
17 |   author={Hartvigsen, Thomas and Gabriel, Saadia and Palangi, Hamid and Sap, Maarten and Ray, Dipankar and Kamar, Ece},
18 |   booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics},
19 |   year={2022}
20 | }
21 | """
22 | 
23 | 
24 | class ToxiGen(MultipleChoiceTask):
25 |     VERSION = 0
26 |     DATASET_PATH = "skg/toxigen-data"
27 |     DATASET_NAME = "annotated"
28 | 
29 |     def has_training_docs(self):
30 |         return True
31 | 
32 |     def has_validation_docs(self):
33 |         return False
34 | 
35 |     def has_test_docs(self):
36 |         return True
37 | 
38 |     def training_docs(self):
39 |         if self.has_training_docs():
40 |             # We cache training documents in `self._training_docs` for faster
41 |             # few-shot processing. If the data is too large to fit in memory,
42 |             # return the training data as a generator instead of a list.
43 |             if self._training_docs is None:
44 |                 dataset = self._preprocess_dataset("train")
45 |                 self._training_docs = list(map(self._process_doc, dataset))
46 |             return self._training_docs
47 | 
48 |     def test_docs(self):
49 |         if self.has_test_docs():
50 |             dataset = self._preprocess_dataset("test")
51 |             return map(self._process_doc, dataset)
52 | 
53 |     def _preprocess_dataset(self, split: str):
54 |         """Preprocess the dataset into a list of (text, label) tuples."""
55 |         d = pd.DataFrame(self.dataset[split])
56 |         text = d["text"]
57 |         labels = np.round(((d["toxicity_ai"] + d["toxicity_human"]) > 5.5), 0).astype(
58 |             np.int32
59 |         )
60 |         return [[x, y] for x, y in zip(text, labels)]
61 | 
62 |     def _process_doc(self, doc):
63 |         return {
64 |             "query": doc[0],
65 |             "choices": ["No", "Yes"],
66 |             "gold": doc[1],
67 |         }
68 | 
69 |     def doc_to_text(self, doc):
70 |         return f"Is the following statement hateful? Respond with either Yes or No. Statement: '{doc['query']}'"
71 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/triviaqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension
 3 | https://arxiv.org/pdf/1705.03551.pdf
 4 | 
 5 | TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence
 6 | triples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts
 7 | and independently gathered evidence documents, six per question on average, that provide
 8 | high quality distant supervision for answering the questions.
 9 | 
10 | Homepage: https://nlp.cs.washington.edu/triviaqa/
11 | """
12 | import inspect
13 | import lm_eval.datasets.triviaqa.triviaqa
14 | from lm_eval.base import Task, rf
15 | from lm_eval.metrics import mean
16 | 
17 | 
18 | _CITATION = """
19 | @InProceedings{JoshiTriviaQA2017,
20 |     author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
21 |     title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
22 |     booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},
23 |     month = {July},
24 |     year = {2017},
25 |     address = {Vancouver, Canada},
26 |     publisher = {Association for Computational Linguistics},
27 | }
28 | """
29 | 
30 | 
31 | class TriviaQA(Task):
32 |     VERSION = 1
33 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.triviaqa.triviaqa)
34 |     DATASET_NAME = None
35 | 
36 |     def has_training_docs(self):
37 |         return True
38 | 
39 |     def has_validation_docs(self):
40 |         return True
41 | 
42 |     def has_test_docs(self):
43 |         return False
44 | 
45 |     def training_docs(self):
46 |         return self.dataset["train"]
47 | 
48 |     def validation_docs(self):
49 |         return self.dataset["validation"]
50 | 
51 |     def test_docs(self):
52 |         raise NotImplementedError()
53 | 
54 |     def doc_to_text(self, doc):
55 |         return f"Question: {doc['question']}\nAnswer:"
56 | 
57 |     def should_decontaminate(self):
58 |         return True
59 | 
60 |     def doc_to_decontamination_query(self, doc):
61 |         return doc["question"]
62 | 
63 |     def doc_to_target(self, doc):
64 |         return " " + doc["answer"]["value"]
65 | 
66 |     def _remove_prefixes(self, aliases):
67 |         # Optimization: Remove any alias that has a strict prefix elsewhere in the list
68 |         # we can do this because if the prefix is acceptable by isgreedy, we can stop looking
69 |         aliases.sort()
70 |         ret = [aliases[0]]
71 |         for alias in aliases[1:]:
72 |             if not alias.startswith(ret[-1]):
73 |                 ret.append(alias)
74 |         return ret
75 | 
76 |     def construct_requests(self, doc, ctx):
77 |         ret = []
78 |         for alias in self._remove_prefixes(doc["answer"]["aliases"]):
79 |             _, is_prediction = rf.loglikelihood(ctx, " " + alias)
80 |             ret.append(is_prediction)
81 |         return ret
82 | 
83 |     def process_results(self, doc, results):
84 |         return {"acc": float(any(results))}
85 | 
86 |     def aggregation(self):
87 |         return {
88 |             "acc": mean,
89 |         }
90 | 
91 |     def higher_is_better(self):
92 |         return {"acc": True}
93 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/unscramble.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Language Models are Few-Shot Learners
 3 | https://arxiv.org/pdf/2005.14165.pdf
 4 | 
 5 | Unscramble is a small battery of 5 “character manipulation” tasks. Each task
 6 | involves giving the model a word distorted by some combination of scrambling,
 7 | addition, or deletion of characters, and asking it to recover the original word.
 8 | 
 9 | Homepage: https://github.com/openai/gpt-3/tree/master/data
10 | """
11 | import inspect
12 | import lm_eval.datasets.unscramble.unscramble
13 | from lm_eval.base import Task, rf
14 | from lm_eval.metrics import mean
15 | 
16 | 
17 | _CITATION = """
18 | @inproceedings{NEURIPS2020_1457c0d6,
19 |     author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
20 |     booktitle = {Advances in Neural Information Processing Systems},
21 |     editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
22 |     pages = {1877--1901},
23 |     publisher = {Curran Associates, Inc.},
24 |     title = {Language Models are Few-Shot Learners},
25 |     url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
26 |     volume = {33},
27 |     year = {2020}
28 | }
29 | """
30 | 
31 | 
32 | class WordUnscrambleTask(Task):
33 |     VERSION = 0
34 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.unscramble.unscramble)
35 |     DATASET_NAME = None
36 | 
37 |     def has_training_docs(self):
38 |         return False
39 | 
40 |     def has_validation_docs(self):
41 |         return True
42 | 
43 |     def has_test_docs(self):
44 |         return False
45 | 
46 |     def validation_docs(self):
47 |         return self.dataset["validation"]
48 | 
49 |     def doc_to_text(self, doc):
50 |         return doc["context"]
51 | 
52 |     def should_decontaminate(self):
53 |         return True
54 | 
55 |     def doc_to_decontamination_query(self, doc):
56 |         return doc["context"]
57 | 
58 |     def doc_to_target(self, doc):
59 |         return doc["completion"]
60 | 
61 |     def construct_requests(self, doc, ctx):
62 |         completion = rf.greedy_until(ctx, ["\n"])
63 |         return completion
64 | 
65 |     def process_results(self, doc, results):
66 |         pred = results[0]
67 |         gold = doc["completion"]
68 |         return {"acc": int(pred == gold)}
69 | 
70 |     def aggregation(self):
71 |         return {"acc": mean}
72 | 
73 |     def higher_is_better(self):
74 |         return {"acc": True}
75 | 
76 | 
77 | class Anagrams1(WordUnscrambleTask):
78 |     DATASET_NAME = "mid_word_1_anagrams"
79 | 
80 | 
81 | class Anagrams2(WordUnscrambleTask):
82 |     DATASET_NAME = "mid_word_2_anagrams"
83 | 
84 | 
85 | class CycleLetters(WordUnscrambleTask):
86 |     DATASET_NAME = "cycle_letters_in_word"
87 | 
88 | 
89 | class RandomInsertion(WordUnscrambleTask):
90 |     DATASET_NAME = "random_insertion_in_word"
91 | 
92 | 
93 | class ReversedWords(WordUnscrambleTask):
94 |     DATASET_NAME = "reversed_words"
95 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/webqs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Semantic Parsing on Freebase from Question-Answer Pairs
 3 | https://cs.stanford.edu/~pliang/papers/freebase-emnlp2013.pdf
 4 | 
 5 | WebQuestions is a benchmark for question answering. The dataset consists of 6,642
 6 | question/answer pairs. The questions are supposed to be answerable by Freebase, a
 7 | large knowledge graph. The questions are mostly centered around a single named entity.
 8 | The questions are popular ones asked on the web (at least in 2013).
 9 | 
10 | Homepage: https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a
11 | """
12 | from lm_eval.base import rf, Task
13 | from lm_eval.metrics import mean
14 | 
15 | 
16 | _CITATION = """
17 | @inproceedings{berant-etal-2013-semantic,
18 |     title = "Semantic Parsing on {F}reebase from Question-Answer Pairs",
19 |     author = "Berant, Jonathan  and
20 |       Chou, Andrew  and
21 |       Frostig, Roy  and
22 |       Liang, Percy",
23 |     booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing",
24 |     month = oct,
25 |     year = "2013",
26 |     address = "Seattle, Washington, USA",
27 |     publisher = "Association for Computational Linguistics",
28 |     url = "https://aclanthology.org/D13-1160",
29 |     pages = "1533--1544",
30 | }
31 | """
32 | 
33 | 
34 | class WebQs(Task):
35 |     VERSION = 0
36 |     DATASET_PATH = "web_questions"
37 |     DATASET_NAME = None
38 | 
39 |     def has_training_docs(self):
40 |         return True
41 | 
42 |     def has_validation_docs(self):
43 |         return False
44 | 
45 |     def has_test_docs(self):
46 |         return True
47 | 
48 |     def training_docs(self):
49 |         if self._training_docs is None:
50 |             self._training_docs = list(self.dataset["train"])
51 |         return self._training_docs
52 | 
53 |     def test_docs(self):
54 |         return self.dataset["test"]
55 | 
56 |     def doc_to_text(self, doc):
57 |         return "Question: " + doc["question"] + "\nAnswer:"
58 | 
59 |     def should_decontaminate(self):
60 |         return True
61 | 
62 |     def doc_to_decontamination_query(self, doc):
63 |         return doc["question"]
64 | 
65 |     def doc_to_target(self, doc):
66 |         # this picks one answer to be the "correct" one, despite sometimes
67 |         # multiple correct answers being possible.
68 |         # TODO: make sure we're actually handling multi-answer correctly
69 |         return " " + doc["answers"][0]
70 | 
71 |     def _remove_prefixes(self, aliases):
72 |         # Optimization: Remove any alias that has a strict prefix elsewhere in the list
73 |         # we can do this because if the prefix is acceptable by isgreedy, we can stop looking
74 |         aliases.sort()
75 |         ret = [aliases[0]]
76 |         for alias in aliases[1:]:
77 |             if not alias.startswith(ret[-1]):
78 |                 ret.append(alias)
79 | 
80 |         return ret
81 | 
82 |     def construct_requests(self, doc, ctx):
83 |         ret = []
84 |         for alias in self._remove_prefixes(doc["answers"]):
85 |             _, is_prediction = rf.loglikelihood(ctx, " " + alias)
86 |             ret.append(is_prediction)
87 |         return ret
88 | 
89 |     def process_results(self, doc, results):
90 |         return {"acc": float(any(results))}
91 | 
92 |     def aggregation(self):
93 |         return {
94 |             "acc": mean,
95 |         }
96 | 
97 |     def higher_is_better(self):
98 |         return {"acc": True}
99 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/wikitext.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pointer Sentinel Mixture Models
 3 | https://arxiv.org/pdf/1609.07843.pdf
 4 | 
 5 | The WikiText language modeling dataset is a collection of over 100 million tokens
 6 | extracted from the set of verified Good and Featured articles on Wikipedia.
 7 | 
 8 | NOTE: This `Task` is based on WikiText-2.
 9 | 
10 | Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
11 | """
12 | import re
13 | from lm_eval.base import PerplexityTask
14 | 
15 | 
16 | _CITATION = """
17 | @misc{merity2016pointer,
18 |     title={Pointer Sentinel Mixture Models},
19 |     author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
20 |     year={2016},
21 |     eprint={1609.07843},
22 |     archivePrefix={arXiv},
23 |     primaryClass={cs.CL}
24 | }
25 | """
26 | 
27 | 
28 | def wikitext_detokenizer(string):
29 |     # contractions
30 |     string = string.replace("s '", "s'")
31 |     string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
32 |     # number separators
33 |     string = string.replace(" @-@ ", "-")
34 |     string = string.replace(" @,@ ", ",")
35 |     string = string.replace(" @.@ ", ".")
36 |     # punctuation
37 |     string = string.replace(" : ", ": ")
38 |     string = string.replace(" ; ", "; ")
39 |     string = string.replace(" . ", ". ")
40 |     string = string.replace(" ! ", "! ")
41 |     string = string.replace(" ? ", "? ")
42 |     string = string.replace(" , ", ", ")
43 |     # double brackets
44 |     string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
45 |     string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
46 |     string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
47 |     string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
48 |     string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
49 |     # miscellaneous
50 |     string = string.replace("= = = =", "====")
51 |     string = string.replace("= = =", "===")
52 |     string = string.replace("= =", "==")
53 |     string = string.replace(" " + chr(176) + " ", chr(176))
54 |     string = string.replace(" \n", "\n")
55 |     string = string.replace("\n ", "\n")
56 |     string = string.replace(" N ", " 1 ")
57 |     string = string.replace(" 's", "'s")
58 | 
59 |     return string
60 | 
61 | 
62 | class WikiText(PerplexityTask):
63 |     VERSION = 1
64 |     DATASET_PATH = "EleutherAI/wikitext_document_level"
65 |     DATASET_NAME = "wikitext-2-raw-v1"
66 | 
67 |     def has_training_docs(self):
68 |         return True
69 | 
70 |     def has_validation_docs(self):
71 |         return True
72 | 
73 |     def has_test_docs(self):
74 |         return True
75 | 
76 |     def training_docs(self):
77 |         return map(self._process_doc, self.dataset["train"])
78 | 
79 |     def validation_docs(self):
80 |         return map(self._process_doc, self.dataset["validation"])
81 | 
82 |     def test_docs(self):
83 |         return map(self._process_doc, self.dataset["test"])
84 | 
85 |     def _process_doc(self, doc):
86 |         return doc["page"]
87 | 
88 |     def doc_to_target(self, doc):
89 |         return wikitext_detokenizer(doc)
90 | 
91 |     def should_decontaminate(self):
92 |         return True
93 | 
94 |     def count_words(self, doc):
95 |         # count number of words in *original doc before detokenization*
96 |         return len(re.split(r"\s+", doc))
97 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/winogrande.py:
--------------------------------------------------------------------------------
  1 | """
  2 | WinoGrande: An Adversarial Winograd Schema Challenge at Scale
  3 | https://arxiv.org/pdf/1907.10641.pdf
  4 | 
  5 | WinoGrande is a collection of 44k problems, inspired by Winograd Schema Challenge
  6 | (Levesque, Davis, and Morgenstern 2011), but adjusted to improve the scale and
  7 | robustness against the dataset-specific bias. Formulated as a fill-in-a-blank
  8 | task with binary options, the goal is to choose the right option for a given
  9 | sentence which requires commonsense reasoning.
 10 | 
 11 | NOTE: This evaluation of Winogrande uses partial evaluation as described by
 12 | Trinh & Le in Simple Method for Commonsense Reasoning (2018).
 13 | See: https://arxiv.org/abs/1806.02847
 14 | 
 15 | Homepage: https://leaderboard.allenai.org/winogrande/submissions/public
 16 | """
 17 | 
 18 | import numpy as np
 19 | from lm_eval.base import rf, Task
 20 | from lm_eval.metrics import mean
 21 | 
 22 | 
 23 | _CITATION = """
 24 | @article{sakaguchi2019winogrande,
 25 |     title={WinoGrande: An Adversarial Winograd Schema Challenge at Scale},
 26 |     author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin},
 27 |     journal={arXiv preprint arXiv:1907.10641},
 28 |     year={2019}
 29 | }
 30 | """
 31 | 
 32 | 
 33 | class Winogrande(Task):
 34 |     VERSION = 0
 35 |     DATASET_PATH = "winogrande"
 36 |     DATASET_NAME = "winogrande_xl"
 37 | 
 38 |     answer_to_num = {"1": 0, "2": 1}
 39 | 
 40 |     def has_training_docs(self):
 41 |         return True
 42 | 
 43 |     def has_validation_docs(self):
 44 |         return True
 45 | 
 46 |     def has_test_docs(self):
 47 |         return False
 48 | 
 49 |     def training_docs(self):
 50 |         if self._training_docs is None:
 51 |             self._training_docs = list(self.dataset["train"])
 52 |         return self._training_docs
 53 | 
 54 |     def validation_docs(self):
 55 |         return self.dataset["validation"]
 56 | 
 57 |     def doc_to_text(self, doc):
 58 |         return self.partial_context(doc, doc["option" + doc["answer"]])
 59 | 
 60 |     def should_decontaminate(self):
 61 |         return True
 62 | 
 63 |     def doc_to_decontamination_query(self, doc):
 64 |         return doc["sentence"]
 65 | 
 66 |     @classmethod
 67 |     def partial_context(cls, doc, option):
 68 |         # Substitute the pronoun in the sentence with the specified option
 69 |         # and ignore everything after.
 70 |         pronoun_loc = doc["sentence"].index("_")
 71 |         return doc["sentence"][:pronoun_loc] + option
 72 | 
 73 |     def doc_to_target(self, doc):
 74 |         return self.partial_target(doc)
 75 | 
 76 |     @classmethod
 77 |     def partial_target(cls, doc):
 78 |         # The target is everything after the document specified pronoun.
 79 |         pronoun_loc = doc["sentence"].index("_") + 1
 80 |         return " " + doc["sentence"][pronoun_loc:].strip()
 81 | 
 82 |     def construct_requests(self, doc, ctx):
 83 |         """Uses RequestFactory to construct Requests and returns an iterable of
 84 |         Requests which will be sent to the LM.
 85 | 
 86 |         :param doc:
 87 |             The document as returned from training_docs, validation_docs, or test_docs.
 88 |         :param ctx: str
 89 |             The context string, generated by fewshot_context. This includes the natural
 90 |             language description, as well as the few shot examples, and the question
 91 |             part of the document for `doc`.
 92 |         """
 93 |         target = self.partial_target(doc)
 94 |         lls = []
 95 |         for option in [doc["option1"], doc["option2"]]:
 96 |             partial_ctx = self.partial_context(doc, option)
 97 |             full_ctx = self.append_context(ctx, partial_ctx)
 98 |             lls.append(rf.loglikelihood(full_ctx, target)[0])
 99 |         return lls
100 | 
101 |     @classmethod
102 |     def append_context(cls, ctx, partial_ctx):
103 |         ctx = ctx.split("\n\n")  # Each fewshot context is on its own new line.
104 |         ctx.pop()  # Remove the correct context put in by `doc_to_text`.
105 |         return "\n\n".join([*ctx, partial_ctx]) if ctx else partial_ctx
106 | 
107 |     def process_results(self, doc, results):
108 |         """Take a single document and the LM results and evaluates, returning a
109 |         dict where keys are the names of submetrics and values are the values of
110 |         the metric for that one document
111 | 
112 |         :param doc:
113 |             The document as returned from training_docs, validation_docs, or test_docs.
114 |         :param results:
115 |             The results of the requests created in construct_requests.
116 |         """
117 |         return {"acc": np.argmax(results) == self.answer_to_num[doc["answer"]]}
118 | 
119 |     def aggregation(self):
120 |         """
121 |         :returns: {str: [float] -> float}
122 |             A dictionary where keys are the names of submetrics and values are
123 |             functions that aggregate a list of metrics
124 |         """
125 |         return {"acc": mean}
126 | 
127 |     def higher_is_better(self):
128 |         """
129 |         :returns: {str: bool}
130 |             A dictionary where keys are the names of submetrics and values are
131 |             whether a higher value of the submetric is better
132 |         """
133 |         return {"acc": True}
134 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "qllm"
 7 | version = "0.1.0"
 8 | description = "An accurate and efficient low-bitwidth PTQ method designed for LLMs (W6A6, W4A8, W4A4)."
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 |     "datasets>=2.0.0","einops","jsonlines","numexpr",
17 |     "openai>=0.6.4","omegaconf>=2.2","peft>=0.2.0",
18 |     "pybind11>=2.6.2","pycountry","pytablewriter",
19 |     "rouge-score>=0.0.4","sacrebleu==1.5.0",
20 |     "scikit-learn>=0.24.1","sqlitedict",
21 |     "tqdm-multiprocess","zstandard",
22 |     "accelerate", "sentencepiece", "tokenizers>=0.12.1",
23 |     "torch>=2.0.0", "torchvision", 
24 |     "transformers==4.37.2", 
25 |     "texttable",
26 |     "toml", "attributedict",
27 |     "protobuf",
28 |     "numpy",
29 |     "matplotlib"
30 | ]
31 | 
32 | [tool.setuptools.packages.find]
33 | exclude = ["results*", "scripts*", "examples*"]
34 | 
35 | [tool.wheel]
36 | exclude = ["results*", "scripts*", "examples*"]


--------------------------------------------------------------------------------
/quantize/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/quantize/__init__.py


--------------------------------------------------------------------------------
/quantize/int_linear.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | from quantize.quantizer import UniformAffineQuantizer
 6 | 
 7 | 
 8 | class QuantLinear(nn.Module):
 9 |     """
10 |     Quantized Module that can perform quantized convolution or normal convolution.
11 |     To activate quantization, please use set_quant_state function.
12 |     """
13 | 
14 |     def __init__(
15 |         self,
16 |         org_module: nn.Linear,
17 |         weight_quant_params: dict = {},
18 |         act_quant_params: dict = {},
19 |         disable_input_quant=False,
20 |     ):
21 |         super().__init__()
22 |         self.fwd_kwargs = dict()
23 |         self.fwd_func = F.linear
24 |         self.weight = org_module.weight
25 |         if org_module.bias is not None:
26 |             self.bias = org_module.bias
27 |         else:
28 |             self.bias = None
29 |         # de-activate the quantized forward default
30 |         self.use_weight_quant = False
31 |         self.use_act_quant = False
32 |         self.replace_weight_with_quantized = False
33 |         self.is_weight_packed = False
34 |         self.mem_packer = None
35 |         # initialize quantizer
36 |         self.weight_quantizer = UniformAffineQuantizer(
37 |             **weight_quant_params, shape=org_module.weight.shape
38 |         )
39 |         if not disable_input_quant:
40 |             self.act_quantizer = UniformAffineQuantizer(**act_quant_params)
41 |         else:
42 |             self.act_quantizer = None
43 | 
44 |         self.disable_input_quant = disable_input_quant
45 |         self.use_temporary_parameter = False
46 | 
47 |     def forward(self, input: torch.Tensor):
48 |         if self.use_temporary_parameter:
49 |             weight = self.temp_weight
50 |             bias = self.temp_bias
51 |         elif self.use_weight_quant:
52 |             weight = self.weight_quantizer(self.weight)
53 |             bias = self.bias
54 |         else:
55 |             weight = self.weight
56 |             bias = self.bias
57 | 
58 |         if self.use_act_quant and not self.disable_input_quant:
59 |             input = self.act_quantizer(input)
60 | 
61 |         out = self.fwd_func(input, weight, bias, **self.fwd_kwargs)
62 | 
63 |         return out
64 | 
65 |     def set_quant_state(self, weight_quant: bool = False, act_quant: bool = False):
66 |         self.use_weight_quant = weight_quant
67 |         self.use_act_quant = act_quant
68 | 
69 |     def extra_repr(self):
70 |         s = super().extra_repr()
71 |         s += ", use_act_quant={}".format(self.use_act_quant)
72 |         s += ", use_weight_quant={}".format(self.use_weight_quant)
73 |         s += ", disable_input_quant={}".format(self.disable_input_quant)
74 |         s += ", quant"
75 |         return s
76 | 


--------------------------------------------------------------------------------
/quantize/int_linear_lora.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | from quantize.int_linear import QuantLinear
  7 | 
  8 | 
  9 | class LoRALayer:
 10 |     def __init__(
 11 |         self,
 12 |         r: int,
 13 |         lora_alpha: int,
 14 |         lora_dropout: float,
 15 |         merge_weights: bool,
 16 |     ):
 17 |         self.r = r
 18 |         self.lora_alpha = lora_alpha
 19 |         # Optional dropout
 20 |         if lora_dropout > 0.0:
 21 |             self.lora_dropout = nn.Dropout(p=lora_dropout)
 22 |         else:
 23 |             self.lora_dropout = lambda x: x
 24 |         # Mark the weight as unmerged
 25 |         self.merged = False
 26 |         self.merge_weights = merge_weights
 27 | 
 28 | 
 29 | class LoRAQuantLinear(QuantLinear, LoRALayer):
 30 |     """
 31 |     Quantized Module that can perform quantized convolution or normal convolution.
 32 |     To activate quantization, please use set_quant_state function.
 33 |     """
 34 | 
 35 |     def __init__(
 36 |         self,
 37 |         org_module: nn.Linear,
 38 |         weight_quant_params: dict = {},
 39 |         act_quant_params: dict = {},
 40 |         disable_input_quant=False,
 41 |         r=0,
 42 |         lora_alpha=1,
 43 |         lora_dropout=0.0,
 44 |         merge_weights=True,
 45 |     ):
 46 |         super().__init__(
 47 |             org_module, weight_quant_params, act_quant_params, disable_input_quant
 48 |         )
 49 |         LoRALayer.__init__(
 50 |             self,
 51 |             r=r,
 52 |             lora_alpha=lora_alpha,
 53 |             lora_dropout=lora_dropout,
 54 |             merge_weights=merge_weights,
 55 |         )
 56 | 
 57 |         if r > 0:
 58 |             out_features, in_features = self.weight.shape
 59 |             self.lora_A = nn.Parameter(self.weight.new_zeros((r, in_features)))
 60 |             self.lora_B = nn.Parameter(self.weight.new_zeros((out_features, r)))
 61 |             self.scaling = self.lora_alpha / r
 62 |             # Freezing the pre-trained weight matrix
 63 |             self.weight.requires_grad = False
 64 | 
 65 |         self.reset_lora_parameters()
 66 | 
 67 |     def reset_lora_parameters(self):
 68 |         if hasattr(self, "lora_A"):
 69 |             # initialize A the same way as the default for nn.Linear and B to zero
 70 |             nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
 71 |             nn.init.zeros_(self.lora_B)
 72 | 
 73 |     def forward(self, input: torch.Tensor):
 74 |         if self.use_temporary_parameter:
 75 |             weight = self.temp_weight
 76 |             bias = self.temp_bias
 77 |         elif self.use_weight_quant:
 78 |             weight = self.weight_quantizer(self.weight)
 79 |             bias = self.bias
 80 |         else:
 81 |             weight = self.weight
 82 |             bias = self.bias
 83 | 
 84 |         if self.use_act_quant and not self.disable_input_quant:
 85 |             input = self.act_quantizer(input)
 86 | 
 87 |         if self.r > 0 and not self.merged and self.use_weight_quant:
 88 |             out = self.fwd_func(
 89 |                 input,
 90 |                 weight + self.lora_B @ self.lora_A * self.scaling,
 91 |                 bias,
 92 |                 **self.fwd_kwargs
 93 |             )
 94 |         else:
 95 |             out = self.fwd_func(input, weight, bias, **self.fwd_kwargs)
 96 | 
 97 |         return out
 98 | 
 99 |     def extra_repr(self):
100 |         s = super().extra_repr()
101 |         s += ", use_act_quant={}".format(self.use_act_quant)
102 |         s += ", use_weight_quant={}".format(self.use_weight_quant)
103 |         s += ", disable_input_quant={}".format(self.disable_input_quant)
104 |         s += ", lora_quant"
105 |         return s
106 | 


--------------------------------------------------------------------------------
/quantize/int_matmul.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from quantize.quantizer import UniformAffineQuantizer
 5 | 
 6 | 
 7 | class QuantMatMul(nn.Module):
 8 |     def __init__(
 9 |         self,
10 |         x1_quant_params: dict = {},
11 |         x2_quant_params: dict = {},
12 |         disable_act_quant=False,
13 |         matmul_func=torch.bmm,
14 |     ):
15 |         super().__init__()
16 |         # de-activate the quantized forward default
17 |         self.use_act_quant = False
18 |         # initialize quantizer
19 |         self.i_cluster_counts = None
20 |         self.x1_quantizer = UniformAffineQuantizer(**x1_quant_params)
21 |         self.x2_quantizer = UniformAffineQuantizer(**x2_quant_params)
22 |         self.matmul_func = matmul_func
23 | 
24 |         self.disable_act_quant = disable_act_quant
25 | 
26 |     def set_quant_state(self, weight_quant: bool = False, act_quant: bool = False):
27 |         self.use_weight_quant = weight_quant
28 |         self.use_act_quant = act_quant
29 | 
30 |     def quant_x1(self, x1):
31 |         if self.use_act_quant:
32 |             x1 = self.x1_quantizer(x1)
33 |         return x1
34 | 
35 |     def quant_x2(self, x2):
36 |         if self.use_act_quant:
37 |             x2 = self.x2_quantizer(x2)
38 |         return x2
39 | 
40 |     def forward(self, x1, x2):
41 |         out = self.matmul_func(x1, x2)
42 |         return out
43 | 


--------------------------------------------------------------------------------
/quantize/learnable_norm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | """
 5 | Modify normalization layer to adapt the training of learnable equivalent transformation
 6 | """
 7 | 
 8 | 
 9 | class LearnableLlamaRMSNorm(nn.Module):
10 |     def __init__(self, ori_norm, eps=1e-6):
11 |         """
12 |         LlamaRMSNorm is equivalent to T5LayerNorm
13 |         """
14 |         super().__init__()
15 |         self.ori_norm = ori_norm
16 |         self.bias = torch.nn.Parameter(
17 |             torch.zeros(ori_norm.weight.shape, device=ori_norm.weight.device)
18 |         )
19 |         self.variance_epsilon = eps
20 |         self.use_temporary_parameter = False
21 | 
22 |     def forward(self, hidden_states):
23 |         input_dtype = hidden_states.dtype
24 |         variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True)
25 |         hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
26 |         weight = self.ori_norm.weight
27 |         bias = self.bias
28 | 
29 |         return (
30 |             (weight * hidden_states + bias).to(input_dtype)
31 |             if bias is not None
32 |             else (weight * hidden_states).to(input_dtype)
33 |         )
34 | 


--------------------------------------------------------------------------------
/quantize/quantizer.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | 
  6 | CLIPMIN = 1e-5
  7 | 
  8 | 
  9 | def round_ste(x: torch.Tensor):
 10 |     """
 11 |     Implement Straight-Through Estimator for rounding operation.
 12 |     """
 13 |     return (x.round() - x).detach() + x
 14 | 
 15 | 
 16 | class UniformAffineQuantizer(nn.Module):
 17 |     def __init__(
 18 |         self,
 19 |         n_bits: int = 8,
 20 |         symmetric: bool = False,
 21 |         per_channel_axes=[],
 22 |         metric="minmax",
 23 |         dynamic=False,
 24 |         dynamic_method="per_cluster",
 25 |         group_size=None,
 26 |         shape=None,
 27 |         use_learnable_step_size=False,
 28 |         **kwargs
 29 |     ):
 30 |         """
 31 |         support cluster quantize
 32 |         dynamic_method support per_token and per_cluster
 33 |         """
 34 |         super().__init__()
 35 |         self.symmetric = symmetric
 36 |         assert 2 <= n_bits <= 16, "bitwidth not supported"
 37 |         self.n_bits = n_bits
 38 |         self.qmin = 0
 39 |         self.qmax = 2 ** (n_bits) - 1
 40 |         self.per_channel_axes = per_channel_axes
 41 |         self.metric = metric
 42 |         self.cluster_counts = None
 43 |         self.cluster_dim = None
 44 | 
 45 |         self.scale = None
 46 |         self.zero_point = None
 47 |         self.round_zero_point = None
 48 | 
 49 |         self.cached_xmin = None
 50 |         self.cached_xmax = None
 51 |         self.dynamic = dynamic
 52 |         self.dynamic_method = dynamic_method
 53 | 
 54 |         self.deficiency = 0
 55 |         self.use_learnable_step_size = use_learnable_step_size
 56 | 
 57 |         if use_learnable_step_size:
 58 |             if group_size:
 59 |                 dim1 = int(shape[0] * math.ceil(shape[1] / group_size))
 60 |                 self.deficiency = shape[-1] % group_size
 61 |                 if self.deficiency > 0:
 62 |                     self.deficiency = group_size - self.deficiency
 63 |                     assert self.symmetric  # support for mlc-llm quantization
 64 |             else:
 65 |                 dim1 = shape[0]
 66 | 
 67 |         self.enable = True
 68 |         self.group_size = group_size
 69 |         self.is_init = False
 70 | 
 71 |     def change_n_bits(self, n_bits):
 72 |         self.n_bits = n_bits
 73 |         self.qmin = 0
 74 |         self.qmax = 2 ** (n_bits) - 1
 75 | 
 76 |     def fake_quant(self, x, scale, round_zero_point):
 77 |         if self.deficiency > 0:
 78 |             pad_zeros = torch.zeros(
 79 |                 (x.shape[0], self.deficiency), dtype=x.dtype, device=x.device
 80 |             )
 81 |             x = torch.cat((x, pad_zeros), dim=1)
 82 | 
 83 |         if self.group_size:
 84 |             assert len(x.shape) == 2, "only support linear layer now"
 85 |             dim1, dim2 = x.shape
 86 |             x = x.reshape(-1, self.group_size)
 87 |         x_int = round_ste(x / scale)
 88 |         if round_zero_point is not None:
 89 |             x_int = x_int.add(round_zero_point)
 90 |         x_int = x_int.clamp(self.qmin, self.qmax)
 91 |         x_dequant = x_int
 92 |         if round_zero_point is not None:
 93 |             x_dequant = x_dequant.sub(round_zero_point)
 94 |         x_dequant = x_dequant.mul(scale)
 95 |         if self.group_size:
 96 |             x_dequant = x_dequant.reshape(dim1, dim2)
 97 |         if self.deficiency > 0:
 98 |             x_dequant = x_dequant[:, : -self.deficiency]
 99 |         return x_dequant
100 | 
101 |     def forward(self, x: torch.Tensor):
102 |         if self.n_bits >= 16 or not self.enable:
103 |             return x
104 |         if self.metric == "fix0to1":
105 |             return x.mul_(2**self.n_bits - 1).round_().div_(2**self.n_bits - 1)
106 | 
107 |         if self.dynamic_method == "per_token" or self.dynamic_method == "per_channel":
108 |             self.per_token_dynamic_calibration(x)
109 |         else:
110 |             raise NotImplementedError()
111 | 
112 |         x_dequant = self.fake_quant(
113 |             x, self.scale.abs().clamp(min=CLIPMIN, max=1e4), self.round_zero_point
114 |         )
115 |         return x_dequant
116 | 
117 |     def per_token_dynamic_calibration(self, x):
118 |         if self.group_size:
119 |             if self.deficiency == 0:
120 |                 x = x.reshape(-1, self.group_size)
121 |             else:
122 |                 pad_zeros = torch.zeros(
123 |                     (x.shape[0], self.deficiency), dtype=x.dtype, device=x.device
124 |                 )
125 |                 x = torch.cat((x, pad_zeros), dim=1)
126 |                 x = x.reshape(-1, self.group_size)
127 |         reduce_shape = [-1]
128 |         xmin = x.amin(reduce_shape, keepdim=True)
129 |         xmax = x.amax(reduce_shape, keepdim=True)
130 |         if self.symmetric:
131 |             abs_max = torch.max(xmax.abs(), xmin.abs())
132 |             scale = abs_max / (2 ** (self.n_bits - 1) - 1)
133 |             # scale = scale.clamp(min=CLIPMIN, max=1e4)
134 |             if self.use_learnable_step_size:
135 |                 if not self.is_init:
136 |                     self.register_parameter("scale", torch.nn.Parameter(scale))
137 |                     self.is_init = True
138 |             else:
139 |                 self.scale = scale
140 |             zero_point = (2 ** (self.n_bits - 1) - 1) * torch.ones_like(self.scale)
141 |         else:
142 |             range = xmax - xmin
143 |             scale = range / (2**self.n_bits - 1)
144 |             # self.scale = scale.clamp(min=CLIPMIN, max=1e4)
145 |             if self.use_learnable_step_size:
146 |                 if not self.is_init:
147 |                     del self.scale
148 |                     self.register_parameter("scale", torch.nn.Parameter(scale))
149 |                     self.is_init = True
150 |             else:
151 |                 self.scale = scale
152 |             zero_point = -(xmin) / (self.scale)
153 |         self.round_zero_point = zero_point.clamp(min=-1e4, max=1e4).round()
154 | 


--------------------------------------------------------------------------------
/scripts/llama-13b/w4a4.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 4 \
 4 | --abits 4 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 5e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \
18 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-13b/w4a8.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 4 \
 4 | --abits 8 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 5e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \
18 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-13b/w6a6.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 6 \
 4 | --abits 6 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 5e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \
18 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-2-13b/w4a4.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 4 \
 4 | --abits 4 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 5e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \
18 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-2-13b/w4a8.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 4 \
 4 | --abits 8 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 5e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \
18 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-2-13b/w6a6.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 6 \
 4 | --abits 6 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 1e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \
18 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-2-70b/w4a4.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 4 \
 4 | --abits 4 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 1e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-2-70b/w4a8.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 4 \
 4 | --abits 8 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 1e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-2-70b/w6a6.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 6 \
 4 | --abits 6 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 1e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-2-7b/w4a4.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 4 \
 4 | --abits 4 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 1e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \
18 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-2-7b/w4a8.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 4 \
 4 | --abits 8 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 1e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \
18 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-2-7b/w6a6.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 6 \
 4 | --abits 6 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 5e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \
18 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-30b/w4a4.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 4 \
 4 | --abits 4 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 5e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-30b/w4a8.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 4 \
 4 | --abits 8 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 5e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-30b/w6a6.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 6 \
 4 | --abits 6 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 5e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-65b/w4a4.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 4 \
 4 | --abits 4 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 5e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-65b/w4a8.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 4 \
 4 | --abits 8 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 5e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-65b/w6a6.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 6 \
 4 | --abits 6 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 5e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-7b/w4a4.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 4 \
 4 | --abits 4 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 5e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \
18 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-7b/w4a8.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 4 \
 4 | --abits 8 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 5e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \
18 | --batch_size 1


--------------------------------------------------------------------------------
/scripts/llama-7b/w6a6.sh:
--------------------------------------------------------------------------------
 1 | SAVE_PATH=save_path
 2 | python main.py --model model_path \
 3 | --wbits 6 \
 4 | --abits 6 \
 5 | --eval_ppl \
 6 | --use_lora \
 7 | --output_dir ${SAVE_PATH} \
 8 | --lr 5e-4 \
 9 | --num_layer 4 \
10 | --epochs 10 \
11 | --plot_act_max \
12 | --channel_ratio 0.2 \
13 | --plot_num_additional_channels \
14 | --calibrate_bs 1 \
15 | --num_gpu 1 \
16 | --nsamples 128 \
17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \
18 | --batch_size 1


--------------------------------------------------------------------------------