├── .gitignore ├── LICENSE ├── README.md ├── assembly └── ca_module.py ├── categories.py ├── datautils.py ├── disassembly └── cd_module.py ├── eval.py ├── imgs ├── llama_1_results.png ├── llama_2_results.png └── qllm.png ├── lm_eval ├── __init__.py ├── base.py ├── datasets │ ├── README.md │ ├── __init__.py │ ├── asdiv │ │ ├── __init__.py │ │ ├── asdiv.py │ │ └── dataset_infos.json │ ├── coqa │ │ ├── __init__.py │ │ ├── coqa.py │ │ └── dataset_infos.json │ ├── drop │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── drop.py │ ├── headqa │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── headqa.py │ ├── hendrycks_ethics │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── hendrycks_ethics.py │ ├── hendrycks_math │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── hendrycks_math.py │ ├── logiqa │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── logiqa.py │ ├── mutual │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── mutual.py │ ├── pile │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── pile.py │ ├── quac │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── quac.py │ ├── sat_analogies │ │ ├── __init__.py │ │ └── sat_analogies.py │ ├── triviaqa │ │ ├── README.md │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── triviaqa.py │ └── unscramble │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── unscramble.py ├── decontamination │ ├── __init__.py │ ├── archiver.py │ ├── decontaminate.py │ └── janitor.py ├── evaluator copy.py ├── evaluator.py ├── metrics.py ├── models │ ├── __init__.py │ ├── dummy.py │ ├── gpt2.py │ ├── gpt3.py │ ├── huggingface.py │ └── textsynth.py ├── tasks │ ├── __init__.py │ ├── anli.py │ ├── arc.py │ ├── arithmetic.py │ ├── asdiv.py │ ├── blimp.py │ ├── cbt.py │ ├── coqa.py │ ├── crowspairs.py │ ├── drop.py │ ├── glue.py │ ├── gsm8k.py │ ├── headqa.py │ ├── hellaswag.py │ ├── hendrycks_ethics.py │ ├── hendrycks_math.py │ ├── hendrycks_test.py │ ├── lambada.py │ ├── lambada_cloze.py │ ├── lambada_multilingual.py │ ├── logiqa.py │ ├── mathqa.py │ ├── mc_taco.py │ ├── mutual.py │ ├── naturalqs.py │ ├── openbookqa.py │ ├── pile.py │ ├── piqa.py │ ├── prost.py │ ├── pubmedqa.py │ ├── qa4mre.py │ ├── qasper.py │ ├── quac.py │ ├── race.py │ ├── sat.py │ ├── sciq.py │ ├── squad.py │ ├── storycloze.py │ ├── superglue.py │ ├── swag.py │ ├── toxigen.py │ ├── translation.py │ ├── triviaqa.py │ ├── truthfulqa.py │ ├── unscramble.py │ ├── webqs.py │ ├── wikitext.py │ ├── winogrande.py │ └── wsc273.py └── utils.py ├── main.py ├── models ├── LMClass.py ├── int_llama_layer.py ├── int_opt_layer.py ├── int_qllm_llama_layer.py ├── models_utils.py └── transformation.py ├── parallel_utils.py ├── pyproject.toml ├── quantize ├── __init__.py ├── int_linear.py ├── int_linear_lora.py ├── int_matmul.py ├── learnable_norm.py ├── qllm.py └── quantizer.py ├── reassembly └── cr_module.py ├── scripts ├── llama-13b │ ├── w4a4.sh │ ├── w4a8.sh │ └── w6a6.sh ├── llama-2-13b │ ├── w4a4.sh │ ├── w4a8.sh │ └── w6a6.sh ├── llama-2-70b │ ├── w4a4.sh │ ├── w4a8.sh │ └── w6a6.sh ├── llama-2-7b │ ├── w4a4.sh │ ├── w4a8.sh │ └── w6a6.sh ├── llama-30b │ ├── w4a4.sh │ ├── w4a8.sh │ └── w6a6.sh ├── llama-65b │ ├── w4a4.sh │ ├── w4a8.sh │ └── w6a6.sh └── llama-7b │ ├── w4a4.sh │ ├── w4a8.sh │ └── w6a6.sh ├── train_utils.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | build 3 | dist 4 | *.txt 5 | *.pt 6 | *egg-info* 7 | tmp 8 | output 9 | *.pyc 10 | .idea 11 | *.zip 12 | cache/ 13 | temp/ 14 | checkpoints/ 15 | huggingface/ 16 | log/ 17 | act_scales/ 18 | act_shifts/ 19 | temp.sh 20 | output/ 21 | .vscode/ 22 | plot/ 23 | wandb/ -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # QLLM: Accurate and Efficient Low-Bitwidth Quantization for Large Language Models (ICLR 2024) 2 | 3 | [![License](https://img.shields.io/badge/License-Apache_2.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 4 | [![arXiv](https://img.shields.io/badge/QLLM-2310.08041-b31b1b.svg)](https://arxiv.org/abs/2310.08041) 5 | 6 | This is the official PyTorch implementation of [QLLM: Accurate and Efficient Low-Bitwidth Quantization for Large Language Models](https://arxiv.org/abs/2310.08041). 7 | 8 | By [Jing Liu](https://jing-liu.com/), [Ruihao Gong](https://xhplus.github.io/), [Xiuying Wei](https://wimh966.github.io/), [Zhiwei Dong](https://zwdong.com.cn/), [Jianfei Cai](https://jianfei-cai.github.io/), and [Bohan Zhuang](https://bohanzhuang.github.io/). 9 | 10 | ![qllm](imgs/qllm.png) 11 | 12 | We propose QLLM, an accurate and efficient low-bitwidth post-training quantization method designed for LLMs. 13 | 14 | ## 📰 News 15 | - [10-03-2024] Release the code!🌟 16 | - [17-01-2024] QLLM is accepted by ICLR 2024! 👏 17 | 18 | ## 📖 Contents 19 | - [Install](#🛠-install) 20 | - [Usage](#⚙️-usage) 21 | - [Results](#📋-results) 22 | - [Citation](#📝-citation) 23 | - [License](#🧾-license) 24 | - [Acknowledgement](#🙏-acknowledgement) 25 | 26 | ## 🛠 Install 27 | ``` 28 | conda create -n qllm python=3.10 -y 29 | conda activate qllm 30 | git clone https://github.com/ModelTC/QLLM 31 | cd QLLM 32 | pip install --upgrade pip 33 | pip install -e . 34 | ``` 35 | 36 | ## ⚙️ Usage 37 | We provide the training scripts in `scripts` folder. For example, to perform W4A8 quantization for LLaMA-7B, run 38 | ``` 39 | sh scripts/llama-7b/w4a4.sh 40 | ``` 41 | Remember to change the path of model `model` and output path `output_dir`. 42 | 43 | ## 📋 Results 44 | * QLLM achieve SoTA performance in weight-activation quantization 45 | 46 | ![weight_activation_llama_1](imgs/llama_1_results.png) 47 | ![weight_activation_llama_2](imgs/llama_2_results.png) 48 | 49 | ## 📝 Citation 50 | If you find our `QLLM` useful in your research, please consider to cite the following related papers: 51 | ``` 52 | @inproceedings{liu2024qllm, 53 | title = {{QLLM}: Accurate and Efficient Low-Bitwidth Quantization for Large Language Models}, 54 | author = {Liu, Jing and Gong, Ruihao and Wei, Xiuying and Dong, Zhiwei and Cai, Jianfei and Zhuang, Bohan}, 55 | booktitle = {International Conference on Learning Representations (ICLR)}, 56 | year = {2024}, 57 | } 58 | ``` 59 | 60 | ## 🧾 License 61 | This repository is released under the Apache 2.0 license as found in the [LICENSE](./LICENSE) file. 62 | 63 | ## 🙏 Acknowledgement 64 | This repository is built upon [OmniQuant](https://github.com/OpenGVLab/OmniQuant). We thank the authors for their open-sourced code. -------------------------------------------------------------------------------- /categories.py: -------------------------------------------------------------------------------- 1 | subcategories = { 2 | "abstract_algebra": ["math"], 3 | "anatomy": ["health"], 4 | "astronomy": ["physics"], 5 | "business_ethics": ["business"], 6 | "clinical_knowledge": ["health"], 7 | "college_biology": ["biology"], 8 | "college_chemistry": ["chemistry"], 9 | "college_computer_science": ["computer science"], 10 | "college_mathematics": ["math"], 11 | "college_medicine": ["health"], 12 | "college_physics": ["physics"], 13 | "computer_security": ["computer science"], 14 | "conceptual_physics": ["physics"], 15 | "econometrics": ["economics"], 16 | "electrical_engineering": ["engineering"], 17 | "elementary_mathematics": ["math"], 18 | "formal_logic": ["philosophy"], 19 | "global_facts": ["other"], 20 | "high_school_biology": ["biology"], 21 | "high_school_chemistry": ["chemistry"], 22 | "high_school_computer_science": ["computer science"], 23 | "high_school_european_history": ["history"], 24 | "high_school_geography": ["geography"], 25 | "high_school_government_and_politics": ["politics"], 26 | "high_school_macroeconomics": ["economics"], 27 | "high_school_mathematics": ["math"], 28 | "high_school_microeconomics": ["economics"], 29 | "high_school_physics": ["physics"], 30 | "high_school_psychology": ["psychology"], 31 | "high_school_statistics": ["math"], 32 | "high_school_us_history": ["history"], 33 | "high_school_world_history": ["history"], 34 | "human_aging": ["health"], 35 | "human_sexuality": ["culture"], 36 | "international_law": ["law"], 37 | "jurisprudence": ["law"], 38 | "logical_fallacies": ["philosophy"], 39 | "machine_learning": ["computer science"], 40 | "management": ["business"], 41 | "marketing": ["business"], 42 | "medical_genetics": ["health"], 43 | "miscellaneous": ["other"], 44 | "moral_disputes": ["philosophy"], 45 | "moral_scenarios": ["philosophy"], 46 | "nutrition": ["health"], 47 | "philosophy": ["philosophy"], 48 | "prehistory": ["history"], 49 | "professional_accounting": ["other"], 50 | "professional_law": ["law"], 51 | "professional_medicine": ["health"], 52 | "professional_psychology": ["psychology"], 53 | "public_relations": ["politics"], 54 | "security_studies": ["politics"], 55 | "sociology": ["culture"], 56 | "us_foreign_policy": ["politics"], 57 | "virology": ["health"], 58 | "world_religions": ["philosophy"], 59 | } 60 | 61 | categories = { 62 | "STEM": [ 63 | "physics", 64 | "chemistry", 65 | "biology", 66 | "computer science", 67 | "math", 68 | "engineering", 69 | ], 70 | "humanities": ["history", "philosophy", "law"], 71 | "social sciences": ["politics", "culture", "economics", "geography", "psychology"], 72 | "other (business, health, misc.)": ["other", "business", "health"], 73 | } 74 | -------------------------------------------------------------------------------- /disassembly/cd_module.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class CDModule(nn.Module): 6 | def __init__(self, channel_ratio): 7 | super().__init__() 8 | self.channel_ratio = channel_ratio 9 | self.outlier_channel_idx = None 10 | self.num_disassembly = None 11 | self.scaling_factors = None 12 | self.num_additional_channels = 0 13 | 14 | def find_threshold_uniform(self, x_max): 15 | x_max = x_max.float() 16 | num_channels = x_max.numel() 17 | channel_constraint = int(num_channels * self.channel_ratio) 18 | channelmax_max = x_max.max() 19 | channelmax_min = x_max.min() 20 | 21 | th = channelmax_max 22 | step_num = max(100, int(channelmax_max / 0.5)) 23 | step = (channelmax_max - channelmax_min) / step_num 24 | while th >= channelmax_min: 25 | num_disassembly = torch.ceil(x_max / th) 26 | num_disassembly = torch.clamp(num_disassembly, min=1.0) 27 | num_additional_channels = num_disassembly.int().sum().item() - num_channels 28 | if num_additional_channels > channel_constraint: 29 | th += step 30 | break 31 | else: 32 | th -= step 33 | print("Find threshold {} using uniform method".format(th)) 34 | return th 35 | 36 | def find_outlier_channels(self, x_min, x_max): 37 | with torch.no_grad(): 38 | x_max = torch.maximum(x_min.abs(), x_max) 39 | th = self.find_threshold_uniform(x_max) 40 | outlier_channel_idx = (x_max > th).nonzero().view(-1) 41 | num_disassembly = torch.ceil(x_max / th) 42 | num_disassembly = torch.clamp(num_disassembly, min=1.0) 43 | scaling_factors = (1.0 / num_disassembly).repeat_interleave(num_disassembly.int()) 44 | if len(outlier_channel_idx) != 0: 45 | del self.outlier_channel_idx 46 | del self.num_disassembly 47 | del self.scaling_factors 48 | self.register_buffer("outlier_channel_idx", outlier_channel_idx) 49 | self.register_buffer("num_disassembly", num_disassembly) 50 | self.register_buffer("scaling_factors", scaling_factors) 51 | 52 | def forward(self, x): 53 | if self.outlier_channel_idx is not None: 54 | if x.ndim == 2: 55 | x = x.unsqueeze(0) 56 | B, N, C = x.shape 57 | x = x.view(B * N, C) 58 | x = torch.repeat_interleave(x, self.num_disassembly.int(), dim=1) 59 | x = x * self.scaling_factors.unsqueeze(0) 60 | C = x.shape[1] 61 | x = x.view(B, N, C) 62 | return x 63 | -------------------------------------------------------------------------------- /imgs/llama_1_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/imgs/llama_1_results.png -------------------------------------------------------------------------------- /imgs/llama_2_results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/imgs/llama_2_results.png -------------------------------------------------------------------------------- /imgs/qllm.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/imgs/qllm.png -------------------------------------------------------------------------------- /lm_eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/README.md: -------------------------------------------------------------------------------- 1 | # datasets 2 | 3 | This directory contains custom HuggingFace [dataset loading scripts](https://huggingface.co/docs/datasets/dataset_script). They are provided to maintain backward compatibility with the ad-hoc data downloaders in earlier versions of the `lm-evaluation-harness` before HuggingFace [`datasets`](https://huggingface.co/docs/datasets/index) was adopted as the default downloading manager. For example, some instances in the HuggingFace `datasets` repository process features (e.g. whitespace stripping, lower-casing, etc.) in ways that the `lm-evaluation-harness` did not. 4 | 5 | __NOTE__: We are __not__ accepting any additional loading scripts into the main branch! If you'd like to use a custom dataset, fork the repo and follow HuggingFace's loading script guide found [here](https://huggingface.co/docs/datasets/dataset_script). You can then override your `Task`'s `DATASET_PATH` attribute to point to this script's local path. 6 | 7 | 8 | __WARNING__: A handful of loading scripts are included in this collection because they have not yet been pushed to the Huggingface Hub or a HuggingFace organization repo. We will remove such scripts once pushed. 9 | -------------------------------------------------------------------------------- /lm_eval/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/asdiv/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/asdiv/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/asdiv/asdiv.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ASDIV dataset.""" 15 | 16 | 17 | import os 18 | import xml.etree.ElementTree as ET 19 | 20 | import datasets 21 | 22 | 23 | _CITATION = """\ 24 | @misc{miao2021diverse, 25 | title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers}, 26 | author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su}, 27 | year={2021}, 28 | eprint={2106.15772}, 29 | archivePrefix={arXiv}, 30 | primaryClass={cs.AI} 31 | } 32 | """ 33 | 34 | _DESCRIPTION = """\ 35 | ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language 36 | patterns and problem types) English math word problem (MWP) corpus for evaluating 37 | the capability of various MWP solvers. Existing MWP corpora for studying AI progress 38 | remain limited either in language usage patterns or in problem types. We thus present 39 | a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem 40 | types taught in elementary school. Each MWP is annotated with its problem type and grade 41 | level (for indicating the level of difficulty). 42 | """ 43 | 44 | _HOMEPAGE = "https://github.com/chaochun/nlu-asdiv-dataset" 45 | 46 | # TODO: Add the licence for the dataset here if you can find it 47 | _LICENSE = "" 48 | 49 | _URLS = "https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip" 50 | 51 | 52 | class ASDiv(datasets.GeneratorBasedBuilder): 53 | """ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers""" 54 | 55 | VERSION = datasets.Version("0.0.1") 56 | 57 | BUILDER_CONFIGS = [ 58 | datasets.BuilderConfig( 59 | name="asdiv", 60 | version=VERSION, 61 | description="A diverse corpus for evaluating and developing english math word problem solvers", 62 | ) 63 | ] 64 | 65 | def _info(self): 66 | features = datasets.Features( 67 | { 68 | "body": datasets.Value("string"), 69 | "question": datasets.Value("string"), 70 | "solution_type": datasets.Value("string"), 71 | "answer": datasets.Value("string"), 72 | "formula": datasets.Value("string"), 73 | } 74 | ) 75 | return datasets.DatasetInfo( 76 | description=_DESCRIPTION, 77 | features=features, 78 | homepage=_HOMEPAGE, 79 | license=_LICENSE, 80 | citation=_CITATION, 81 | ) 82 | 83 | def _split_generators(self, dl_manager): 84 | urls = _URLS 85 | data_dir = dl_manager.download_and_extract(urls) 86 | base_filepath = "nlu-asdiv-dataset-55790e5270bb91ccfa5053194b25732534696b50" 87 | return [ 88 | datasets.SplitGenerator( 89 | name=datasets.Split.VALIDATION, 90 | # These kwargs will be passed to _generate_examples 91 | gen_kwargs={ 92 | "filepath": os.path.join( 93 | data_dir, base_filepath, "dataset", "ASDiv.xml" 94 | ), 95 | "split": datasets.Split.VALIDATION, 96 | }, 97 | ), 98 | ] 99 | 100 | # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` 101 | def _generate_examples(self, filepath, split): 102 | tree = ET.parse(filepath) 103 | root = tree.getroot() 104 | for key, problem in enumerate(root.iter("Problem")): 105 | yield key, { 106 | "body": problem.find("Body").text, 107 | "question": problem.find("Question").text, 108 | "solution_type": problem.find("Solution-Type").text, 109 | "answer": problem.find("Answer").text, 110 | "formula": problem.find("Formula").text, 111 | } 112 | -------------------------------------------------------------------------------- /lm_eval/datasets/asdiv/dataset_infos.json: -------------------------------------------------------------------------------- 1 | {"asdiv": {"description": "ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language\npatterns and problem types) English math word problem (MWP) corpus for evaluating\nthe capability of various MWP solvers. Existing MWP corpora for studying AI progress\nremain limited either in language usage patterns or in problem types. We thus present\na new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem\ntypes taught in elementary school. Each MWP is annotated with its problem type and grade\nlevel (for indicating the level of difficulty).\n", "citation": "@misc{miao2021diverse,\n title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},\n author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},\n year={2021},\n eprint={2106.15772},\n archivePrefix={arXiv},\n primaryClass={cs.AI}\n}\n", "homepage": "https://github.com/chaochun/nlu-asdiv-dataset", "license": "", "features": {"body": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "solution_type": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "formula": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "as_div", "config_name": "asdiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 501489, "num_examples": 2305, "dataset_name": "as_div"}}, "download_checksums": {"https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip": {"num_bytes": 440966, "checksum": "8f1fe4f6d5f170ec1e24ab78c244153c14c568b1bb2b1dad0324e71f37939a2d"}}, "download_size": 440966, "post_processing_size": null, "dataset_size": 501489, "size_in_bytes": 942455}} 2 | -------------------------------------------------------------------------------- /lm_eval/datasets/coqa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/coqa/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/coqa/dataset_infos.json: -------------------------------------------------------------------------------- 1 | {"coqa": {"description": "CoQA is a large-scale dataset for building Conversational Question Answering\nsystems. The goal of the CoQA challenge is to measure the ability of machines to\nunderstand a text passage and answer a series of interconnected questions that\nappear in a conversation.\n", "citation": "@misc{reddy2018coqa,\n title={CoQA: A Conversational Question Answering Challenge},\n author={Siva Reddy and Danqi Chen and Christopher D. Manning},\n year={2018},\n eprint={1808.07042},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://stanfordnlp.github.io/coqa/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "story": {"dtype": "string", "id": null, "_type": "Value"}, "questions": {"feature": {"input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answers": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "additional_answers": {"0": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "1": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "2": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "coqa", "config_name": "coqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 26250528, "num_examples": 7199, "dataset_name": "coqa"}, "validation": {"name": "validation", "num_bytes": 3765933, "num_examples": 500, "dataset_name": "coqa"}}, "download_checksums": {"https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json": {"num_bytes": 49001836, "checksum": "b0fdb2bc1bd38dd3ca2ce5fa2ac3e02c6288ac914f241ac409a655ffb6619fa6"}, "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json": {"num_bytes": 9090845, "checksum": "dfa367a9733ce53222918d0231d9b3bedc2b8ee831a2845f62dfc70701f2540a"}}, "download_size": 58092681, "post_processing_size": null, "dataset_size": 30016461, "size_in_bytes": 88109142}} 2 | -------------------------------------------------------------------------------- /lm_eval/datasets/drop/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/drop/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/drop/dataset_infos.json: -------------------------------------------------------------------------------- 1 | {"drop": {"description": "DROP is a QA dataset which tests comprehensive understanding of paragraphs. In \nthis crowdsourced, adversarially-created, 96k question-answering benchmark, a \nsystem must resolve multiple references in a question, map them onto a paragraph,\nand perform discrete operations over them (such as addition, counting, or sorting).\n", "citation": "@misc{dua2019drop,\n title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, \n author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},\n year={2019},\n eprint={1903.00161},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://allenai.org/data/drop", "license": "", "features": {"section_id": {"dtype": "string", "id": null, "_type": "Value"}, "passage": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "query_id": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "validated_answers": {"feature": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "drop", "config_name": "drop", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 108858121, "num_examples": 77409, "dataset_name": "drop"}, "validation": {"name": "validation", "num_bytes": 12560739, "num_examples": 9536, "dataset_name": "drop"}}, "download_checksums": {"https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip": {"num_bytes": 8308692, "checksum": "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"}}, "download_size": 8308692, "post_processing_size": null, "dataset_size": 121418860, "size_in_bytes": 129727552}} 2 | -------------------------------------------------------------------------------- /lm_eval/datasets/headqa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/headqa/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/headqa/dataset_infos.json: -------------------------------------------------------------------------------- 1 | {"es": {"description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "citation": "@inproceedings{vilares-gomez-rodriguez-2019-head,\n title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n author = \"Vilares, David and\n G{'o}mez-Rodr{'i}guez, Carlos\",\n booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n month = jul,\n year = \"2019\",\n address = \"Florence, Italy\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/P19-1092\",\n doi = \"10.18653/v1/P19-1092\",\n pages = \"960--966\",\n abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n", "homepage": "https://aghie.github.io/head-qa/", "license": "MIT License", "features": {"name": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"dtype": "string", "id": null, "_type": "Value"}, "qid": {"dtype": "int32", "id": null, "_type": "Value"}, "qtext": {"dtype": "string", "id": null, "_type": "Value"}, "ra": {"dtype": "int32", "id": null, "_type": "Value"}, "answers": [{"aid": {"dtype": "int32", "id": null, "_type": "Value"}, "atext": {"dtype": "string", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "head_qa", "config_name": "es", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1196021, "num_examples": 2657, "dataset_name": "head_qa"}, "test": {"name": "test", "num_bytes": 1169819, "num_examples": 2742, "dataset_name": "head_qa"}, "validation": {"name": "validation", "num_bytes": 556924, "num_examples": 1366, "dataset_name": "head_qa"}}, "download_checksums": {"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t": {"num_bytes": 79365502, "checksum": "6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}}, "download_size": 79365502, "post_processing_size": null, "dataset_size": 2922764, "size_in_bytes": 82288266}, "en": {"description": "HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to access a specialized position in the\nSpanish healthcare system, and are challenging even for highly specialized humans. They are designed by the Ministerio\nde Sanidad, Consumo y Bienestar Social.\nThe dataset contains questions about the following topics: medicine, nursing, psychology, chemistry, pharmacology and biology.\n", "citation": "@inproceedings{vilares-gomez-rodriguez-2019-head,\n title = \"{HEAD}-{QA}: A Healthcare Dataset for Complex Reasoning\",\n author = \"Vilares, David and\n G{'o}mez-Rodr{'i}guez, Carlos\",\n booktitle = \"Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics\",\n month = jul,\n year = \"2019\",\n address = \"Florence, Italy\",\n publisher = \"Association for Computational Linguistics\",\n url = \"https://www.aclweb.org/anthology/P19-1092\",\n doi = \"10.18653/v1/P19-1092\",\n pages = \"960--966\",\n abstract = \"We present HEAD-QA, a multi-choice question answering testbed to encourage research on complex reasoning. The questions come from exams to access a specialized position in the Spanish healthcare system, and are challenging even for highly specialized humans. We then consider monolingual (Spanish) and cross-lingual (to English) experiments with information retrieval and neural techniques. We show that: (i) HEAD-QA challenges current methods, and (ii) the results lag well behind human performance, demonstrating its usefulness as a benchmark for future work.\",\n}\n", "homepage": "https://aghie.github.io/head-qa/", "license": "MIT License", "features": {"name": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}, "category": {"dtype": "string", "id": null, "_type": "Value"}, "qid": {"dtype": "int32", "id": null, "_type": "Value"}, "qtext": {"dtype": "string", "id": null, "_type": "Value"}, "ra": {"dtype": "int32", "id": null, "_type": "Value"}, "answers": [{"aid": {"dtype": "int32", "id": null, "_type": "Value"}, "atext": {"dtype": "string", "id": null, "_type": "Value"}}]}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "head_qa", "config_name": "en", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 1123151, "num_examples": 2657, "dataset_name": "head_qa"}, "test": {"name": "test", "num_bytes": 1097349, "num_examples": 2742, "dataset_name": "head_qa"}, "validation": {"name": "validation", "num_bytes": 523462, "num_examples": 1366, "dataset_name": "head_qa"}}, "download_checksums": {"https://drive.google.com/uc?export=download&confirm=t&id=1a_95N5zQQoUCq8IBNVZgziHbeM-QxG2t": {"num_bytes": 79365502, "checksum": "6ec29a3f55153d167f0bdf05395558919ba0b1df9c63e79ffceda2a09884ad8b"}}, "download_size": 79365502, "post_processing_size": null, "dataset_size": 2743962, "size_in_bytes": 82109464}} 2 | -------------------------------------------------------------------------------- /lm_eval/datasets/hendrycks_ethics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/hendrycks_ethics/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/hendrycks_math/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/hendrycks_math/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/hendrycks_math/hendrycks_math.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """MATH dataset.""" 15 | 16 | 17 | import json 18 | import os 19 | import pathlib 20 | 21 | import datasets 22 | 23 | 24 | _CITATION = """\ 25 | @article{hendrycksmath2021, 26 | title={Measuring Mathematical Problem Solving With the Math Dataset}, 27 | author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt}, 28 | journal={NeurIPS}, 29 | year={2021} 30 | } 31 | """ 32 | 33 | _DESCRIPTION = """\ 34 | MATH is a dataset of 12,500 challenging competition mathematics problems. Each 35 | problem in Math has a full step-by-step solution which can be used to teach 36 | models to generate answer derivations and explanations. 37 | """ 38 | 39 | _HOMEPAGE = "https://github.com/hendrycks/math" 40 | 41 | # TODO: Add the licence for the dataset here if you can find it 42 | _LICENSE = "" 43 | 44 | _URLS = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar" 45 | 46 | _NAMES = [ 47 | "algebra", 48 | "counting_and_probability", 49 | "geometry", 50 | "intermediate_algebra", 51 | "number_theory", 52 | "prealgebra", 53 | "precalculus", 54 | ] 55 | 56 | 57 | class HendrycksMath(datasets.GeneratorBasedBuilder): 58 | """MATH is a dataset of 12,500 challenging competition mathematics problems.""" 59 | 60 | VERSION = datasets.Version("0.0.1") 61 | 62 | BUILDER_CONFIGS = [ 63 | datasets.BuilderConfig(name=name, version=version, description=name) 64 | for name, version in zip(_NAMES, [VERSION] * len(_NAMES)) 65 | ] 66 | 67 | def _info(self): 68 | features = datasets.Features( 69 | { 70 | "problem": datasets.Value("string"), 71 | "level": datasets.Value("string"), 72 | "type": datasets.Value("string"), 73 | "solution": datasets.Value("string"), 74 | } 75 | ) 76 | return datasets.DatasetInfo( 77 | description=_DESCRIPTION, 78 | features=features, 79 | homepage=_HOMEPAGE, 80 | license=_LICENSE, 81 | citation=_CITATION, 82 | ) 83 | 84 | def _split_generators(self, dl_manager): 85 | urls = _URLS 86 | data_dir = dl_manager.download_and_extract(urls) 87 | return [ 88 | datasets.SplitGenerator( 89 | name=datasets.Split.TRAIN, 90 | # These kwargs will be passed to _generate_examples 91 | gen_kwargs={ 92 | "basepath": os.path.join( 93 | data_dir, "MATH", "train", self.config.name 94 | ), 95 | "split": "train", 96 | }, 97 | ), 98 | datasets.SplitGenerator( 99 | name=datasets.Split.TEST, 100 | # These kwargs will be passed to _generate_examples 101 | gen_kwargs={ 102 | "basepath": os.path.join( 103 | data_dir, "MATH", "test", self.config.name 104 | ), 105 | "split": "test", 106 | }, 107 | ), 108 | ] 109 | 110 | # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` 111 | def _generate_examples(self, basepath, split): 112 | key = 0 113 | for file in sorted(pathlib.Path(basepath).iterdir()): 114 | with open(file, "r", encoding="utf-8") as f: 115 | data = json.load(f) 116 | yield key, { 117 | "problem": data["problem"], 118 | "level": data["level"], 119 | "type": data["type"], 120 | "solution": data["solution"], 121 | } 122 | key += 1 123 | -------------------------------------------------------------------------------- /lm_eval/datasets/logiqa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/logiqa/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/logiqa/dataset_infos.json: -------------------------------------------------------------------------------- 1 | {"logiqa": {"description": "LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA\ninstances, covering multiple types of deductive reasoning. Results show that state-\nof-the-art neural models perform by far worse than human ceiling. The dataset can\nalso serve as a benchmark for reinvestigating logical AI under the deep learning\nNLP setting.\n", "citation": "@misc{liu2020logiqa,\n title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, \n author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},\n year={2020},\n eprint={2007.08124},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/lgw863/LogiQA-dataset", "license": "", "features": {"label": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "logiqa", "config_name": "logiqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 6419852, "num_examples": 7376, "dataset_name": "logiqa"}, "test": {"name": "test", "num_bytes": 571705, "num_examples": 651, "dataset_name": "logiqa"}, "validation": {"name": "validation", "num_bytes": 562437, "num_examples": 651, "dataset_name": "logiqa"}}, "download_checksums": {"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt": {"num_bytes": 6281272, "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt": {"num_bytes": 559060, "checksum": "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt": {"num_bytes": 550021, "checksum": "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"}}, "download_size": 7390353, "post_processing_size": null, "dataset_size": 7553994, "size_in_bytes": 14944347}} 2 | -------------------------------------------------------------------------------- /lm_eval/datasets/logiqa/logiqa.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """LogiQA dataset.""" 15 | 16 | 17 | import datasets 18 | 19 | 20 | _CITATION = """\ 21 | @misc{liu2020logiqa, 22 | title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, 23 | author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang}, 24 | year={2020}, 25 | eprint={2007.08124}, 26 | archivePrefix={arXiv}, 27 | primaryClass={cs.CL} 28 | } 29 | """ 30 | 31 | _DESCRIPTION = """\ 32 | LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA 33 | instances, covering multiple types of deductive reasoning. Results show that state- 34 | of-the-art neural models perform by far worse than human ceiling. The dataset can 35 | also serve as a benchmark for reinvestigating logical AI under the deep learning 36 | NLP setting. 37 | """ 38 | 39 | _HOMEPAGE = "https://github.com/lgw863/LogiQA-dataset" 40 | 41 | # TODO: Add the licence for the dataset here if you can find it 42 | _LICENSE = "" 43 | 44 | _URLS = { 45 | "train": "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt", 46 | "validation": "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt", 47 | "test": "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt", 48 | } 49 | 50 | 51 | class Logiqa(datasets.GeneratorBasedBuilder): 52 | """LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning""" 53 | 54 | VERSION = datasets.Version("0.0.1") 55 | 56 | BUILDER_CONFIGS = [ 57 | datasets.BuilderConfig( 58 | name="logiqa", version=VERSION, description="The LogiQA dataset." 59 | ), 60 | ] 61 | 62 | def _info(self): 63 | features = datasets.Features( 64 | { 65 | "label": datasets.Value("string"), 66 | "context": datasets.Value("string"), 67 | "question": datasets.Value("string"), 68 | "options": datasets.features.Sequence(datasets.Value("string")), 69 | } 70 | ) 71 | return datasets.DatasetInfo( 72 | description=_DESCRIPTION, 73 | features=features, 74 | homepage=_HOMEPAGE, 75 | license=_LICENSE, 76 | citation=_CITATION, 77 | ) 78 | 79 | def _split_generators(self, dl_manager): 80 | urls = { 81 | "train": _URLS["train"], 82 | "test": _URLS["test"], 83 | "validation": _URLS["validation"], 84 | } 85 | data_dir = dl_manager.download_and_extract(urls) 86 | return [ 87 | datasets.SplitGenerator( 88 | name=datasets.Split.TRAIN, 89 | # These kwargs will be passed to _generate_examples 90 | gen_kwargs={ 91 | "filepath": data_dir["train"], 92 | "split": "train", 93 | }, 94 | ), 95 | datasets.SplitGenerator( 96 | name=datasets.Split.TEST, 97 | # These kwargs will be passed to _generate_examples 98 | gen_kwargs={"filepath": data_dir["test"], "split": "test"}, 99 | ), 100 | datasets.SplitGenerator( 101 | name=datasets.Split.VALIDATION, 102 | # These kwargs will be passed to _generate_examples 103 | gen_kwargs={ 104 | "filepath": data_dir["validation"], 105 | "split": "validation", 106 | }, 107 | ), 108 | ] 109 | 110 | # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` 111 | def _generate_examples(self, filepath, split): 112 | def normalize(text): 113 | return text.replace(".", ". ").strip() 114 | 115 | with open(filepath, encoding="utf-8") as f: 116 | data = f.read().strip().split("\n\n") 117 | for key, row in enumerate(data): 118 | example = row.split("\n") 119 | yield key, { 120 | "label": example[0].strip(), 121 | "context": normalize(example[1]), 122 | "question": normalize(example[2]), 123 | "options": [normalize(option[2:]) for option in example[3:]], 124 | } 125 | -------------------------------------------------------------------------------- /lm_eval/datasets/mutual/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/mutual/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/mutual/dataset_infos.json: -------------------------------------------------------------------------------- 1 | {"mutual": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nThe MuTual dataset.", "citation": "@inproceedings{mutual,\n title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n author = \"Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 5141602, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 634396, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 624271, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6400269, "size_in_bytes": 17398147}, "mutual_plus": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nMuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.", "citation": "@inproceedings{mutual,\n title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n author = \"Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual_plus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 4921179, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 606620, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 597340, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6125139, "size_in_bytes": 17123017}} 2 | -------------------------------------------------------------------------------- /lm_eval/datasets/mutual/mutual.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """MuTual dataset.""" 15 | 16 | 17 | import json 18 | import os 19 | from pathlib import Path 20 | 21 | import datasets 22 | 23 | 24 | _CITATION = """\ 25 | @inproceedings{mutual, 26 | title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning", 27 | author = "Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" , 28 | booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics", 29 | year = "2020", 30 | publisher = "Association for Computational Linguistics", 31 | } 32 | """ 33 | 34 | _DESCRIPTION = """\ 35 | MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is 36 | modified from Chinese high school English listening comprehension test data. 37 | """ 38 | 39 | _HOMEPAGE = "https://github.com/Nealcly/MuTual" 40 | 41 | # TODO: Add the licence for the dataset here if you can find it 42 | _LICENSE = "" 43 | 44 | _URLS = "https://github.com/Nealcly/MuTual/archive/master.zip" 45 | 46 | 47 | class Mutual(datasets.GeneratorBasedBuilder): 48 | """MuTual: A Dataset for Multi-Turn Dialogue Reasoning""" 49 | 50 | VERSION = datasets.Version("0.0.1") 51 | 52 | BUILDER_CONFIGS = [ 53 | datasets.BuilderConfig( 54 | name="mutual", version=VERSION, description="The MuTual dataset." 55 | ), 56 | datasets.BuilderConfig( 57 | name="mutual_plus", 58 | version=VERSION, 59 | description="MuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.", 60 | ), 61 | ] 62 | 63 | def _info(self): 64 | features = datasets.Features( 65 | { 66 | "answers": datasets.Value("string"), 67 | "options": datasets.features.Sequence(datasets.Value("string")), 68 | "article": datasets.Value("string"), 69 | "id": datasets.Value("string"), 70 | } 71 | ) 72 | return datasets.DatasetInfo( 73 | description=f"{_DESCRIPTION}\n{self.config.description}", 74 | features=features, 75 | homepage=_HOMEPAGE, 76 | license=_LICENSE, 77 | citation=_CITATION, 78 | ) 79 | 80 | def _split_generators(self, dl_manager): 81 | urls = _URLS 82 | data_dir = dl_manager.download_and_extract(urls) 83 | return [ 84 | datasets.SplitGenerator( 85 | name=datasets.Split.TRAIN, 86 | # These kwargs will be passed to _generate_examples 87 | gen_kwargs={ 88 | "basepath": os.path.join( 89 | data_dir, "MuTual-master", "data", self.config.name, "train" 90 | ), 91 | "split": "train", 92 | }, 93 | ), 94 | datasets.SplitGenerator( 95 | name=datasets.Split.TEST, 96 | # These kwargs will be passed to _generate_examples 97 | gen_kwargs={ 98 | "basepath": os.path.join( 99 | data_dir, "MuTual-master", "data", self.config.name, "test" 100 | ), 101 | "split": "test", 102 | }, 103 | ), 104 | datasets.SplitGenerator( 105 | name=datasets.Split.VALIDATION, 106 | # These kwargs will be passed to _generate_examples 107 | gen_kwargs={ 108 | "basepath": os.path.join( 109 | data_dir, "MuTual-master", "data", self.config.name, "dev" 110 | ), 111 | "split": "dev", 112 | }, 113 | ), 114 | ] 115 | 116 | # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` 117 | def _generate_examples(self, basepath, split): 118 | # TODO: This method handles input defined in _split_generators to yield (key, example) tuples from the dataset. 119 | # The `key` is for legacy reasons (tfds) and is not important in itself, but must be unique for each example. 120 | key = 0 121 | for file in sorted(Path(basepath).iterdir()): 122 | if file.suffix != ".txt": 123 | continue 124 | with open(file, "r", encoding="utf-8") as f: 125 | data_str = f.read() 126 | # Ignore the occasional empty file. 127 | if not data_str: 128 | continue 129 | data = json.loads(data_str) 130 | yield key, { 131 | "answers": data["answers"], 132 | "options": data["options"], 133 | "article": data["article"], 134 | "id": data["id"], 135 | } 136 | key += 1 137 | -------------------------------------------------------------------------------- /lm_eval/datasets/pile/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/pile/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/pile/pile.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Pile dataset.""" 15 | 16 | 17 | import json 18 | 19 | import datasets 20 | 21 | 22 | _CITATION = """\ 23 | @article{pile, 24 | title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling}, 25 | author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor}, 26 | journal={arXiv preprint arXiv:2101.00027}, 27 | year={2020} 28 | } 29 | """ 30 | 31 | _DESCRIPTION = """\ 32 | The Pile is a 825 GiB diverse, open source language modeling data set that consists 33 | of 22 smaller, high-quality datasets combined together. To score well on Pile 34 | BPB (bits per byte), a model must be able to understand many disparate domains 35 | including books, github repositories, webpages, chat logs, and medical, physics, 36 | math, computer science, and philosophy papers. 37 | """ 38 | 39 | _HOMEPAGE = "https://pile.eleuther.ai/" 40 | 41 | # TODO: Add the licence for the dataset here if you can find it 42 | _LICENSE = "" 43 | 44 | _URLS = { 45 | "validation": "https://the-eye.eu/public/AI/pile/val.jsonl.zst", 46 | "test": "https://the-eye.eu/public/AI/pile/test.jsonl.zst", 47 | } 48 | 49 | _NAMES = { 50 | "pile_arxiv": "ArXiv", 51 | "pile_books3": "Books3", 52 | "pile_bookcorpus2": "BookCorpus2", 53 | "pile_dm-mathematics": "DM Mathematics", 54 | "pile_enron": "Enron Emails", 55 | "pile_europarl": "EuroParl", 56 | "pile_freelaw": "FreeLaw", 57 | "pile_github": "Github", 58 | "pile_gutenberg": "Gutenberg (PG-19)", 59 | "pile_hackernews": "HackerNews", 60 | "pile_nih-exporter": "NIH ExPorter", 61 | "pile_opensubtitles": "OpenSubtitles", 62 | "pile_openwebtext2": "OpenWebText2", 63 | "pile_philpapers": "PhilPapers", 64 | "pile_pile-cc": "Pile-CC", 65 | "pile_pubmed-abstracts": "PubMed Abstracts", 66 | "pile_pubmed-central": "PubMed Central", 67 | "pile_stackexchange": "StackExchange", 68 | "pile_upsto": "USPTO Backgrounds", 69 | "pile_ubuntu-irc": "Ubuntu IRC", 70 | "pile_wikipedia": "Wikipedia (en)", 71 | "pile_youtubesubtitles": "YoutubeSubtitles", 72 | } 73 | 74 | 75 | class Pile(datasets.GeneratorBasedBuilder): 76 | """The Pile is a 825 GiB diverse, open source language modeling dataset.""" 77 | 78 | VERSION = datasets.Version("0.0.1") 79 | 80 | BUILDER_CONFIGS = [ 81 | datasets.BuilderConfig(name=name, version=version, description=_NAMES[name]) 82 | for name, version in zip(_NAMES.keys(), [VERSION] * len(_NAMES)) 83 | ] 84 | 85 | def _info(self): 86 | features = datasets.Features( 87 | { 88 | "text": datasets.Value("string"), 89 | } 90 | ) 91 | return datasets.DatasetInfo( 92 | description=f"{_DESCRIPTION}\n{self.config.description}", 93 | features=features, 94 | homepage=_HOMEPAGE, 95 | license=_LICENSE, 96 | citation=_CITATION, 97 | ) 98 | 99 | def _split_generators(self, dl_manager): 100 | urls = {"validation": _URLS["validation"], "test": _URLS["test"]} 101 | data_dir = dl_manager.download_and_extract(urls) 102 | return [ 103 | datasets.SplitGenerator( 104 | name=datasets.Split.TEST, 105 | # These kwargs will be passed to _generate_examples 106 | gen_kwargs={"filepath": data_dir["test"], "split": "test"}, 107 | ), 108 | datasets.SplitGenerator( 109 | name=datasets.Split.VALIDATION, 110 | # These kwargs will be passed to _generate_examples 111 | gen_kwargs={ 112 | "filepath": data_dir["validation"], 113 | "split": "validation", 114 | }, 115 | ), 116 | ] 117 | 118 | # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` 119 | def _generate_examples(self, filepath, split): 120 | with open(filepath, encoding="utf-8") as f: 121 | for key, row in enumerate(f): 122 | data = json.loads(row) 123 | if data["meta"]["pile_set_name"] == _NAMES[self.config.name]: 124 | yield key, { 125 | "text": data["text"], 126 | } 127 | -------------------------------------------------------------------------------- /lm_eval/datasets/quac/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/quac/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/quac/dataset_infos.json: -------------------------------------------------------------------------------- 1 | {"quac": {"description": "Question Answering in Context (QuAC) is a dataset for modeling, understanding, and \nparticipating in information seeking dialog. Data instances consist of an interactive\ndialog between two crowd workers: (1) a student who poses a sequence of freeform\nquestions to learn as much as possible about a hidden Wikipedia text, and (2)\na teacher who answers the questions by providing short excerpts (spans) from the text.\n", "citation": "@article{choi2018quac,\n title={Quac: Question answering in context},\n author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},\n journal={arXiv preprint arXiv:1808.07036},\n year={2018}\n}\n", "homepage": "https://quac.ai/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "section_title": {"dtype": "string", "id": null, "_type": "Value"}, "paragraph": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "quac", "config_name": "quac", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 212391958, "num_examples": 83568, "dataset_name": "quac"}, "validation": {"name": "validation", "num_bytes": 20678483, "num_examples": 7354, "dataset_name": "quac"}}, "download_checksums": {"https://s3.amazonaws.com/my89public/quac/train_v0.2.json": {"num_bytes": 68114819, "checksum": "ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a"}, "https://s3.amazonaws.com/my89public/quac/val_v0.2.json": {"num_bytes": 8929167, "checksum": "09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378"}}, "download_size": 77043986, "post_processing_size": null, "dataset_size": 233070441, "size_in_bytes": 310114427}} 2 | -------------------------------------------------------------------------------- /lm_eval/datasets/quac/quac.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # TODO: Address all TODOs and remove all explanatory comments 15 | """QuAC dataset.""" 16 | 17 | 18 | import json 19 | 20 | import datasets 21 | 22 | 23 | _CITATION = """\ 24 | @article{choi2018quac, 25 | title={Quac: Question answering in context}, 26 | author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke}, 27 | journal={arXiv preprint arXiv:1808.07036}, 28 | year={2018} 29 | } 30 | """ 31 | 32 | _DESCRIPTION = """\ 33 | Question Answering in Context (QuAC) is a dataset for modeling, understanding, and 34 | participating in information seeking dialog. Data instances consist of an interactive 35 | dialog between two crowd workers: (1) a student who poses a sequence of freeform 36 | questions to learn as much as possible about a hidden Wikipedia text, and (2) 37 | a teacher who answers the questions by providing short excerpts (spans) from the text. 38 | """ 39 | 40 | _HOMEPAGE = "https://quac.ai/" 41 | 42 | # TODO: Add the licence for the dataset here if you can find it 43 | _LICENSE = "" 44 | 45 | _URLS = { 46 | "train": "https://s3.amazonaws.com/my89public/quac/train_v0.2.json", 47 | "validation": "https://s3.amazonaws.com/my89public/quac/val_v0.2.json", 48 | } 49 | 50 | 51 | class Quac(datasets.GeneratorBasedBuilder): 52 | """Question Answering in Context (QuAC) is a dataset for modeling, understanding, and participating in information seeking dialog.""" 53 | 54 | VERSION = datasets.Version("1.1.0") 55 | 56 | BUILDER_CONFIGS = [ 57 | datasets.BuilderConfig( 58 | name="quac", version=VERSION, description="The QuAC dataset" 59 | ), 60 | ] 61 | 62 | def _info(self): 63 | features = datasets.Features( 64 | { 65 | "title": datasets.Value("string"), 66 | "section_title": datasets.Value("string"), 67 | "paragraph": datasets.Value("string"), 68 | "question": datasets.Value("string"), 69 | "answer": datasets.Value("string"), 70 | } 71 | ) 72 | return datasets.DatasetInfo( 73 | description=_DESCRIPTION, 74 | features=features, 75 | homepage=_HOMEPAGE, 76 | license=_LICENSE, 77 | citation=_CITATION, 78 | ) 79 | 80 | def _split_generators(self, dl_manager): 81 | urls = {"train": _URLS["train"], "validation": _URLS["validation"]} 82 | data_dir = dl_manager.download_and_extract(urls) 83 | return [ 84 | datasets.SplitGenerator( 85 | name=datasets.Split.TRAIN, 86 | # These kwargs will be passed to _generate_examples 87 | gen_kwargs={ 88 | "filepath": data_dir["train"], 89 | "split": "train", 90 | }, 91 | ), 92 | datasets.SplitGenerator( 93 | name=datasets.Split.VALIDATION, 94 | # These kwargs will be passed to _generate_examples 95 | gen_kwargs={"filepath": data_dir["validation"], "split": "validation"}, 96 | ), 97 | ] 98 | 99 | # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` 100 | def _generate_examples(self, filepath, split): 101 | with open(filepath, encoding="utf-8") as f: 102 | data = json.load(f)["data"] 103 | key = 0 104 | for row in data: 105 | paragraph = row["paragraphs"][0]["context"].replace("CANNOTANSWER", "") 106 | qas = row["paragraphs"][0]["qas"] 107 | qa_pairs = [(qa["question"], qa["answers"][0]["text"]) for qa in qas] 108 | for (question, answer) in qa_pairs: 109 | # Yields examples as (key, example) tuples 110 | yield key, { 111 | "title": row["title"], 112 | "section_title": row["section_title"], 113 | "paragraph": paragraph, 114 | "question": question, 115 | "answer": answer, 116 | } 117 | key += 1 118 | -------------------------------------------------------------------------------- /lm_eval/datasets/sat_analogies/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/sat_analogies/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/sat_analogies/sat_analogies.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """SAT Analogy Questions dataset.""" 15 | 16 | 17 | import os 18 | 19 | import datasets 20 | 21 | 22 | _CITATION = """\ 23 | @article{article, 24 | author = {Turney, Peter}, 25 | year = {2006}, 26 | month = {09}, 27 | pages = {379-416}, 28 | title = {Similarity of Semantic Relations}, 29 | volume = {32}, 30 | journal = {Computational Linguistics}, 31 | doi = {10.1162/coli.2006.32.3.379} 32 | } 33 | """ 34 | 35 | _DESCRIPTION = """\ 36 | SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 37 | multiple-choice analogy questions; 5 choices per question. 38 | """ 39 | 40 | _HOMEPAGE = "https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art)" 41 | 42 | # TODO: Add the licence for the dataset here if you can find it 43 | _LICENSE = "" 44 | 45 | 46 | class SatAnalogies(datasets.GeneratorBasedBuilder): 47 | """SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 multiple-choice analogy questions.""" 48 | 49 | VERSION = datasets.Version("0.0.1") 50 | 51 | BUILDER_CONFIGS = [ 52 | datasets.BuilderConfig( 53 | name="sat_analogies", 54 | version=VERSION, 55 | description="The SAT Analogy Questions dataset", 56 | ), 57 | ] 58 | 59 | @property 60 | def manual_download_instructions(self): 61 | return ( 62 | "To use SAT Analogy Questions you have to download it manually. Please " 63 | "email Peter Turney to request the data (https://www.apperceptual.com). " 64 | "Once you receive a download link for the dataset, supply the local path " 65 | "as the `data_dir` arg: " 66 | "`datasets.load_dataset('sat_analogies', data_dir='path/to/folder/folder_name')`" 67 | ) 68 | 69 | def _info(self): 70 | features = datasets.Features( 71 | { 72 | "source": datasets.Value("string"), 73 | "stem": datasets.Value("string"), 74 | "choices": datasets.features.Sequence(datasets.Value("string")), 75 | "solution": datasets.Value("string"), 76 | } 77 | ) 78 | return datasets.DatasetInfo( 79 | description=_DESCRIPTION, 80 | features=features, 81 | homepage=_HOMEPAGE, 82 | license=_LICENSE, 83 | citation=_CITATION, 84 | ) 85 | 86 | def _split_generators(self, dl_manager): 87 | data_dir = os.path.abspath(os.path.expanduser(dl_manager.manual_dir)) 88 | if not os.path.exists(data_dir): 89 | raise FileNotFoundError( 90 | f"{data_dir} does not exist. Make sure you insert a manual dir via `datasets.load_dataset('matinf', data_dir=...)` that includes SAT-package-V3.txt. Manual download instructions: {self.manual_download_instructions}" 91 | ) 92 | return [ 93 | datasets.SplitGenerator( 94 | name=datasets.Split.VALIDATION, 95 | # These kwargs will be passed to _generate_examples 96 | gen_kwargs={ 97 | "filepath": os.path.join(data_dir, "SAT-package-V3.txt"), 98 | }, 99 | ) 100 | ] 101 | 102 | # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` 103 | def _generate_examples(self, filepath): 104 | data = [] 105 | with open(filepath, "r", encoding="utf-8") as f: 106 | record = [] 107 | for line in f: 108 | line = line.strip() 109 | if len(line) == 0 and record: 110 | data.append(record) 111 | record = [] 112 | elif len(line) > 0 and line[0] == "#": 113 | # Skip comments. 114 | continue 115 | else: 116 | record.append(line) 117 | data.append(record) 118 | for key, record in enumerate(data): 119 | source = record[-8] 120 | stem = record[-7] 121 | choices = record[-6:-1] 122 | solution = record[-1] 123 | yield key, { 124 | "source": source, 125 | "stem": stem, 126 | "choices": choices, 127 | "solution": solution, 128 | } 129 | -------------------------------------------------------------------------------- /lm_eval/datasets/triviaqa/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | dataset_info: 3 | features: 4 | - name: question_id 5 | dtype: string 6 | - name: question_source 7 | dtype: string 8 | - name: question 9 | dtype: string 10 | - name: answer 11 | struct: 12 | - name: aliases 13 | sequence: string 14 | - name: value 15 | dtype: string 16 | - name: search_results 17 | sequence: 18 | - name: description 19 | dtype: string 20 | - name: filename 21 | dtype: string 22 | - name: rank 23 | dtype: int32 24 | - name: title 25 | dtype: string 26 | - name: url 27 | dtype: string 28 | - name: search_context 29 | dtype: string 30 | config_name: triviaqa 31 | splits: 32 | - name: train 33 | num_bytes: 1270894387 34 | num_examples: 87622 35 | - name: validation 36 | num_bytes: 163755044 37 | num_examples: 11313 38 | download_size: 632549060 39 | dataset_size: 1434649431 40 | --- 41 | -------------------------------------------------------------------------------- /lm_eval/datasets/triviaqa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/triviaqa/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/triviaqa/dataset_infos.json: -------------------------------------------------------------------------------- 1 | {"triviaqa": {"description": "TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n", "citation": "@InProceedings{JoshiTriviaQA2017,\n author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n month = {July},\n year = {2017},\n address = {Vancouver, Canada},\n publisher = {Association for Computational Linguistics},\n}\n", "homepage": "https://nlp.cs.washington.edu/triviaqa/", "license": "Apache License 2.0", "features": {"question_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_source": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"aliases": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "value": {"dtype": "string", "id": null, "_type": "Value"}}, "search_results": {"feature": {"description": {"dtype": "string", "id": null, "_type": "Value"}, "filename": {"dtype": "string", "id": null, "_type": "Value"}, "rank": {"dtype": "int32", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}, "search_context": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "triviaqa", "config_name": "triviaqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1271393601, "num_examples": 87622, "dataset_name": "triviaqa"}, "validation": {"name": "validation", "num_bytes": 163819509, "num_examples": 11313, "dataset_name": "triviaqa"}}, "download_checksums": {"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz": {"num_bytes": 546481381, "checksum": "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}}, "download_size": 546481381, "post_processing_size": null, "dataset_size": 1435213110, "size_in_bytes": 1981694491}} 2 | -------------------------------------------------------------------------------- /lm_eval/datasets/unscramble/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/datasets/unscramble/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/unscramble/unscramble.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Unscramble dataset.""" 15 | 16 | 17 | import json 18 | import os 19 | 20 | import datasets 21 | 22 | 23 | _CITATION = """\ 24 | @inproceedings{NEURIPS2020_1457c0d6, 25 | author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario}, 26 | booktitle = {Advances in Neural Information Processing Systems}, 27 | editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin}, 28 | pages = {1877--1901}, 29 | publisher = {Curran Associates, Inc.}, 30 | title = {Language Models are Few-Shot Learners}, 31 | url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf}, 32 | volume = {33}, 33 | year = {2020} 34 | } 35 | """ 36 | 37 | _DESCRIPTION = """\ 38 | Unscramble is a small battery of 5 “character manipulation” tasks. Each task 39 | involves giving the model a word distorted by some combination of scrambling, 40 | addition, or deletion of characters, and asking it to recover the original word. 41 | """ 42 | 43 | _HOMEPAGE = "https://github.com/openai/gpt-3/tree/master/data" 44 | 45 | # TODO: Add the licence for the dataset here if you can find it 46 | _LICENSE = "" 47 | 48 | _BASE_URL = "https://raw.githubusercontent.com/openai/gpt-3/master/data" 49 | 50 | 51 | _DESCRIPTIONS = { 52 | "mid_word_1_anagrams": "Anagrams of all but the first and last letter.", 53 | "mid_word_2_anagrams": "Anagrams of all but the first and last 2 letters.", 54 | "cycle_letters_in_word": "Cycle letters in the word.", 55 | "random_insertion_in_word": "Random insertions in the word that must be removed.", 56 | "reversed_words": "Words spelled backwards that must be reversed.", 57 | } 58 | _NAMES = _DESCRIPTIONS.keys() 59 | 60 | 61 | class Unscramble(datasets.GeneratorBasedBuilder): 62 | """Unscramble is a small battery of 5 “character manipulation” tasks.""" 63 | 64 | VERSION = datasets.Version("0.0.1") 65 | 66 | BUILDER_CONFIGS = [ 67 | datasets.BuilderConfig( 68 | name=name, version=version, description=_DESCRIPTIONS[name] 69 | ) 70 | for name, version in zip(_NAMES, [VERSION] * len(_NAMES)) 71 | ] 72 | 73 | def _info(self): 74 | features = datasets.Features( 75 | { 76 | "context": datasets.Value("string"), 77 | "completion": datasets.Value("string"), 78 | } 79 | ) 80 | return datasets.DatasetInfo( 81 | description=_DESCRIPTION, 82 | features=features, 83 | homepage=_HOMEPAGE, 84 | license=_LICENSE, 85 | citation=_CITATION, 86 | ) 87 | 88 | def _split_generators(self, dl_manager): 89 | urls = os.path.join(_BASE_URL, f"{self.config.name}.jsonl.gz") 90 | data_dir = dl_manager.download_and_extract(urls) 91 | return [ 92 | datasets.SplitGenerator( 93 | name=datasets.Split.VALIDATION, 94 | # These kwargs will be passed to _generate_examples 95 | gen_kwargs={ 96 | "filepath": data_dir, 97 | "split": "validation", 98 | }, 99 | ), 100 | ] 101 | 102 | # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` 103 | def _generate_examples(self, filepath, split): 104 | with open(filepath, encoding="utf-8") as f: 105 | for key, row in enumerate(f): 106 | data = json.loads(row) 107 | yield key, { 108 | "context": data["context"], 109 | "completion": data["completion"], 110 | } 111 | -------------------------------------------------------------------------------- /lm_eval/decontamination/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/lm_eval/decontamination/__init__.py -------------------------------------------------------------------------------- /lm_eval/models/__init__.py: -------------------------------------------------------------------------------- 1 | from . import gpt2 2 | from . import gpt3 3 | from . import huggingface 4 | from . import textsynth 5 | from . import dummy 6 | 7 | MODEL_REGISTRY = { 8 | "hf": gpt2.HFLM, 9 | "hf-causal": huggingface.AutoCausalLM, 10 | "hf-seq2seq": huggingface.AutoSeq2SeqLM, 11 | "gpt2": gpt2.GPT2LM, 12 | "gpt3": gpt3.GPT3LM, 13 | "textsynth": textsynth.TextSynthLM, 14 | "dummy": dummy.DummyLM, 15 | } 16 | 17 | 18 | def get_model(model_name): 19 | return MODEL_REGISTRY[model_name] 20 | -------------------------------------------------------------------------------- /lm_eval/models/dummy.py: -------------------------------------------------------------------------------- 1 | import random 2 | from lm_eval.base import LM 3 | 4 | 5 | class DummyLM(LM): 6 | def __init__(self): 7 | pass 8 | 9 | @classmethod 10 | def create_from_arg_string(cls, arg_string, additional_config=None): 11 | return cls() 12 | 13 | def loglikelihood(self, requests): 14 | res = [] 15 | 16 | for _ in requests: 17 | res.append((-random.random(), False)) 18 | 19 | return res 20 | 21 | def greedy_until(self, requests): 22 | res = [] 23 | 24 | for ctx, _ in requests: 25 | res.append("lol") 26 | assert ctx.strip() != "" 27 | 28 | return res 29 | 30 | def loglikelihood_rolling(self, requests): 31 | res = [] 32 | 33 | for _ in requests: 34 | res.append(-random.random()) 35 | 36 | return res 37 | -------------------------------------------------------------------------------- /lm_eval/models/gpt2.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import transformers 3 | from lm_eval.base import BaseLM 4 | 5 | 6 | class HFLM(BaseLM): 7 | def __init__( 8 | self, 9 | device="cuda", 10 | pretrained="gpt2", 11 | revision="main", 12 | low_cpu_mem_usage=None, 13 | subfolder=None, 14 | tokenizer=None, 15 | batch_size=1, 16 | ): 17 | super().__init__() 18 | 19 | assert isinstance(device, str) 20 | assert isinstance(pretrained, str) 21 | assert isinstance(batch_size, int) 22 | 23 | if device: 24 | if device not in ["cuda", "cpu"]: 25 | device = int(device) 26 | self._device = torch.device(device) 27 | print(f"Using device '{device}'") 28 | else: 29 | print("Device not specified") 30 | print(f"Cuda Available? {torch.cuda.is_available()}") 31 | self._device = ( 32 | torch.device("cuda") 33 | if torch.cuda.is_available() 34 | else torch.device("cpu") 35 | ) 36 | 37 | # TODO: update this to be less of a hack once subfolder is fixed in HF 38 | revision = revision + ("/" + subfolder if subfolder is not None else "") 39 | 40 | self.gpt2 = transformers.AutoModelForCausalLM.from_pretrained( 41 | pretrained, revision=revision, low_cpu_mem_usage=low_cpu_mem_usage 42 | ).to(self.device) 43 | self.gpt2.eval() 44 | 45 | self.tokenizer = transformers.AutoTokenizer.from_pretrained( 46 | pretrained if tokenizer is None else tokenizer, 47 | revision=revision, 48 | ) 49 | 50 | assert isinstance( 51 | self.tokenizer, 52 | ( 53 | transformers.GPT2Tokenizer, 54 | transformers.GPT2TokenizerFast, 55 | transformers.T5Tokenizer, 56 | transformers.T5TokenizerFast, 57 | ), 58 | ), "this tokenizer has not been checked for compatibility yet!" 59 | 60 | self.vocab_size = self.tokenizer.vocab_size 61 | 62 | if isinstance( 63 | self.tokenizer, (transformers.GPT2Tokenizer, transformers.GPT2TokenizerFast) 64 | ): 65 | assert self.tokenizer.encode("hello\n\nhello") == [ 66 | 31373, 67 | 198, 68 | 198, 69 | 31373, 70 | ], self.tokenizer.encode("hello\n\nhello") 71 | 72 | # multithreading and batching 73 | self.batch_size_per_gpu = batch_size # todo: adaptive batch size 74 | 75 | # TODO: fix multi-gpu 76 | # gpus = torch.cuda.device_count() 77 | # if gpus > 1: 78 | # self.gpt2 = nn.DataParallel(self.gpt2) 79 | 80 | @property 81 | def eot_token_id(self): 82 | # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* 83 | return self.tokenizer.eos_token_id 84 | 85 | @property 86 | def max_length(self): 87 | try: 88 | return self.gpt2.config.n_ctx 89 | except AttributeError: 90 | # gptneoconfig doesn't have n_ctx apparently 91 | return self.gpt2.config.max_position_embeddings 92 | 93 | @property 94 | def max_gen_toks(self): 95 | return 256 96 | 97 | @property 98 | def batch_size(self): 99 | # TODO: fix multi-gpu 100 | return self.batch_size_per_gpu # * gpus 101 | 102 | @property 103 | def device(self): 104 | # TODO: fix multi-gpu 105 | return self._device 106 | 107 | def tok_encode(self, string: str): 108 | return self.tokenizer.encode(string, add_special_tokens=False) 109 | 110 | def tok_decode(self, tokens): 111 | return self.tokenizer.decode(tokens) 112 | 113 | def _model_call(self, inps): 114 | """ 115 | inps: a torch tensor of shape [batch, sequence] 116 | the size of sequence may vary from call to call 117 | 118 | returns: a torch tensor of shape [batch, sequence, vocab] with the 119 | logits returned from the model 120 | """ 121 | with torch.no_grad(): 122 | return self.gpt2(inps)[0] 123 | 124 | def _model_generate(self, context, max_length, eos_token_id): 125 | return self.gpt2.generate( 126 | context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False 127 | ) 128 | 129 | 130 | # for backwards compatibility 131 | GPT2LM = HFLM 132 | -------------------------------------------------------------------------------- /lm_eval/models/textsynth.py: -------------------------------------------------------------------------------- 1 | """ TextSynth API 2 | Implementation provided by Fabrice Bellard: 3 | https://github.com/EleutherAI/lm-evaluation-harness/issues/295 4 | 5 | In order to use the API, you must have a valid TextSynth account and 6 | enough credits. 7 | 8 | Example usage: 9 | 10 | python main.py --model textsynth --model_args engine=gptj_6B --no_cache --tasks piqa 11 | 12 | Homepage: https://textsynth.com/index.html 13 | """ 14 | import logging 15 | import os 16 | import requests as _requests 17 | import time 18 | from tqdm import tqdm 19 | from lm_eval.base import BaseLM 20 | 21 | 22 | logger = logging.getLogger(__name__) 23 | 24 | 25 | def textsynth_completion(**kwargs): 26 | """Query TextSynth API for completion. 27 | Retry with back-off until they respond. 28 | """ 29 | backoff_time = 3 30 | while True: 31 | try: 32 | return _requests.post(**kwargs) 33 | except _requests.exceptions.RequestException: 34 | import traceback 35 | 36 | traceback.print_exc() 37 | time.sleep(backoff_time) 38 | backoff_time *= 1.5 39 | 40 | 41 | class TextSynthLM(BaseLM): 42 | def __init__(self, engine, truncate=False): 43 | """ 44 | :param engine: str 45 | TextSynth API engine (e.g. `gptj_6B`) 46 | :param truncate: bool 47 | Truncate input if too long (if False and input is too long, throw error) 48 | """ 49 | super().__init__() 50 | 51 | self.engine = engine 52 | self.truncate = truncate 53 | self.api_url = "https://api.textsynth.com" 54 | # Read from environment variable TEXTSYNTH_API_SECRET_KEY 55 | self.api_key = os.environ["TEXTSYNTH_API_SECRET_KEY"] 56 | 57 | @property 58 | def eot_token_id(self): 59 | # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until 60 | raise NotImplementedError() 61 | 62 | @property 63 | def max_length(self): 64 | # NOTE: Turn on truncation to avoid errors on long inputs. 65 | return 2048 66 | 67 | @property 68 | def max_gen_toks(self): 69 | return 256 70 | 71 | @property 72 | def batch_size(self): 73 | # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until 74 | raise NotImplementedError() 75 | 76 | @property 77 | def device(self): 78 | # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until 79 | raise NotImplementedError() 80 | 81 | def tok_encode(self, string: str): 82 | # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until 83 | raise NotImplementedError() 84 | 85 | def tok_decode(self, tokens): 86 | # Isn't used because we override loglikelihood, loglikelihood_rolling and greedy_until 87 | raise NotImplementedError() 88 | 89 | def loglikelihood(self, requests): 90 | res = [] 91 | for context, continuation in tqdm(requests): 92 | response = textsynth_completion( 93 | url=self.api_url + "/v1/engines/" + self.engine + "/logprob", 94 | headers={"Authorization": "Bearer " + self.api_key}, 95 | json={"context": context, "continuation": continuation}, 96 | ) 97 | resp = response.json() 98 | if "logprob" in resp: 99 | logprob = resp["logprob"] 100 | is_greedy = resp["is_greedy"] 101 | res.append((logprob, is_greedy)) 102 | else: 103 | logger.error( 104 | f"The following response does not contain `logprobs`. Got:\n{resp}" 105 | ) 106 | assert False 107 | return res 108 | 109 | def loglikelihood_rolling(self, requests): 110 | # TODO: The TextSynth API does not support tokenized inputs so we cannot 111 | # manually partition long contexts into smaller rolling windows as 112 | # done for other models derived from `BaseLM`. Override this method 113 | # with a windowing scheme that works for direct string inputs. 114 | raise NotImplementedError( 115 | "`loglikelihood_rolling` is currently not supported due to lack of " 116 | "input tokenization support from TextSynth." 117 | ) 118 | 119 | def greedy_until(self, requests): 120 | if not requests: 121 | return [] 122 | 123 | res = [] 124 | for request in tqdm(requests): 125 | inp = request[0] 126 | until = request[1] 127 | response = textsynth_completion( 128 | url=self.api_url + "/v1/engines/" + self.engine + "/completions", 129 | headers={"Authorization": "Bearer " + self.api_key}, 130 | json={ 131 | "prompt": inp, 132 | "max_tokens": self.max_gen_toks, 133 | "top_k": 1, 134 | "stop": until, 135 | }, 136 | ) 137 | resp = response.json() 138 | if "text" in resp: 139 | s = resp["text"] 140 | res.append(s) 141 | else: 142 | logger.error( 143 | f"The following response does not contain generated `text`. " 144 | "Got:\n{resp}" 145 | ) 146 | assert False 147 | return res 148 | 149 | def _model_call(self, inps): 150 | # Isn't used because we override _loglikelihood_tokens 151 | raise NotImplementedError() 152 | 153 | def _model_generate(self, context, max_length, eos_token_id): 154 | # Isn't used because we override greedy_until 155 | raise NotImplementedError() 156 | -------------------------------------------------------------------------------- /lm_eval/tasks/anli.py: -------------------------------------------------------------------------------- 1 | """ 2 | Adversarial NLI: A New Benchmark for Natural Language Understanding 3 | https://arxiv.org/pdf/1910.14599.pdf 4 | 5 | Adversarial NLI (ANLI) is a dataset collected via an iterative, adversarial 6 | human-and-model-in-the-loop procedure. It consists of three rounds that progressively 7 | increase in difficulty and complexity, and each question-answer includes annotator- 8 | provided explanations. 9 | 10 | Homepage: "https://github.com/facebookresearch/anli" 11 | """ 12 | import numpy as np 13 | from lm_eval.base import rf, Task 14 | from lm_eval.metrics import mean 15 | 16 | 17 | _CITATION = """ 18 | @inproceedings{nie-etal-2020-adversarial, 19 | title = "Adversarial {NLI}: A New Benchmark for Natural Language Understanding", 20 | author = "Nie, Yixin and 21 | Williams, Adina and 22 | Dinan, Emily and 23 | Bansal, Mohit and 24 | Weston, Jason and 25 | Kiela, Douwe", 26 | booktitle = "Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics", 27 | year = "2020", 28 | publisher = "Association for Computational Linguistics", 29 | } 30 | """ 31 | 32 | 33 | class ANLIBase(Task): 34 | VERSION = 0 35 | DATASET_PATH = "anli" 36 | DATASET_NAME = None 37 | SPLIT = None 38 | 39 | def has_training_docs(self): 40 | return True 41 | 42 | def has_validation_docs(self): 43 | return True 44 | 45 | def has_test_docs(self): 46 | return True 47 | 48 | def training_docs(self): 49 | if self.has_training_docs(): 50 | if self._training_docs is None: 51 | self._training_docs = list(self.dataset["train_r" + str(self.SPLIT)]) 52 | return self._training_docs 53 | 54 | def validation_docs(self): 55 | if self.has_validation_docs(): 56 | return self.dataset["dev_r" + str(self.SPLIT)] 57 | 58 | def test_docs(self): 59 | if self.has_test_docs(): 60 | return self.dataset["test_r" + str(self.SPLIT)] 61 | 62 | def doc_to_text(self, doc): 63 | # OA does this a bit weirdly: they prepend "anli 1: anli 1: " to the beginning 64 | # of the prompt (yes, repeating it!). also, " True, False, or Neither?" is directly 65 | # appended onto the question, with no "Answer:" or even a newline. Do we *really* 66 | # want to do it exactly as OA did? 67 | return ( 68 | doc["premise"] 69 | + "\nQuestion: " 70 | + doc["hypothesis"] 71 | + " True, False, or Neither?\nAnswer:" 72 | ) 73 | 74 | def should_decontaminate(self): 75 | return True 76 | 77 | def doc_to_decontamination_query(self, doc): 78 | return doc["premise"] 79 | 80 | def doc_to_target(self, doc): 81 | # True = entailment 82 | # False = contradiction 83 | # Neither = neutral 84 | return " " + ["True", "Neither", "False"][doc["label"]] 85 | 86 | def construct_requests(self, doc, ctx): 87 | """Uses RequestFactory to construct Requests and returns an iterable of 88 | Requests which will be sent to the LM. 89 | 90 | :param doc: 91 | The document as returned from training_docs, validation_docs, or test_docs. 92 | :param ctx: str 93 | The context string, generated by fewshot_context. This includes the natural 94 | language description, as well as the few shot examples, and the question 95 | part of the document for `doc`. 96 | """ 97 | ll_true, _ = rf.loglikelihood(ctx, " True") 98 | ll_neither, _ = rf.loglikelihood(ctx, " Neither") 99 | ll_false, _ = rf.loglikelihood(ctx, " False") 100 | return ll_true, ll_neither, ll_false 101 | 102 | def process_results(self, doc, results): 103 | """Take a single document and the LM results and evaluates, returning a 104 | dict where keys are the names of submetrics and values are the values of 105 | the metric for that one document 106 | 107 | :param doc: 108 | The document as returned from training_docs, validation_docs, or test_docs. 109 | :param results: 110 | The results of the requests created in construct_requests. 111 | """ 112 | gold = doc["label"] 113 | pred = np.argmax(results) 114 | return {"acc": pred == gold} 115 | 116 | def aggregation(self): 117 | """ 118 | :returns: {str: [float] -> float} 119 | A dictionary where keys are the names of submetrics and values are 120 | functions that aggregate a list of metrics 121 | """ 122 | return {"acc": mean} 123 | 124 | def higher_is_better(self): 125 | """ 126 | :returns: {str: bool} 127 | A dictionary where keys are the names of submetrics and values are 128 | whether a higher value of the submetric is better 129 | """ 130 | return {"acc": True} 131 | 132 | 133 | class ANLIRound1(ANLIBase): 134 | SPLIT = 1 135 | 136 | 137 | class ANLIRound2(ANLIBase): 138 | SPLIT = 2 139 | 140 | 141 | class ANLIRound3(ANLIBase): 142 | SPLIT = 3 143 | -------------------------------------------------------------------------------- /lm_eval/tasks/arc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge 3 | https://arxiv.org/pdf/1803.05457.pdf 4 | 5 | The ARC dataset consists of 7,787 science exam questions drawn from a variety 6 | of sources, including science questions provided under license by a research 7 | partner affiliated with AI2. These are text-only, English language exam questions 8 | that span several grade levels as indicated in the files. Each question has a 9 | multiple choice structure (typically 4 answer options). The questions are sorted 10 | into a Challenge Set of 2,590 “hard” questions (those that both a retrieval and 11 | a co-occurrence method fail to answer correctly) and an Easy Set of 5,197 questions. 12 | 13 | Homepage: https://allenai.org/data/arc 14 | """ 15 | 16 | from lm_eval.base import MultipleChoiceTask 17 | 18 | 19 | _CITATION = """ 20 | @article{Clark2018ThinkYH, 21 | title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge}, 22 | author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord}, 23 | journal={ArXiv}, 24 | year={2018}, 25 | volume={abs/1803.05457} 26 | } 27 | """ 28 | 29 | 30 | class ARCEasy(MultipleChoiceTask): 31 | VERSION = 0 32 | DATASET_PATH = "ai2_arc" 33 | DATASET_NAME = "ARC-Easy" 34 | 35 | def has_training_docs(self): 36 | return True 37 | 38 | def has_validation_docs(self): 39 | return True 40 | 41 | def has_test_docs(self): 42 | return True 43 | 44 | def training_docs(self): 45 | if self._training_docs is None: 46 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 47 | return self._training_docs 48 | 49 | def validation_docs(self): 50 | return map(self._process_doc, self.dataset["validation"]) 51 | 52 | def test_docs(self): 53 | return map(self._process_doc, self.dataset["test"]) 54 | 55 | def _process_doc(self, doc): 56 | # NOTE: Some `doc["answerKey"]`s are in numeric string format being one 57 | # of {'1', '2', '3', '4', '5'}. We map them back to letters. 58 | num_to_letter = {"1": "A", "2": "B", "3": "C", "4": "D", "5": "E"} 59 | doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"]) 60 | out_doc = { 61 | "id": doc["id"], 62 | "query": "Question: " + doc["question"] + "\nAnswer:", 63 | "choices": doc["choices"]["text"], 64 | "gold": ["A", "B", "C", "D", "E"].index(doc["answerKey"]), 65 | } 66 | return out_doc 67 | 68 | def doc_to_text(self, doc): 69 | return doc["query"] 70 | 71 | def should_decontaminate(self): 72 | return True 73 | 74 | def doc_to_decontamination_query(self, doc): 75 | return doc["query"] 76 | 77 | 78 | class ARCChallenge(ARCEasy): 79 | DATASET_PATH = "ai2_arc" 80 | DATASET_NAME = "ARC-Challenge" 81 | -------------------------------------------------------------------------------- /lm_eval/tasks/arithmetic.py: -------------------------------------------------------------------------------- 1 | """ 2 | Language Models are Few-Shot Learners 3 | https://arxiv.org/pdf/2005.14165.pdf 4 | 5 | A small battery of 10 tests that involve asking language models a simple arithmetic 6 | problem in natural language. 7 | 8 | Homepage: https://github.com/openai/gpt-3/tree/master/data 9 | """ 10 | from lm_eval.base import Task, rf 11 | from lm_eval.metrics import mean 12 | 13 | 14 | _CITATION = """ 15 | @inproceedings{NEURIPS2020_1457c0d6, 16 | author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario}, 17 | booktitle = {Advances in Neural Information Processing Systems}, 18 | editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin}, 19 | pages = {1877--1901}, 20 | publisher = {Curran Associates, Inc.}, 21 | title = {Language Models are Few-Shot Learners}, 22 | url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf}, 23 | volume = {33}, 24 | year = {2020} 25 | } 26 | """ 27 | 28 | 29 | class Arithmetic(Task): 30 | VERSION = 0 31 | DATASET_PATH = "EleutherAI/arithmetic" 32 | 33 | def has_training_docs(self): 34 | return False 35 | 36 | def has_validation_docs(self): 37 | return True 38 | 39 | def has_test_docs(self): 40 | return False 41 | 42 | def training_docs(self): 43 | return NotImplemented 44 | 45 | def validation_docs(self): 46 | return self.dataset["validation"] 47 | 48 | def test_docs(self): 49 | return NotImplemented 50 | 51 | def doc_to_text(self, doc): 52 | return doc["context"] 53 | 54 | def should_decontaminate(self): 55 | return True 56 | 57 | def doc_to_decontamination_query(self, doc): 58 | return doc["context"] 59 | 60 | def doc_to_target(self, doc): 61 | return doc["completion"] 62 | 63 | def construct_requests(self, doc, ctx): 64 | ll, is_prediction = rf.loglikelihood(ctx, doc["completion"]) 65 | return is_prediction 66 | 67 | def process_results(self, doc, results): 68 | (is_prediction,) = results 69 | return {"acc": is_prediction} 70 | 71 | def aggregation(self): 72 | return { 73 | "acc": mean, 74 | } 75 | 76 | def higher_is_better(self): 77 | return {"acc": True} 78 | 79 | 80 | class Arithmetic2DPlus(Arithmetic): 81 | DATASET_NAME = "arithmetic_2da" 82 | 83 | 84 | class Arithmetic2DMinus(Arithmetic): 85 | DATASET_NAME = "arithmetic_2ds" 86 | 87 | 88 | class Arithmetic3DPlus(Arithmetic): 89 | DATASET_NAME = "arithmetic_3da" 90 | 91 | 92 | class Arithmetic3DMinus(Arithmetic): 93 | DATASET_NAME = "arithmetic_3ds" 94 | 95 | 96 | class Arithmetic4DPlus(Arithmetic): 97 | DATASET_NAME = "arithmetic_4da" 98 | 99 | 100 | class Arithmetic4DMinus(Arithmetic): 101 | DATASET_NAME = "arithmetic_4ds" 102 | 103 | 104 | class Arithmetic5DPlus(Arithmetic): 105 | DATASET_NAME = "arithmetic_5da" 106 | 107 | 108 | class Arithmetic5DMinus(Arithmetic): 109 | DATASET_NAME = "arithmetic_5ds" 110 | 111 | 112 | class Arithmetic2DMultiplication(Arithmetic): 113 | DATASET_NAME = "arithmetic_2dm" 114 | 115 | 116 | class Arithmetic1DComposite(Arithmetic): 117 | DATASET_NAME = "arithmetic_1dc" 118 | -------------------------------------------------------------------------------- /lm_eval/tasks/asdiv.py: -------------------------------------------------------------------------------- 1 | """ 2 | ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers 3 | https://arxiv.org/abs/2106.15772 4 | 5 | ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language 6 | patterns and problem types) English math word problem (MWP) corpus for evaluating 7 | the capability of various MWP solvers. Existing MWP corpora for studying AI progress 8 | remain limited either in language usage patterns or in problem types. We thus present 9 | a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem 10 | types taught in elementary school. Each MWP is annotated with its problem type and grade 11 | level (for indicating the level of difficulty). 12 | 13 | NOTE: We currently ignore formulas for answer generation. 14 | 15 | Homepage: https://github.com/chaochun/nlu-asdiv-dataset 16 | """ 17 | import inspect 18 | import lm_eval.datasets.asdiv.asdiv 19 | from lm_eval.base import rf, Task 20 | from lm_eval.metrics import mean 21 | 22 | 23 | _CITATION = """ 24 | @misc{miao2021diverse, 25 | title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers}, 26 | author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su}, 27 | year={2021}, 28 | eprint={2106.15772}, 29 | archivePrefix={arXiv}, 30 | primaryClass={cs.AI} 31 | } 32 | """ 33 | 34 | 35 | class Asdiv(Task): 36 | VERSION = 0 37 | DATASET_PATH = inspect.getfile(lm_eval.datasets.asdiv.asdiv) 38 | 39 | def has_training_docs(self): 40 | return False 41 | 42 | def has_validation_docs(self): 43 | return True 44 | 45 | def has_test_docs(self): 46 | return False 47 | 48 | def training_docs(self): 49 | raise NotImplementedError("This dataset has no training docs") 50 | 51 | def validation_docs(self): 52 | return self.dataset["validation"] 53 | 54 | def test_docs(self): 55 | raise NotImplementedError("This dataset has no test docs") 56 | 57 | def fewshot_context( 58 | self, doc, num_fewshot, provide_description=None, rnd=None, description=None 59 | ): 60 | assert num_fewshot == 0, "ASDiv is intended only for the zero-shot setting." 61 | return super().fewshot_context( 62 | doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description 63 | ) 64 | 65 | def doc_to_text(self, doc): 66 | # TODO: add solution-type 67 | return doc["body"] + "\n" + "Question:" + doc["question"] + "\n" + "Answer:" 68 | 69 | def should_decontaminate(self): 70 | return True 71 | 72 | def doc_to_decontamination_query(self, doc): 73 | return doc["body"] + " " + doc["question"] 74 | 75 | def doc_to_target(self, doc): 76 | # TODO: add formula 77 | 78 | answer = doc["answer"].split(" (")[0] 79 | return " " + answer 80 | 81 | def construct_requests(self, doc, ctx): 82 | ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc)) 83 | return ll, is_greedy 84 | 85 | def process_results(self, doc, results): 86 | ll, is_greedy = results 87 | 88 | return {"acc": int(is_greedy)} 89 | 90 | def aggregation(self): 91 | return {"acc": mean} 92 | 93 | def higher_is_better(self): 94 | return {"acc": True} 95 | -------------------------------------------------------------------------------- /lm_eval/tasks/cbt.py: -------------------------------------------------------------------------------- 1 | """ 2 | The Children’s Book Test (CBT) from the paper: 3 | https://research.fb.com/wp-content/uploads/2016/11/the_goldilocks_principle_reading_children_s_books_with_explicit_memory_representations.pdf 4 | 5 | The Children's Book Test (CBT) is test of how well language models capture 6 | meaning in children's books. Unlike standard language modelling benchmarks, 7 | it distinguishes the task of predicting syntactic function words from that 8 | of predicting lower-frequency words, which carry greater semantic content. 9 | 10 | NOTE: This evaluation is based on the (context + query) question-answering variant 11 | used by the Recurrent Language Models described in the paper. See section 4.4. 12 | 13 | Homepage: https://github.com/facebookresearch/ParlAI/tree/main/parlai/tasks/cbt 14 | """ 15 | import numpy as np 16 | from lm_eval.base import rf, Task 17 | from lm_eval.metrics import mean 18 | 19 | 20 | _CITATION = """ 21 | @misc{hill2016goldilocks, 22 | title={The Goldilocks Principle: Reading Children's Books with Explicit Memory Representations}, 23 | author={Felix Hill and Antoine Bordes and Sumit Chopra and Jason Weston}, 24 | year={2016}, 25 | eprint={1511.02301}, 26 | archivePrefix={arXiv}, 27 | primaryClass={cs.CL} 28 | } 29 | """ 30 | 31 | 32 | class CBTBase(Task): 33 | VERSION = 0 34 | DATASET_PATH = "cbt" 35 | DATASET_NAME = None 36 | 37 | def has_training_docs(self): 38 | return True 39 | 40 | def has_validation_docs(self): 41 | return True 42 | 43 | def has_test_docs(self): 44 | return True 45 | 46 | def training_docs(self): 47 | if self._training_docs is None: 48 | self._training_docs = list(self.dataset["train"]) 49 | return self._training_docs 50 | 51 | def validation_docs(self): 52 | return self.dataset["validation"] 53 | 54 | def test_docs(self): 55 | return self.dataset["test"] 56 | 57 | def detokenize(self, text): 58 | text = text.replace(" '", "'") 59 | text = text.replace(" \n", "\n") 60 | text = text.replace("\n ", "\n") 61 | text = text.replace(" n't", "n't") 62 | text = text.replace("`` ", '"') 63 | text = text.replace("''", '"') 64 | # punctuation 65 | text = text.replace(" :", ":") 66 | text = text.replace(" ;", ";") 67 | text = text.replace(" !", "!") 68 | text = text.replace(" ?", "?") 69 | text = text.replace(" ,", ",") 70 | text = text.replace(" .", ".") 71 | return text 72 | 73 | def doc_to_text(self, doc): 74 | passage = " ".join(doc["sentences"]) 75 | text = "Passage: " + passage + "\nQuestion: " + doc["question"] 76 | return self.detokenize(text) 77 | 78 | def should_decontaminate(self): 79 | return True 80 | 81 | def doc_to_decontamination_query(self, doc): 82 | passage = " ".join(doc["sentences"]) 83 | return passage 84 | 85 | def doc_to_target(self, doc): 86 | return "" 87 | 88 | def fewshot_examples(self, k, rnd): 89 | assert ( 90 | k == 0 91 | ), f"CBT is only implemented for the zero-shot setting. Given k={k}." 92 | return super().fewshot_examples(k, rnd) 93 | 94 | def construct_requests(self, doc, ctx): 95 | """Uses RequestFactory to construct Requests and returns an iterable of 96 | Requests which will be sent to the LM. 97 | 98 | :param doc: 99 | The document as returned from training_docs, validation_docs, or test_docs. 100 | :param ctx: str 101 | The context string, generated by fewshot_context. This includes the natural 102 | language description, as well as the few shot examples, and the question 103 | part of the document for `doc`. 104 | """ 105 | lls = [] 106 | for option in doc["options"]: 107 | # Following Section 4.4 "Recurrent Language Models" in the CBT paper: 108 | # "we rank candidate [option] c based on p(q1 . . . qk−1, c, qk+1 . . . ql) 109 | # rather than simply p(q1 . . . qk−1, c)." 110 | lls.append(rf.loglikelihood("", ctx.replace("XXXXX", option))[0]) 111 | return lls 112 | 113 | def process_results(self, doc, results): 114 | """Take a single document and the LM results and evaluates, returning a 115 | dict where keys are the names of submetrics and values are the values of 116 | the metric for that one document 117 | 118 | :param doc: 119 | The document as returned from training_docs, validation_docs, or test_docs. 120 | :param results: 121 | The results of the requests created in construct_requests. 122 | """ 123 | gold = doc["options"].index(doc["answer"]) 124 | pred = np.argmax(results) 125 | return {"acc": pred == gold} 126 | 127 | def aggregation(self): 128 | """ 129 | :returns: {str: [float] -> float} 130 | A dictionary where keys are the names of submetrics and values are 131 | functions that aggregate a list of metrics 132 | """ 133 | return {"acc": mean} 134 | 135 | def higher_is_better(self): 136 | """ 137 | :returns: {str: bool} 138 | A dictionary where keys are the names of submetrics and values are 139 | whether a higher value of the submetric is better 140 | """ 141 | return {"acc": True} 142 | 143 | 144 | class CBTCN(CBTBase): 145 | DATASET_NAME = "CN" 146 | 147 | 148 | class CBTNE(CBTBase): 149 | DATASET_NAME = "NE" 150 | -------------------------------------------------------------------------------- /lm_eval/tasks/gsm8k.py: -------------------------------------------------------------------------------- 1 | """ 2 | "Training Verifiers to Solve Math Word Problems" 3 | https://arxiv.org/abs/2110.14168 4 | 5 | State-of-the-art language models can match human performance on many tasks, but 6 | they still struggle to robustly perform multi-step mathematical reasoning. To 7 | diagnose the failures of current models and support research, we introduce GSM8K, 8 | a dataset of 8.5K high quality linguistically diverse grade school math word problems. 9 | We find that even the largest transformer models fail to achieve high test performance, 10 | despite the conceptual simplicity of this problem distribution. 11 | 12 | NOTE: See the official implementation of the task: 13 | https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py 14 | for how to make use of the dataset's calculator annotations in your language 15 | model's sample/generation function. 16 | 17 | Homepage: https://github.com/openai/grade-school-math 18 | """ 19 | import re 20 | from lm_eval.base import Task, rf 21 | from lm_eval.metrics import mean 22 | 23 | 24 | _CITATION = """ 25 | @misc{cobbe2021training, 26 | title={Training Verifiers to Solve Math Word Problems}, 27 | author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman}, 28 | year={2021}, 29 | eprint={2110.14168}, 30 | archivePrefix={arXiv}, 31 | primaryClass={cs.LG} 32 | } 33 | """ 34 | 35 | 36 | ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)") 37 | INVALID_ANS = "[invalid]" 38 | 39 | 40 | class GradeSchoolMath8K(Task): 41 | VERSION = 0 42 | DATASET_PATH = "gsm8k" 43 | DATASET_NAME = "main" 44 | 45 | def has_training_docs(self): 46 | return True 47 | 48 | def has_validation_docs(self): 49 | return False 50 | 51 | def has_test_docs(self): 52 | return True 53 | 54 | def training_docs(self): 55 | return self.dataset["train"] 56 | 57 | def validation_docs(self): 58 | raise NotImplementedError 59 | 60 | def test_docs(self): 61 | return self.dataset["test"] 62 | 63 | def doc_to_text(self, doc): 64 | return "Question: " + doc["question"] + "\nAnswer:" 65 | 66 | def doc_to_target(self, doc): 67 | return " " + doc["answer"] 68 | 69 | def construct_requests(self, doc, ctx): 70 | """Uses RequestFactory to construct Requests and returns an iterable of 71 | Requests which will be sent to the LM. 72 | 73 | :param doc: 74 | The document as returned from training_docs, validation_docs, or test_docs. 75 | :param ctx: str 76 | The context string, generated by fewshot_context. This includes the natural 77 | language description, as well as the few shot examples, and the question 78 | part of the document for `doc`. 79 | """ 80 | # NOTE: The paper implements "verifiers" that assign a score to multiple 81 | # solutions and output the highest ranked solution. 82 | completion = rf.greedy_until(ctx, ["\n"]) 83 | return completion 84 | 85 | def _extract_answer(self, completion): 86 | match = ANS_RE.search(completion) 87 | if match: 88 | match_str = match.group(1).strip() 89 | match_str = match_str.replace(",", "") 90 | return match_str 91 | else: 92 | return INVALID_ANS 93 | 94 | def _is_correct(self, completion, answer): 95 | gold = self._extract_answer(answer) 96 | assert gold != INVALID_ANS, "No ground truth answer found in the document." 97 | return self._extract_answer(completion) == gold 98 | 99 | def process_results(self, doc, results): 100 | """Take a single document and the LM results and evaluates, returning a 101 | dict where keys are the names of submetrics and values are the values of 102 | the metric for that one document 103 | 104 | :param doc: 105 | The document as returned from training_docs, validation_docs, or test_docs. 106 | :param results: 107 | The results of the requests created in construct_requests. 108 | """ 109 | completion = results[0] 110 | answer = doc["answer"] 111 | return {"acc": self._is_correct(completion, answer)} 112 | 113 | def aggregation(self): 114 | """ 115 | :returns: {str: [float] -> float} 116 | A dictionary where keys are the names of submetrics and values are 117 | functions that aggregate a list of metrics 118 | """ 119 | return {"acc": mean} 120 | 121 | def higher_is_better(self): 122 | """ 123 | :returns: {str: bool} 124 | A dictionary where keys are the names of submetrics and values are 125 | whether a higher value of the submetric is better 126 | """ 127 | return {"acc": True} 128 | -------------------------------------------------------------------------------- /lm_eval/tasks/headqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | Interpretable Multi-Step Reasoning with Knowledge Extraction on Complex Healthcare Question Answering 3 | https://aclanthology.org/P19-1092.pdf 4 | 5 | HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to 6 | access a specialized position in the Spanish healthcare system, and are challenging 7 | even for highly specialized humans. 8 | 9 | Homepage: https://aghie.github.io/head-qa/ 10 | """ 11 | import inspect 12 | import lm_eval.datasets.headqa.headqa 13 | from lm_eval.base import MultipleChoiceTask 14 | 15 | 16 | _CITATION = """ 17 | @misc{liu2020interpretable, 18 | title={Interpretable Multi-Step Reasoning with Knowledge Extraction on Complex Healthcare Question Answering}, 19 | author={Ye Liu and Shaika Chowdhury and Chenwei Zhang and Cornelia Caragea and Philip S. Yu}, 20 | year={2020}, 21 | eprint={2008.02434}, 22 | archivePrefix={arXiv}, 23 | primaryClass={cs.AI} 24 | } 25 | """ 26 | 27 | 28 | class HeadQABase(MultipleChoiceTask): 29 | VERSION = 0 30 | DATASET_PATH = inspect.getfile(lm_eval.datasets.headqa.headqa) 31 | 32 | def has_training_docs(self): 33 | return True 34 | 35 | def has_validation_docs(self): 36 | return True 37 | 38 | def has_test_docs(self): 39 | return True 40 | 41 | def training_docs(self): 42 | if self._training_docs is None: 43 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 44 | return self._training_docs 45 | 46 | def validation_docs(self): 47 | return map(self._process_doc, self.dataset["validation"]) 48 | 49 | def test_docs(self): 50 | return map(self._process_doc, self.dataset["test"]) 51 | 52 | def _process_doc(self, doc): 53 | out_doc = { 54 | "id": doc["qid"], 55 | "query": "Question: " + doc["qtext"] + "\nAnswer:", 56 | "choices": [answer["atext"] for answer in doc["answers"]], 57 | "gold": int(doc["ra"]) - 1, 58 | } 59 | return out_doc 60 | 61 | def doc_to_text(self, doc): 62 | return doc["query"] 63 | 64 | def should_decontaminate(self): 65 | return True 66 | 67 | def doc_to_decontamination_query(self, doc): 68 | return doc["query"] 69 | 70 | 71 | class HeadQAEn(HeadQABase): 72 | DATASET_NAME = "en" 73 | 74 | 75 | class HeadQAEs(HeadQABase): 76 | DATASET_NAME = "es" 77 | 78 | 79 | # for backwards compatibility 80 | class HeadQAEsDeprecated(HeadQABase): 81 | DATASET_NAME = "es" 82 | 83 | def __init__(self): 84 | super().__init__() 85 | print( 86 | "WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info." 87 | ) 88 | -------------------------------------------------------------------------------- /lm_eval/tasks/hellaswag.py: -------------------------------------------------------------------------------- 1 | """ 2 | HellaSwag: Can a Machine Really Finish Your Sentence? 3 | https://arxiv.org/pdf/1905.07830.pdf 4 | 5 | Hellaswag is a commonsense inference challenge dataset. Though its questions are 6 | trivial for humans (>95% accuracy), state-of-the-art models struggle (<48%). This is 7 | achieved via Adversarial Filtering (AF), a data collection paradigm wherein a 8 | series of discriminators iteratively select an adversarial set of machine-generated 9 | wrong answers. AF proves to be surprisingly robust. The key insight is to scale up 10 | the length and complexity of the dataset examples towards a critical 'Goldilocks' 11 | zone wherein generated text is ridiculous to humans, yet often misclassified by 12 | state-of-the-art models. 13 | 14 | Homepage: https://rowanzellers.com/hellaswag/ 15 | """ 16 | 17 | import re 18 | from lm_eval.base import MultipleChoiceTask 19 | 20 | 21 | _CITATION = """ 22 | @inproceedings{zellers2019hellaswag, 23 | title={HellaSwag: Can a Machine Really Finish Your Sentence?}, 24 | author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin}, 25 | booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, 26 | year={2019} 27 | } 28 | """ 29 | 30 | 31 | class HellaSwag(MultipleChoiceTask): 32 | VERSION = 0 33 | DATASET_PATH = "hellaswag" 34 | DATASET_NAME = None 35 | 36 | def has_training_docs(self): 37 | return True 38 | 39 | def has_validation_docs(self): 40 | return True 41 | 42 | def has_test_docs(self): 43 | return False 44 | 45 | def training_docs(self): 46 | if self._training_docs is None: 47 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 48 | return self._training_docs 49 | 50 | def validation_docs(self): 51 | return map(self._process_doc, self.dataset["validation"]) 52 | 53 | def _process_doc(self, doc): 54 | ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize() 55 | out_doc = { 56 | "query": self.preprocess(doc["activity_label"] + ": " + ctx), 57 | "choices": [self.preprocess(ending) for ending in doc["endings"]], 58 | "gold": int(doc["label"]), 59 | } 60 | return out_doc 61 | 62 | @classmethod 63 | def preprocess(cls, text): 64 | text = text.strip() 65 | # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag. 66 | text = text.replace(" [title]", ". ") 67 | text = re.sub("\\[.*?\\]", "", text) 68 | text = text.replace(" ", " ") 69 | return text 70 | 71 | def doc_to_text(self, doc): 72 | return doc["query"] 73 | 74 | def should_decontaminate(self): 75 | return True 76 | 77 | def doc_to_decontamination_query(self, doc): 78 | return doc["query"] 79 | -------------------------------------------------------------------------------- /lm_eval/tasks/hendrycks_test.py: -------------------------------------------------------------------------------- 1 | """ 2 | Measuring Massive Multitask Language Understanding 3 | https://arxiv.org/pdf/2009.03300.pdf 4 | 5 | The Hendryck's Test is a benchmark that measured a text model’s multitask accuracy. 6 | The test covers 57 tasks including elementary mathematics, US history, computer 7 | science, law, and more. To attain high accuracy on this test, models must possess 8 | extensive world knowledge and problem solving ability. By comprehensively evaluating 9 | the breadth and depth of a model’s academic and professional understanding, 10 | Hendryck's Test can be used to analyze models across many tasks and to identify 11 | important shortcomings. 12 | 13 | Homepage: https://github.com/hendrycks/test 14 | """ 15 | 16 | from lm_eval.base import MultipleChoiceTask 17 | 18 | 19 | _CITATION = """ 20 | @article{hendryckstest2021, 21 | title={Measuring Massive Multitask Language Understanding}, 22 | author={Dan Hendrycks and Collin Burns and Steven Basart and Andy Zou and Mantas Mazeika and Dawn Song and Jacob Steinhardt}, 23 | journal={Proceedings of the International Conference on Learning Representations (ICLR)}, 24 | year={2021} 25 | } 26 | """ 27 | 28 | 29 | SUBJECTS = [ 30 | "abstract_algebra", 31 | "anatomy", 32 | "astronomy", 33 | "business_ethics", 34 | "clinical_knowledge", 35 | "college_biology", 36 | "college_chemistry", 37 | "college_computer_science", 38 | "college_mathematics", 39 | "college_medicine", 40 | "college_physics", 41 | "computer_security", 42 | "conceptual_physics", 43 | "econometrics", 44 | "electrical_engineering", 45 | "elementary_mathematics", 46 | "formal_logic", 47 | "global_facts", 48 | "high_school_biology", 49 | "high_school_chemistry", 50 | "high_school_computer_science", 51 | "high_school_european_history", 52 | "high_school_geography", 53 | "high_school_government_and_politics", 54 | "high_school_macroeconomics", 55 | "high_school_mathematics", 56 | "high_school_microeconomics", 57 | "high_school_physics", 58 | "high_school_psychology", 59 | "high_school_statistics", 60 | "high_school_us_history", 61 | "high_school_world_history", 62 | "human_aging", 63 | "human_sexuality", 64 | "international_law", 65 | "jurisprudence", 66 | "logical_fallacies", 67 | "machine_learning", 68 | "management", 69 | "marketing", 70 | "medical_genetics", 71 | "miscellaneous", 72 | "moral_disputes", 73 | "moral_scenarios", 74 | "nutrition", 75 | "philosophy", 76 | "prehistory", 77 | "professional_accounting", 78 | "professional_law", 79 | "professional_medicine", 80 | "professional_psychology", 81 | "public_relations", 82 | "security_studies", 83 | "sociology", 84 | "us_foreign_policy", 85 | "virology", 86 | "world_religions", 87 | ] 88 | 89 | 90 | def create_all_tasks(): 91 | """Creates a dictionary of tasks from a list of subjects 92 | :return: {task_name: task} 93 | e.g. {hendrycksTest-abstract_algebra: Task, hendrycksTest-anatomy: Task} 94 | """ 95 | return {f"hendrycksTest-{sub}": create_task(sub) for sub in SUBJECTS} 96 | 97 | 98 | def create_task(subject): 99 | class HendrycksTest(GeneralHendrycksTest): 100 | def __init__(self): 101 | super().__init__(subject) 102 | 103 | return HendrycksTest 104 | 105 | 106 | class GeneralHendrycksTest(MultipleChoiceTask): 107 | VERSION = 0 108 | DATASET_PATH = "hendrycks_test" 109 | DATASET_NAME = None 110 | 111 | def __init__(self, subject): 112 | self.DATASET_NAME = subject 113 | super().__init__() 114 | 115 | def has_training_docs(self): 116 | return False 117 | 118 | def has_validation_docs(self): 119 | return True 120 | 121 | def has_test_docs(self): 122 | return True 123 | 124 | def validation_docs(self): 125 | return map(self._process_doc, self.dataset["validation"]) 126 | 127 | def test_docs(self): 128 | return map(self._process_doc, self.dataset["test"]) 129 | 130 | def _process_doc(self, doc): 131 | def format_example(doc, keys): 132 | """ 133 | Question: 134 | Choices: 135 | A. 136 | B. 137 | C. 138 | D. 139 | Answer: 140 | """ 141 | prompt = "Question: " + doc["question"] + "\nChoices:\n" 142 | prompt += "".join( 143 | [f"{key}. {choice}\n" for key, choice in zip(keys, doc["choices"])] 144 | ) 145 | prompt += "Answer:" 146 | return prompt 147 | 148 | keys = ["A", "B", "C", "D"] 149 | return { 150 | "query": format_example(doc, keys), 151 | "choices": doc["choices"], 152 | "gold": ( 153 | keys.index(doc["answer"]) 154 | if isinstance(doc["answer"], str) 155 | else doc["answer"] 156 | ), 157 | } 158 | 159 | def fewshot_examples(self, k, rnd): 160 | # fewshot_examples is not just sampling from train_docs because dev is 161 | # in the same distribution as val/test but auxiliary_train isn't 162 | 163 | if self._fewshot_docs is None: 164 | self._fewshot_docs = list(map(self._process_doc, self.dataset["dev"])) 165 | 166 | return rnd.sample(list(self._fewshot_docs), k) 167 | 168 | def doc_to_text(self, doc): 169 | return doc["query"] 170 | 171 | def should_decontaminate(self): 172 | return True 173 | 174 | def doc_to_decontamination_query(self, doc): 175 | return doc["query"] 176 | -------------------------------------------------------------------------------- /lm_eval/tasks/lambada.py: -------------------------------------------------------------------------------- 1 | """ 2 | The LAMBADA dataset: Word prediction requiring a broad discourse context∗ 3 | https://arxiv.org/pdf/1606.06031.pdf 4 | 5 | LAMBADA is a dataset to evaluate the capabilities of computational models for text 6 | understanding by means of a word prediction task. LAMBADA is a collection of narrative 7 | passages sharing the characteristic that human subjects are able to guess their last 8 | word if they are exposed to the whole passage, but not if they only see the last 9 | sentence preceding the target word. To succeed on LAMBADA, computational models 10 | cannot simply rely on local context, but must be able to keep track of information 11 | in the broader discourse. 12 | 13 | Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI 14 | """ 15 | from lm_eval.base import Task, rf 16 | from lm_eval.metrics import mean, perplexity 17 | 18 | 19 | _CITATION = """ 20 | @misc{ 21 | author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel}, 22 | title={The LAMBADA dataset}, 23 | DOI={10.5281/zenodo.2630551}, 24 | publisher={Zenodo}, 25 | year={2016}, 26 | month={Aug} 27 | } 28 | """ 29 | 30 | 31 | class LambadaBase(Task): 32 | VERSION = None 33 | 34 | def training_docs(self): 35 | if self.has_training_docs(): 36 | return self.dataset["train"] 37 | 38 | def validation_docs(self): 39 | if self.has_validation_docs(): 40 | return self.dataset["validation"] 41 | 42 | def test_docs(self): 43 | if self.has_test_docs(): 44 | return self.dataset["test"] 45 | 46 | def doc_to_text(self, doc): 47 | return doc["text"].rsplit(" ", 1)[0] 48 | 49 | def should_decontaminate(self): 50 | return True 51 | 52 | def doc_to_decontamination_query(self, doc): 53 | return doc["text"] 54 | 55 | def doc_to_target(self, doc): 56 | return " " + doc["text"].rsplit(" ", 1)[1] 57 | 58 | def construct_requests(self, doc, ctx): 59 | ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc)) 60 | 61 | return ll, is_greedy 62 | 63 | def process_results(self, doc, results): 64 | ll, is_greedy = results 65 | 66 | return {"ppl": ll, "acc": int(is_greedy)} 67 | 68 | def aggregation(self): 69 | return {"ppl": perplexity, "acc": mean} 70 | 71 | def higher_is_better(self): 72 | return {"ppl": False, "acc": True} 73 | 74 | 75 | class LambadaStandard(LambadaBase): 76 | """The LAMBADA task using the standard original LAMBADA dataset.""" 77 | 78 | VERSION = 0 79 | DATASET_PATH = "lambada" 80 | 81 | def has_training_docs(self): 82 | return False 83 | 84 | def has_validation_docs(self): 85 | return True 86 | 87 | def has_test_docs(self): 88 | return True 89 | 90 | 91 | class LambadaOpenAI(LambadaBase): 92 | """The LAMBADA task using the LAMBADA OpenAI dataset, a modified version of the 93 | original LAMBADA dataset created by OpenAI for evaluating their GPT-2 model. 94 | 95 | Reference: https://github.com/openai/gpt-2/issues/131#issuecomment-497136199 96 | """ 97 | 98 | VERSION = 0 99 | DATASET_PATH = "EleutherAI/lambada_openai" 100 | 101 | def has_training_docs(self): 102 | return False 103 | 104 | def has_validation_docs(self): 105 | return False 106 | 107 | def has_test_docs(self): 108 | return True 109 | -------------------------------------------------------------------------------- /lm_eval/tasks/lambada_cloze.py: -------------------------------------------------------------------------------- 1 | """ 2 | The LAMBADA dataset: Word prediction requiring a broad discourse context∗ 3 | https://arxiv.org/pdf/1606.06031.pdf 4 | 5 | Cloze-style LAMBADA dataset. 6 | LAMBADA is a dataset to evaluate the capabilities of computational models for text 7 | understanding by means of a word prediction task. LAMBADA is a collection of narrative 8 | passages sharing the characteristic that human subjects are able to guess their last 9 | word if they are exposed to the whole passage, but not if they only see the last 10 | sentence preceding the target word. To succeed on LAMBADA, computational models 11 | cannot simply rely on local context, but must be able to keep track of information 12 | in the broader discourse. 13 | 14 | Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI 15 | """ 16 | from lm_eval.tasks.lambada import LambadaOpenAI, LambadaStandard 17 | 18 | 19 | _CITATION = """ 20 | @misc{ 21 | author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel}, 22 | title={The LAMBADA dataset}, 23 | DOI={10.5281/zenodo.2630551}, 24 | publisher={Zenodo}, 25 | year={2016}, 26 | month={Aug} 27 | } 28 | """ 29 | 30 | 31 | class LambadaStandardCloze(LambadaStandard): 32 | """Cloze-style LambadaStandard.""" 33 | 34 | VERSION = 0 35 | 36 | def doc_to_text(self, doc): 37 | return doc["text"].rsplit(" ", 1)[0] + " ____. ->" 38 | 39 | def should_decontaminate(self): 40 | return True 41 | 42 | def doc_to_decontamination_query(self, doc): 43 | return doc["text"] 44 | 45 | def doc_to_target(self, doc): 46 | return " " + doc["text"].rsplit(" ", 1)[1] 47 | 48 | 49 | class LambadaOpenAICloze(LambadaOpenAI): 50 | """Cloze-style LambadaOpenAI.""" 51 | 52 | VERSION = 0 53 | 54 | def doc_to_text(self, doc): 55 | return doc["text"].rsplit(" ", 1)[0] + " ____. ->" 56 | 57 | def should_decontaminate(self): 58 | return True 59 | 60 | def doc_to_decontamination_query(self, doc): 61 | return doc["text"] 62 | 63 | def doc_to_target(self, doc): 64 | return " " + doc["text"].rsplit(" ", 1)[1] 65 | -------------------------------------------------------------------------------- /lm_eval/tasks/lambada_multilingual.py: -------------------------------------------------------------------------------- 1 | """ 2 | The LAMBADA (OpenAI) dataset: Word prediction requiring a broad discourse context∗ 3 | https://arxiv.org/pdf/1606.06031.pdf 4 | 5 | The LAMBADA OpenAI dataset machine-translated to other languages. 6 | LAMBADA is a dataset to evaluate the capabilities of computational models for text 7 | understanding by means of a word prediction task. LAMBADA is a collection of narrative 8 | passages sharing the characteristic that human subjects are able to guess their last 9 | word if they are exposed to the whole passage, but not if they only see the last 10 | sentence preceding the target word. To succeed on LAMBADA, computational models 11 | cannot simply rely on local context, but must be able to keep track of information 12 | in the broader discourse. 13 | 14 | Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI 15 | 16 | Reference (OpenAI): https://github.com/openai/gpt-2/issues/131#issuecomment-497136199 17 | """ 18 | from .lambada import LambadaOpenAI 19 | 20 | 21 | _CITATION = """ 22 | @misc{ 23 | author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel}, 24 | title={The LAMBADA dataset}, 25 | DOI={10.5281/zenodo.2630551}, 26 | publisher={Zenodo}, 27 | year={2016}, 28 | month={Aug} 29 | } 30 | """ 31 | 32 | 33 | class LambadaOpenAIMultilingualEnglish(LambadaOpenAI): 34 | VERSION = 0 35 | DATASET_NAME = "en" 36 | 37 | 38 | class LambadaOpenAIMultilingualFrench(LambadaOpenAI): 39 | VERSION = 0 40 | DATASET_NAME = "fr" 41 | 42 | 43 | class LambadaOpenAIMultilingualGerman(LambadaOpenAI): 44 | VERSION = 0 45 | DATASET_NAME = "de" 46 | 47 | 48 | class LambadaOpenAIMultilingualItalian(LambadaOpenAI): 49 | VERSION = 0 50 | DATASET_NAME = "it" 51 | 52 | 53 | class LambadaOpenAIMultilingualSpanish(LambadaOpenAI): 54 | VERSION = 0 55 | DATASET_NAME = "es" 56 | 57 | 58 | LANG_CLASSES = [ 59 | LambadaOpenAIMultilingualEnglish, 60 | LambadaOpenAIMultilingualFrench, 61 | LambadaOpenAIMultilingualGerman, 62 | LambadaOpenAIMultilingualItalian, 63 | LambadaOpenAIMultilingualSpanish, 64 | ] 65 | 66 | 67 | def construct_tasks(): 68 | tasks = {} 69 | for lang_class in LANG_CLASSES: 70 | tasks[f"lambada_openai_mt_{lang_class.DATASET_NAME}"] = lang_class 71 | return tasks 72 | -------------------------------------------------------------------------------- /lm_eval/tasks/logiqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning 3 | https://arxiv.org/pdf/2007.08124.pdf 4 | 5 | LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA 6 | instances, covering multiple types of deductive reasoning. Results show that state- 7 | of-the-art neural models perform by far worse than human ceiling. The dataset can 8 | also serve as a benchmark for reinvestigating logical AI under the deep learning 9 | NLP setting. 10 | 11 | Homepage: https://github.com/lgw863/LogiQA-dataset 12 | """ 13 | import inspect 14 | import lm_eval.datasets.logiqa.logiqa 15 | from lm_eval.base import MultipleChoiceTask 16 | 17 | 18 | _CITATION = """ 19 | @misc{liu2020logiqa, 20 | title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, 21 | author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang}, 22 | year={2020}, 23 | eprint={2007.08124}, 24 | archivePrefix={arXiv}, 25 | primaryClass={cs.CL} 26 | } 27 | """ 28 | 29 | 30 | class LogiQA(MultipleChoiceTask): 31 | VERSION = 0 32 | DATASET_PATH = inspect.getfile(lm_eval.datasets.logiqa.logiqa) 33 | DATASET_NAME = None 34 | 35 | def has_training_docs(self): 36 | return True 37 | 38 | def has_validation_docs(self): 39 | return True 40 | 41 | def has_test_docs(self): 42 | return True 43 | 44 | def training_docs(self): 45 | if self._training_docs is None: 46 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 47 | return self._training_docs 48 | 49 | def validation_docs(self): 50 | return map(self._process_doc, self.dataset["validation"]) 51 | 52 | def test_docs(self): 53 | return map(self._process_doc, self.dataset["test"]) 54 | 55 | def _process_doc(self, doc): 56 | def format_example(doc, choices): 57 | """ 58 | Passage: 59 | Question: 60 | Choices: 61 | A. 62 | B. 63 | C. 64 | D. 65 | Answer: 66 | """ 67 | prompt = "Passage: " + doc["context"] + "\n" 68 | prompt += "Question: " + doc["question"] + "\nChoices:\n" 69 | for choice, option in zip(choices, doc["options"]): 70 | prompt += f"{choice.upper()}. {option}\n" 71 | prompt += "Answer:" 72 | return prompt 73 | 74 | choices = ["a", "b", "c", "d"] 75 | return { 76 | "passage": doc["context"], # Used for decontamination 77 | "query": format_example(doc, choices), 78 | "choices": doc["options"], 79 | "gold": choices.index(doc["label"]), 80 | } 81 | 82 | def doc_to_text(self, doc): 83 | return doc["query"] 84 | 85 | def should_decontaminate(self): 86 | return True 87 | 88 | def doc_to_decontamination_query(self, doc): 89 | return doc["passage"] 90 | -------------------------------------------------------------------------------- /lm_eval/tasks/mathqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | MathQA: Towards Interpretable Math Word Problem Solving with Operation-Based Formalisms 3 | https://arxiv.org/pdf/1905.13319.pdf 4 | 5 | MathQA is a large-scale dataset of 37k English multiple-choice math word problems 6 | covering multiple math domain categories by modeling operation programs corresponding 7 | to word problems in the AQuA dataset (Ling et al., 2017). 8 | 9 | Homepage: https://math-qa.github.io/math-QA/ 10 | """ 11 | import re 12 | from lm_eval.base import MultipleChoiceTask 13 | 14 | 15 | _CITATION = """ 16 | @misc{amini2019mathqa, 17 | title={MathQA: Towards Interpretable Math Word Problem Solving with Operation-Based Formalisms}, 18 | author={Aida Amini and Saadia Gabriel and Peter Lin and Rik Koncel-Kedziorski and Yejin Choi and Hannaneh Hajishirzi}, 19 | year={2019}, 20 | eprint={1905.13319}, 21 | archivePrefix={arXiv}, 22 | primaryClass={cs.CL} 23 | } 24 | """ 25 | 26 | 27 | class MathQA(MultipleChoiceTask): 28 | VERSION = 0 29 | DATASET_PATH = "math_qa" 30 | DATASET_NAME = None 31 | 32 | def has_training_docs(self): 33 | return True 34 | 35 | def has_validation_docs(self): 36 | return True 37 | 38 | def has_test_docs(self): 39 | return True 40 | 41 | def training_docs(self): 42 | if self._training_docs is None: 43 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 44 | return self._training_docs 45 | 46 | def validation_docs(self): 47 | return map(self._process_doc, self.dataset["validation"]) 48 | 49 | def test_docs(self): 50 | return map(self._process_doc, self.dataset["test"]) 51 | 52 | def _process_doc(self, doc): 53 | answer_idx = ["a", "b", "c", "d", "e"].index(doc["correct"]) 54 | choices = [ 55 | c[4:].rstrip(" ,") 56 | for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc["options"]) 57 | ] 58 | 59 | out_doc = { 60 | "query": "Question: " + doc["Problem"] + "\nAnswer:", 61 | "choices": choices, 62 | "gold": answer_idx, 63 | } 64 | return out_doc 65 | 66 | def doc_to_text(self, doc): 67 | return doc["query"] 68 | 69 | def should_decontaminate(self): 70 | return True 71 | 72 | def doc_to_decontamination_query(self, doc): 73 | return doc["query"] 74 | -------------------------------------------------------------------------------- /lm_eval/tasks/mutual.py: -------------------------------------------------------------------------------- 1 | """ 2 | MuTual: A Dataset for Multi-Turn Dialogue Reasoning 3 | https://www.aclweb.org/anthology/2020.acl-main.130/ 4 | 5 | MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is 6 | modified from Chinese high school English listening comprehension test data. 7 | 8 | Homepage: https://github.com/Nealcly/MuTual 9 | """ 10 | import numpy as np 11 | import inspect 12 | import lm_eval.datasets.mutual.mutual 13 | from lm_eval.base import Task, rf 14 | from lm_eval.metrics import mean 15 | 16 | 17 | _CITATION = """ 18 | @inproceedings{mutual, 19 | title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning", 20 | author = "Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" , 21 | booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics", 22 | year = "2020", 23 | publisher = "Association for Computational Linguistics", 24 | } 25 | """ 26 | 27 | 28 | class MuTualBase(Task): 29 | VERSION = 1 30 | DATASET_PATH = inspect.getfile(lm_eval.datasets.mutual.mutual) 31 | DATASET_NAME = None 32 | CHOICES = ["A", "B", "C", "D"] 33 | 34 | def has_training_docs(self): 35 | return True 36 | 37 | def has_validation_docs(self): 38 | return True 39 | 40 | def has_test_docs(self): 41 | return False 42 | 43 | def training_docs(self): 44 | return self.dataset["train"] 45 | 46 | def validation_docs(self): 47 | return self.dataset["validation"] 48 | 49 | def test_docs(self): 50 | return NotImplemented 51 | 52 | def doc_to_text(self, doc): 53 | return self.detokenize(doc["article"]) 54 | 55 | def should_decontaminate(self): 56 | return True 57 | 58 | def doc_to_decontamination_query(self, doc): 59 | return doc["article"] 60 | 61 | def doc_to_target(self, doc): 62 | return " " + self.detokenize(doc["options"][self.CHOICES.index(doc["answers"])]) 63 | 64 | def construct_requests(self, doc, ctx): 65 | lls = [] 66 | for option in doc["options"]: 67 | lls.append(rf.loglikelihood(ctx, f" {self.detokenize(option)}")[0]) 68 | return lls 69 | 70 | def detokenize(self, text): 71 | text = text.replace(" '", "'") 72 | text = text.replace(" \n", "\n") 73 | text = text.replace("\n ", "\n") 74 | text = text.replace(" n't", "n't") 75 | text = text.replace("`` ", '"') 76 | text = text.replace("''", '"') 77 | # punctuation 78 | text = text.replace(" :", ":") 79 | text = text.replace(" ;", ";") 80 | text = text.replace(" !", "!") 81 | text = text.replace(" ?", "?") 82 | text = text.replace(" ,", ",") 83 | text = text.replace(" .", ".") 84 | return text 85 | 86 | def process_results(self, doc, results): 87 | gold = self.CHOICES.index(doc["answers"]) 88 | r4_1 = np.argmax(results) == gold # r4_1 = accuracy 89 | ranks = sorted(results, reverse=True) 90 | r4_2 = (ranks.index(results[gold]) == 1) + r4_1 91 | mrr = 1.0 / (ranks.index(results[gold]) + 1) # `+ 1` for index offset 92 | return {"r@1": r4_1, "r@2": r4_2, "mrr": mrr} 93 | 94 | def aggregation(self): 95 | return {"r@1": mean, "r@2": mean, "mrr": mean} 96 | 97 | def higher_is_better(self): 98 | return {"r@1": True, "r@2": True, "mrr": True} 99 | 100 | 101 | class MuTual(MuTualBase): 102 | DATASET_NAME = "mutual" 103 | 104 | 105 | class MuTualPlus(MuTualBase): 106 | DATASET_NAME = "mutual_plus" 107 | -------------------------------------------------------------------------------- /lm_eval/tasks/openbookqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering 3 | https://arxiv.org/pdf/1809.02789.pdf 4 | 5 | OpenBookQA is a question-answering dataset modeled after open book exams for 6 | assessing human understanding of a subject. It consists of 5,957 multiple-choice 7 | elementary-level science questions (4,957 train, 500 dev, 500 test), which probe 8 | the understanding of a small “book” of 1,326 core science facts and the application 9 | of these facts to novel situations. For training, the dataset includes a mapping 10 | from each question to the core science fact it was designed to probe. Answering 11 | OpenBookQA questions requires additional broad common knowledge, not contained 12 | in the book. The questions, by design, are answered incorrectly by both a retrieval- 13 | based algorithm and a word co-occurrence algorithm. 14 | 15 | Homepage: https://allenai.org/data/open-book-qa 16 | """ 17 | from lm_eval.base import MultipleChoiceTask 18 | 19 | 20 | _CITATION = """ 21 | @inproceedings{OpenBookQA2018, 22 | title={Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering}, 23 | author={Todor Mihaylov and Peter Clark and Tushar Khot and Ashish Sabharwal}, 24 | booktitle={EMNLP}, 25 | year={2018} 26 | } 27 | """ 28 | 29 | 30 | class OpenBookQA(MultipleChoiceTask): 31 | VERSION = 0 32 | DATASET_PATH = "openbookqa" 33 | DATASET_NAME = "main" 34 | 35 | def has_training_docs(self): 36 | return True 37 | 38 | def has_validation_docs(self): 39 | return True 40 | 41 | def has_test_docs(self): 42 | return True 43 | 44 | def training_docs(self): 45 | if self._training_docs is None: 46 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 47 | return self._training_docs 48 | 49 | def validation_docs(self): 50 | return map(self._process_doc, self.dataset["validation"]) 51 | 52 | def test_docs(self): 53 | return map(self._process_doc, self.dataset["test"]) 54 | 55 | def _process_doc(self, doc): 56 | out_doc = { 57 | "id": doc["id"], 58 | "query": doc["question_stem"], 59 | "choices": doc["choices"]["text"], 60 | "gold": ["A", "B", "C", "D"].index(doc["answerKey"].strip()), 61 | } 62 | return out_doc 63 | 64 | def doc_to_text(self, doc): 65 | return doc["query"] 66 | 67 | def should_decontaminate(self): 68 | return True 69 | 70 | def doc_to_decontamination_query(self, doc): 71 | return doc["query"] 72 | -------------------------------------------------------------------------------- /lm_eval/tasks/pile.py: -------------------------------------------------------------------------------- 1 | """ 2 | The Pile: An 800GB Dataset of Diverse Text for Language Modeling 3 | https://arxiv.org/pdf/2101.00027.pdf 4 | 5 | The Pile is a 825 GiB diverse, open source language modelling data set that consists 6 | of 22 smaller, high-quality datasets combined together. To score well on Pile 7 | BPB (bits per byte), a model must be able to understand many disparate domains 8 | including books, github repositories, webpages, chat logs, and medical, physics, 9 | math, computer science, and philosophy papers. 10 | 11 | Homepage: https://pile.eleuther.ai/ 12 | """ 13 | import inspect 14 | import lm_eval.datasets.pile.pile 15 | from lm_eval.base import PerplexityTask 16 | 17 | 18 | _CITATION = """ 19 | @article{pile, 20 | title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling}, 21 | author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor}, 22 | journal={arXiv preprint arXiv:2101.00027}, 23 | year={2020} 24 | } 25 | """ 26 | 27 | 28 | class PilePerplexityTask(PerplexityTask): 29 | VERSION = 1 30 | DATASET_PATH = inspect.getfile(lm_eval.datasets.pile.pile) 31 | DATASET_NAME = None 32 | 33 | def has_validation_docs(self): 34 | return True 35 | 36 | def has_test_docs(self): 37 | return True 38 | 39 | def validation_docs(self): 40 | for doc in self.dataset["validation"]: 41 | yield doc["text"] 42 | 43 | def test_docs(self): 44 | for doc in self.dataset["test"]: 45 | yield doc["text"] 46 | 47 | 48 | class PileArxiv(PilePerplexityTask): 49 | DATASET_NAME = "pile_arxiv" 50 | 51 | 52 | class PileBooks3(PilePerplexityTask): 53 | DATASET_NAME = "pile_books3" 54 | 55 | 56 | class PileBookCorpus2(PilePerplexityTask): 57 | DATASET_NAME = "pile_bookcorpus2" 58 | 59 | 60 | class PileDmMathematics(PilePerplexityTask): 61 | DATASET_NAME = "pile_dm-mathematics" 62 | 63 | 64 | class PileEnron(PilePerplexityTask): 65 | DATASET_NAME = "pile_enron" 66 | 67 | 68 | class PileEuroparl(PilePerplexityTask): 69 | DATASET_NAME = "pile_europarl" 70 | 71 | 72 | class PileFreeLaw(PilePerplexityTask): 73 | DATASET_NAME = "pile_freelaw" 74 | 75 | 76 | class PileGithub(PilePerplexityTask): 77 | DATASET_NAME = "pile_github" 78 | 79 | 80 | class PileGutenberg(PilePerplexityTask): 81 | DATASET_NAME = "pile_gutenberg" 82 | 83 | 84 | class PileHackernews(PilePerplexityTask): 85 | DATASET_NAME = "pile_hackernews" 86 | 87 | 88 | class PileNIHExporter(PilePerplexityTask): 89 | DATASET_NAME = "pile_nih-exporter" 90 | 91 | 92 | class PileOpenSubtitles(PilePerplexityTask): 93 | DATASET_NAME = "pile_opensubtitles" 94 | 95 | 96 | class PileOpenWebText2(PilePerplexityTask): 97 | DATASET_NAME = "pile_openwebtext2" 98 | 99 | 100 | class PilePhilPapers(PilePerplexityTask): 101 | DATASET_NAME = "pile_philpapers" 102 | 103 | 104 | class PilePileCc(PilePerplexityTask): 105 | DATASET_NAME = "pile_pile-cc" 106 | 107 | 108 | class PilePubmedAbstracts(PilePerplexityTask): 109 | DATASET_NAME = "pile_pubmed-abstracts" 110 | 111 | 112 | class PilePubmedCentral(PilePerplexityTask): 113 | DATASET_NAME = "pile_pubmed-central" 114 | 115 | 116 | class PileStackExchange(PilePerplexityTask): 117 | DATASET_NAME = "pile_stackexchange" 118 | 119 | 120 | class PileUspto(PilePerplexityTask): 121 | DATASET_NAME = "pile_upsto" 122 | 123 | 124 | class PileUbuntuIrc(PilePerplexityTask): 125 | DATASET_NAME = "pile_ubuntu-irc" 126 | 127 | 128 | class PileWikipedia(PilePerplexityTask): 129 | DATASET_NAME = "pile_wikipedia" 130 | 131 | 132 | class PileYoutubeSubtitles(PilePerplexityTask): 133 | DATASET_NAME = "pile_youtubesubtitles" 134 | -------------------------------------------------------------------------------- /lm_eval/tasks/piqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | PIQA: Reasoning about Physical Commonsense in Natural Language 3 | https://arxiv.org/pdf/1911.11641.pdf 4 | 5 | Physical Interaction: Question Answering (PIQA) is a physical commonsense 6 | reasoning and a corresponding benchmark dataset. PIQA was designed to investigate 7 | the physical knowledge of existing models. To what extent are current approaches 8 | actually learning about the world? 9 | 10 | Homepage: https://yonatanbisk.com/piqa/ 11 | """ 12 | 13 | from lm_eval.base import MultipleChoiceTask 14 | 15 | 16 | _CITATION = """ 17 | @inproceedings{Bisk2020, 18 | author = {Yonatan Bisk and Rowan Zellers and 19 | Ronan Le Bras and Jianfeng Gao 20 | and Yejin Choi}, 21 | title = {PIQA: Reasoning about Physical Commonsense in 22 | Natural Language}, 23 | booktitle = {Thirty-Fourth AAAI Conference on 24 | Artificial Intelligence}, 25 | year = {2020}, 26 | } 27 | """ 28 | 29 | 30 | class PiQA(MultipleChoiceTask): 31 | VERSION = 0 32 | DATASET_PATH = "piqa" 33 | DATASET_NAME = None 34 | 35 | def has_training_docs(self): 36 | return True 37 | 38 | def has_validation_docs(self): 39 | return True 40 | 41 | def has_test_docs(self): 42 | return False 43 | 44 | def training_docs(self): 45 | if self._training_docs is None: 46 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 47 | return self._training_docs 48 | 49 | def validation_docs(self): 50 | return map(self._process_doc, self.dataset["validation"]) 51 | 52 | def _process_doc(self, doc): 53 | out_doc = { 54 | "goal": doc["goal"], 55 | "choices": [doc["sol1"], doc["sol2"]], 56 | "gold": doc["label"], 57 | } 58 | return out_doc 59 | 60 | def doc_to_text(self, doc): 61 | return "Question: " + doc["goal"] + "\nAnswer:" 62 | 63 | def should_decontaminate(self): 64 | return True 65 | 66 | def doc_to_decontamination_query(self, doc): 67 | return doc["goal"] 68 | -------------------------------------------------------------------------------- /lm_eval/tasks/prost.py: -------------------------------------------------------------------------------- 1 | """ 2 | PROST: Physical Reasoning about Objects Through Space and Time 3 | https://arxiv.org/pdf/2106.03634.pdf 4 | 5 | PROST, Physical Reasoning about Objects Through Space and Time, is a dataset 6 | consisting of 18,736 multiple-choice questions made from 14 manually curated 7 | templates, covering 10 physical reasoning concepts. All questions are designed 8 | to probe both causal and masked language models in a zero-shot setting. 9 | 10 | NOTE: PROST is limited to the zero-shot setting to adhere to authors' intentions 11 | as discussed in section 7 of the paper: "We hope that the community will use 12 | this dataset in the intended way: in a zero-shot setting to probe models which 13 | have been trained on data not specifically collected to succeed on PROST." 14 | 15 | Homepage: https://github.com/nala-cub/prost 16 | """ 17 | from lm_eval.base import MultipleChoiceTask 18 | 19 | 20 | _CITATION = """ 21 | @inproceedings{aroca-ouellette-etal-2021-prost, 22 | title = "{PROST}: {P}hysical Reasoning about Objects through Space and Time", 23 | author = "Aroca-Ouellette, St{\'e}phane and 24 | Paik, Cory and 25 | Roncone, Alessandro and 26 | Kann, Katharina", 27 | booktitle = "Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021", 28 | month = aug, 29 | year = "2021", 30 | address = "Online", 31 | publisher = "Association for Computational Linguistics", 32 | url = "https://aclanthology.org/2021.findings-acl.404", 33 | pages = "4597--4608", 34 | } 35 | """ 36 | 37 | 38 | class PROST(MultipleChoiceTask): 39 | VERSION = 0 40 | DATASET_PATH = "corypaik/prost" 41 | DATASET_NAME = None 42 | 43 | def has_training_docs(self): 44 | return False 45 | 46 | def has_validation_docs(self): 47 | return False 48 | 49 | def has_test_docs(self): 50 | return True 51 | 52 | def test_docs(self): 53 | return map(self._process_doc, self.dataset["test"]) 54 | 55 | def fewshot_context( 56 | self, doc, num_fewshot, provide_description=None, rnd=None, description=None 57 | ): 58 | assert ( 59 | num_fewshot == 0 60 | ), "PROST is designed to probe models in a zero-shot fashion only." 61 | return super().fewshot_context( 62 | doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description 63 | ) 64 | 65 | def _process_doc(self, doc): 66 | out_doc = { 67 | "query": f"{doc['context']}\nQuestion: {doc['ex_question']}\nAnswer:", 68 | "choices": [doc["A"], doc["B"], doc["C"], doc["D"]], 69 | "gold": doc["label"], 70 | } 71 | return out_doc 72 | 73 | def doc_to_text(self, doc): 74 | return doc["query"] 75 | 76 | def should_decontaminate(self): 77 | return True 78 | 79 | def doc_to_decontamination_query(self, doc): 80 | return doc["query"] 81 | -------------------------------------------------------------------------------- /lm_eval/tasks/pubmedqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | PubMedQA: A Dataset for Biomedical Research Question Answering 3 | https://arxiv.org/pdf/1909.06146.pdf 4 | 5 | PubMedQA is a novel biomedical question answering (QA) dataset collected from 6 | PubMed abstracts. The task of PubMedQA is to answer research questions with 7 | yes/no/maybe (e.g.: Do preoperative statins reduce atrial fibrillation after 8 | coronary artery bypass grafting?) using the corresponding abstracts. PubMedQA 9 | has 1k expert-annotated, 61.2k unlabeled and 211.3k artificially generated QA 10 | instances. Each PubMedQA instance is composed of (1) a question which is either 11 | an existing research article title or derived from one, (2) a context which is 12 | the corresponding abstract without its conclusion, (3) a long answer, which is 13 | the conclusion of the abstract and, presumably, answers the research question, 14 | and (4) a yes/no/maybe answer which summarizes the conclusion. 15 | 16 | Homepage: https://pubmedqa.github.io/ 17 | """ 18 | import numpy as np 19 | from lm_eval.base import rf, Task 20 | from lm_eval.metrics import mean 21 | 22 | 23 | _CITATION = """ 24 | @inproceedings{jin2019pubmedqa, 25 | title={PubMedQA: A Dataset for Biomedical Research Question Answering}, 26 | author={Jin, Qiao and Dhingra, Bhuwan and Liu, Zhengping and Cohen, William and Lu, Xinghua}, 27 | booktitle={Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)}, 28 | pages={2567--2577}, 29 | year={2019} 30 | } 31 | """ 32 | 33 | 34 | class Pubmed_QA(Task): 35 | VERSION = 0 36 | DATASET_PATH = "pubmed_qa" 37 | DATASET_NAME = "pqa_labeled" 38 | 39 | def has_training_docs(self): 40 | return False 41 | 42 | def has_validation_docs(self): 43 | return False 44 | 45 | def has_test_docs(self): 46 | return True 47 | 48 | def test_docs(self): 49 | if self.has_test_docs(): 50 | # HF is labelled as train but its really just for testing 51 | return self.dataset["train"] 52 | 53 | def doc_to_text(self, doc): 54 | ctxs = "\n".join(doc["context"]["contexts"]) 55 | return "Abstract: {}\nQuestion: {}\nAnswer:".format( 56 | ctxs, doc["question"], doc["final_decision"] 57 | ) 58 | 59 | def should_decontaminate(self): 60 | return True 61 | 62 | def doc_to_decontamination_query(self, doc): 63 | return doc["question"] + " " + "\n".join(doc["context"]["contexts"]) 64 | 65 | def doc_to_target(self, doc): 66 | return " {}".format(doc["final_decision"]) 67 | 68 | def construct_requests(self, doc, ctx): 69 | """Uses RequestFactory to construct Requests and returns 70 | an iterable of Requests which will be sent to the LM. 71 | """ 72 | ll_yes, _ = rf.loglikelihood(ctx, " yes") 73 | ll_no, _ = rf.loglikelihood(ctx, " no") 74 | ll_maybe, _ = rf.loglikelihood(ctx, " maybe") 75 | return ll_yes, ll_no, ll_maybe 76 | 77 | def process_results(self, doc, results): 78 | gold = doc["final_decision"] 79 | ll_yes, ll_no, ll_maybe = results 80 | pred = np.argmax(results) 81 | return { 82 | "acc": ["yes", "no", "maybe"][pred] == gold, 83 | } 84 | 85 | def aggregation(self): 86 | return {"acc": mean} 87 | 88 | def higher_is_better(self): 89 | return {"acc": True} 90 | -------------------------------------------------------------------------------- /lm_eval/tasks/qa4mre.py: -------------------------------------------------------------------------------- 1 | """ 2 | QA4MRE 2011-2013: Overview of Question Answering for Machine Reading Evaluation 3 | https://www.cs.cmu.edu/~./hovy/papers/13CLEF-QA4MRE.pdf 4 | 5 | The (English only) QA4MRE challenge which was run as a Lab at CLEF 2011-2013. 6 | The main objective of this exercise is to develop a methodology for evaluating 7 | Machine Reading systems through Question Answering and Reading Comprehension 8 | Tests. Systems should be able to extract knowledge from large volumes of text 9 | and use this knowledge to answer questions. Four different tasks have been 10 | organized during these years: Main Task, Processing Modality and Negation for 11 | Machine Reading, Machine Reading of Biomedical Texts about Alzheimer's disease, 12 | and Entrance Exam. 13 | 14 | Homepage: http://nlp.uned.es/clef-qa/repository/qa4mre.php 15 | """ 16 | from lm_eval.base import MultipleChoiceTask 17 | 18 | 19 | _CITATION = """ 20 | @inproceedings{Peas2013QA4MRE2O, 21 | title={QA4MRE 2011-2013: Overview of Question Answering for Machine Reading Evaluation}, 22 | author={Anselmo Pe{\~n}as and Eduard H. Hovy and Pamela Forner and {\'A}lvaro Rodrigo and Richard F. E. Sutcliffe and Roser Morante}, 23 | booktitle={CLEF}, 24 | year={2013} 25 | } 26 | """ # noqa: W605 27 | 28 | 29 | class QA4MRE(MultipleChoiceTask): 30 | VERSION = 0 31 | DATASET_PATH = "qa4mre" 32 | DATASET_NAME = None 33 | 34 | def has_training_docs(self): 35 | return False 36 | 37 | def has_validation_docs(self): 38 | return False 39 | 40 | def has_test_docs(self): 41 | return True 42 | 43 | def test_docs(self): 44 | # `qa4mre` only has train data so we use it for the test docs. 45 | return map(self._process_doc, self.dataset["train"]) 46 | 47 | def _process_doc(self, doc): 48 | choices = doc["answer_options"]["answer_str"] 49 | out_doc = { 50 | "source": doc["document_str"].strip().replace("'", "'"), 51 | "query": doc["question_str"], 52 | "choices": choices, 53 | "gold": int(doc["correct_answer_id"]) - 1, 54 | } 55 | return out_doc 56 | 57 | def doc_to_text(self, doc): 58 | return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"]) 59 | 60 | def should_decontaminate(self): 61 | return True 62 | 63 | def doc_to_decontamination_query(self, doc): 64 | return doc["source"] + " " + doc["query"] 65 | 66 | 67 | class QA4MRE_2011(QA4MRE): 68 | DATASET_NAME = "2011.main.EN" 69 | 70 | 71 | class QA4MRE_2012(QA4MRE): 72 | DATASET_NAME = "2012.main.EN" 73 | 74 | 75 | class QA4MRE_2013(QA4MRE): 76 | DATASET_NAME = "2013.main.EN" 77 | -------------------------------------------------------------------------------- /lm_eval/tasks/quac.py: -------------------------------------------------------------------------------- 1 | """ 2 | QuAC: Question Answering in Context 3 | https://arxiv.org/abs/1808.07036 4 | 5 | Question Answering in Context (QuAC) is a dataset for modeling, understanding, and 6 | participating in information seeking dialog. Data instances consist of an interactive 7 | dialog between two crowd workers: (1) a student who poses a sequence of freeform 8 | questions to learn as much as possible about a hidden Wikipedia text, and (2) 9 | a teacher who answers the questions by providing short excerpts (spans) from the text. 10 | 11 | Homepage: https://quac.ai/ 12 | """ 13 | import inspect 14 | import lm_eval.datasets.quac.quac 15 | from lm_eval.base import Task 16 | 17 | 18 | _CITATION = """ 19 | @article{choi2018quac, 20 | title={Quac: Question answering in context}, 21 | author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke}, 22 | journal={arXiv preprint arXiv:1808.07036}, 23 | year={2018} 24 | } 25 | """ 26 | 27 | 28 | class QuAC(Task): 29 | VERSION = 0 30 | DATASET_PATH = inspect.getfile(lm_eval.datasets.quac.quac) 31 | DATASET_NAME = None 32 | 33 | def has_training_docs(self): 34 | return True 35 | 36 | def has_validation_docs(self): 37 | return True 38 | 39 | def has_test_docs(self): 40 | return False 41 | 42 | def training_docs(self): 43 | if self._training_docs is None: 44 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 45 | return self._training_docs 46 | 47 | def validation_docs(self): 48 | return map(self._process_doc, self.dataset["validation"]) 49 | 50 | def test_docs(self): 51 | raise NotImplementedError("QuAC has no test docs.") 52 | 53 | def _process_doc(self, doc): 54 | doc["title"] = doc["title"] + " - " + doc["section_title"] 55 | return doc 56 | 57 | def doc_to_text(self, doc): 58 | return ( 59 | "TITLE: " 60 | + doc["title"] 61 | + "\n" 62 | + "PARAGRAPH: " 63 | + doc["paragraph"] 64 | + "\n\n" 65 | + "Q: " 66 | + doc["question"] 67 | + "\n\n" 68 | + "A: " 69 | ) 70 | 71 | def should_decontaminate(self): 72 | return True 73 | 74 | def doc_to_decontamination_query(self, doc): 75 | return doc["paragraph"] 76 | 77 | def doc_to_target(self, doc): 78 | return doc["answer"] 79 | 80 | def construct_requests(self, doc, ctx): 81 | """Uses RequestFactory to construct Requests and returns an iterable of 82 | Requests which will be sent to the LM. 83 | 84 | :param doc: 85 | The document as returned from training_docs, validation_docs, or test_docs. 86 | :param ctx: str 87 | The context string, generated by fewshot_context. This includes the natural 88 | language description, as well as the few shot examples, and the question 89 | part of the document for `doc`. 90 | """ 91 | # TODO: implement evaluation. 92 | raise NotImplementedError("Evaluation not implemented") 93 | 94 | def process_results(self, doc, results): 95 | """Take a single document and the LM results and evaluates, returning a 96 | dict where keys are the names of submetrics and values are the values of 97 | the metric for that one document 98 | 99 | :param doc: 100 | The document as returned from training_docs, validation_docs, or test_docs. 101 | :param results: 102 | The results of the requests created in construct_requests. 103 | """ 104 | # TODO: implement evaluation. 105 | raise NotImplementedError("Evaluation not implemented") 106 | 107 | def aggregation(self): 108 | """ 109 | :returns: {str: [float] -> float} 110 | A dictionary where keys are the names of submetrics and values are 111 | functions that aggregate a list of metrics 112 | """ 113 | # TODO: implement evaluation. 114 | raise NotImplementedError("Evaluation not implemented") 115 | 116 | def higher_is_better(self): 117 | """ 118 | :returns: {str: bool} 119 | A dictionary where keys are the names of submetrics and values are 120 | whether a higher value of the submetric is better 121 | """ 122 | # TODO: implement evaluation. 123 | raise NotImplementedError("Evaluation not implemented") 124 | -------------------------------------------------------------------------------- /lm_eval/tasks/sat.py: -------------------------------------------------------------------------------- 1 | """ 2 | Similarity of Semantic Relations 3 | https://arxiv.org/pdf/cs/0608100.pdf 4 | 5 | SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 6 | multiple-choice analogy questions; 5 choices per question. 7 | 8 | Homepage: https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art) 9 | """ 10 | import inspect 11 | import lm_eval.datasets.sat_analogies.sat_analogies 12 | from lm_eval.base import MultipleChoiceTask 13 | 14 | 15 | _CITATION = """ 16 | @article{article, 17 | author = {Turney, Peter}, 18 | year = {2006}, 19 | month = {09}, 20 | pages = {379-416}, 21 | title = {Similarity of Semantic Relations}, 22 | volume = {32}, 23 | journal = {Computational Linguistics}, 24 | doi = {10.1162/coli.2006.32.3.379} 25 | } 26 | """ 27 | 28 | 29 | class SATAnalogies(MultipleChoiceTask): 30 | VERSION = 0 31 | DATASET_PATH = inspect.getfile(lm_eval.datasets.sat_analogies.sat_analogies) 32 | DATASET_NAME = None 33 | 34 | def __init__(self, data_dir: str): 35 | """ 36 | SAT Analog Questions is not publicly available. You must request the data 37 | by emailing Peter Turney and then download it to a local directory path 38 | which should be passed into the `data_dir` arg. 39 | """ 40 | super().__init__(data_dir=data_dir) 41 | 42 | def has_training_docs(self): 43 | return False 44 | 45 | def has_validation_docs(self): 46 | return True 47 | 48 | def has_test_docs(self): 49 | return False 50 | 51 | def training_docs(self): 52 | return [] 53 | 54 | def validation_docs(self): 55 | return map(self._process_doc, self.dataset["validation"]) 56 | 57 | def test_docs(self): 58 | return [] 59 | 60 | def _process_doc(self, doc): 61 | return { 62 | "source": doc["source"], 63 | "query": doc["stem"].split(" ")[:2], 64 | "choices": [ 65 | "{} is to {}".format(*c.split(" ")[:2]) for c in doc["choices"] 66 | ], 67 | "gold": ["a", "b", "c", "d", "e"].index(doc["solution"].strip()), 68 | } 69 | 70 | def doc_to_text(self, doc): 71 | return "{} is to {} as".format(*doc["query"]) 72 | 73 | def should_decontaminate(self): 74 | return True 75 | 76 | def doc_to_decontamination_query(self, doc): 77 | return doc["source"] + "\n" + " ".join(doc["query"]) 78 | -------------------------------------------------------------------------------- /lm_eval/tasks/sciq.py: -------------------------------------------------------------------------------- 1 | """ 2 | Crowdsourcing Multiple Choice Science Questions 3 | https://aclanthology.org/W17-4413.pdf 4 | 5 | The SciQ dataset contains 13,679 crowdsourced science exam questions about Physics, 6 | Chemistry and Biology, among others. The questions are in multiple-choice format 7 | with 4 answer options each. For the majority of the questions, an additional paragraph 8 | with supporting evidence for the correct answer is provided. 9 | 10 | Homepage: https://allenai.org/data/sciq 11 | """ 12 | from lm_eval.base import MultipleChoiceTask 13 | 14 | 15 | _CITATION = """ 16 | @inproceedings{Welbl2017CrowdsourcingMC, 17 | title={Crowdsourcing Multiple Choice Science Questions}, 18 | author={Johannes Welbl and Nelson F. Liu and Matt Gardner}, 19 | booktitle={NUT@EMNLP}, 20 | year={2017} 21 | } 22 | """ 23 | 24 | 25 | class SciQ(MultipleChoiceTask): 26 | VERSION = 0 27 | DATASET_PATH = "sciq" 28 | DATASET_NAME = None 29 | 30 | def has_training_docs(self): 31 | return True 32 | 33 | def has_validation_docs(self): 34 | return True 35 | 36 | def has_test_docs(self): 37 | return True 38 | 39 | def training_docs(self): 40 | if self._training_docs is None: 41 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 42 | return self._training_docs 43 | 44 | def validation_docs(self): 45 | return map(self._process_doc, self.dataset["validation"]) 46 | 47 | def test_docs(self): 48 | return map(self._process_doc, self.dataset["test"]) 49 | 50 | def _process_doc(self, doc): 51 | choices = [ 52 | doc["distractor1"], 53 | doc["distractor2"], 54 | doc["distractor3"], 55 | doc["correct_answer"], 56 | ] 57 | src = doc["support"] 58 | out_doc = { 59 | "source": src, 60 | "query": doc["question"], 61 | "choices": choices, 62 | "gold": 3, 63 | } 64 | return out_doc 65 | 66 | def doc_to_text(self, doc): 67 | return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"]).strip() 68 | 69 | def should_decontaminate(self): 70 | return True 71 | 72 | def doc_to_decontamination_query(self, doc): 73 | return doc["source"] + " " + doc["query"] 74 | -------------------------------------------------------------------------------- /lm_eval/tasks/swag.py: -------------------------------------------------------------------------------- 1 | """ 2 | SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference 3 | https://arxiv.org/pdf/1808.05326.pdf 4 | 5 | SWAG (Situations With Adversarial Generations) is an adversarial dataset 6 | that consists of 113k multiple choice questions about grounded situations. Each 7 | question is a video caption from LSMDC or ActivityNet Captions, with four answer 8 | choices about what might happen next in the scene. The correct answer is the 9 | (real) video caption for the next event in the video; the three incorrect 10 | answers are adversarially generated and human verified, so as to fool machines 11 | but not humans. 12 | 13 | Homepage: https://rowanzellers.com/swag/ 14 | """ 15 | from lm_eval.base import MultipleChoiceTask 16 | 17 | 18 | _CITATION = """ 19 | @inproceedings{zellers2018swagaf, 20 | title={SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference}, 21 | author={Zellers, Rowan and Bisk, Yonatan and Schwartz, Roy and Choi, Yejin}, 22 | booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP)", 23 | year={2018} 24 | } 25 | """ 26 | 27 | 28 | class SWAG(MultipleChoiceTask): 29 | VERSION = 0 30 | DATASET_PATH = "swag" 31 | DATASET_NAME = "regular" 32 | 33 | def has_training_docs(self): 34 | return True 35 | 36 | def has_validation_docs(self): 37 | return True 38 | 39 | def has_test_docs(self): 40 | return False 41 | 42 | def training_docs(self): 43 | if self._training_docs is None: 44 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 45 | return self._training_docs 46 | 47 | def validation_docs(self): 48 | return map(self._process_doc, self.dataset["validation"]) 49 | 50 | def _process_doc(self, doc): 51 | out_doc = { 52 | "query": doc["startphrase"], 53 | "choices": [doc["ending0"], doc["ending1"], doc["ending2"], doc["ending3"]], 54 | "gold": int(doc["label"]), 55 | } 56 | return out_doc 57 | 58 | def doc_to_text(self, doc): 59 | return doc["query"] 60 | -------------------------------------------------------------------------------- /lm_eval/tasks/toxigen.py: -------------------------------------------------------------------------------- 1 | """ 2 | ToxiGen: A Large-Scale Machine-Generated Dataset for Adversarial and Implicit Hate Speech Detection 3 | https://arxiv.org/abs/2203.09509 4 | 5 | Classify input text as either hateful or not hateful. 6 | 7 | Homepage: https://github.com/microsoft/TOXIGEN 8 | """ 9 | from lm_eval.base import MultipleChoiceTask 10 | import numpy as np 11 | import pandas as pd 12 | 13 | 14 | _CITATION = """ 15 | @inproceedings{hartvigsen2022toxigen, 16 | title={ToxiGen: A Large-Scale Machine-Generated Dataset for Implicit and Adversarial Hate Speech Detection}, 17 | author={Hartvigsen, Thomas and Gabriel, Saadia and Palangi, Hamid and Sap, Maarten and Ray, Dipankar and Kamar, Ece}, 18 | booktitle={Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics}, 19 | year={2022} 20 | } 21 | """ 22 | 23 | 24 | class ToxiGen(MultipleChoiceTask): 25 | VERSION = 0 26 | DATASET_PATH = "skg/toxigen-data" 27 | DATASET_NAME = "annotated" 28 | 29 | def has_training_docs(self): 30 | return True 31 | 32 | def has_validation_docs(self): 33 | return False 34 | 35 | def has_test_docs(self): 36 | return True 37 | 38 | def training_docs(self): 39 | if self.has_training_docs(): 40 | # We cache training documents in `self._training_docs` for faster 41 | # few-shot processing. If the data is too large to fit in memory, 42 | # return the training data as a generator instead of a list. 43 | if self._training_docs is None: 44 | dataset = self._preprocess_dataset("train") 45 | self._training_docs = list(map(self._process_doc, dataset)) 46 | return self._training_docs 47 | 48 | def test_docs(self): 49 | if self.has_test_docs(): 50 | dataset = self._preprocess_dataset("test") 51 | return map(self._process_doc, dataset) 52 | 53 | def _preprocess_dataset(self, split: str): 54 | """Preprocess the dataset into a list of (text, label) tuples.""" 55 | d = pd.DataFrame(self.dataset[split]) 56 | text = d["text"] 57 | labels = np.round(((d["toxicity_ai"] + d["toxicity_human"]) > 5.5), 0).astype( 58 | np.int32 59 | ) 60 | return [[x, y] for x, y in zip(text, labels)] 61 | 62 | def _process_doc(self, doc): 63 | return { 64 | "query": doc[0], 65 | "choices": ["No", "Yes"], 66 | "gold": doc[1], 67 | } 68 | 69 | def doc_to_text(self, doc): 70 | return f"Is the following statement hateful? Respond with either Yes or No. Statement: '{doc['query']}'" 71 | -------------------------------------------------------------------------------- /lm_eval/tasks/triviaqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension 3 | https://arxiv.org/pdf/1705.03551.pdf 4 | 5 | TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence 6 | triples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts 7 | and independently gathered evidence documents, six per question on average, that provide 8 | high quality distant supervision for answering the questions. 9 | 10 | Homepage: https://nlp.cs.washington.edu/triviaqa/ 11 | """ 12 | import inspect 13 | import lm_eval.datasets.triviaqa.triviaqa 14 | from lm_eval.base import Task, rf 15 | from lm_eval.metrics import mean 16 | 17 | 18 | _CITATION = """ 19 | @InProceedings{JoshiTriviaQA2017, 20 | author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke}, 21 | title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension}, 22 | booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics}, 23 | month = {July}, 24 | year = {2017}, 25 | address = {Vancouver, Canada}, 26 | publisher = {Association for Computational Linguistics}, 27 | } 28 | """ 29 | 30 | 31 | class TriviaQA(Task): 32 | VERSION = 1 33 | DATASET_PATH = inspect.getfile(lm_eval.datasets.triviaqa.triviaqa) 34 | DATASET_NAME = None 35 | 36 | def has_training_docs(self): 37 | return True 38 | 39 | def has_validation_docs(self): 40 | return True 41 | 42 | def has_test_docs(self): 43 | return False 44 | 45 | def training_docs(self): 46 | return self.dataset["train"] 47 | 48 | def validation_docs(self): 49 | return self.dataset["validation"] 50 | 51 | def test_docs(self): 52 | raise NotImplementedError() 53 | 54 | def doc_to_text(self, doc): 55 | return f"Question: {doc['question']}\nAnswer:" 56 | 57 | def should_decontaminate(self): 58 | return True 59 | 60 | def doc_to_decontamination_query(self, doc): 61 | return doc["question"] 62 | 63 | def doc_to_target(self, doc): 64 | return " " + doc["answer"]["value"] 65 | 66 | def _remove_prefixes(self, aliases): 67 | # Optimization: Remove any alias that has a strict prefix elsewhere in the list 68 | # we can do this because if the prefix is acceptable by isgreedy, we can stop looking 69 | aliases.sort() 70 | ret = [aliases[0]] 71 | for alias in aliases[1:]: 72 | if not alias.startswith(ret[-1]): 73 | ret.append(alias) 74 | return ret 75 | 76 | def construct_requests(self, doc, ctx): 77 | ret = [] 78 | for alias in self._remove_prefixes(doc["answer"]["aliases"]): 79 | _, is_prediction = rf.loglikelihood(ctx, " " + alias) 80 | ret.append(is_prediction) 81 | return ret 82 | 83 | def process_results(self, doc, results): 84 | return {"acc": float(any(results))} 85 | 86 | def aggregation(self): 87 | return { 88 | "acc": mean, 89 | } 90 | 91 | def higher_is_better(self): 92 | return {"acc": True} 93 | -------------------------------------------------------------------------------- /lm_eval/tasks/unscramble.py: -------------------------------------------------------------------------------- 1 | """ 2 | Language Models are Few-Shot Learners 3 | https://arxiv.org/pdf/2005.14165.pdf 4 | 5 | Unscramble is a small battery of 5 “character manipulation” tasks. Each task 6 | involves giving the model a word distorted by some combination of scrambling, 7 | addition, or deletion of characters, and asking it to recover the original word. 8 | 9 | Homepage: https://github.com/openai/gpt-3/tree/master/data 10 | """ 11 | import inspect 12 | import lm_eval.datasets.unscramble.unscramble 13 | from lm_eval.base import Task, rf 14 | from lm_eval.metrics import mean 15 | 16 | 17 | _CITATION = """ 18 | @inproceedings{NEURIPS2020_1457c0d6, 19 | author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario}, 20 | booktitle = {Advances in Neural Information Processing Systems}, 21 | editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin}, 22 | pages = {1877--1901}, 23 | publisher = {Curran Associates, Inc.}, 24 | title = {Language Models are Few-Shot Learners}, 25 | url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf}, 26 | volume = {33}, 27 | year = {2020} 28 | } 29 | """ 30 | 31 | 32 | class WordUnscrambleTask(Task): 33 | VERSION = 0 34 | DATASET_PATH = inspect.getfile(lm_eval.datasets.unscramble.unscramble) 35 | DATASET_NAME = None 36 | 37 | def has_training_docs(self): 38 | return False 39 | 40 | def has_validation_docs(self): 41 | return True 42 | 43 | def has_test_docs(self): 44 | return False 45 | 46 | def validation_docs(self): 47 | return self.dataset["validation"] 48 | 49 | def doc_to_text(self, doc): 50 | return doc["context"] 51 | 52 | def should_decontaminate(self): 53 | return True 54 | 55 | def doc_to_decontamination_query(self, doc): 56 | return doc["context"] 57 | 58 | def doc_to_target(self, doc): 59 | return doc["completion"] 60 | 61 | def construct_requests(self, doc, ctx): 62 | completion = rf.greedy_until(ctx, ["\n"]) 63 | return completion 64 | 65 | def process_results(self, doc, results): 66 | pred = results[0] 67 | gold = doc["completion"] 68 | return {"acc": int(pred == gold)} 69 | 70 | def aggregation(self): 71 | return {"acc": mean} 72 | 73 | def higher_is_better(self): 74 | return {"acc": True} 75 | 76 | 77 | class Anagrams1(WordUnscrambleTask): 78 | DATASET_NAME = "mid_word_1_anagrams" 79 | 80 | 81 | class Anagrams2(WordUnscrambleTask): 82 | DATASET_NAME = "mid_word_2_anagrams" 83 | 84 | 85 | class CycleLetters(WordUnscrambleTask): 86 | DATASET_NAME = "cycle_letters_in_word" 87 | 88 | 89 | class RandomInsertion(WordUnscrambleTask): 90 | DATASET_NAME = "random_insertion_in_word" 91 | 92 | 93 | class ReversedWords(WordUnscrambleTask): 94 | DATASET_NAME = "reversed_words" 95 | -------------------------------------------------------------------------------- /lm_eval/tasks/webqs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Semantic Parsing on Freebase from Question-Answer Pairs 3 | https://cs.stanford.edu/~pliang/papers/freebase-emnlp2013.pdf 4 | 5 | WebQuestions is a benchmark for question answering. The dataset consists of 6,642 6 | question/answer pairs. The questions are supposed to be answerable by Freebase, a 7 | large knowledge graph. The questions are mostly centered around a single named entity. 8 | The questions are popular ones asked on the web (at least in 2013). 9 | 10 | Homepage: https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a 11 | """ 12 | from lm_eval.base import rf, Task 13 | from lm_eval.metrics import mean 14 | 15 | 16 | _CITATION = """ 17 | @inproceedings{berant-etal-2013-semantic, 18 | title = "Semantic Parsing on {F}reebase from Question-Answer Pairs", 19 | author = "Berant, Jonathan and 20 | Chou, Andrew and 21 | Frostig, Roy and 22 | Liang, Percy", 23 | booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing", 24 | month = oct, 25 | year = "2013", 26 | address = "Seattle, Washington, USA", 27 | publisher = "Association for Computational Linguistics", 28 | url = "https://aclanthology.org/D13-1160", 29 | pages = "1533--1544", 30 | } 31 | """ 32 | 33 | 34 | class WebQs(Task): 35 | VERSION = 0 36 | DATASET_PATH = "web_questions" 37 | DATASET_NAME = None 38 | 39 | def has_training_docs(self): 40 | return True 41 | 42 | def has_validation_docs(self): 43 | return False 44 | 45 | def has_test_docs(self): 46 | return True 47 | 48 | def training_docs(self): 49 | if self._training_docs is None: 50 | self._training_docs = list(self.dataset["train"]) 51 | return self._training_docs 52 | 53 | def test_docs(self): 54 | return self.dataset["test"] 55 | 56 | def doc_to_text(self, doc): 57 | return "Question: " + doc["question"] + "\nAnswer:" 58 | 59 | def should_decontaminate(self): 60 | return True 61 | 62 | def doc_to_decontamination_query(self, doc): 63 | return doc["question"] 64 | 65 | def doc_to_target(self, doc): 66 | # this picks one answer to be the "correct" one, despite sometimes 67 | # multiple correct answers being possible. 68 | # TODO: make sure we're actually handling multi-answer correctly 69 | return " " + doc["answers"][0] 70 | 71 | def _remove_prefixes(self, aliases): 72 | # Optimization: Remove any alias that has a strict prefix elsewhere in the list 73 | # we can do this because if the prefix is acceptable by isgreedy, we can stop looking 74 | aliases.sort() 75 | ret = [aliases[0]] 76 | for alias in aliases[1:]: 77 | if not alias.startswith(ret[-1]): 78 | ret.append(alias) 79 | 80 | return ret 81 | 82 | def construct_requests(self, doc, ctx): 83 | ret = [] 84 | for alias in self._remove_prefixes(doc["answers"]): 85 | _, is_prediction = rf.loglikelihood(ctx, " " + alias) 86 | ret.append(is_prediction) 87 | return ret 88 | 89 | def process_results(self, doc, results): 90 | return {"acc": float(any(results))} 91 | 92 | def aggregation(self): 93 | return { 94 | "acc": mean, 95 | } 96 | 97 | def higher_is_better(self): 98 | return {"acc": True} 99 | -------------------------------------------------------------------------------- /lm_eval/tasks/wikitext.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pointer Sentinel Mixture Models 3 | https://arxiv.org/pdf/1609.07843.pdf 4 | 5 | The WikiText language modeling dataset is a collection of over 100 million tokens 6 | extracted from the set of verified Good and Featured articles on Wikipedia. 7 | 8 | NOTE: This `Task` is based on WikiText-2. 9 | 10 | Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/ 11 | """ 12 | import re 13 | from lm_eval.base import PerplexityTask 14 | 15 | 16 | _CITATION = """ 17 | @misc{merity2016pointer, 18 | title={Pointer Sentinel Mixture Models}, 19 | author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher}, 20 | year={2016}, 21 | eprint={1609.07843}, 22 | archivePrefix={arXiv}, 23 | primaryClass={cs.CL} 24 | } 25 | """ 26 | 27 | 28 | def wikitext_detokenizer(string): 29 | # contractions 30 | string = string.replace("s '", "s'") 31 | string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) 32 | # number separators 33 | string = string.replace(" @-@ ", "-") 34 | string = string.replace(" @,@ ", ",") 35 | string = string.replace(" @.@ ", ".") 36 | # punctuation 37 | string = string.replace(" : ", ": ") 38 | string = string.replace(" ; ", "; ") 39 | string = string.replace(" . ", ". ") 40 | string = string.replace(" ! ", "! ") 41 | string = string.replace(" ? ", "? ") 42 | string = string.replace(" , ", ", ") 43 | # double brackets 44 | string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string) 45 | string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string) 46 | string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string) 47 | string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) 48 | string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) 49 | # miscellaneous 50 | string = string.replace("= = = =", "====") 51 | string = string.replace("= = =", "===") 52 | string = string.replace("= =", "==") 53 | string = string.replace(" " + chr(176) + " ", chr(176)) 54 | string = string.replace(" \n", "\n") 55 | string = string.replace("\n ", "\n") 56 | string = string.replace(" N ", " 1 ") 57 | string = string.replace(" 's", "'s") 58 | 59 | return string 60 | 61 | 62 | class WikiText(PerplexityTask): 63 | VERSION = 1 64 | DATASET_PATH = "EleutherAI/wikitext_document_level" 65 | DATASET_NAME = "wikitext-2-raw-v1" 66 | 67 | def has_training_docs(self): 68 | return True 69 | 70 | def has_validation_docs(self): 71 | return True 72 | 73 | def has_test_docs(self): 74 | return True 75 | 76 | def training_docs(self): 77 | return map(self._process_doc, self.dataset["train"]) 78 | 79 | def validation_docs(self): 80 | return map(self._process_doc, self.dataset["validation"]) 81 | 82 | def test_docs(self): 83 | return map(self._process_doc, self.dataset["test"]) 84 | 85 | def _process_doc(self, doc): 86 | return doc["page"] 87 | 88 | def doc_to_target(self, doc): 89 | return wikitext_detokenizer(doc) 90 | 91 | def should_decontaminate(self): 92 | return True 93 | 94 | def count_words(self, doc): 95 | # count number of words in *original doc before detokenization* 96 | return len(re.split(r"\s+", doc)) 97 | -------------------------------------------------------------------------------- /lm_eval/tasks/winogrande.py: -------------------------------------------------------------------------------- 1 | """ 2 | WinoGrande: An Adversarial Winograd Schema Challenge at Scale 3 | https://arxiv.org/pdf/1907.10641.pdf 4 | 5 | WinoGrande is a collection of 44k problems, inspired by Winograd Schema Challenge 6 | (Levesque, Davis, and Morgenstern 2011), but adjusted to improve the scale and 7 | robustness against the dataset-specific bias. Formulated as a fill-in-a-blank 8 | task with binary options, the goal is to choose the right option for a given 9 | sentence which requires commonsense reasoning. 10 | 11 | NOTE: This evaluation of Winogrande uses partial evaluation as described by 12 | Trinh & Le in Simple Method for Commonsense Reasoning (2018). 13 | See: https://arxiv.org/abs/1806.02847 14 | 15 | Homepage: https://leaderboard.allenai.org/winogrande/submissions/public 16 | """ 17 | 18 | import numpy as np 19 | from lm_eval.base import rf, Task 20 | from lm_eval.metrics import mean 21 | 22 | 23 | _CITATION = """ 24 | @article{sakaguchi2019winogrande, 25 | title={WinoGrande: An Adversarial Winograd Schema Challenge at Scale}, 26 | author={Sakaguchi, Keisuke and Bras, Ronan Le and Bhagavatula, Chandra and Choi, Yejin}, 27 | journal={arXiv preprint arXiv:1907.10641}, 28 | year={2019} 29 | } 30 | """ 31 | 32 | 33 | class Winogrande(Task): 34 | VERSION = 0 35 | DATASET_PATH = "winogrande" 36 | DATASET_NAME = "winogrande_xl" 37 | 38 | answer_to_num = {"1": 0, "2": 1} 39 | 40 | def has_training_docs(self): 41 | return True 42 | 43 | def has_validation_docs(self): 44 | return True 45 | 46 | def has_test_docs(self): 47 | return False 48 | 49 | def training_docs(self): 50 | if self._training_docs is None: 51 | self._training_docs = list(self.dataset["train"]) 52 | return self._training_docs 53 | 54 | def validation_docs(self): 55 | return self.dataset["validation"] 56 | 57 | def doc_to_text(self, doc): 58 | return self.partial_context(doc, doc["option" + doc["answer"]]) 59 | 60 | def should_decontaminate(self): 61 | return True 62 | 63 | def doc_to_decontamination_query(self, doc): 64 | return doc["sentence"] 65 | 66 | @classmethod 67 | def partial_context(cls, doc, option): 68 | # Substitute the pronoun in the sentence with the specified option 69 | # and ignore everything after. 70 | pronoun_loc = doc["sentence"].index("_") 71 | return doc["sentence"][:pronoun_loc] + option 72 | 73 | def doc_to_target(self, doc): 74 | return self.partial_target(doc) 75 | 76 | @classmethod 77 | def partial_target(cls, doc): 78 | # The target is everything after the document specified pronoun. 79 | pronoun_loc = doc["sentence"].index("_") + 1 80 | return " " + doc["sentence"][pronoun_loc:].strip() 81 | 82 | def construct_requests(self, doc, ctx): 83 | """Uses RequestFactory to construct Requests and returns an iterable of 84 | Requests which will be sent to the LM. 85 | 86 | :param doc: 87 | The document as returned from training_docs, validation_docs, or test_docs. 88 | :param ctx: str 89 | The context string, generated by fewshot_context. This includes the natural 90 | language description, as well as the few shot examples, and the question 91 | part of the document for `doc`. 92 | """ 93 | target = self.partial_target(doc) 94 | lls = [] 95 | for option in [doc["option1"], doc["option2"]]: 96 | partial_ctx = self.partial_context(doc, option) 97 | full_ctx = self.append_context(ctx, partial_ctx) 98 | lls.append(rf.loglikelihood(full_ctx, target)[0]) 99 | return lls 100 | 101 | @classmethod 102 | def append_context(cls, ctx, partial_ctx): 103 | ctx = ctx.split("\n\n") # Each fewshot context is on its own new line. 104 | ctx.pop() # Remove the correct context put in by `doc_to_text`. 105 | return "\n\n".join([*ctx, partial_ctx]) if ctx else partial_ctx 106 | 107 | def process_results(self, doc, results): 108 | """Take a single document and the LM results and evaluates, returning a 109 | dict where keys are the names of submetrics and values are the values of 110 | the metric for that one document 111 | 112 | :param doc: 113 | The document as returned from training_docs, validation_docs, or test_docs. 114 | :param results: 115 | The results of the requests created in construct_requests. 116 | """ 117 | return {"acc": np.argmax(results) == self.answer_to_num[doc["answer"]]} 118 | 119 | def aggregation(self): 120 | """ 121 | :returns: {str: [float] -> float} 122 | A dictionary where keys are the names of submetrics and values are 123 | functions that aggregate a list of metrics 124 | """ 125 | return {"acc": mean} 126 | 127 | def higher_is_better(self): 128 | """ 129 | :returns: {str: bool} 130 | A dictionary where keys are the names of submetrics and values are 131 | whether a higher value of the submetric is better 132 | """ 133 | return {"acc": True} 134 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "qllm" 7 | version = "0.1.0" 8 | description = "An accurate and efficient low-bitwidth PTQ method designed for LLMs (W6A6, W4A8, W4A4)." 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: Apache Software License", 14 | ] 15 | dependencies = [ 16 | "datasets>=2.0.0","einops","jsonlines","numexpr", 17 | "openai>=0.6.4","omegaconf>=2.2","peft>=0.2.0", 18 | "pybind11>=2.6.2","pycountry","pytablewriter", 19 | "rouge-score>=0.0.4","sacrebleu==1.5.0", 20 | "scikit-learn>=0.24.1","sqlitedict", 21 | "tqdm-multiprocess","zstandard", 22 | "accelerate", "sentencepiece", "tokenizers>=0.12.1", 23 | "torch>=2.0.0", "torchvision", 24 | "transformers==4.37.2", 25 | "texttable", 26 | "toml", "attributedict", 27 | "protobuf", 28 | "numpy", 29 | "matplotlib" 30 | ] 31 | 32 | [tool.setuptools.packages.find] 33 | exclude = ["results*", "scripts*", "examples*"] 34 | 35 | [tool.wheel] 36 | exclude = ["results*", "scripts*", "examples*"] -------------------------------------------------------------------------------- /quantize/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/QLLM/653a329e4a5bf17b9296854617e093b7d45643b9/quantize/__init__.py -------------------------------------------------------------------------------- /quantize/int_linear.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | 5 | from quantize.quantizer import UniformAffineQuantizer 6 | 7 | 8 | class QuantLinear(nn.Module): 9 | """ 10 | Quantized Module that can perform quantized convolution or normal convolution. 11 | To activate quantization, please use set_quant_state function. 12 | """ 13 | 14 | def __init__( 15 | self, 16 | org_module: nn.Linear, 17 | weight_quant_params: dict = {}, 18 | act_quant_params: dict = {}, 19 | disable_input_quant=False, 20 | ): 21 | super().__init__() 22 | self.fwd_kwargs = dict() 23 | self.fwd_func = F.linear 24 | self.weight = org_module.weight 25 | if org_module.bias is not None: 26 | self.bias = org_module.bias 27 | else: 28 | self.bias = None 29 | # de-activate the quantized forward default 30 | self.use_weight_quant = False 31 | self.use_act_quant = False 32 | self.replace_weight_with_quantized = False 33 | self.is_weight_packed = False 34 | self.mem_packer = None 35 | # initialize quantizer 36 | self.weight_quantizer = UniformAffineQuantizer( 37 | **weight_quant_params, shape=org_module.weight.shape 38 | ) 39 | if not disable_input_quant: 40 | self.act_quantizer = UniformAffineQuantizer(**act_quant_params) 41 | else: 42 | self.act_quantizer = None 43 | 44 | self.disable_input_quant = disable_input_quant 45 | self.use_temporary_parameter = False 46 | 47 | def forward(self, input: torch.Tensor): 48 | if self.use_temporary_parameter: 49 | weight = self.temp_weight 50 | bias = self.temp_bias 51 | elif self.use_weight_quant: 52 | weight = self.weight_quantizer(self.weight) 53 | bias = self.bias 54 | else: 55 | weight = self.weight 56 | bias = self.bias 57 | 58 | if self.use_act_quant and not self.disable_input_quant: 59 | input = self.act_quantizer(input) 60 | 61 | out = self.fwd_func(input, weight, bias, **self.fwd_kwargs) 62 | 63 | return out 64 | 65 | def set_quant_state(self, weight_quant: bool = False, act_quant: bool = False): 66 | self.use_weight_quant = weight_quant 67 | self.use_act_quant = act_quant 68 | 69 | def extra_repr(self): 70 | s = super().extra_repr() 71 | s += ", use_act_quant={}".format(self.use_act_quant) 72 | s += ", use_weight_quant={}".format(self.use_weight_quant) 73 | s += ", disable_input_quant={}".format(self.disable_input_quant) 74 | s += ", quant" 75 | return s 76 | -------------------------------------------------------------------------------- /quantize/int_linear_lora.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | from quantize.int_linear import QuantLinear 7 | 8 | 9 | class LoRALayer: 10 | def __init__( 11 | self, 12 | r: int, 13 | lora_alpha: int, 14 | lora_dropout: float, 15 | merge_weights: bool, 16 | ): 17 | self.r = r 18 | self.lora_alpha = lora_alpha 19 | # Optional dropout 20 | if lora_dropout > 0.0: 21 | self.lora_dropout = nn.Dropout(p=lora_dropout) 22 | else: 23 | self.lora_dropout = lambda x: x 24 | # Mark the weight as unmerged 25 | self.merged = False 26 | self.merge_weights = merge_weights 27 | 28 | 29 | class LoRAQuantLinear(QuantLinear, LoRALayer): 30 | """ 31 | Quantized Module that can perform quantized convolution or normal convolution. 32 | To activate quantization, please use set_quant_state function. 33 | """ 34 | 35 | def __init__( 36 | self, 37 | org_module: nn.Linear, 38 | weight_quant_params: dict = {}, 39 | act_quant_params: dict = {}, 40 | disable_input_quant=False, 41 | r=0, 42 | lora_alpha=1, 43 | lora_dropout=0.0, 44 | merge_weights=True, 45 | ): 46 | super().__init__( 47 | org_module, weight_quant_params, act_quant_params, disable_input_quant 48 | ) 49 | LoRALayer.__init__( 50 | self, 51 | r=r, 52 | lora_alpha=lora_alpha, 53 | lora_dropout=lora_dropout, 54 | merge_weights=merge_weights, 55 | ) 56 | 57 | if r > 0: 58 | out_features, in_features = self.weight.shape 59 | self.lora_A = nn.Parameter(self.weight.new_zeros((r, in_features))) 60 | self.lora_B = nn.Parameter(self.weight.new_zeros((out_features, r))) 61 | self.scaling = self.lora_alpha / r 62 | # Freezing the pre-trained weight matrix 63 | self.weight.requires_grad = False 64 | 65 | self.reset_lora_parameters() 66 | 67 | def reset_lora_parameters(self): 68 | if hasattr(self, "lora_A"): 69 | # initialize A the same way as the default for nn.Linear and B to zero 70 | nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5)) 71 | nn.init.zeros_(self.lora_B) 72 | 73 | def forward(self, input: torch.Tensor): 74 | if self.use_temporary_parameter: 75 | weight = self.temp_weight 76 | bias = self.temp_bias 77 | elif self.use_weight_quant: 78 | weight = self.weight_quantizer(self.weight) 79 | bias = self.bias 80 | else: 81 | weight = self.weight 82 | bias = self.bias 83 | 84 | if self.use_act_quant and not self.disable_input_quant: 85 | input = self.act_quantizer(input) 86 | 87 | if self.r > 0 and not self.merged and self.use_weight_quant: 88 | out = self.fwd_func( 89 | input, 90 | weight + self.lora_B @ self.lora_A * self.scaling, 91 | bias, 92 | **self.fwd_kwargs 93 | ) 94 | else: 95 | out = self.fwd_func(input, weight, bias, **self.fwd_kwargs) 96 | 97 | return out 98 | 99 | def extra_repr(self): 100 | s = super().extra_repr() 101 | s += ", use_act_quant={}".format(self.use_act_quant) 102 | s += ", use_weight_quant={}".format(self.use_weight_quant) 103 | s += ", disable_input_quant={}".format(self.disable_input_quant) 104 | s += ", lora_quant" 105 | return s 106 | -------------------------------------------------------------------------------- /quantize/int_matmul.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | from quantize.quantizer import UniformAffineQuantizer 5 | 6 | 7 | class QuantMatMul(nn.Module): 8 | def __init__( 9 | self, 10 | x1_quant_params: dict = {}, 11 | x2_quant_params: dict = {}, 12 | disable_act_quant=False, 13 | matmul_func=torch.bmm, 14 | ): 15 | super().__init__() 16 | # de-activate the quantized forward default 17 | self.use_act_quant = False 18 | # initialize quantizer 19 | self.i_cluster_counts = None 20 | self.x1_quantizer = UniformAffineQuantizer(**x1_quant_params) 21 | self.x2_quantizer = UniformAffineQuantizer(**x2_quant_params) 22 | self.matmul_func = matmul_func 23 | 24 | self.disable_act_quant = disable_act_quant 25 | 26 | def set_quant_state(self, weight_quant: bool = False, act_quant: bool = False): 27 | self.use_weight_quant = weight_quant 28 | self.use_act_quant = act_quant 29 | 30 | def quant_x1(self, x1): 31 | if self.use_act_quant: 32 | x1 = self.x1_quantizer(x1) 33 | return x1 34 | 35 | def quant_x2(self, x2): 36 | if self.use_act_quant: 37 | x2 = self.x2_quantizer(x2) 38 | return x2 39 | 40 | def forward(self, x1, x2): 41 | out = self.matmul_func(x1, x2) 42 | return out 43 | -------------------------------------------------------------------------------- /quantize/learnable_norm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | """ 5 | Modify normalization layer to adapt the training of learnable equivalent transformation 6 | """ 7 | 8 | 9 | class LearnableLlamaRMSNorm(nn.Module): 10 | def __init__(self, ori_norm, eps=1e-6): 11 | """ 12 | LlamaRMSNorm is equivalent to T5LayerNorm 13 | """ 14 | super().__init__() 15 | self.ori_norm = ori_norm 16 | self.bias = torch.nn.Parameter( 17 | torch.zeros(ori_norm.weight.shape, device=ori_norm.weight.device) 18 | ) 19 | self.variance_epsilon = eps 20 | self.use_temporary_parameter = False 21 | 22 | def forward(self, hidden_states): 23 | input_dtype = hidden_states.dtype 24 | variance = hidden_states.to(torch.float32).pow(2).mean(-1, keepdim=True) 25 | hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) 26 | weight = self.ori_norm.weight 27 | bias = self.bias 28 | 29 | return ( 30 | (weight * hidden_states + bias).to(input_dtype) 31 | if bias is not None 32 | else (weight * hidden_states).to(input_dtype) 33 | ) 34 | -------------------------------------------------------------------------------- /quantize/quantizer.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | import torch 4 | import torch.nn as nn 5 | 6 | CLIPMIN = 1e-5 7 | 8 | 9 | def round_ste(x: torch.Tensor): 10 | """ 11 | Implement Straight-Through Estimator for rounding operation. 12 | """ 13 | return (x.round() - x).detach() + x 14 | 15 | 16 | class UniformAffineQuantizer(nn.Module): 17 | def __init__( 18 | self, 19 | n_bits: int = 8, 20 | symmetric: bool = False, 21 | per_channel_axes=[], 22 | metric="minmax", 23 | dynamic=False, 24 | dynamic_method="per_cluster", 25 | group_size=None, 26 | shape=None, 27 | use_learnable_step_size=False, 28 | **kwargs 29 | ): 30 | """ 31 | support cluster quantize 32 | dynamic_method support per_token and per_cluster 33 | """ 34 | super().__init__() 35 | self.symmetric = symmetric 36 | assert 2 <= n_bits <= 16, "bitwidth not supported" 37 | self.n_bits = n_bits 38 | self.qmin = 0 39 | self.qmax = 2 ** (n_bits) - 1 40 | self.per_channel_axes = per_channel_axes 41 | self.metric = metric 42 | self.cluster_counts = None 43 | self.cluster_dim = None 44 | 45 | self.scale = None 46 | self.zero_point = None 47 | self.round_zero_point = None 48 | 49 | self.cached_xmin = None 50 | self.cached_xmax = None 51 | self.dynamic = dynamic 52 | self.dynamic_method = dynamic_method 53 | 54 | self.deficiency = 0 55 | self.use_learnable_step_size = use_learnable_step_size 56 | 57 | if use_learnable_step_size: 58 | if group_size: 59 | dim1 = int(shape[0] * math.ceil(shape[1] / group_size)) 60 | self.deficiency = shape[-1] % group_size 61 | if self.deficiency > 0: 62 | self.deficiency = group_size - self.deficiency 63 | assert self.symmetric # support for mlc-llm quantization 64 | else: 65 | dim1 = shape[0] 66 | 67 | self.enable = True 68 | self.group_size = group_size 69 | self.is_init = False 70 | 71 | def change_n_bits(self, n_bits): 72 | self.n_bits = n_bits 73 | self.qmin = 0 74 | self.qmax = 2 ** (n_bits) - 1 75 | 76 | def fake_quant(self, x, scale, round_zero_point): 77 | if self.deficiency > 0: 78 | pad_zeros = torch.zeros( 79 | (x.shape[0], self.deficiency), dtype=x.dtype, device=x.device 80 | ) 81 | x = torch.cat((x, pad_zeros), dim=1) 82 | 83 | if self.group_size: 84 | assert len(x.shape) == 2, "only support linear layer now" 85 | dim1, dim2 = x.shape 86 | x = x.reshape(-1, self.group_size) 87 | x_int = round_ste(x / scale) 88 | if round_zero_point is not None: 89 | x_int = x_int.add(round_zero_point) 90 | x_int = x_int.clamp(self.qmin, self.qmax) 91 | x_dequant = x_int 92 | if round_zero_point is not None: 93 | x_dequant = x_dequant.sub(round_zero_point) 94 | x_dequant = x_dequant.mul(scale) 95 | if self.group_size: 96 | x_dequant = x_dequant.reshape(dim1, dim2) 97 | if self.deficiency > 0: 98 | x_dequant = x_dequant[:, : -self.deficiency] 99 | return x_dequant 100 | 101 | def forward(self, x: torch.Tensor): 102 | if self.n_bits >= 16 or not self.enable: 103 | return x 104 | if self.metric == "fix0to1": 105 | return x.mul_(2**self.n_bits - 1).round_().div_(2**self.n_bits - 1) 106 | 107 | if self.dynamic_method == "per_token" or self.dynamic_method == "per_channel": 108 | self.per_token_dynamic_calibration(x) 109 | else: 110 | raise NotImplementedError() 111 | 112 | x_dequant = self.fake_quant( 113 | x, self.scale.abs().clamp(min=CLIPMIN, max=1e4), self.round_zero_point 114 | ) 115 | return x_dequant 116 | 117 | def per_token_dynamic_calibration(self, x): 118 | if self.group_size: 119 | if self.deficiency == 0: 120 | x = x.reshape(-1, self.group_size) 121 | else: 122 | pad_zeros = torch.zeros( 123 | (x.shape[0], self.deficiency), dtype=x.dtype, device=x.device 124 | ) 125 | x = torch.cat((x, pad_zeros), dim=1) 126 | x = x.reshape(-1, self.group_size) 127 | reduce_shape = [-1] 128 | xmin = x.amin(reduce_shape, keepdim=True) 129 | xmax = x.amax(reduce_shape, keepdim=True) 130 | if self.symmetric: 131 | abs_max = torch.max(xmax.abs(), xmin.abs()) 132 | scale = abs_max / (2 ** (self.n_bits - 1) - 1) 133 | # scale = scale.clamp(min=CLIPMIN, max=1e4) 134 | if self.use_learnable_step_size: 135 | if not self.is_init: 136 | self.register_parameter("scale", torch.nn.Parameter(scale)) 137 | self.is_init = True 138 | else: 139 | self.scale = scale 140 | zero_point = (2 ** (self.n_bits - 1) - 1) * torch.ones_like(self.scale) 141 | else: 142 | range = xmax - xmin 143 | scale = range / (2**self.n_bits - 1) 144 | # self.scale = scale.clamp(min=CLIPMIN, max=1e4) 145 | if self.use_learnable_step_size: 146 | if not self.is_init: 147 | del self.scale 148 | self.register_parameter("scale", torch.nn.Parameter(scale)) 149 | self.is_init = True 150 | else: 151 | self.scale = scale 152 | zero_point = -(xmin) / (self.scale) 153 | self.round_zero_point = zero_point.clamp(min=-1e4, max=1e4).round() 154 | -------------------------------------------------------------------------------- /scripts/llama-13b/w4a4.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 4 \ 4 | --abits 4 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 5e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \ 18 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-13b/w4a8.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 4 \ 4 | --abits 8 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 5e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \ 18 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-13b/w6a6.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 6 \ 4 | --abits 6 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 5e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \ 18 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-2-13b/w4a4.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 4 \ 4 | --abits 4 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 5e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \ 18 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-2-13b/w4a8.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 4 \ 4 | --abits 8 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 5e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \ 18 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-2-13b/w6a6.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 6 \ 4 | --abits 6 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 1e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \ 18 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-2-70b/w4a4.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 4 \ 4 | --abits 4 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 1e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-2-70b/w4a8.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 4 \ 4 | --abits 8 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 1e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-2-70b/w6a6.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 6 \ 4 | --abits 6 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 1e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-2-7b/w4a4.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 4 \ 4 | --abits 4 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 1e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \ 18 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-2-7b/w4a8.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 4 \ 4 | --abits 8 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 1e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \ 18 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-2-7b/w6a6.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 6 \ 4 | --abits 6 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 5e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \ 18 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-30b/w4a4.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 4 \ 4 | --abits 4 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 5e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-30b/w4a8.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 4 \ 4 | --abits 8 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 5e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-30b/w6a6.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 6 \ 4 | --abits 6 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 5e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-65b/w4a4.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 4 \ 4 | --abits 4 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 5e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-65b/w4a8.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 4 \ 4 | --abits 8 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 5e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-65b/w6a6.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 6 \ 4 | --abits 6 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 5e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-7b/w4a4.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 4 \ 4 | --abits 4 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 5e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \ 18 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-7b/w4a8.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 4 \ 4 | --abits 8 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 5e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \ 18 | --batch_size 1 -------------------------------------------------------------------------------- /scripts/llama-7b/w6a6.sh: -------------------------------------------------------------------------------- 1 | SAVE_PATH=save_path 2 | python main.py --model model_path \ 3 | --wbits 6 \ 4 | --abits 6 \ 5 | --eval_ppl \ 6 | --use_lora \ 7 | --output_dir ${SAVE_PATH} \ 8 | --lr 5e-4 \ 9 | --num_layer 4 \ 10 | --epochs 10 \ 11 | --plot_act_max \ 12 | --channel_ratio 0.2 \ 13 | --plot_num_additional_channels \ 14 | --calibrate_bs 1 \ 15 | --num_gpu 1 \ 16 | --nsamples 128 \ 17 | --tasks piqa,arc_easy,arc_challenge,boolq,hellaswag,winogrande \ 18 | --batch_size 1 --------------------------------------------------------------------------------