├── .coveragerc ├── .flake8 ├── .github └── workflows │ ├── pull_request.yml │ └── python-app.yml ├── .gitignore ├── .pre-commit-config.yaml ├── LICENSE.md ├── README.md ├── docs ├── decontamination.md ├── description_guide.md ├── img │ └── fewshot_example_gpt3.png ├── task_guide.md └── task_table.md ├── exp ├── bloom_176b │ ├── int6.yaml │ └── int8.yaml ├── bloomz_176b │ ├── int6.yaml │ └── int8.yaml ├── llama │ ├── int4_token.yaml │ ├── int4_token_disable.yaml │ ├── int6_token.yaml │ └── int6_token_disable.yaml └── opt │ ├── int4_group.yaml │ ├── int5_token.yaml │ ├── int6.yaml │ └── int8.yaml ├── figure ├── outlier_phenomenon.png └── outlier_suppression_plus.png ├── ignore.txt ├── lm_eval ├── __init__.py ├── base.py ├── datasets │ ├── ai2_arc │ │ ├── README.md │ │ ├── ai2_arc.py │ │ └── dataset_infos.json │ ├── arithmetic │ │ ├── __init__.py │ │ ├── arithmetic.py │ │ └── dataset_infos.json │ ├── asdiv │ │ ├── __init__.py │ │ ├── asdiv.py │ │ └── dataset_infos.json │ ├── coqa │ │ ├── __init__.py │ │ ├── coqa.py │ │ └── dataset_infos.json │ ├── drop │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── drop.py │ ├── headqa │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── headqa.py │ ├── hellaswag │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── hellaswag.py │ ├── hendrycks_ethics │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── hendrycks_ethics.py │ ├── hendrycks_math │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── hendrycks_math.py │ ├── lambada_openai │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── lambada_openai.py │ ├── logiqa │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── logiqa.py │ ├── mutual │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── mutual.py │ ├── pile │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── pile.py │ ├── piqa │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── piqa.py │ ├── quac │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── quac.py │ ├── sat_analogies │ │ ├── __init__.py │ │ └── sat_analogies.py │ ├── story_cloze │ │ ├── README.md │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── story_cloze.py │ ├── super_glue │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── super_glue.py │ ├── triviaqa │ │ ├── README.md │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── triviaqa.py │ ├── unscramble │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── unscramble.py │ ├── wikitext │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── wikitext.py │ └── winogrande │ │ ├── __init__.py │ │ ├── dataset_infos.json │ │ └── winogrande.py ├── decontamination │ ├── __init__.py │ ├── archiver.py │ ├── decontaminate.py │ └── janitor.py ├── evaluator.py ├── metrics.py ├── models │ ├── __init__.py │ ├── bloom.py │ ├── dummy.py │ ├── gpt2.py │ ├── gpt3.py │ ├── llama.py │ ├── opt.py │ └── textsynth.py ├── tasks │ ├── __init__.py │ ├── anli.py │ ├── arc.py │ ├── arithmetic.py │ ├── asdiv.py │ ├── blimp.py │ ├── cbt.py │ ├── coqa.py │ ├── drop.py │ ├── glue.py │ ├── gsm8k.py │ ├── headqa.py │ ├── hellaswag.py │ ├── hendrycks_ethics.py │ ├── hendrycks_math.py │ ├── hendrycks_test.py │ ├── lambada.py │ ├── lambada_cloze.py │ ├── lambada_multilingual.py │ ├── logiqa.py │ ├── mathqa.py │ ├── mc_taco.py │ ├── mutual.py │ ├── naturalqs.py │ ├── openbookqa.py │ ├── pile.py │ ├── piqa.py │ ├── prost.py │ ├── pubmedqa.py │ ├── qa4mre.py │ ├── qasper.py │ ├── quac.py │ ├── race.py │ ├── sat.py │ ├── sciq.py │ ├── squad.py │ ├── storycloze.py │ ├── superglue.py │ ├── swag.py │ ├── translation.py │ ├── triviaqa.py │ ├── truthfulqa.py │ ├── unscramble.py │ ├── webqs.py │ ├── wikitext.py │ ├── winogrande.py │ └── wsc273.py └── utils.py ├── main.py ├── outlier_analysis.md ├── pile_statistics.json ├── quant_transformer ├── __init__.py ├── model │ ├── __init__.py │ ├── quant_bloom.py │ ├── quant_llama.py │ ├── quant_model.py │ ├── quant_opt.py │ └── util_layernorm.py ├── quantization │ ├── __init__.py │ ├── fake_quant.py │ ├── migration.py │ ├── migration_bloom.py │ ├── migration_llama.py │ ├── observer.py │ ├── quantized_module.py │ ├── state.py │ └── util_quant.py └── solver │ ├── __init__.py │ ├── calibrate.py │ ├── export.py │ └── token_wise_clipping.py ├── scripts ├── __init__.py ├── clean_training_data │ ├── README.md │ ├── __init__.py │ ├── compress_and_package.py │ ├── generate_13_grams.py │ ├── investigate_pile.py │ ├── janitor_util.cpp │ ├── process_sorted_buckets.py │ └── sort_13_gram_buckets.py ├── cost_estimate.py ├── get_prompts.py ├── make_gpt2_test_cases.py ├── make_table_tasks.py └── write_out.py └── templates ├── new_multiple_choice_task.py └── new_task.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | 3 | # tasks that aren't wired up. 4 | omit = 5 | lm_eval/tasks/quac.py 6 | lm_eval/tasks/storycloze.py 7 | lm_eval/tasks/cbt.py 8 | lm_eval/tasks/sat.py 9 | lm_eval/tasks/triviaqa.py 10 | lm_eval/tasks/naturalqs.py 11 | lm_eval/models/dummy.py 12 | 13 | [report] 14 | exclude_lines = 15 | # Skip any pass lines such as may be used for @abstractmethod 16 | pass 17 | 18 | # Have to re-enable the standard pragma 19 | pragma: no cover 20 | 21 | # Don't complain about missing debug-only code: 22 | def __repr__ 23 | if self\.debug 24 | 25 | # Don't complain if tests don't hit defensive assertion code: 26 | raise AssertionError 27 | raise NotImplementedError 28 | return NotImplemented 29 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203, E266, E501, W503, F403, F401, C901 3 | max-line-length = 127 4 | max-complexity = 10 5 | select = B,C,E,F,W,T4,B9 6 | -------------------------------------------------------------------------------- /.github/workflows/pull_request.yml: -------------------------------------------------------------------------------- 1 | name: Pull Request 2 | 3 | on: [pull_request] 4 | 5 | jobs: 6 | pre-commit: 7 | runs-on: ubuntu-20.04 8 | steps: 9 | - uses: actions/checkout@v3 10 | - uses: actions/setup-python@v4 11 | with: 12 | python-version: 3.8 13 | - uses: pre-commit/action@v2.0.3 14 | -------------------------------------------------------------------------------- /.github/workflows/python-app.yml: -------------------------------------------------------------------------------- 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Build 5 | 6 | on: 7 | push: 8 | branches: [ master ] 9 | pull_request: 10 | branches: [ master ] 11 | 12 | jobs: 13 | build: 14 | 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Cache 20 | uses: actions/cache@v2.1.3 21 | with: 22 | # A list of files, directories, and wildcard patterns to cache and restore 23 | path: | 24 | ~/.cache 25 | # An explicit key for restoring and saving the cache 26 | key: evaldata-cache-4 27 | - name: Set up Python 3.9 28 | uses: actions/setup-python@v4 29 | with: 30 | python-version: 3.9 31 | - name: Install dependencies 32 | run: | 33 | python -m pip install --upgrade pip 34 | pip install flake8 pytest pytest-cov 35 | pip install -e .[dev,multilingual] 36 | # Install optional git dependencies 37 | pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt 38 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 39 | - name: Lint with flake8 40 | run: | 41 | # stop the build if there are Python syntax errors or undefined names 42 | flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics 43 | # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide 44 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics 45 | - name: Test with pytest 46 | run: | 47 | pytest -vv --cov=lm_eval/ tests/ 48 | - name: Upload to codecov 49 | run: | 50 | bash <(curl -s https://codecov.io/bash) -t $CODECOV_TOKEN 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | env 2 | *.pyc 3 | data/ 4 | lm_cache 5 | .idea 6 | tests 7 | experiment 8 | lj_exp 9 | lj_experiment 10 | lm_eval.egg-info/ 11 | 12 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # Ignore test linting to avoid conflicting changes to version stability. 2 | exclude: ^tests/testdata/ 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v4.1.0 6 | hooks: 7 | - id: check-added-large-files 8 | - id: check-ast 9 | - id: check-byte-order-marker 10 | - id: check-case-conflict 11 | - id: check-json 12 | - id: check-merge-conflict 13 | - id: check-symlinks 14 | - id: check-yaml 15 | - id: destroyed-symlinks 16 | - id: detect-private-key 17 | - id: end-of-file-fixer 18 | - id: no-commit-to-branch 19 | - id: requirements-txt-fixer 20 | - id: trailing-whitespace 21 | - id: fix-byte-order-marker 22 | exclude: docs/CNAME 23 | - id: fix-encoding-pragma 24 | args: [--remove] 25 | - id: mixed-line-ending 26 | args: [--fix=lf] 27 | - repo: https://github.com/pycqa/flake8 28 | rev: 3.7.9 29 | hooks: 30 | - id: flake8 31 | - repo: https://github.com/psf/black 32 | rev: 22.3.0 33 | hooks: 34 | - id: black 35 | language_version: python3.8 36 | - repo: https://github.com/codespell-project/codespell 37 | rev: v2.1.0 38 | hooks: 39 | - id: codespell 40 | exclude: > 41 | (?x)^( 42 | .*\.json|ignore.txt 43 | )$ 44 | args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt] 45 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 EleutherAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /docs/decontamination.md: -------------------------------------------------------------------------------- 1 | # Decontamination 2 | 3 | ## Usage 4 | 5 | Simply add a "--decontamination_ngrams_path" when running main.py. The provided directory should contain 6 | the ngram files and info.json produced in "Pile Ngram Generation" further down. 7 | 8 | ```bash 9 | python main.py \ 10 | --model gpt2 \ 11 | --device 0 \ 12 | --tasks sciq \ 13 | --decontamination_ngrams_path path/containing/training/set/ngrams 14 | ``` 15 | 16 | ## Background 17 | Downstream evaluations test model generalization, and are less useful when test set data also exists in the training set, referred to as leakage or contamination. 18 | 19 | Filtering your training set against the test set is a good first step, however this isn't always possible, as in the case of a new benchmark or one that wasn't considered prior to model training. When training set filtering isn't possible, it is useful to measure the impact of test set leakage by detecting the contaminated test examples and producing a clean version of the benchmark. 20 | 21 | The basis for our decontamination procedure can be found in Appendix C of "Language Models are Few-Shot Learners". OpenAI defined a test document as contaminated if any N-gram overlap existed with any training document. They used a range of N values between 8 and 13 depending on dataset, while we just used 13 for simplicity. 22 | 23 | ## Implementation 24 | Contamination detection can be found in `lm_eval/decontaminate.py` with supporting code in `lm_eval/decontamination/`. 25 | 26 | decontaminate.py does the following: 27 | 1. Build dictionaries of all ngrams and their corresponding evaluation/document ids. 28 | 2. Scan through sorted files containing training set n-grams. 29 | 3. If a match is found, the corresponding evaluation/document combinations are marked as contaminated. 30 | 31 | `lm_eval/evaluator.py` can then produce a clean version of the benchmark by excluding the results of contaminated documents. For each metric, a clean version will be shown in the results with a "decontaminate" suffix. 32 | 33 | This is disabled by default for new tasks, to support decontamination on a task override the "should_decontaminate" and "doc_to_decontamination_query" methods. For more details see the [task guide](task_guide.md). 34 | 35 | ## Pile Ngram Generation 36 | The relevant scripts can be found in `scripts/clean_training_data`, which also import from 37 | `lm_eval/decontamination/` 38 | 39 | 1. git clone https://github.com/EleutherAI/lm-evaluation-harness.git 40 | 2. pip install -r requirements.txt 41 | 3. Download The Pile from [The Eye](https://the-eye.eu/public/AI/pile/train/) 42 | 4. Place pile files in "pile" directory under "lm-evaluation-harness" (or create a symlink) 43 | 5. Run generate_13_grams. 44 | 45 | ```bash 46 | export PYTHONHASHSEED=0 47 | python -m scripts/clean_training_data/generate_13_grams \ 48 | -dir path/to/working/directory \ 49 | -n 13 \ 50 | -buckets 500 51 | ``` 52 | 53 | Took approximately 4 days for us. We had the time to wait, but this could be scaled out by doing partial pile scans on multiple instances of this script and merging the relevant buckets. We fixed PYTHONHASHSEED to ensure reproducibility of bucket hashing in case you need to stop and start. 54 | 55 | 6. Sort the generated 13-grams. 56 | ```bash 57 | python -m scripts/clean_training_data/sort_13_gram_buckets \ 58 | -dir path/to/working/directory/output 59 | ``` 60 | 61 | Took approximately 5 days for us. You could speed this up by spreading the files around to different machines and running the sort script before gathering them together. 62 | 63 | 7. Compress the sorted 13 grams files and place them together with info.json. 64 | 65 | This step only takes a few hours. 66 | 67 | ```bash 68 | python -m scripts/clean_training_data/compress_and_package \ 69 | -dir path/to/working/directory \ 70 | -output path/to/final/directory \ 71 | -procs 8 72 | ``` 73 | 74 | Congratulations, the final directory can now be passed to lm-evaulation-harness with the "--decontamination_ngrams_path" argument. 75 | -------------------------------------------------------------------------------- /docs/description_guide.md: -------------------------------------------------------------------------------- 1 | # Description Guide 2 | 3 | ![fewshot-example](./img/fewshot_example_gpt3.png) 4 | (Figure from [Brown et al., 2020](https://arxiv.org/pdf/2005.14165.pdf)) 5 | 6 | Task descriptions provide in-context task instruction for your language model. If you'd like to prepend a natural language description to your few-shot examples and prompt, you can do so on a per-task basis via the `description_dict` arg of [`evaluator.evaluate`](../lm_eval/evaluator.py). This `description_dict` must adhere to the following key-value structure: 7 | 8 | - **key**: the task name (`str`) as specified in the lm-eval-harness [task registry](../lm_eval/tasks/__init__.py). 9 | - **value**: the corresponding (`str`) description/prompt for the task identified by **key**. 10 | 11 | ```python 12 | description_dict = { 13 | "task_name_1": "description", 14 | "task_name_2": "description", 15 | ... 16 | } 17 | ``` 18 | 19 | Note that a task's description will be separated from its following few-shot examples and prompt by a new line as such: 20 | 21 | ```python 22 | """ 23 | 24 | 25 | 26 | 27 | 28 | """ 29 | ``` 30 | 31 | ## Descriptions in File 32 | 33 | One can also interface with the aforementioned [`evaluator.evaluate`](../lm_eval/evaluator.py) (or `evaluator.simple_evaluate`) method from a higher level by simply passing a JSON file path to the `description_dict_path` arg of the command-line interface (CLI) program, `main.py`. The JSON file pointed to should be structured the same as the `description_dict`. E.g. for some file at `/your/path/descriptions.json` you may have: 34 | 35 | ```json 36 | { 37 | "cycle_letters": "Please unscramble the letters into a word, and write that word:", 38 | "copa": "Given a premise and one alternative with a causal relation to the premise and another without, choose the more plausible alternative" 39 | } 40 | ``` 41 | 42 | which can then be supplied to the CLI as: 43 | 44 | ```bash 45 | python main.py \ 46 | --tasks cycle_letters,copa \ 47 | --description_dict_path /your/path/descriptions.json \ 48 | ... 49 | ``` 50 | -------------------------------------------------------------------------------- /docs/img/fewshot_example_gpt3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/docs/img/fewshot_example_gpt3.png -------------------------------------------------------------------------------- /exp/bloom_176b/int6.yaml: -------------------------------------------------------------------------------- 1 | quant: 2 | a_qconfig: 3 | quantizer: FixedFakeQuantize 4 | observer: AvgTokenQuantileObserver 5 | bit: 6 6 | symmetric: False 7 | ch_axis: -1 8 | token_quantile: 0.985 9 | w_qconfig: 10 | quantizer: FixedFakeQuantize 11 | observer: MinMaxObserver 12 | bit: 6 13 | symmetric: False 14 | ch_axis: 0 # perchannel 0 perlayer -1 15 | calibrate: 128 16 | calibrate_path: /mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/pile_cali 17 | is_remove_padding: True 18 | migrate: True 19 | model: 20 | max_length: 512 -------------------------------------------------------------------------------- /exp/bloom_176b/int8.yaml: -------------------------------------------------------------------------------- 1 | quant: 2 | a_qconfig: 3 | quantizer: FixedFakeQuantize 4 | observer: AvgTokenQuantileObserver 5 | bit: 8 6 | symmetric: True 7 | ch_axis: -1 8 | token_quantile: 0.995 9 | w_qconfig: 10 | quantizer: FixedFakeQuantize 11 | observer: MinMaxObserver 12 | bit: 8 13 | symmetric: True 14 | ch_axis: -1 # perchannel 0 perlayer -1 15 | calibrate: 128 16 | calibrate_path: /mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/pile_cali 17 | is_remove_padding: True 18 | migrate: True 19 | model: 20 | max_length: 512 -------------------------------------------------------------------------------- /exp/bloomz_176b/int6.yaml: -------------------------------------------------------------------------------- 1 | quant: 2 | a_qconfig: 3 | quantizer: FixedFakeQuantize 4 | observer: AvgTokenQuantileObserver 5 | bit: 6 6 | symmetric: False 7 | ch_axis: -1 8 | token_quantile: 0.995 9 | w_qconfig: 10 | quantizer: FixedFakeQuantize 11 | observer: MinMaxObserver 12 | bit: 6 13 | symmetric: False 14 | ch_axis: 0 # perchannel 0 perlayer -1 15 | calibrate: 128 16 | calibrate_path: /mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/pile_cali 17 | is_remove_padding: True 18 | migrate: True 19 | model: 20 | max_length: 512 -------------------------------------------------------------------------------- /exp/bloomz_176b/int8.yaml: -------------------------------------------------------------------------------- 1 | quant: 2 | a_qconfig: 3 | quantizer: FixedFakeQuantize 4 | observer: AvgMinMaxObserver 5 | bit: 8 6 | symmetric: True 7 | ch_axis: -1 8 | w_qconfig: 9 | quantizer: FixedQuantize 10 | observer: MinMaxObserver 11 | bit: 8 12 | symmetric: True 13 | ch_axis: -1 # perchannel 0 perlayer -1 14 | calibrate: 128 15 | calibrate_path: /mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/pile_cali 16 | is_remove_padding: True 17 | migrate: True 18 | model: 19 | max_length: 512 -------------------------------------------------------------------------------- /exp/llama/int4_token.yaml: -------------------------------------------------------------------------------- 1 | quant: 2 | a_qconfig: 3 | quantizer: TokenFixedFakeQuantize 4 | observer: MinMaxObserver # EMAMSEObserver EMAMinMaxObserver EMAQuantileObserver EMAPruneMinMaxObserver 5 | bit: 4 6 | symmetric: False 7 | ch_axis: 0 8 | w_qconfig: 9 | quantizer: FixedQuantize 10 | observer: MinMaxObserver 11 | bit: 4 12 | symmetric: False 13 | ch_axis: 0 # perchannel 0 perlayer -1 14 | calibrate: 128 15 | calibrate_path: /mnt/cache/weixiuying.vendor/wikitext/wiki_cali 16 | is_remove_padding: True 17 | except_quantizer: [query_permute_post_act_fake_quant, key_transpose_post_act_fake_quant, value_permute_post_act_fake_quant, attention_probs_post_act_fake_quant] # disable bmm quantization 18 | migrate: True 19 | model: 20 | max_length: 2048 21 | -------------------------------------------------------------------------------- /exp/llama/int4_token_disable.yaml: -------------------------------------------------------------------------------- 1 | quant: 2 | a_qconfig: 3 | quantizer: TokenFixedFakeQuantize 4 | observer: MinMaxObserver # EMAMSEObserver EMAMinMaxObserver EMAQuantileObserver EMAPruneMinMaxObserver 5 | bit: 4 6 | symmetric: False 7 | ch_axis: 0 8 | disable_down_proj: True 9 | w_qconfig: 10 | quantizer: FixedQuantize 11 | observer: MinMaxObserver 12 | bit: 4 13 | symmetric: False 14 | ch_axis: 0 # perchannel 0 perlayer -1 15 | calibrate: 128 16 | calibrate_path: /mnt/cache/weixiuying.vendor/wikitext/wiki_cali 17 | is_remove_padding: True 18 | except_quantizer: [query_permute_post_act_fake_quant, key_transpose_post_act_fake_quant, value_permute_post_act_fake_quant, attention_probs_post_act_fake_quant] # disable bmm quantization 19 | migrate: True 20 | model: 21 | max_length: 2048 22 | -------------------------------------------------------------------------------- /exp/llama/int6_token.yaml: -------------------------------------------------------------------------------- 1 | quant: 2 | a_qconfig: 3 | quantizer: TokenFixedFakeQuantize 4 | observer: MinMaxObserver # EMAMSEObserver EMAMinMaxObserver EMAQuantileObserver EMAPruneMinMaxObserver 5 | bit: 6 6 | symmetric: False 7 | ch_axis: 0 8 | w_qconfig: 9 | quantizer: FixedQuantize 10 | observer: MinMaxObserver 11 | bit: 6 12 | symmetric: False 13 | ch_axis: 0 # perchannel 0 perlayer -1 14 | calibrate: 128 15 | calibrate_path: /mnt/cache/weixiuying.vendor/wikitext/wiki_cali 16 | is_remove_padding: True 17 | except_quantizer: [query_permute_post_act_fake_quant, key_transpose_post_act_fake_quant, value_permute_post_act_fake_quant, attention_probs_post_act_fake_quant] # disable bmm quantization 18 | migrate: True 19 | model: 20 | max_length: 2048 21 | -------------------------------------------------------------------------------- /exp/llama/int6_token_disable.yaml: -------------------------------------------------------------------------------- 1 | quant: 2 | a_qconfig: 3 | quantizer: TokenFixedFakeQuantize 4 | observer: MinMaxObserver # EMAMSEObserver EMAMinMaxObserver EMAQuantileObserver EMAPruneMinMaxObserver 5 | bit: 6 6 | symmetric: False 7 | ch_axis: 0 8 | disable_down_proj: True 9 | w_qconfig: 10 | quantizer: FixedQuantize 11 | observer: MinMaxObserver 12 | bit: 6 13 | symmetric: False 14 | ch_axis: 0 # perchannel 0 perlayer -1 15 | calibrate: 128 16 | calibrate_path: /mnt/cache/weixiuying.vendor/wikitext/wiki_cali 17 | is_remove_padding: True 18 | except_quantizer: [query_permute_post_act_fake_quant, key_transpose_post_act_fake_quant, value_permute_post_act_fake_quant, attention_probs_post_act_fake_quant] # disable bmm quantization 19 | migrate: True 20 | model: 21 | max_length: 2048 22 | -------------------------------------------------------------------------------- /exp/opt/int4_group.yaml: -------------------------------------------------------------------------------- 1 | quant: 2 | a_qconfig: 3 | quantizer: GroupFixedFakeQuantize 4 | group_size: 1024 5 | observer: MinMaxObserver # EMAMSEObserver EMAMinMaxObserver EMAQuantileObserver EMAPruneMinMaxObserver 6 | bit: 4 7 | symmetric: False 8 | ch_axis: 0 9 | w_qconfig: 10 | quantizer: GroupFixedQuantize 11 | group_size: 1024 12 | observer: MinMaxObserver 13 | bit: 4 14 | symmetric: False 15 | ch_axis: 0 # perchannel 0 perlayer -1 16 | calibrate: 128 17 | calibrate_path: /mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/pile_cali 18 | is_remove_padding: True 19 | except_quantizer: [query_permute_post_act_fake_quant, key_transpose_post_act_fake_quant, value_permute_post_act_fake_quant, attention_probs_post_act_fake_quant] # disable bmm quantization 20 | migrate: True 21 | model: 22 | max_length: 2048 23 | -------------------------------------------------------------------------------- /exp/opt/int5_token.yaml: -------------------------------------------------------------------------------- 1 | quant: 2 | a_qconfig: 3 | quantizer: TokenFixedFakeQuantize 4 | observer: MinMaxObserver # EMAMSEObserver EMAMinMaxObserver EMAQuantileObserver EMAPruneMinMaxObserver 5 | bit: 5 6 | symmetric: False 7 | ch_axis: 0 8 | w_qconfig: 9 | quantizer: FixedQuantize 10 | observer: MinMaxObserver 11 | bit: 5 12 | symmetric: False 13 | ch_axis: 0 # perchannel 0 perlayer -1 14 | calibrate: 128 15 | calibrate_path: /mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/pile_cali 16 | is_remove_padding: True 17 | except_quantizer: [query_permute_post_act_fake_quant, key_transpose_post_act_fake_quant, value_permute_post_act_fake_quant, attention_probs_post_act_fake_quant] # disable bmm quantization 18 | migrate: True 19 | model: 20 | max_length: 2048 21 | -------------------------------------------------------------------------------- /exp/opt/int6.yaml: -------------------------------------------------------------------------------- 1 | quant: 2 | a_qconfig: 3 | quantizer: FixedFakeQuantize 4 | observer: AvgMinMaxObserver # EMAMSEObserver EMAMinMaxObserver EMAQuantileObserver EMAPruneMinMaxObserver 5 | bit: 6 6 | symmetric: False 7 | ch_axis: -1 # perlayer -1 perchannel 0 8 | w_qconfig: 9 | quantizer: FixedQuantize 10 | observer: MinMaxObserver 11 | bit: 6 12 | symmetric: False 13 | ch_axis: 0 # perchannel 0 perlayer -1 14 | calibrate: 128 15 | calibrate_path: /mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/pile_cali 16 | is_remove_padding: True 17 | migrate: True 18 | model: 19 | max_length: 512 -------------------------------------------------------------------------------- /exp/opt/int8.yaml: -------------------------------------------------------------------------------- 1 | quant: 2 | a_qconfig: 3 | quantizer: FixedFakeQuantize 4 | observer: AvgMinMaxObserver # EMAMSEObserver EMAMinMaxObserver EMAQuantileObserver EMAPruneMinMaxObserver 5 | bit: 8 6 | symmetric: True 7 | ch_axis: -1 # perlayer -1 perchannel 0 8 | w_qconfig: 9 | quantizer: FixedQuantize 10 | observer: MinMaxObserver 11 | bit: 8 12 | symmetric: True 13 | ch_axis: -1 # perchannel 0 perlayer -1 14 | calibrate: 128 15 | calibrate_path: /mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/pile_cali 16 | is_remove_padding: True 17 | migrate: True 18 | model: 19 | max_length: 512 -------------------------------------------------------------------------------- /figure/outlier_phenomenon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/figure/outlier_phenomenon.png -------------------------------------------------------------------------------- /figure/outlier_suppression_plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/figure/outlier_suppression_plus.png -------------------------------------------------------------------------------- /ignore.txt: -------------------------------------------------------------------------------- 1 | ROUGE 2 | rouge 3 | nin 4 | -------------------------------------------------------------------------------- /lm_eval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/ai2_arc/dataset_infos.json: -------------------------------------------------------------------------------- 1 | {"ARC-Challenge": {"description": "A new dataset of 7,787 genuine grade-school level, multiple-choice science questions, assembled to encourage research in\n advanced question-answering. The dataset is partitioned into a Challenge Set and an Easy Set, where the former contains\n only questions answered incorrectly by both a retrieval-based algorithm and a word co-occurrence algorithm. We are also\n including a corpus of over 14 million science sentences relevant to the task, and an implementation of three neural baseline models for this dataset. We pose ARC as a challenge to the community.\n", "citation": "@article{allenai:arc,\n author = {Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and\n Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},\n title = {Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},\n journal = {arXiv:1803.05457v1},\n year = {2018},\n}\n", "homepage": "https://allenai.org/data/arc", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "choices": {"feature": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answerKey": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "ai2_arc", "config_name": "ARC-Challenge", "version": {"version_str": "1.0.0", "description": "", "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 377740, "num_examples": 1172, "dataset_name": "ai2_arc"}, "train": {"name": "train", "num_bytes": 351888, "num_examples": 1119, "dataset_name": "ai2_arc"}, "validation": {"name": "validation", "num_bytes": 97254, "num_examples": 299, "dataset_name": "ai2_arc"}}, "download_checksums": {"/mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/arc/ARC-V1-Feb2018.zip": {"num_bytes": 680841265, "checksum": "6d2d5ab50b2ceec6ba5f79c921be77cf2de712ea25a2b3f4fff3acc101cecfa0"}}, "download_size": 680841265, "dataset_size": 826882, "size_in_bytes": 681668147}, "ARC-Easy": {"description": "A new dataset of 7,787 genuine grade-school level, multiple-choice science questions, assembled to encourage research in\n advanced question-answering. The dataset is partitioned into a Challenge Set and an Easy Set, where the former contains\n only questions answered incorrectly by both a retrieval-based algorithm and a word co-occurrence algorithm. We are also\n including a corpus of over 14 million science sentences relevant to the task, and an implementation of three neural baseline models for this dataset. We pose ARC as a challenge to the community.\n", "citation": "@article{allenai:arc,\n author = {Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and\n Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},\n title = {Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},\n journal = {arXiv:1803.05457v1},\n year = {2018},\n}\n", "homepage": "https://allenai.org/data/arc", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "choices": {"feature": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answerKey": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "ai2_arc", "config_name": "ARC-Easy", "version": {"version_str": "1.0.0", "description": "", "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 661997, "num_examples": 2376, "dataset_name": "ai2_arc"}, "train": {"name": "train", "num_bytes": 623254, "num_examples": 2251, "dataset_name": "ai2_arc"}, "validation": {"name": "validation", "num_bytes": 158498, "num_examples": 570, "dataset_name": "ai2_arc"}}, "download_checksums": {"/mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/arc/ARC-V1-Feb2018.zip": {"num_bytes": 680841265, "checksum": "6d2d5ab50b2ceec6ba5f79c921be77cf2de712ea25a2b3f4fff3acc101cecfa0"}}, "download_size": 680841265, "dataset_size": 1443749, "size_in_bytes": 682285014}} -------------------------------------------------------------------------------- /lm_eval/datasets/arithmetic/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/arithmetic/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/asdiv/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/asdiv/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/asdiv/asdiv.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ASDIV dataset.""" 15 | 16 | 17 | import os 18 | import xml.etree.ElementTree as ET 19 | 20 | import datasets 21 | 22 | 23 | _CITATION = """\ 24 | @misc{miao2021diverse, 25 | title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers}, 26 | author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su}, 27 | year={2021}, 28 | eprint={2106.15772}, 29 | archivePrefix={arXiv}, 30 | primaryClass={cs.AI} 31 | } 32 | """ 33 | 34 | _DESCRIPTION = """\ 35 | ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language 36 | patterns and problem types) English math word problem (MWP) corpus for evaluating 37 | the capability of various MWP solvers. Existing MWP corpora for studying AI progress 38 | remain limited either in language usage patterns or in problem types. We thus present 39 | a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem 40 | types taught in elementary school. Each MWP is annotated with its problem type and grade 41 | level (for indicating the level of difficulty). 42 | """ 43 | 44 | _HOMEPAGE = "https://github.com/chaochun/nlu-asdiv-dataset" 45 | 46 | # TODO: Add the licence for the dataset here if you can find it 47 | _LICENSE = "" 48 | 49 | _URLS = "https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip" 50 | 51 | 52 | class ASDiv(datasets.GeneratorBasedBuilder): 53 | """ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers""" 54 | 55 | VERSION = datasets.Version("0.0.1") 56 | 57 | BUILDER_CONFIGS = [ 58 | datasets.BuilderConfig( 59 | name="asdiv", 60 | version=VERSION, 61 | description="A diverse corpus for evaluating and developing english math word problem solvers", 62 | ) 63 | ] 64 | 65 | def _info(self): 66 | features = datasets.Features( 67 | { 68 | "body": datasets.Value("string"), 69 | "question": datasets.Value("string"), 70 | "solution_type": datasets.Value("string"), 71 | "answer": datasets.Value("string"), 72 | "formula": datasets.Value("string"), 73 | } 74 | ) 75 | return datasets.DatasetInfo( 76 | description=_DESCRIPTION, 77 | features=features, 78 | homepage=_HOMEPAGE, 79 | license=_LICENSE, 80 | citation=_CITATION, 81 | ) 82 | 83 | def _split_generators(self, dl_manager): 84 | urls = _URLS 85 | data_dir = dl_manager.download_and_extract(urls) 86 | base_filepath = "nlu-asdiv-dataset-55790e5270bb91ccfa5053194b25732534696b50" 87 | return [ 88 | datasets.SplitGenerator( 89 | name=datasets.Split.VALIDATION, 90 | # These kwargs will be passed to _generate_examples 91 | gen_kwargs={ 92 | "filepath": os.path.join( 93 | data_dir, base_filepath, "dataset", "ASDiv.xml" 94 | ), 95 | "split": datasets.Split.VALIDATION, 96 | }, 97 | ), 98 | ] 99 | 100 | # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` 101 | def _generate_examples(self, filepath, split): 102 | tree = ET.parse(filepath) 103 | root = tree.getroot() 104 | for key, problem in enumerate(root.iter("Problem")): 105 | yield key, { 106 | "body": problem.find("Body").text, 107 | "question": problem.find("Question").text, 108 | "solution_type": problem.find("Solution-Type").text, 109 | "answer": problem.find("Answer").text, 110 | "formula": problem.find("Formula").text, 111 | } 112 | -------------------------------------------------------------------------------- /lm_eval/datasets/asdiv/dataset_infos.json: -------------------------------------------------------------------------------- 1 | {"asdiv": {"description": "ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language\npatterns and problem types) English math word problem (MWP) corpus for evaluating\nthe capability of various MWP solvers. Existing MWP corpora for studying AI progress\nremain limited either in language usage patterns or in problem types. We thus present\na new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem\ntypes taught in elementary school. Each MWP is annotated with its problem type and grade\nlevel (for indicating the level of difficulty).\n", "citation": "@misc{miao2021diverse,\n title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},\n author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},\n year={2021},\n eprint={2106.15772},\n archivePrefix={arXiv},\n primaryClass={cs.AI}\n}\n", "homepage": "https://github.com/chaochun/nlu-asdiv-dataset", "license": "", "features": {"body": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "solution_type": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "formula": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "as_div", "config_name": "asdiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 501489, "num_examples": 2305, "dataset_name": "as_div"}}, "download_checksums": {"https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip": {"num_bytes": 440966, "checksum": "8f1fe4f6d5f170ec1e24ab78c244153c14c568b1bb2b1dad0324e71f37939a2d"}}, "download_size": 440966, "post_processing_size": null, "dataset_size": 501489, "size_in_bytes": 942455}} 2 | -------------------------------------------------------------------------------- /lm_eval/datasets/coqa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/coqa/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/coqa/dataset_infos.json: -------------------------------------------------------------------------------- 1 | {"coqa": {"description": "CoQA is a large-scale dataset for building Conversational Question Answering\nsystems. The goal of the CoQA challenge is to measure the ability of machines to\nunderstand a text passage and answer a series of interconnected questions that\nappear in a conversation.\n", "citation": "@misc{reddy2018coqa,\n title={CoQA: A Conversational Question Answering Challenge},\n author={Siva Reddy and Danqi Chen and Christopher D. Manning},\n year={2018},\n eprint={1808.07042},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://stanfordnlp.github.io/coqa/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "story": {"dtype": "string", "id": null, "_type": "Value"}, "questions": {"feature": {"input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answers": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "additional_answers": {"0": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "1": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "2": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "coqa", "config_name": "coqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 26250528, "num_examples": 7199, "dataset_name": "coqa"}, "validation": {"name": "validation", "num_bytes": 3765933, "num_examples": 500, "dataset_name": "coqa"}}, "download_checksums": {"https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json": {"num_bytes": 49001836, "checksum": "b0fdb2bc1bd38dd3ca2ce5fa2ac3e02c6288ac914f241ac409a655ffb6619fa6"}, "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json": {"num_bytes": 9090845, "checksum": "dfa367a9733ce53222918d0231d9b3bedc2b8ee831a2845f62dfc70701f2540a"}}, "download_size": 58092681, "post_processing_size": null, "dataset_size": 30016461, "size_in_bytes": 88109142}} 2 | -------------------------------------------------------------------------------- /lm_eval/datasets/drop/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/drop/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/drop/dataset_infos.json: -------------------------------------------------------------------------------- 1 | {"drop": {"description": "DROP is a QA dataset which tests comprehensive understanding of paragraphs. In \nthis crowdsourced, adversarially-created, 96k question-answering benchmark, a \nsystem must resolve multiple references in a question, map them onto a paragraph,\nand perform discrete operations over them (such as addition, counting, or sorting).\n", "citation": "@misc{dua2019drop,\n title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, \n author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},\n year={2019},\n eprint={1903.00161},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://allenai.org/data/drop", "license": "", "features": {"section_id": {"dtype": "string", "id": null, "_type": "Value"}, "passage": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "query_id": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "validated_answers": {"feature": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "drop", "config_name": "drop", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 108858121, "num_examples": 77409, "dataset_name": "drop"}, "validation": {"name": "validation", "num_bytes": 12560739, "num_examples": 9536, "dataset_name": "drop"}}, "download_checksums": {"https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip": {"num_bytes": 8308692, "checksum": "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"}}, "download_size": 8308692, "post_processing_size": null, "dataset_size": 121418860, "size_in_bytes": 129727552}} 2 | -------------------------------------------------------------------------------- /lm_eval/datasets/headqa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/headqa/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/hellaswag/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/hellaswag/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/hellaswag/dataset_infos.json: -------------------------------------------------------------------------------- 1 | {"default": {"description": "\n", "citation": "@inproceedings{zellers2019hellaswag,\n title={HellaSwag: Can a Machine Really Finish Your Sentence?},\n author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},\n booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},\n year={2019}\n}\n", "homepage": "https://rowanzellers.com/hellaswag/", "license": "", "features": {"ind": {"dtype": "int32", "id": null, "_type": "Value"}, "activity_label": {"dtype": "string", "id": null, "_type": "Value"}, "ctx_a": {"dtype": "string", "id": null, "_type": "Value"}, "ctx_b": {"dtype": "string", "id": null, "_type": "Value"}, "ctx": {"dtype": "string", "id": null, "_type": "Value"}, "endings": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_id": {"dtype": "string", "id": null, "_type": "Value"}, "split": {"dtype": "string", "id": null, "_type": "Value"}, "split_type": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "hellaswag", "config_name": "default", "version": {"version_str": "0.1.0", "description": null, "datasets_version_to_prepare": null, "major": 0, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 43307616, "num_examples": 39905, "dataset_name": "hellaswag"}, "test": {"name": "test", "num_bytes": 10810696, "num_examples": 10003, "dataset_name": "hellaswag"}, "validation": {"name": "validation", "num_bytes": 11194634, "num_examples": 10042, "dataset_name": "hellaswag"}}, "download_checksums": {"/mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/hellaswag/hellaswag_train.jsonl": {"num_bytes": 47496131, "checksum": "dae5e69249868cb9fe4e23ff925c60b66169564cfb7072d793cd7356a2b69f8d"}, "/mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/hellaswag/hellaswag_test.jsonl": {"num_bytes": 11752147, "checksum": "da082b00543e422b8d25394614d102944586986def4de5cd1bd36d86bcb76261"}, "/mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/hellaswag/hellaswag_val.jsonl": {"num_bytes": 12246618, "checksum": "0aa3b88843990f3f10a97b9575c94d7b71fb2205240ba04ae4884d9e9c992588"}}, "download_size": 71494896, "dataset_size": 65312946, "size_in_bytes": 136807842}} 2 | -------------------------------------------------------------------------------- /lm_eval/datasets/hellaswag/hellaswag.py: -------------------------------------------------------------------------------- 1 | """TODO(hellaswag): Add a description here.""" 2 | 3 | 4 | import json 5 | 6 | import datasets 7 | 8 | 9 | # TODO(hellaswag): BibTeX citation 10 | _CITATION = """\ 11 | @inproceedings{zellers2019hellaswag, 12 | title={HellaSwag: Can a Machine Really Finish Your Sentence?}, 13 | author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin}, 14 | booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, 15 | year={2019} 16 | } 17 | """ 18 | 19 | # TODO(hellaswag): 20 | _DESCRIPTION = """ 21 | """ 22 | _URL = "/mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/hellaswag/" 23 | _URLS = { 24 | "train": _URL + "hellaswag_train.jsonl", 25 | "test": _URL + "hellaswag_test.jsonl", 26 | "dev": _URL + "hellaswag_val.jsonl", 27 | } 28 | 29 | 30 | class Hellaswag(datasets.GeneratorBasedBuilder): 31 | """TODO(hellaswag): Short description of my dataset.""" 32 | 33 | # TODO(hellaswag): Set up version. 34 | VERSION = datasets.Version("0.1.0") 35 | 36 | def _info(self): 37 | # TODO(hellaswag): Specifies the datasets.DatasetInfo object 38 | return datasets.DatasetInfo( 39 | # This is the description that will appear on the datasets page. 40 | description=_DESCRIPTION, 41 | # datasets.features.FeatureConnectors 42 | features=datasets.Features( 43 | { 44 | # These are the features of your dataset like images, labels ... 45 | "ind": datasets.Value("int32"), 46 | "activity_label": datasets.Value("string"), 47 | "ctx_a": datasets.Value("string"), 48 | "ctx_b": datasets.Value("string"), 49 | "ctx": datasets.Value("string"), 50 | "endings": datasets.features.Sequence(datasets.Value("string")), 51 | "source_id": datasets.Value("string"), 52 | "split": datasets.Value("string"), 53 | "split_type": datasets.Value("string"), 54 | "label": datasets.Value("string"), 55 | } 56 | ), 57 | # If there's a common (input, target) tuple from the features, 58 | # specify them here. They'll be used if as_supervised=True in 59 | # builder.as_dataset. 60 | supervised_keys=None, 61 | # Homepage of the dataset for documentation 62 | homepage="https://rowanzellers.com/hellaswag/", 63 | citation=_CITATION, 64 | ) 65 | 66 | def _split_generators(self, dl_manager): 67 | """Returns SplitGenerators.""" 68 | # TODO(hellaswag): Downloads the data and defines the splits 69 | # dl_manager is a datasets.download.DownloadManager that can be used to 70 | # download and extract URLs 71 | urls_to_download = _URLS 72 | dl_dir = dl_manager.download_and_extract(urls_to_download) 73 | return [ 74 | datasets.SplitGenerator( 75 | name=datasets.Split.TRAIN, 76 | # These kwargs will be passed to _generate_examples 77 | gen_kwargs={"filepath": dl_dir["train"]}, 78 | ), 79 | datasets.SplitGenerator( 80 | name=datasets.Split.TEST, 81 | # These kwargs will be passed to _generate_examples 82 | gen_kwargs={"filepath": dl_dir["test"]}, 83 | ), 84 | datasets.SplitGenerator( 85 | name=datasets.Split.VALIDATION, 86 | # These kwargs will be passed to _generate_examples 87 | gen_kwargs={"filepath": dl_dir["dev"]}, 88 | ), 89 | ] 90 | 91 | def _generate_examples(self, filepath): 92 | """Yields examples.""" 93 | # TODO(hellaswag): Yields (key, example) tuples from the dataset 94 | with open(filepath, encoding="utf-8") as f: 95 | for id_, row in enumerate(f): 96 | data = json.loads(row) 97 | yield id_, { 98 | "ind": int(data["ind"]), 99 | "activity_label": data["activity_label"], 100 | "ctx_a": data.get("ctx_a", ""), 101 | "ctx_b": data.get("ctx_b", ""), 102 | "ctx": data["ctx"], 103 | "endings": data.get("endings", []), 104 | "source_id": data["source_id"], 105 | "split": data["split"], 106 | "split_type": data["split_type"], 107 | "label": str(data.get("label", "")), 108 | } 109 | -------------------------------------------------------------------------------- /lm_eval/datasets/hendrycks_ethics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/hendrycks_ethics/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/hendrycks_math/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/hendrycks_math/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/hendrycks_math/hendrycks_math.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """MATH dataset.""" 15 | 16 | 17 | import json 18 | import os 19 | import pathlib 20 | 21 | import datasets 22 | 23 | 24 | _CITATION = """\ 25 | @article{hendrycksmath2021, 26 | title={Measuring Mathematical Problem Solving With the Math Dataset}, 27 | author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt}, 28 | journal={NeurIPS}, 29 | year={2021} 30 | } 31 | """ 32 | 33 | _DESCRIPTION = """\ 34 | MATH is a dataset of 12,500 challenging competition mathematics problems. Each 35 | problem in Math has a full step-by-step solution which can be used to teach 36 | models to generate answer derivations and explanations. 37 | """ 38 | 39 | _HOMEPAGE = "https://github.com/hendrycks/math" 40 | 41 | # TODO: Add the licence for the dataset here if you can find it 42 | _LICENSE = "" 43 | 44 | _URLS = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar" 45 | 46 | _NAMES = [ 47 | "algebra", 48 | "counting_and_probability", 49 | "geometry", 50 | "intermediate_algebra", 51 | "number_theory", 52 | "prealgebra", 53 | "precalculus", 54 | ] 55 | 56 | 57 | class HendrycksMath(datasets.GeneratorBasedBuilder): 58 | """MATH is a dataset of 12,500 challenging competition mathematics problems.""" 59 | 60 | VERSION = datasets.Version("0.0.1") 61 | 62 | BUILDER_CONFIGS = [ 63 | datasets.BuilderConfig(name=name, version=version, description=name) 64 | for name, version in zip(_NAMES, [VERSION] * len(_NAMES)) 65 | ] 66 | 67 | def _info(self): 68 | features = datasets.Features( 69 | { 70 | "problem": datasets.Value("string"), 71 | "level": datasets.Value("string"), 72 | "type": datasets.Value("string"), 73 | "solution": datasets.Value("string"), 74 | } 75 | ) 76 | return datasets.DatasetInfo( 77 | description=_DESCRIPTION, 78 | features=features, 79 | homepage=_HOMEPAGE, 80 | license=_LICENSE, 81 | citation=_CITATION, 82 | ) 83 | 84 | def _split_generators(self, dl_manager): 85 | urls = _URLS 86 | data_dir = dl_manager.download_and_extract(urls) 87 | return [ 88 | datasets.SplitGenerator( 89 | name=datasets.Split.TRAIN, 90 | # These kwargs will be passed to _generate_examples 91 | gen_kwargs={ 92 | "basepath": os.path.join( 93 | data_dir, "MATH", "train", self.config.name 94 | ), 95 | "split": "train", 96 | }, 97 | ), 98 | datasets.SplitGenerator( 99 | name=datasets.Split.TEST, 100 | # These kwargs will be passed to _generate_examples 101 | gen_kwargs={ 102 | "basepath": os.path.join( 103 | data_dir, "MATH", "test", self.config.name 104 | ), 105 | "split": "test", 106 | }, 107 | ), 108 | ] 109 | 110 | # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` 111 | def _generate_examples(self, basepath, split): 112 | key = 0 113 | for file in sorted(pathlib.Path(basepath).iterdir()): 114 | with open(file, "r", encoding="utf-8") as f: 115 | data = json.load(f) 116 | yield key, { 117 | "problem": data["problem"], 118 | "level": data["level"], 119 | "type": data["type"], 120 | "solution": data["solution"], 121 | } 122 | key += 1 123 | -------------------------------------------------------------------------------- /lm_eval/datasets/lambada_openai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/lambada_openai/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/logiqa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/logiqa/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/logiqa/dataset_infos.json: -------------------------------------------------------------------------------- 1 | {"logiqa": {"description": "LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA\ninstances, covering multiple types of deductive reasoning. Results show that state-\nof-the-art neural models perform by far worse than human ceiling. The dataset can\nalso serve as a benchmark for reinvestigating logical AI under the deep learning\nNLP setting.\n", "citation": "@misc{liu2020logiqa,\n title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, \n author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},\n year={2020},\n eprint={2007.08124},\n archivePrefix={arXiv},\n primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/lgw863/LogiQA-dataset", "license": "", "features": {"label": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "logiqa", "config_name": "logiqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 6419852, "num_examples": 7376, "dataset_name": "logiqa"}, "test": {"name": "test", "num_bytes": 571705, "num_examples": 651, "dataset_name": "logiqa"}, "validation": {"name": "validation", "num_bytes": 562437, "num_examples": 651, "dataset_name": "logiqa"}}, "download_checksums": {"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt": {"num_bytes": 6281272, "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt": {"num_bytes": 559060, "checksum": "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt": {"num_bytes": 550021, "checksum": "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"}}, "download_size": 7390353, "post_processing_size": null, "dataset_size": 7553994, "size_in_bytes": 14944347}} 2 | -------------------------------------------------------------------------------- /lm_eval/datasets/mutual/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/mutual/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/mutual/dataset_infos.json: -------------------------------------------------------------------------------- 1 | {"mutual": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nThe MuTual dataset.", "citation": "@inproceedings{mutual,\n title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n author = \"Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 5141602, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 634396, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 624271, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6400269, "size_in_bytes": 17398147}, "mutual_plus": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nMuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.", "citation": "@inproceedings{mutual,\n title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n author = \"Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n year = \"2020\",\n publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual_plus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 4921179, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 606620, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 597340, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6125139, "size_in_bytes": 17123017}} 2 | -------------------------------------------------------------------------------- /lm_eval/datasets/pile/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/pile/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/piqa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/piqa/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/piqa/dataset_infos.json: -------------------------------------------------------------------------------- 1 | {"plain_text": {"description": "To apply eyeshadow without a brush, should I use a cotton swab or a toothpick?\nQuestions requiring this kind of physical commonsense pose a challenge to state-of-the-art\nnatural language understanding systems. The PIQA dataset introduces the task of physical commonsense reasoning\nand a corresponding benchmark dataset Physical Interaction: Question Answering or PIQA.\n\nPhysical commonsense knowledge is a major challenge on the road to true AI-completeness,\nincluding robots that interact with the world and understand natural language.\n\nThe dataset focuses on everyday situations with a preference for atypical solutions.\nThe dataset is inspired by instructables.com, which provides users with instructions on how to build, craft,\nbake, or manipulate objects using everyday materials.\n\nThe underlying task is formualted as multiple choice question answering:\ngiven a question `q` and two possible solutions `s1`, `s2`, a model or\na human must choose the most appropriate solution, of which exactly one is correct.\nThe dataset is further cleaned of basic artifacts using the AFLite algorithm which is an improvement of\nadversarial filtering. The dataset contains 16,000 examples for training, 2,000 for development and 3,000 for testing.\n", "citation": "@inproceedings{Bisk2020,\n author = {Yonatan Bisk and Rowan Zellers and\n Ronan Le Bras and Jianfeng Gao\n and Yejin Choi},\n title = {PIQA: Reasoning about Physical Commonsense in\n Natural Language},\n booktitle = {Thirty-Fourth AAAI Conference on\n Artificial Intelligence},\n year = {2020},\n}\n", "homepage": "https://yonatanbisk.com/piqa/", "license": "", "features": {"goal": {"dtype": "string", "id": null, "_type": "Value"}, "sol1": {"dtype": "string", "id": null, "_type": "Value"}, "sol2": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "piqa", "config_name": "plain_text", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4104026, "num_examples": 16113, "dataset_name": "piqa"}, "test": {"name": "test", "num_bytes": 761521, "num_examples": 3084, "dataset_name": "piqa"}, "validation": {"name": "validation", "num_bytes": 464321, "num_examples": 1838, "dataset_name": "piqa"}}, "download_checksums": {"/mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/piqa/physicaliqa-train-dev.zip": {"num_bytes": 1824009, "checksum": "54d32a04f59a7e354396f321723c8d7ec35cc6b08506563d8d1ffcc15ce98ddd"}, "/mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/piqa/tests.jsonl": {"num_bytes": 814616, "checksum": "402f1e2e61347db773e6e5e0a6b24f97396b59f6fd046dcdcbc12f483ac8553b"}}, "download_size": 2638625, "post_processing_size": null, "dataset_size": 5329868, "size_in_bytes": 7968493}} 2 | -------------------------------------------------------------------------------- /lm_eval/datasets/quac/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/quac/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/quac/dataset_infos.json: -------------------------------------------------------------------------------- 1 | {"quac": {"description": "Question Answering in Context (QuAC) is a dataset for modeling, understanding, and \nparticipating in information seeking dialog. Data instances consist of an interactive\ndialog between two crowd workers: (1) a student who poses a sequence of freeform\nquestions to learn as much as possible about a hidden Wikipedia text, and (2)\na teacher who answers the questions by providing short excerpts (spans) from the text.\n", "citation": "@article{choi2018quac,\n title={Quac: Question answering in context},\n author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},\n journal={arXiv preprint arXiv:1808.07036},\n year={2018}\n}\n", "homepage": "https://quac.ai/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "section_title": {"dtype": "string", "id": null, "_type": "Value"}, "paragraph": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "quac", "config_name": "quac", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 212391958, "num_examples": 83568, "dataset_name": "quac"}, "validation": {"name": "validation", "num_bytes": 20678483, "num_examples": 7354, "dataset_name": "quac"}}, "download_checksums": {"https://s3.amazonaws.com/my89public/quac/train_v0.2.json": {"num_bytes": 68114819, "checksum": "ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a"}, "https://s3.amazonaws.com/my89public/quac/val_v0.2.json": {"num_bytes": 8929167, "checksum": "09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378"}}, "download_size": 77043986, "post_processing_size": null, "dataset_size": 233070441, "size_in_bytes": 310114427}} 2 | -------------------------------------------------------------------------------- /lm_eval/datasets/sat_analogies/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/sat_analogies/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/story_cloze/README.md: -------------------------------------------------------------------------------- 1 | # datasets 2 | 3 | This directory contains custom HuggingFace [dataset loading scripts](https://huggingface.co/docs/datasets/dataset_script). They are provided to maintain backward compatibility with the ad-hoc data downloaders in earlier versions of the `lm-evaluation-harness` before HuggingFace [`datasets`](https://huggingface.co/docs/datasets/index) was adopted as the default downloading manager. For example, some instances in the HuggingFace `datasets` repository process features (e.g. whitespace stripping, lower-casing, etc.) in ways that the `lm-evaluation-harness` did not. 4 | 5 | __NOTE__: We are __not__ accepting any additional loading scripts into the main branch! If you'd like to use a custom dataset, fork the repo and follow HuggingFace's loading script guide found [here](https://huggingface.co/docs/datasets/dataset_script). You can then override your `Task`'s `DATASET_PATH` attribute to point to this script's local path. 6 | 7 | 8 | __WARNING__: A handful of loading scripts are included in this collection because they have not yet been pushed to the Huggingface Hub or a HuggingFace organization repo. We will remove such scripts once pushed. 9 | -------------------------------------------------------------------------------- /lm_eval/datasets/story_cloze/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/story_cloze/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/story_cloze/dataset_infos.json: -------------------------------------------------------------------------------- 1 | {"2016": {"description": "\nStory Cloze Test' is a commonsense reasoning framework for evaluating story understanding,\nstory generation, and script learning.This test requires a system to choose the correct ending\nto a four-sentence story.\n", "citation": "@inproceedings{mostafazadeh2017lsdsem,\n title={Lsdsem 2017 shared task: The story cloze test},\n author={Mostafazadeh, Nasrin and Roth, Michael and Louis, Annie and Chambers, Nathanael and Allen, James},\n booktitle={Proceedings of the 2nd Workshop on Linking Models of Lexical, Sentential and Discourse-level Semantics},\n pages={46--51},\n year={2017}\n}\n", "homepage": "https://cs.rochester.edu/nlp/rocstories/", "license": "", "features": {"story_id": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_1": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_2": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_3": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_4": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz2": {"dtype": "string", "id": null, "_type": "Value"}, "answer_right_ending": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "story_cloze", "config_name": "2016", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 614084, "num_examples": 1871, "dataset_name": "story_cloze"}, "test": {"name": "test", "num_bytes": 613184, "num_examples": 1871, "dataset_name": "story_cloze"}}, "download_checksums": {}, "download_size": 0, "post_processing_size": null, "dataset_size": 1227268, "size_in_bytes": 1227268}, "2018": {"description": "\nStory Cloze Test' is a commonsense reasoning framework for evaluating story understanding,\nstory generation, and script learning.This test requires a system to choose the correct ending\nto a four-sentence story.\n", "citation": "@inproceedings{mostafazadeh2017lsdsem,\n title={Lsdsem 2017 shared task: The story cloze test},\n author={Mostafazadeh, Nasrin and Roth, Michael and Louis, Annie and Chambers, Nathanael and Allen, James},\n booktitle={Proceedings of the 2nd Workshop on Linking Models of Lexical, Sentential and Discourse-level Semantics},\n pages={46--51},\n year={2017}\n}\n", "homepage": "https://cs.rochester.edu/nlp/rocstories/", "license": "", "features": {"story_id": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_1": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_2": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_3": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_4": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz2": {"dtype": "string", "id": null, "_type": "Value"}, "answer_right_ending": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "story_cloze", "config_name": "2018", "version": "0.0.0", "splits": {"validation": {"name": "validation", "num_bytes": 515439, "num_examples": 1571, "dataset_name": "story_cloze"}}, "download_checksums": {}, "download_size": 0, "post_processing_size": null, "dataset_size": 515439, "size_in_bytes": 515439}} -------------------------------------------------------------------------------- /lm_eval/datasets/super_glue/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/super_glue/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/triviaqa/README.md: -------------------------------------------------------------------------------- 1 | --- 2 | dataset_info: 3 | features: 4 | - name: question_id 5 | dtype: string 6 | - name: question_source 7 | dtype: string 8 | - name: question 9 | dtype: string 10 | - name: answer 11 | struct: 12 | - name: aliases 13 | sequence: string 14 | - name: value 15 | dtype: string 16 | - name: search_results 17 | sequence: 18 | - name: description 19 | dtype: string 20 | - name: filename 21 | dtype: string 22 | - name: rank 23 | dtype: int32 24 | - name: title 25 | dtype: string 26 | - name: url 27 | dtype: string 28 | - name: search_context 29 | dtype: string 30 | config_name: triviaqa 31 | splits: 32 | - name: train 33 | num_bytes: 1270894387 34 | num_examples: 87622 35 | - name: validation 36 | num_bytes: 163755044 37 | num_examples: 11313 38 | download_size: 632549060 39 | dataset_size: 1434649431 40 | --- 41 | -------------------------------------------------------------------------------- /lm_eval/datasets/triviaqa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/triviaqa/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/triviaqa/dataset_infos.json: -------------------------------------------------------------------------------- 1 | {"triviaqa": {"description": "TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n", "citation": "@InProceedings{JoshiTriviaQA2017,\n author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n month = {July},\n year = {2017},\n address = {Vancouver, Canada},\n publisher = {Association for Computational Linguistics},\n}\n", "homepage": "https://nlp.cs.washington.edu/triviaqa/", "license": "Apache License 2.0", "features": {"question_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_source": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"aliases": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "value": {"dtype": "string", "id": null, "_type": "Value"}}, "search_results": {"feature": {"description": {"dtype": "string", "id": null, "_type": "Value"}, "filename": {"dtype": "string", "id": null, "_type": "Value"}, "rank": {"dtype": "int32", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}, "search_context": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "triviaqa", "config_name": "triviaqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1271393601, "num_examples": 87622, "dataset_name": "triviaqa"}, "validation": {"name": "validation", "num_bytes": 163819509, "num_examples": 11313, "dataset_name": "triviaqa"}}, "download_checksums": {"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz": {"num_bytes": 546481381, "checksum": "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}}, "download_size": 546481381, "post_processing_size": null, "dataset_size": 1435213110, "size_in_bytes": 1981694491}} 2 | -------------------------------------------------------------------------------- /lm_eval/datasets/unscramble/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/unscramble/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/unscramble/unscramble.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """Unscramble dataset.""" 15 | 16 | 17 | import json 18 | import os 19 | 20 | import datasets 21 | 22 | 23 | _CITATION = """\ 24 | @inproceedings{NEURIPS2020_1457c0d6, 25 | author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario}, 26 | booktitle = {Advances in Neural Information Processing Systems}, 27 | editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin}, 28 | pages = {1877--1901}, 29 | publisher = {Curran Associates, Inc.}, 30 | title = {Language Models are Few-Shot Learners}, 31 | url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf}, 32 | volume = {33}, 33 | year = {2020} 34 | } 35 | """ 36 | 37 | _DESCRIPTION = """\ 38 | Unscramble is a small battery of 5 “character manipulation” tasks. Each task 39 | involves giving the model a word distorted by some combination of scrambling, 40 | addition, or deletion of characters, and asking it to recover the original word. 41 | """ 42 | 43 | _HOMEPAGE = "https://github.com/openai/gpt-3/tree/master/data" 44 | 45 | # TODO: Add the licence for the dataset here if you can find it 46 | _LICENSE = "" 47 | 48 | _BASE_URL = "https://raw.githubusercontent.com/openai/gpt-3/master/data" 49 | 50 | 51 | _DESCRIPTIONS = { 52 | "mid_word_1_anagrams": "Anagrams of all but the first and last letter.", 53 | "mid_word_2_anagrams": "Anagrams of all but the first and last 2 letters.", 54 | "cycle_letters_in_word": "Cycle letters in the word.", 55 | "random_insertion_in_word": "Random insertions in the word that must be removed.", 56 | "reversed_words": "Words spelled backwards that must be reversed.", 57 | } 58 | _NAMES = _DESCRIPTIONS.keys() 59 | 60 | 61 | class Unscramble(datasets.GeneratorBasedBuilder): 62 | """Unscramble is a small battery of 5 “character manipulation” tasks.""" 63 | 64 | VERSION = datasets.Version("0.0.1") 65 | 66 | BUILDER_CONFIGS = [ 67 | datasets.BuilderConfig( 68 | name=name, version=version, description=_DESCRIPTIONS[name] 69 | ) 70 | for name, version in zip(_NAMES, [VERSION] * len(_NAMES)) 71 | ] 72 | 73 | def _info(self): 74 | features = datasets.Features( 75 | { 76 | "context": datasets.Value("string"), 77 | "completion": datasets.Value("string"), 78 | } 79 | ) 80 | return datasets.DatasetInfo( 81 | description=_DESCRIPTION, 82 | features=features, 83 | homepage=_HOMEPAGE, 84 | license=_LICENSE, 85 | citation=_CITATION, 86 | ) 87 | 88 | def _split_generators(self, dl_manager): 89 | urls = os.path.join(_BASE_URL, f"{self.config.name}.jsonl.gz") 90 | data_dir = dl_manager.download_and_extract(urls) 91 | return [ 92 | datasets.SplitGenerator( 93 | name=datasets.Split.VALIDATION, 94 | # These kwargs will be passed to _generate_examples 95 | gen_kwargs={ 96 | "filepath": data_dir, 97 | "split": "validation", 98 | }, 99 | ), 100 | ] 101 | 102 | # method parameters are unpacked from `gen_kwargs` as given in `_split_generators` 103 | def _generate_examples(self, filepath, split): 104 | with open(filepath, encoding="utf-8") as f: 105 | for key, row in enumerate(f): 106 | data = json.loads(row) 107 | yield key, { 108 | "context": data["context"], 109 | "completion": data["completion"], 110 | } 111 | -------------------------------------------------------------------------------- /lm_eval/datasets/wikitext/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/wikitext/__init__.py -------------------------------------------------------------------------------- /lm_eval/datasets/winogrande/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/winogrande/__init__.py -------------------------------------------------------------------------------- /lm_eval/decontamination/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/decontamination/__init__.py -------------------------------------------------------------------------------- /lm_eval/models/__init__.py: -------------------------------------------------------------------------------- 1 | from . import gpt2 2 | from . import gpt3 3 | from . import textsynth 4 | from . import dummy 5 | from . import opt 6 | from . import bloom 7 | from . import llama 8 | 9 | 10 | MODEL_REGISTRY = { 11 | "hf": gpt2.HFLM, 12 | "gpt2": gpt2.GPT2LM, 13 | "gpt3": gpt3.GPT3LM, 14 | "textsynth": textsynth.TextSynthLM, 15 | "dummy": dummy.DummyLM, 16 | 'opt': opt.OPTLM, 17 | 'bloom': bloom.BLOOMLM, 18 | 'llama': llama.LLAMALM, 19 | } 20 | 21 | 22 | def get_model(model_name): 23 | return MODEL_REGISTRY[model_name] 24 | -------------------------------------------------------------------------------- /lm_eval/models/bloom.py: -------------------------------------------------------------------------------- 1 | import transformers 2 | import torch 3 | from lm_eval.base import BaseLM 4 | from accelerate.big_modeling import dispatch_model, infer_auto_device_map, get_balanced_memory 5 | 6 | 7 | class BLOOMLM(BaseLM): 8 | 9 | def __init__( 10 | self, 11 | device="cuda", 12 | pretrained="bloom", 13 | revision="main", 14 | subfolder=None, 15 | tokenizer=None, 16 | batch_size=1, 17 | dtype=torch.float32, 18 | max_length=-1 19 | ): 20 | super().__init__() 21 | 22 | assert isinstance(device, str) 23 | assert isinstance(pretrained, str) 24 | assert isinstance(batch_size, int) 25 | 26 | if device: 27 | if device not in ["cuda", "cpu"]: 28 | device = int(device) 29 | self._device = torch.device(device) 30 | print(f"Using device '{device}'") 31 | else: 32 | print("Device not specified") 33 | print(f"Cuda Available? {torch.cuda.is_available()}") 34 | self._device = ( 35 | torch.device("cuda") 36 | if torch.cuda.is_available() 37 | else torch.device("cpu") 38 | ) 39 | self.dtype = dtype 40 | self.model = transformers.AutoModelForCausalLM.from_pretrained( 41 | pretrained, 42 | revision=revision + ("/" + subfolder if subfolder is not None else ""), 43 | torch_dtype=self.dtype 44 | ) 45 | if max_length != -1: 46 | self.model.config.n_ctx = max_length 47 | else: 48 | self.model.config.n_ctx = 512 49 | self.pretrained = pretrained 50 | self.no_split_modules = self.model._no_split_modules 51 | self.model.eval() 52 | # pretrained tokenizer for neo is broken for now so just hard-coding this to gpt2 53 | self.tokenizer = transformers.AutoTokenizer.from_pretrained( 54 | pretrained if tokenizer is None else tokenizer, 55 | revision=revision, 56 | # subfolder=subfolder, 57 | use_fast=True, 58 | ) 59 | self.vocab_size = self.tokenizer.vocab_size 60 | self.batch_size_per_gpu = batch_size # todo: adaptive batch size 61 | 62 | def prepare_for_inference(self): 63 | self.no_split_modules = self.model._no_split_modules 64 | self.model.to(self.dtype) 65 | max_memory = get_balanced_memory( 66 | self.model, 67 | no_split_module_classes=self.no_split_modules, 68 | dtype=self.dtype 69 | ) 70 | device_map = infer_auto_device_map( 71 | self.model, 72 | no_split_module_classes=self.no_split_modules, 73 | dtype=self.dtype, 74 | max_memory=max_memory, 75 | ) 76 | print(device_map) 77 | dispatch_model(self.model, device_map=device_map) 78 | self.model.eval() 79 | 80 | @property 81 | def eot_token_id(self): 82 | # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* 83 | return self.tokenizer.eos_token_id 84 | 85 | @property 86 | def max_length(self,): 87 | return self.model.config.n_ctx 88 | 89 | @property 90 | def max_gen_toks(self): 91 | return 256 92 | 93 | @property 94 | def batch_size(self): 95 | # TODO: fix multi-gpu 96 | return self.batch_size_per_gpu # * gpus 97 | 98 | @property 99 | def device(self): 100 | # TODO: fix multi-gpu 101 | return self._device 102 | 103 | def tok_encode(self, string: str): 104 | return self.tokenizer.encode(string, add_special_tokens=False) 105 | 106 | def tok_decode(self, tokens): 107 | return self.tokenizer.decode(tokens) 108 | 109 | def _model_call(self, inps, attention_mask=None): 110 | """ 111 | inps: a torch tensor of shape [batch, sequence] 112 | the size of sequence may vary from call to call 113 | 114 | returns: a torch tensor of shape [batch, sequence, vocab] with the 115 | logits returned from the model 116 | """ 117 | with torch.no_grad(): 118 | return self.model(inps, attention_mask=attention_mask)[0][:, :, :250680] 119 | 120 | def _model_generate(self, context, max_length, eos_token_id): 121 | return self.model.generate( 122 | context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False 123 | ) 124 | -------------------------------------------------------------------------------- /lm_eval/models/dummy.py: -------------------------------------------------------------------------------- 1 | import random 2 | from lm_eval.base import LM 3 | 4 | 5 | class DummyLM(LM): 6 | def __init__(self): 7 | pass 8 | 9 | @classmethod 10 | def create_from_arg_string(cls, arg_string, additional_config=None): 11 | return cls() 12 | 13 | def loglikelihood(self, requests): 14 | res = [] 15 | 16 | for _ in requests: 17 | res.append((-random.random(), False)) 18 | 19 | return res 20 | 21 | def greedy_until(self, requests): 22 | res = [] 23 | 24 | for ctx, _ in requests: 25 | res.append("lol") 26 | assert ctx.strip() != "" 27 | 28 | return res 29 | 30 | def loglikelihood_rolling(self, requests): 31 | res = [] 32 | 33 | for _ in requests: 34 | res.append(-random.random()) 35 | 36 | return res 37 | -------------------------------------------------------------------------------- /lm_eval/models/llama.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from lm_eval.base import BaseLM 3 | from transformers import LlamaForCausalLM, LlamaTokenizer 4 | from accelerate.big_modeling import dispatch_model, infer_auto_device_map, get_balanced_memory 5 | 6 | 7 | class LLAMALM(BaseLM): 8 | 9 | def __init__( 10 | self, 11 | device="cuda", 12 | pretrained="llama", 13 | revision="main", 14 | subfolder=None, 15 | tokenizer=None, 16 | batch_size=1, 17 | dtype=torch.float32, 18 | max_length=-1, 19 | ): 20 | super().__init__() 21 | 22 | assert isinstance(device, str) 23 | assert isinstance(pretrained, str) 24 | assert isinstance(batch_size, int) 25 | 26 | if device: 27 | if device not in ["cuda", "cpu"]: 28 | device = int(device) 29 | self._device = torch.device(device) 30 | print(f"Using device '{device}'") 31 | else: 32 | print("Device not specified") 33 | print(f"Cuda Available? {torch.cuda.is_available()}") 34 | self._device = ( 35 | torch.device("cuda") 36 | if torch.cuda.is_available() 37 | else torch.device("cpu") 38 | ) 39 | self.dtype = dtype 40 | self.model = LlamaForCausalLM.from_pretrained( 41 | pretrained, 42 | revision=revision + ("/" + subfolder if subfolder is not None else ""), 43 | torch_dtype=self.dtype 44 | ) 45 | if max_length != -1: 46 | self.model.config.max_sequence_length = max_length 47 | self.pretrained = pretrained 48 | self.no_split_modules = self.model._no_split_modules 49 | self.model.eval() 50 | self.tokenizer = LlamaTokenizer.from_pretrained( 51 | pretrained if tokenizer is None else tokenizer, 52 | revision=revision, 53 | # subfolder=subfolder, 54 | ) 55 | if self.tokenizer.pad_token_id is None: 56 | self.tokenizer.pad_token_id = self.tokenizer.eos_token_id 57 | self.vocab_size = self.tokenizer.vocab_size 58 | self.batch_size_per_gpu = batch_size # todo: adaptive batch size 59 | 60 | def prepare_for_inference(self): 61 | self.no_split_modules = self.model._no_split_modules 62 | self.model.to(self.dtype) 63 | max_memory = get_balanced_memory( 64 | self.model, 65 | no_split_module_classes=self.no_split_modules, 66 | dtype=self.dtype, 67 | ) 68 | device_map = infer_auto_device_map( 69 | self.model, 70 | no_split_module_classes=self.no_split_modules, 71 | dtype=self.dtype, 72 | max_memory=max_memory, 73 | ) 74 | print(device_map) 75 | dispatch_model(self.model, device_map=device_map) 76 | self.model.eval() 77 | 78 | @property 79 | def eot_token_id(self): 80 | # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence* 81 | return self.tokenizer.eos_token_id 82 | 83 | @property 84 | def max_length(self): 85 | return self.model.config.max_sequence_length 86 | 87 | @property 88 | def max_gen_toks(self): 89 | return 256 90 | 91 | @property 92 | def batch_size(self): 93 | # TODO: fix multi-gpu 94 | return self.batch_size_per_gpu # * gpus 95 | 96 | @property 97 | def device(self): 98 | # TODO: fix multi-gpu 99 | return self._device 100 | 101 | def tok_encode(self, string: str): 102 | return self.tokenizer.encode(string, add_special_tokens=False) 103 | 104 | def tok_decode(self, tokens): 105 | return self.tokenizer.decode(tokens) 106 | 107 | def _model_call(self, inps, attention_mask=None): 108 | """ 109 | inps: a torch tensor of shape [batch, sequence] 110 | the size of sequence may vary from call to call 111 | 112 | returns: a torch tensor of shape [batch, sequence, vocab] with the 113 | logits returned from the model 114 | """ 115 | with torch.no_grad(): 116 | return self.model(inps, attention_mask=attention_mask)[0][:, :, :len(self.tokenizer)] 117 | 118 | def _model_generate(self, context, max_length, eos_token_id): 119 | return self.model.generate( 120 | context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False 121 | ) 122 | -------------------------------------------------------------------------------- /lm_eval/tasks/arc.py: -------------------------------------------------------------------------------- 1 | """ 2 | Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge 3 | https://arxiv.org/pdf/1803.05457.pdf 4 | 5 | The ARC dataset consists of 7,787 science exam questions drawn from a variety 6 | of sources, including science questions provided under license by a research 7 | partner affiliated with AI2. These are text-only, English language exam questions 8 | that span several grade levels as indicated in the files. Each question has a 9 | multiple choice structure (typically 4 answer options). The questions are sorted 10 | into a Challenge Set of 2,590 “hard” questions (those that both a retrieval and 11 | a co-occurrence method fail to answer correctly) and an Easy Set of 5,197 questions. 12 | 13 | Homepage: https://allenai.org/data/arc 14 | """ 15 | import inspect 16 | from lm_eval.base import MultipleChoiceTask 17 | import lm_eval.datasets.ai2_arc.ai2_arc 18 | 19 | _CITATION = """ 20 | @article{Clark2018ThinkYH, 21 | title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge}, 22 | author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord}, 23 | journal={ArXiv}, 24 | year={2018}, 25 | volume={abs/1803.05457} 26 | } 27 | """ 28 | 29 | 30 | class ARCEasy(MultipleChoiceTask): 31 | VERSION = 0 32 | DATASET_PATH = inspect.getfile(lm_eval.datasets.ai2_arc.ai2_arc) 33 | DATASET_NAME = "ARC-Easy" 34 | 35 | def has_training_docs(self): 36 | return True 37 | 38 | def has_validation_docs(self): 39 | return True 40 | 41 | def has_test_docs(self): 42 | return True 43 | 44 | def training_docs(self): 45 | if self._training_docs is None: 46 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 47 | return self._training_docs 48 | 49 | def validation_docs(self): 50 | return map(self._process_doc, self.dataset["validation"]) 51 | 52 | def test_docs(self): 53 | return map(self._process_doc, self.dataset["test"]) 54 | 55 | def _process_doc(self, doc): 56 | # NOTE: Some `doc["answerKey"]`s are in numeric string format being one 57 | # of {'1', '2', '3', '4', '5'}. We map them back to letters. 58 | num_to_letter = {"1": "A", "2": "B", "3": "C", "4": "D", "5": "E"} 59 | doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"]) 60 | out_doc = { 61 | "id": doc["id"], 62 | "query": "Question: " + doc["question"] + "\nAnswer:", 63 | "choices": doc["choices"]["text"], 64 | "gold": ["A", "B", "C", "D", "E"].index(doc["answerKey"]), 65 | } 66 | return out_doc 67 | 68 | def doc_to_text(self, doc): 69 | return doc["query"] 70 | 71 | def should_decontaminate(self): 72 | return True 73 | 74 | def doc_to_decontamination_query(self, doc): 75 | return doc["query"] 76 | 77 | 78 | class ARCChallenge(ARCEasy): 79 | DATASET_PATH = inspect.getfile(lm_eval.datasets.ai2_arc.ai2_arc) 80 | DATASET_NAME = "ARC-Challenge" 81 | -------------------------------------------------------------------------------- /lm_eval/tasks/arithmetic.py: -------------------------------------------------------------------------------- 1 | """ 2 | Language Models are Few-Shot Learners 3 | https://arxiv.org/pdf/2005.14165.pdf 4 | 5 | A small battery of 10 tests that involve asking language models a simple arithmetic 6 | problem in natural language. 7 | 8 | Homepage: https://github.com/openai/gpt-3/tree/master/data 9 | """ 10 | import inspect 11 | import lm_eval.datasets.arithmetic.arithmetic 12 | from lm_eval.base import Task, rf 13 | from lm_eval.metrics import mean 14 | 15 | 16 | _CITATION = """ 17 | @inproceedings{NEURIPS2020_1457c0d6, 18 | author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario}, 19 | booktitle = {Advances in Neural Information Processing Systems}, 20 | editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin}, 21 | pages = {1877--1901}, 22 | publisher = {Curran Associates, Inc.}, 23 | title = {Language Models are Few-Shot Learners}, 24 | url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf}, 25 | volume = {33}, 26 | year = {2020} 27 | } 28 | """ 29 | 30 | 31 | class Arithmetic(Task): 32 | VERSION = 0 33 | DATASET_PATH = inspect.getfile(lm_eval.datasets.arithmetic.arithmetic) 34 | 35 | def has_training_docs(self): 36 | return False 37 | 38 | def has_validation_docs(self): 39 | return True 40 | 41 | def has_test_docs(self): 42 | return False 43 | 44 | def training_docs(self): 45 | return NotImplemented 46 | 47 | def validation_docs(self): 48 | return self.dataset["validation"] 49 | 50 | def test_docs(self): 51 | return NotImplemented 52 | 53 | def doc_to_text(self, doc): 54 | return doc["context"] 55 | 56 | def should_decontaminate(self): 57 | return True 58 | 59 | def doc_to_decontamination_query(self, doc): 60 | return doc["context"] 61 | 62 | def doc_to_target(self, doc): 63 | return doc["completion"] 64 | 65 | def construct_requests(self, doc, ctx): 66 | ll, is_prediction = rf.loglikelihood(ctx, doc["completion"]) 67 | return is_prediction 68 | 69 | def process_results(self, doc, results): 70 | (is_prediction,) = results 71 | return {"acc": is_prediction} 72 | 73 | def aggregation(self): 74 | return { 75 | "acc": mean, 76 | } 77 | 78 | def higher_is_better(self): 79 | return {"acc": True} 80 | 81 | 82 | class Arithmetic2DPlus(Arithmetic): 83 | DATASET_NAME = "arithmetic_2da" 84 | 85 | 86 | class Arithmetic2DMinus(Arithmetic): 87 | DATASET_NAME = "arithmetic_2ds" 88 | 89 | 90 | class Arithmetic3DPlus(Arithmetic): 91 | DATASET_NAME = "arithmetic_3da" 92 | 93 | 94 | class Arithmetic3DMinus(Arithmetic): 95 | DATASET_NAME = "arithmetic_3ds" 96 | 97 | 98 | class Arithmetic4DPlus(Arithmetic): 99 | DATASET_NAME = "arithmetic_4da" 100 | 101 | 102 | class Arithmetic4DMinus(Arithmetic): 103 | DATASET_NAME = "arithmetic_4ds" 104 | 105 | 106 | class Arithmetic5DPlus(Arithmetic): 107 | DATASET_NAME = "arithmetic_5da" 108 | 109 | 110 | class Arithmetic5DMinus(Arithmetic): 111 | DATASET_NAME = "arithmetic_5ds" 112 | 113 | 114 | class Arithmetic2DMultiplication(Arithmetic): 115 | DATASET_NAME = "arithmetic_2dm" 116 | 117 | 118 | class Arithmetic1DComposite(Arithmetic): 119 | DATASET_NAME = "arithmetic_1dc" 120 | -------------------------------------------------------------------------------- /lm_eval/tasks/asdiv.py: -------------------------------------------------------------------------------- 1 | """ 2 | ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers 3 | https://arxiv.org/abs/2106.15772 4 | 5 | ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language 6 | patterns and problem types) English math word problem (MWP) corpus for evaluating 7 | the capability of various MWP solvers. Existing MWP corpora for studying AI progress 8 | remain limited either in language usage patterns or in problem types. We thus present 9 | a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem 10 | types taught in elementary school. Each MWP is annotated with its problem type and grade 11 | level (for indicating the level of difficulty). 12 | 13 | NOTE: We currently ignore formulas for answer generation. 14 | 15 | Homepage: https://github.com/chaochun/nlu-asdiv-dataset 16 | """ 17 | import inspect 18 | import lm_eval.datasets.asdiv.asdiv 19 | from lm_eval.base import rf, Task 20 | from lm_eval.metrics import mean 21 | 22 | 23 | _CITATION = """ 24 | @misc{miao2021diverse, 25 | title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers}, 26 | author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su}, 27 | year={2021}, 28 | eprint={2106.15772}, 29 | archivePrefix={arXiv}, 30 | primaryClass={cs.AI} 31 | } 32 | """ 33 | 34 | 35 | class Asdiv(Task): 36 | VERSION = 0 37 | DATASET_PATH = inspect.getfile(lm_eval.datasets.asdiv.asdiv) 38 | 39 | def has_training_docs(self): 40 | return False 41 | 42 | def has_validation_docs(self): 43 | return True 44 | 45 | def has_test_docs(self): 46 | return False 47 | 48 | def training_docs(self): 49 | raise NotImplementedError("This dataset has no training docs") 50 | 51 | def validation_docs(self): 52 | return self.dataset["validation"] 53 | 54 | def test_docs(self): 55 | raise NotImplementedError("This dataset has no test docs") 56 | 57 | def fewshot_context( 58 | self, doc, num_fewshot, provide_description=None, rnd=None, description=None 59 | ): 60 | assert num_fewshot == 0, "ASDiv is intended only for the zero-shot setting." 61 | return super().fewshot_context( 62 | doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description 63 | ) 64 | 65 | def doc_to_text(self, doc): 66 | # TODO: add solution-type 67 | return doc["body"] + "\n" + "Question:" + doc["question"] + "\n" + "Answer:" 68 | 69 | def should_decontaminate(self): 70 | return True 71 | 72 | def doc_to_decontamination_query(self, doc): 73 | return doc["body"] + " " + doc["question"] 74 | 75 | def doc_to_target(self, doc): 76 | # TODO: add formula 77 | 78 | answer = doc["answer"].split(" (")[0] 79 | return " " + answer 80 | 81 | def construct_requests(self, doc, ctx): 82 | ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc)) 83 | return ll, is_greedy 84 | 85 | def process_results(self, doc, results): 86 | ll, is_greedy = results 87 | 88 | return {"acc": int(is_greedy)} 89 | 90 | def aggregation(self): 91 | return {"acc": mean} 92 | 93 | def higher_is_better(self): 94 | return {"acc": True} 95 | -------------------------------------------------------------------------------- /lm_eval/tasks/gsm8k.py: -------------------------------------------------------------------------------- 1 | """ 2 | "Training Verifiers to Solve Math Word Problems" 3 | https://arxiv.org/abs/2110.14168 4 | 5 | State-of-the-art language models can match human performance on many tasks, but 6 | they still struggle to robustly perform multi-step mathematical reasoning. To 7 | diagnose the failures of current models and support research, we introduce GSM8K, 8 | a dataset of 8.5K high quality linguistically diverse grade school math word problems. 9 | We find that even the largest transformer models fail to achieve high test performance, 10 | despite the conceptual simplicity of this problem distribution. 11 | 12 | NOTE: See the official implementation of the task: 13 | https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py 14 | for how to make use of the dataset's calculator annotations in your language 15 | model's sample/generation function. 16 | 17 | Homepage: https://github.com/openai/grade-school-math 18 | """ 19 | import re 20 | from lm_eval.base import Task, rf 21 | from lm_eval.metrics import mean 22 | 23 | 24 | _CITATION = """ 25 | @misc{cobbe2021training, 26 | title={Training Verifiers to Solve Math Word Problems}, 27 | author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman}, 28 | year={2021}, 29 | eprint={2110.14168}, 30 | archivePrefix={arXiv}, 31 | primaryClass={cs.LG} 32 | } 33 | """ 34 | 35 | 36 | ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)") 37 | INVALID_ANS = "[invalid]" 38 | 39 | 40 | class GradeSchoolMath8K(Task): 41 | VERSION = 0 42 | DATASET_PATH = "gsm8k" 43 | DATASET_NAME = "main" 44 | 45 | def has_training_docs(self): 46 | return True 47 | 48 | def has_validation_docs(self): 49 | return False 50 | 51 | def has_test_docs(self): 52 | return True 53 | 54 | def training_docs(self): 55 | return self.dataset["train"] 56 | 57 | def validation_docs(self): 58 | raise NotImplementedError 59 | 60 | def test_docs(self): 61 | return self.dataset["test"] 62 | 63 | def doc_to_text(self, doc): 64 | return "Question: " + doc["question"] + "\nAnswer:" 65 | 66 | def doc_to_target(self, doc): 67 | return " " + doc["answer"] 68 | 69 | def construct_requests(self, doc, ctx): 70 | """Uses RequestFactory to construct Requests and returns an iterable of 71 | Requests which will be sent to the LM. 72 | 73 | :param doc: 74 | The document as returned from training_docs, validation_docs, or test_docs. 75 | :param ctx: str 76 | The context string, generated by fewshot_context. This includes the natural 77 | language description, as well as the few shot examples, and the question 78 | part of the document for `doc`. 79 | """ 80 | # NOTE: The paper implements "verifiers" that assign a score to multiple 81 | # solutions and output the highest ranked solution. 82 | completion = rf.greedy_until(ctx, ["\n"]) 83 | return completion 84 | 85 | def _extract_answer(self, completion): 86 | match = ANS_RE.search(completion) 87 | if match: 88 | match_str = match.group(1).strip() 89 | match_str = match_str.replace(",", "") 90 | return match_str 91 | else: 92 | return INVALID_ANS 93 | 94 | def _is_correct(self, completion, answer): 95 | gold = self._extract_answer(answer) 96 | assert gold != INVALID_ANS, "No ground truth answer found in the document." 97 | return self._extract_answer(completion) == gold 98 | 99 | def process_results(self, doc, results): 100 | """Take a single document and the LM results and evaluates, returning a 101 | dict where keys are the names of submetrics and values are the values of 102 | the metric for that one document 103 | 104 | :param doc: 105 | The document as returned from training_docs, validation_docs, or test_docs. 106 | :param results: 107 | The results of the requests created in construct_requests. 108 | """ 109 | completion = results[0] 110 | answer = doc["answer"] 111 | return {"acc": self._is_correct(completion, answer)} 112 | 113 | def aggregation(self): 114 | """ 115 | :returns: {str: [float] -> float} 116 | A dictionary where keys are the names of submetrics and values are 117 | functions that aggregate a list of metrics 118 | """ 119 | return {"acc": mean} 120 | 121 | def higher_is_better(self): 122 | """ 123 | :returns: {str: bool} 124 | A dictionary where keys are the names of submetrics and values are 125 | whether a higher value of the submetric is better 126 | """ 127 | return {"acc": True} 128 | -------------------------------------------------------------------------------- /lm_eval/tasks/headqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | Interpretable Multi-Step Reasoning with Knowledge Extraction on Complex Healthcare Question Answering 3 | https://aclanthology.org/P19-1092.pdf 4 | 5 | HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to 6 | access a specialized position in the Spanish healthcare system, and are challenging 7 | even for highly specialized humans. 8 | 9 | Homepage: https://aghie.github.io/head-qa/ 10 | """ 11 | import inspect 12 | import lm_eval.datasets.headqa.headqa 13 | from lm_eval.base import MultipleChoiceTask 14 | 15 | 16 | _CITATION = """ 17 | @misc{liu2020interpretable, 18 | title={Interpretable Multi-Step Reasoning with Knowledge Extraction on Complex Healthcare Question Answering}, 19 | author={Ye Liu and Shaika Chowdhury and Chenwei Zhang and Cornelia Caragea and Philip S. Yu}, 20 | year={2020}, 21 | eprint={2008.02434}, 22 | archivePrefix={arXiv}, 23 | primaryClass={cs.AI} 24 | } 25 | """ 26 | 27 | 28 | class HeadQABase(MultipleChoiceTask): 29 | VERSION = 0 30 | DATASET_PATH = inspect.getfile(lm_eval.datasets.headqa.headqa) 31 | 32 | def has_training_docs(self): 33 | return True 34 | 35 | def has_validation_docs(self): 36 | return True 37 | 38 | def has_test_docs(self): 39 | return True 40 | 41 | def training_docs(self): 42 | if self._training_docs is None: 43 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 44 | return self._training_docs 45 | 46 | def validation_docs(self): 47 | return map(self._process_doc, self.dataset["validation"]) 48 | 49 | def test_docs(self): 50 | return map(self._process_doc, self.dataset["test"]) 51 | 52 | def _process_doc(self, doc): 53 | out_doc = { 54 | "id": doc["qid"], 55 | "query": "Question: " + doc["qtext"] + "\nAnswer:", 56 | "choices": [answer["atext"] for answer in doc["answers"]], 57 | "gold": int(doc["ra"]) - 1, 58 | } 59 | return out_doc 60 | 61 | def doc_to_text(self, doc): 62 | return doc["query"] 63 | 64 | def should_decontaminate(self): 65 | return True 66 | 67 | def doc_to_decontamination_query(self, doc): 68 | return doc["query"] 69 | 70 | 71 | class HeadQAEn(HeadQABase): 72 | DATASET_NAME = "en" 73 | 74 | 75 | class HeadQAEs(HeadQABase): 76 | DATASET_NAME = "es" 77 | 78 | 79 | # for backwards compatibility 80 | class HeadQAEsDeprecated(HeadQABase): 81 | DATASET_NAME = "es" 82 | 83 | def __init__(self): 84 | super().__init__() 85 | print( 86 | "WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info." 87 | ) 88 | -------------------------------------------------------------------------------- /lm_eval/tasks/hellaswag.py: -------------------------------------------------------------------------------- 1 | """ 2 | HellaSwag: Can a Machine Really Finish Your Sentence? 3 | https://arxiv.org/pdf/1905.07830.pdf 4 | 5 | Hellaswag is a commonsense inference challenge dataset. Though its questions are 6 | trivial for humans (>95% accuracy), state-of-the-art models struggle (<48%). This is 7 | achieved via Adversarial Filtering (AF), a data collection paradigm wherein a 8 | series of discriminators iteratively select an adversarial set of machine-generated 9 | wrong answers. AF proves to be surprisingly robust. The key insight is to scale up 10 | the length and complexity of the dataset examples towards a critical 'Goldilocks' 11 | zone wherein generated text is ridiculous to humans, yet often misclassified by 12 | state-of-the-art models. 13 | 14 | Homepage: https://rowanzellers.com/hellaswag/ 15 | """ 16 | import re 17 | import inspect 18 | from lm_eval.base import MultipleChoiceTask 19 | import lm_eval.datasets.hellaswag.hellaswag 20 | 21 | _CITATION = """ 22 | @inproceedings{zellers2019hellaswag, 23 | title={HellaSwag: Can a Machine Really Finish Your Sentence?}, 24 | author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin}, 25 | booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, 26 | year={2019} 27 | } 28 | """ 29 | 30 | 31 | class HellaSwag(MultipleChoiceTask): 32 | VERSION = 0 33 | DATASET_PATH = inspect.getfile(lm_eval.datasets.hellaswag.hellaswag) 34 | DATASET_NAME = "hellaswag" 35 | 36 | def has_training_docs(self): 37 | return True 38 | 39 | def has_validation_docs(self): 40 | return True 41 | 42 | def has_test_docs(self): 43 | return False 44 | 45 | def training_docs(self): 46 | if self._training_docs is None: 47 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 48 | return self._training_docs 49 | 50 | def validation_docs(self): 51 | return map(self._process_doc, self.dataset["validation"]) 52 | 53 | def _process_doc(self, doc): 54 | ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize() 55 | out_doc = { 56 | "query": self.preprocess(doc["activity_label"] + ": " + ctx), 57 | "choices": [self.preprocess(ending) for ending in doc["endings"]], 58 | "gold": int(doc["label"]), 59 | } 60 | return out_doc 61 | 62 | @classmethod 63 | def preprocess(cls, text): 64 | text = text.strip() 65 | # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag. 66 | text = text.replace(" [title]", ". ") 67 | text = re.sub("\\[.*?\\]", "", text) 68 | text = text.replace(" ", " ") 69 | return text 70 | 71 | def doc_to_text(self, doc): 72 | return doc["query"] 73 | 74 | def should_decontaminate(self): 75 | return True 76 | 77 | def doc_to_decontamination_query(self, doc): 78 | return doc["query"] 79 | -------------------------------------------------------------------------------- /lm_eval/tasks/lambada.py: -------------------------------------------------------------------------------- 1 | """ 2 | The LAMBADA dataset: Word prediction requiring a broad discourse context∗ 3 | https://arxiv.org/pdf/1606.06031.pdf 4 | 5 | LAMBADA is a dataset to evaluate the capabilities of computational models for text 6 | understanding by means of a word prediction task. LAMBADA is a collection of narrative 7 | passages sharing the characteristic that human subjects are able to guess their last 8 | word if they are exposed to the whole passage, but not if they only see the last 9 | sentence preceding the target word. To succeed on LAMBADA, computational models 10 | cannot simply rely on local context, but must be able to keep track of information 11 | in the broader discourse. 12 | 13 | Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI 14 | """ 15 | import inspect 16 | import lm_eval.datasets.lambada_openai.lambada_openai 17 | from lm_eval.base import Task, rf 18 | from lm_eval.metrics import mean, perplexity 19 | 20 | 21 | _CITATION = """ 22 | @misc{ 23 | author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel}, 24 | title={The LAMBADA dataset}, 25 | DOI={10.5281/zenodo.2630551}, 26 | publisher={Zenodo}, 27 | year={2016}, 28 | month={Aug} 29 | } 30 | """ 31 | 32 | 33 | class LambadaBase(Task): 34 | VERSION = None 35 | 36 | def training_docs(self): 37 | if self.has_training_docs(): 38 | return self.dataset["train"] 39 | 40 | def validation_docs(self): 41 | if self.has_validation_docs(): 42 | return self.dataset["validation"] 43 | 44 | def test_docs(self): 45 | if self.has_test_docs(): 46 | return self.dataset["test"] 47 | 48 | def doc_to_text(self, doc): 49 | return doc["text"].rsplit(" ", 1)[0] 50 | 51 | def should_decontaminate(self): 52 | return True 53 | 54 | def doc_to_decontamination_query(self, doc): 55 | return doc["text"] 56 | 57 | def doc_to_target(self, doc): 58 | return " " + doc["text"].rsplit(" ", 1)[1] 59 | 60 | def construct_requests(self, doc, ctx): 61 | ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc)) 62 | 63 | return ll, is_greedy 64 | 65 | def process_results(self, doc, results): 66 | ll, is_greedy = results 67 | 68 | return {"ppl": ll, "acc": int(is_greedy)} 69 | 70 | def aggregation(self): 71 | return {"ppl": perplexity, "acc": mean} 72 | 73 | def higher_is_better(self): 74 | return {"ppl": False, "acc": True} 75 | 76 | 77 | class LambadaStandard(LambadaBase): 78 | """The LAMBADA task using the standard original LAMBADA dataset.""" 79 | 80 | VERSION = 0 81 | DATASET_PATH = "lambada" 82 | 83 | def has_training_docs(self): 84 | return False 85 | 86 | def has_validation_docs(self): 87 | return True 88 | 89 | def has_test_docs(self): 90 | return True 91 | 92 | 93 | class LambadaOpenAI(LambadaBase): 94 | """The LAMBADA task using the LAMBADA OpenAI dataset, a modified version of the 95 | original LAMBADA dataset created by OpenAI for evaluating their GPT-2 model. 96 | 97 | Reference: https://github.com/openai/gpt-2/issues/131#issuecomment-497136199 98 | """ 99 | 100 | VERSION = 0 101 | DATASET_PATH = inspect.getfile(lm_eval.datasets.lambada_openai.lambada_openai) 102 | 103 | def has_training_docs(self): 104 | return False 105 | 106 | def has_validation_docs(self): 107 | return True 108 | 109 | def has_test_docs(self): 110 | return False 111 | -------------------------------------------------------------------------------- /lm_eval/tasks/lambada_cloze.py: -------------------------------------------------------------------------------- 1 | """ 2 | The LAMBADA dataset: Word prediction requiring a broad discourse context∗ 3 | https://arxiv.org/pdf/1606.06031.pdf 4 | 5 | Cloze-style LAMBADA dataset. 6 | LAMBADA is a dataset to evaluate the capabilities of computational models for text 7 | understanding by means of a word prediction task. LAMBADA is a collection of narrative 8 | passages sharing the characteristic that human subjects are able to guess their last 9 | word if they are exposed to the whole passage, but not if they only see the last 10 | sentence preceding the target word. To succeed on LAMBADA, computational models 11 | cannot simply rely on local context, but must be able to keep track of information 12 | in the broader discourse. 13 | 14 | Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI 15 | """ 16 | from lm_eval.tasks.lambada import LambadaOpenAI, LambadaStandard 17 | 18 | 19 | _CITATION = """ 20 | @misc{ 21 | author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel}, 22 | title={The LAMBADA dataset}, 23 | DOI={10.5281/zenodo.2630551}, 24 | publisher={Zenodo}, 25 | year={2016}, 26 | month={Aug} 27 | } 28 | """ 29 | 30 | 31 | class LambadaStandardCloze(LambadaStandard): 32 | """Cloze-style LambadaStandard.""" 33 | 34 | VERSION = 0 35 | 36 | def doc_to_text(self, doc): 37 | return doc["text"].rsplit(" ", 1)[0] + " ____. ->" 38 | 39 | def should_decontaminate(self): 40 | return True 41 | 42 | def doc_to_decontamination_query(self, doc): 43 | return doc["text"] 44 | 45 | def doc_to_target(self, doc): 46 | return " " + doc["text"].rsplit(" ", 1)[1] 47 | 48 | 49 | class LambadaOpenAICloze(LambadaOpenAI): 50 | """Cloze-style LambadaOpenAI.""" 51 | 52 | VERSION = 0 53 | 54 | def doc_to_text(self, doc): 55 | return doc["text"].rsplit(" ", 1)[0] + " ____. ->" 56 | 57 | def should_decontaminate(self): 58 | return True 59 | 60 | def doc_to_decontamination_query(self, doc): 61 | return doc["text"] 62 | 63 | def doc_to_target(self, doc): 64 | return " " + doc["text"].rsplit(" ", 1)[1] 65 | -------------------------------------------------------------------------------- /lm_eval/tasks/lambada_multilingual.py: -------------------------------------------------------------------------------- 1 | """ 2 | The LAMBADA (OpenAI) dataset: Word prediction requiring a broad discourse context∗ 3 | https://arxiv.org/pdf/1606.06031.pdf 4 | 5 | The LAMBADA OpenAI dataset machine-translated to other languages. 6 | LAMBADA is a dataset to evaluate the capabilities of computational models for text 7 | understanding by means of a word prediction task. LAMBADA is a collection of narrative 8 | passages sharing the characteristic that human subjects are able to guess their last 9 | word if they are exposed to the whole passage, but not if they only see the last 10 | sentence preceding the target word. To succeed on LAMBADA, computational models 11 | cannot simply rely on local context, but must be able to keep track of information 12 | in the broader discourse. 13 | 14 | Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI 15 | 16 | Reference (OpenAI): https://github.com/openai/gpt-2/issues/131#issuecomment-497136199 17 | """ 18 | from .lambada import LambadaOpenAI 19 | 20 | 21 | _CITATION = """ 22 | @misc{ 23 | author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel}, 24 | title={The LAMBADA dataset}, 25 | DOI={10.5281/zenodo.2630551}, 26 | publisher={Zenodo}, 27 | year={2016}, 28 | month={Aug} 29 | } 30 | """ 31 | 32 | 33 | class LambadaOpenAIMultilingualEnglish(LambadaOpenAI): 34 | VERSION = 0 35 | DATASET_NAME = "en" 36 | 37 | 38 | class LambadaOpenAIMultilingualFrench(LambadaOpenAI): 39 | VERSION = 0 40 | DATASET_NAME = "fr" 41 | 42 | 43 | class LambadaOpenAIMultilingualGerman(LambadaOpenAI): 44 | VERSION = 0 45 | DATASET_NAME = "de" 46 | 47 | 48 | class LambadaOpenAIMultilingualItalian(LambadaOpenAI): 49 | VERSION = 0 50 | DATASET_NAME = "it" 51 | 52 | 53 | class LambadaOpenAIMultilingualSpanish(LambadaOpenAI): 54 | VERSION = 0 55 | DATASET_NAME = "es" 56 | 57 | 58 | LANG_CLASSES = [ 59 | LambadaOpenAIMultilingualEnglish, 60 | LambadaOpenAIMultilingualFrench, 61 | LambadaOpenAIMultilingualGerman, 62 | LambadaOpenAIMultilingualItalian, 63 | LambadaOpenAIMultilingualSpanish, 64 | ] 65 | 66 | 67 | def construct_tasks(): 68 | tasks = {} 69 | for lang_class in LANG_CLASSES: 70 | tasks[f"lambada_openai_mt_{lang_class.DATASET_NAME}"] = lang_class 71 | return tasks 72 | -------------------------------------------------------------------------------- /lm_eval/tasks/logiqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning 3 | https://arxiv.org/pdf/2007.08124.pdf 4 | 5 | LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA 6 | instances, covering multiple types of deductive reasoning. Results show that state- 7 | of-the-art neural models perform by far worse than human ceiling. The dataset can 8 | also serve as a benchmark for reinvestigating logical AI under the deep learning 9 | NLP setting. 10 | 11 | Homepage: https://github.com/lgw863/LogiQA-dataset 12 | """ 13 | import inspect 14 | import lm_eval.datasets.logiqa.logiqa 15 | from lm_eval.base import MultipleChoiceTask 16 | 17 | 18 | _CITATION = """ 19 | @misc{liu2020logiqa, 20 | title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, 21 | author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang}, 22 | year={2020}, 23 | eprint={2007.08124}, 24 | archivePrefix={arXiv}, 25 | primaryClass={cs.CL} 26 | } 27 | """ 28 | 29 | 30 | class LogiQA(MultipleChoiceTask): 31 | VERSION = 0 32 | DATASET_PATH = inspect.getfile(lm_eval.datasets.logiqa.logiqa) 33 | DATASET_NAME = None 34 | 35 | def has_training_docs(self): 36 | return True 37 | 38 | def has_validation_docs(self): 39 | return True 40 | 41 | def has_test_docs(self): 42 | return True 43 | 44 | def training_docs(self): 45 | if self._training_docs is None: 46 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 47 | return self._training_docs 48 | 49 | def validation_docs(self): 50 | return map(self._process_doc, self.dataset["validation"]) 51 | 52 | def test_docs(self): 53 | return map(self._process_doc, self.dataset["test"]) 54 | 55 | def _process_doc(self, doc): 56 | def format_example(doc, choices): 57 | """ 58 | Passage: 59 | Question: 60 | Choices: 61 | A. 62 | B. 63 | C. 64 | D. 65 | Answer: 66 | """ 67 | prompt = "Passage: " + doc["context"] + "\n" 68 | prompt += "Question: " + doc["question"] + "\nChoices:\n" 69 | for choice, option in zip(choices, doc["options"]): 70 | prompt += f"{choice.upper()}. {option}\n" 71 | prompt += "Answer:" 72 | return prompt 73 | 74 | choices = ["a", "b", "c", "d"] 75 | return { 76 | "passage": doc["context"], # Used for decontamination 77 | "query": format_example(doc, choices), 78 | "choices": doc["options"], 79 | "gold": choices.index(doc["label"]), 80 | } 81 | 82 | def doc_to_text(self, doc): 83 | return doc["query"] 84 | 85 | def should_decontaminate(self): 86 | return True 87 | 88 | def doc_to_decontamination_query(self, doc): 89 | return doc["passage"] 90 | -------------------------------------------------------------------------------- /lm_eval/tasks/mathqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | MathQA: Towards Interpretable Math Word Problem Solving with Operation-Based Formalisms 3 | https://arxiv.org/pdf/1905.13319.pdf 4 | 5 | MathQA is a large-scale dataset of 37k English multiple-choice math word problems 6 | covering multiple math domain categories by modeling operation programs corresponding 7 | to word problems in the AQuA dataset (Ling et al., 2017). 8 | 9 | Homepage: https://math-qa.github.io/math-QA/ 10 | """ 11 | import re 12 | from lm_eval.base import MultipleChoiceTask 13 | 14 | 15 | _CITATION = """ 16 | @misc{amini2019mathqa, 17 | title={MathQA: Towards Interpretable Math Word Problem Solving with Operation-Based Formalisms}, 18 | author={Aida Amini and Saadia Gabriel and Peter Lin and Rik Koncel-Kedziorski and Yejin Choi and Hannaneh Hajishirzi}, 19 | year={2019}, 20 | eprint={1905.13319}, 21 | archivePrefix={arXiv}, 22 | primaryClass={cs.CL} 23 | } 24 | """ 25 | 26 | 27 | class MathQA(MultipleChoiceTask): 28 | VERSION = 0 29 | DATASET_PATH = "math_qa" 30 | DATASET_NAME = None 31 | 32 | def has_training_docs(self): 33 | return True 34 | 35 | def has_validation_docs(self): 36 | return True 37 | 38 | def has_test_docs(self): 39 | return True 40 | 41 | def training_docs(self): 42 | if self._training_docs is None: 43 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 44 | return self._training_docs 45 | 46 | def validation_docs(self): 47 | return map(self._process_doc, self.dataset["validation"]) 48 | 49 | def test_docs(self): 50 | return map(self._process_doc, self.dataset["test"]) 51 | 52 | def _process_doc(self, doc): 53 | answer_idx = ["a", "b", "c", "d", "e"].index(doc["correct"]) 54 | choices = [ 55 | c[4:].rstrip(" ,") 56 | for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc["options"]) 57 | ] 58 | 59 | out_doc = { 60 | "query": "Question: " + doc["Problem"] + "\nAnswer:", 61 | "choices": choices, 62 | "gold": answer_idx, 63 | } 64 | return out_doc 65 | 66 | def doc_to_text(self, doc): 67 | return doc["query"] 68 | 69 | def should_decontaminate(self): 70 | return True 71 | 72 | def doc_to_decontamination_query(self, doc): 73 | return doc["query"] 74 | -------------------------------------------------------------------------------- /lm_eval/tasks/mutual.py: -------------------------------------------------------------------------------- 1 | """ 2 | MuTual: A Dataset for Multi-Turn Dialogue Reasoning 3 | https://www.aclweb.org/anthology/2020.acl-main.130/ 4 | 5 | MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is 6 | modified from Chinese high school English listening comprehension test data. 7 | 8 | Homepage: https://github.com/Nealcly/MuTual 9 | """ 10 | import numpy as np 11 | import inspect 12 | import lm_eval.datasets.mutual.mutual 13 | from lm_eval.base import Task, rf 14 | from lm_eval.metrics import mean 15 | 16 | 17 | _CITATION = """ 18 | @inproceedings{mutual, 19 | title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning", 20 | author = "Cui, Leyang and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" , 21 | booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics", 22 | year = "2020", 23 | publisher = "Association for Computational Linguistics", 24 | } 25 | """ 26 | 27 | 28 | class MuTualBase(Task): 29 | VERSION = 1 30 | DATASET_PATH = inspect.getfile(lm_eval.datasets.mutual.mutual) 31 | DATASET_NAME = None 32 | CHOICES = ["A", "B", "C", "D"] 33 | 34 | def has_training_docs(self): 35 | return True 36 | 37 | def has_validation_docs(self): 38 | return True 39 | 40 | def has_test_docs(self): 41 | return False 42 | 43 | def training_docs(self): 44 | return self.dataset["train"] 45 | 46 | def validation_docs(self): 47 | return self.dataset["validation"] 48 | 49 | def test_docs(self): 50 | return NotImplemented 51 | 52 | def doc_to_text(self, doc): 53 | return self.detokenize(doc["article"]) 54 | 55 | def should_decontaminate(self): 56 | return True 57 | 58 | def doc_to_decontamination_query(self, doc): 59 | return doc["article"] 60 | 61 | def doc_to_target(self, doc): 62 | return " " + self.detokenize(doc["options"][self.CHOICES.index(doc["answers"])]) 63 | 64 | def construct_requests(self, doc, ctx): 65 | lls = [] 66 | for option in doc["options"]: 67 | lls.append(rf.loglikelihood(ctx, f" {self.detokenize(option)}")[0]) 68 | return lls 69 | 70 | def detokenize(self, text): 71 | text = text.replace(" '", "'") 72 | text = text.replace(" \n", "\n") 73 | text = text.replace("\n ", "\n") 74 | text = text.replace(" n't", "n't") 75 | text = text.replace("`` ", '"') 76 | text = text.replace("''", '"') 77 | # punctuation 78 | text = text.replace(" :", ":") 79 | text = text.replace(" ;", ";") 80 | text = text.replace(" !", "!") 81 | text = text.replace(" ?", "?") 82 | text = text.replace(" ,", ",") 83 | text = text.replace(" .", ".") 84 | return text 85 | 86 | def process_results(self, doc, results): 87 | gold = self.CHOICES.index(doc["answers"]) 88 | r4_1 = np.argmax(results) == gold # r4_1 = accuracy 89 | ranks = sorted(results, reverse=True) 90 | r4_2 = (ranks.index(results[gold]) == 1) + r4_1 91 | mrr = 1.0 / (ranks.index(results[gold]) + 1) # `+ 1` for index offset 92 | return {"r@1": r4_1, "r@2": r4_2, "mrr": mrr} 93 | 94 | def aggregation(self): 95 | return {"r@1": mean, "r@2": mean, "mrr": mean} 96 | 97 | def higher_is_better(self): 98 | return {"r@1": True, "r@2": True, "mrr": True} 99 | 100 | 101 | class MuTual(MuTualBase): 102 | DATASET_NAME = "mutual" 103 | 104 | 105 | class MuTualPlus(MuTualBase): 106 | DATASET_NAME = "mutual_plus" 107 | -------------------------------------------------------------------------------- /lm_eval/tasks/openbookqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering 3 | https://arxiv.org/pdf/1809.02789.pdf 4 | 5 | OpenBookQA is a question-answering dataset modeled after open book exams for 6 | assessing human understanding of a subject. It consists of 5,957 multiple-choice 7 | elementary-level science questions (4,957 train, 500 dev, 500 test), which probe 8 | the understanding of a small “book” of 1,326 core science facts and the application 9 | of these facts to novel situations. For training, the dataset includes a mapping 10 | from each question to the core science fact it was designed to probe. Answering 11 | OpenBookQA questions requires additional broad common knowledge, not contained 12 | in the book. The questions, by design, are answered incorrectly by both a retrieval- 13 | based algorithm and a word co-occurrence algorithm. 14 | 15 | Homepage: https://allenai.org/data/open-book-qa 16 | """ 17 | from lm_eval.base import MultipleChoiceTask 18 | 19 | 20 | _CITATION = """ 21 | @inproceedings{OpenBookQA2018, 22 | title={Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering}, 23 | author={Todor Mihaylov and Peter Clark and Tushar Khot and Ashish Sabharwal}, 24 | booktitle={EMNLP}, 25 | year={2018} 26 | } 27 | """ 28 | 29 | 30 | class OpenBookQA(MultipleChoiceTask): 31 | VERSION = 0 32 | DATASET_PATH = "openbookqa" 33 | DATASET_NAME = "main" 34 | 35 | def has_training_docs(self): 36 | return True 37 | 38 | def has_validation_docs(self): 39 | return True 40 | 41 | def has_test_docs(self): 42 | return True 43 | 44 | def training_docs(self): 45 | if self._training_docs is None: 46 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 47 | return self._training_docs 48 | 49 | def validation_docs(self): 50 | return map(self._process_doc, self.dataset["validation"]) 51 | 52 | def test_docs(self): 53 | return map(self._process_doc, self.dataset["test"]) 54 | 55 | def _process_doc(self, doc): 56 | out_doc = { 57 | "id": doc["id"], 58 | "query": doc["question_stem"], 59 | "choices": doc["choices"]["text"], 60 | "gold": ["A", "B", "C", "D"].index(doc["answerKey"].strip()), 61 | } 62 | return out_doc 63 | 64 | def doc_to_text(self, doc): 65 | return doc["query"] 66 | 67 | def should_decontaminate(self): 68 | return True 69 | 70 | def doc_to_decontamination_query(self, doc): 71 | return doc["query"] 72 | -------------------------------------------------------------------------------- /lm_eval/tasks/pile.py: -------------------------------------------------------------------------------- 1 | """ 2 | The Pile: An 800GB Dataset of Diverse Text for Language Modeling 3 | https://arxiv.org/pdf/2101.00027.pdf 4 | 5 | The Pile is a 825 GiB diverse, open source language modelling data set that consists 6 | of 22 smaller, high-quality datasets combined together. To score well on Pile 7 | BPB (bits per byte), a model must be able to understand many disparate domains 8 | including books, github repositories, webpages, chat logs, and medical, physics, 9 | math, computer science, and philosophy papers. 10 | 11 | Homepage: https://pile.eleuther.ai/ 12 | """ 13 | import inspect 14 | import lm_eval.datasets.pile.pile 15 | from lm_eval.base import PerplexityTask 16 | 17 | 18 | _CITATION = """ 19 | @article{pile, 20 | title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling}, 21 | author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor}, 22 | journal={arXiv preprint arXiv:2101.00027}, 23 | year={2020} 24 | } 25 | """ 26 | 27 | 28 | class PilePerplexityTask(PerplexityTask): 29 | VERSION = 1 30 | DATASET_PATH = inspect.getfile(lm_eval.datasets.pile.pile) 31 | DATASET_NAME = None 32 | 33 | def has_validation_docs(self): 34 | return True 35 | 36 | def has_test_docs(self): 37 | return True 38 | 39 | def validation_docs(self): 40 | for doc in self.dataset["validation"]: 41 | yield doc["text"] 42 | 43 | def test_docs(self): 44 | for doc in self.dataset["test"]: 45 | yield doc["text"] 46 | 47 | 48 | class PileArxiv(PilePerplexityTask): 49 | DATASET_NAME = "pile_arxiv" 50 | 51 | 52 | class PileBooks3(PilePerplexityTask): 53 | DATASET_NAME = "pile_books3" 54 | 55 | 56 | class PileBookCorpus2(PilePerplexityTask): 57 | DATASET_NAME = "pile_bookcorpus2" 58 | 59 | 60 | class PileDmMathematics(PilePerplexityTask): 61 | DATASET_NAME = "pile_dm-mathematics" 62 | 63 | 64 | class PileEnron(PilePerplexityTask): 65 | DATASET_NAME = "pile_enron" 66 | 67 | 68 | class PileEuroparl(PilePerplexityTask): 69 | DATASET_NAME = "pile_europarl" 70 | 71 | 72 | class PileFreeLaw(PilePerplexityTask): 73 | DATASET_NAME = "pile_freelaw" 74 | 75 | 76 | class PileGithub(PilePerplexityTask): 77 | DATASET_NAME = "pile_github" 78 | 79 | 80 | class PileGutenberg(PilePerplexityTask): 81 | DATASET_NAME = "pile_gutenberg" 82 | 83 | 84 | class PileHackernews(PilePerplexityTask): 85 | DATASET_NAME = "pile_hackernews" 86 | 87 | 88 | class PileNIHExporter(PilePerplexityTask): 89 | DATASET_NAME = "pile_nih-exporter" 90 | 91 | 92 | class PileOpenSubtitles(PilePerplexityTask): 93 | DATASET_NAME = "pile_opensubtitles" 94 | 95 | 96 | class PileOpenWebText2(PilePerplexityTask): 97 | DATASET_NAME = "pile_openwebtext2" 98 | 99 | 100 | class PilePhilPapers(PilePerplexityTask): 101 | DATASET_NAME = "pile_philpapers" 102 | 103 | 104 | class PilePileCc(PilePerplexityTask): 105 | DATASET_NAME = "pile_pile-cc" 106 | 107 | 108 | class PilePubmedAbstracts(PilePerplexityTask): 109 | DATASET_NAME = "pile_pubmed-abstracts" 110 | 111 | 112 | class PilePubmedCentral(PilePerplexityTask): 113 | DATASET_NAME = "pile_pubmed-central" 114 | 115 | 116 | class PileStackExchange(PilePerplexityTask): 117 | DATASET_NAME = "pile_stackexchange" 118 | 119 | 120 | class PileUspto(PilePerplexityTask): 121 | DATASET_NAME = "pile_upsto" 122 | 123 | 124 | class PileUbuntuIrc(PilePerplexityTask): 125 | DATASET_NAME = "pile_ubuntu-irc" 126 | 127 | 128 | class PileWikipedia(PilePerplexityTask): 129 | DATASET_NAME = "pile_wikipedia" 130 | 131 | 132 | class PileYoutubeSubtitles(PilePerplexityTask): 133 | DATASET_NAME = "pile_youtubesubtitles" 134 | -------------------------------------------------------------------------------- /lm_eval/tasks/piqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | PIQA: Reasoning about Physical Commonsense in Natural Language 3 | https://arxiv.org/pdf/1911.11641.pdf 4 | 5 | Physical Interaction: Question Answering (PIQA) is a physical commonsense 6 | reasoning and a corresponding benchmark dataset. PIQA was designed to investigate 7 | the physical knowledge of existing models. To what extent are current approaches 8 | actually learning about the world? 9 | 10 | Homepage: https://yonatanbisk.com/piqa/ 11 | """ 12 | import inspect 13 | from lm_eval.base import MultipleChoiceTask 14 | import lm_eval.datasets.piqa.piqa 15 | 16 | _CITATION = """ 17 | @inproceedings{Bisk2020, 18 | author = {Yonatan Bisk and Rowan Zellers and 19 | Ronan Le Bras and Jianfeng Gao 20 | and Yejin Choi}, 21 | title = {PIQA: Reasoning about Physical Commonsense in 22 | Natural Language}, 23 | booktitle = {Thirty-Fourth AAAI Conference on 24 | Artificial Intelligence}, 25 | year = {2020}, 26 | } 27 | """ 28 | 29 | 30 | class PiQA(MultipleChoiceTask): 31 | VERSION = 0 32 | DATASET_PATH = inspect.getfile(lm_eval.datasets.piqa.piqa) 33 | DATASET_NAME = None 34 | 35 | def has_training_docs(self): 36 | return True 37 | 38 | def has_validation_docs(self): 39 | return True 40 | 41 | def has_test_docs(self): 42 | return False 43 | 44 | def training_docs(self): 45 | if self._training_docs is None: 46 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 47 | return self._training_docs 48 | 49 | def validation_docs(self): 50 | return map(self._process_doc, self.dataset["validation"]) 51 | 52 | def _process_doc(self, doc): 53 | out_doc = { 54 | "goal": doc["goal"], 55 | "choices": [doc["sol1"], doc["sol2"]], 56 | "gold": doc["label"], 57 | } 58 | return out_doc 59 | 60 | def doc_to_text(self, doc): 61 | return "Question: " + doc["goal"] + "\nAnswer:" 62 | 63 | def should_decontaminate(self): 64 | return True 65 | 66 | def doc_to_decontamination_query(self, doc): 67 | return doc["goal"] 68 | -------------------------------------------------------------------------------- /lm_eval/tasks/prost.py: -------------------------------------------------------------------------------- 1 | """ 2 | PROST: Physical Reasoning about Objects Through Space and Time 3 | https://arxiv.org/pdf/2106.03634.pdf 4 | 5 | PROST, Physical Reasoning about Objects Through Space and Time, is a dataset 6 | consisting of 18,736 multiple-choice questions made from 14 manually curated 7 | templates, covering 10 physical reasoning concepts. All questions are designed 8 | to probe both causal and masked language models in a zero-shot setting. 9 | 10 | NOTE: PROST is limited to the zero-shot setting to adhere to authors' intentions 11 | as discussed in section 7 of the paper: "We hope that the community will use 12 | this dataset in the intended way: in a zero-shot setting to probe models which 13 | have been trained on data not specifically collected to succeed on PROST." 14 | 15 | Homepage: https://github.com/nala-cub/prost 16 | """ 17 | from lm_eval.base import MultipleChoiceTask 18 | 19 | 20 | _CITATION = """ 21 | @inproceedings{aroca-ouellette-etal-2021-prost, 22 | title = "{PROST}: {P}hysical Reasoning about Objects through Space and Time", 23 | author = "Aroca-Ouellette, St{\'e}phane and 24 | Paik, Cory and 25 | Roncone, Alessandro and 26 | Kann, Katharina", 27 | booktitle = "Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021", 28 | month = aug, 29 | year = "2021", 30 | address = "Online", 31 | publisher = "Association for Computational Linguistics", 32 | url = "https://aclanthology.org/2021.findings-acl.404", 33 | pages = "4597--4608", 34 | } 35 | """ 36 | 37 | 38 | class PROST(MultipleChoiceTask): 39 | VERSION = 0 40 | DATASET_PATH = "corypaik/prost" 41 | DATASET_NAME = None 42 | 43 | def has_training_docs(self): 44 | return False 45 | 46 | def has_validation_docs(self): 47 | return False 48 | 49 | def has_test_docs(self): 50 | return True 51 | 52 | def test_docs(self): 53 | return map(self._process_doc, self.dataset["test"]) 54 | 55 | def fewshot_context( 56 | self, doc, num_fewshot, provide_description=None, rnd=None, description=None 57 | ): 58 | assert ( 59 | num_fewshot == 0 60 | ), "PROST is designed to probe models in a zero-shot fashion only." 61 | return super().fewshot_context( 62 | doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description 63 | ) 64 | 65 | def _process_doc(self, doc): 66 | out_doc = { 67 | "query": f"{doc['context']}\nQuestion: {doc['ex_question']}\nAnswer:", 68 | "choices": [doc["A"], doc["B"], doc["C"], doc["D"]], 69 | "gold": doc["label"], 70 | } 71 | return out_doc 72 | 73 | def doc_to_text(self, doc): 74 | return doc["query"] 75 | 76 | def should_decontaminate(self): 77 | return True 78 | 79 | def doc_to_decontamination_query(self, doc): 80 | return doc["query"] 81 | -------------------------------------------------------------------------------- /lm_eval/tasks/pubmedqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | PubMedQA: A Dataset for Biomedical Research Question Answering 3 | https://arxiv.org/pdf/1909.06146.pdf 4 | 5 | PubMedQA is a novel biomedical question answering (QA) dataset collected from 6 | PubMed abstracts. The task of PubMedQA is to answer research questions with 7 | yes/no/maybe (e.g.: Do preoperative statins reduce atrial fibrillation after 8 | coronary artery bypass grafting?) using the corresponding abstracts. PubMedQA 9 | has 1k expert-annotated, 61.2k unlabeled and 211.3k artificially generated QA 10 | instances. Each PubMedQA instance is composed of (1) a question which is either 11 | an existing research article title or derived from one, (2) a context which is 12 | the corresponding abstract without its conclusion, (3) a long answer, which is 13 | the conclusion of the abstract and, presumably, answers the research question, 14 | and (4) a yes/no/maybe answer which summarizes the conclusion. 15 | 16 | Homepage: https://pubmedqa.github.io/ 17 | """ 18 | import numpy as np 19 | from lm_eval.base import rf, Task 20 | from lm_eval.metrics import mean 21 | 22 | 23 | _CITATION = """ 24 | @inproceedings{jin2019pubmedqa, 25 | title={PubMedQA: A Dataset for Biomedical Research Question Answering}, 26 | author={Jin, Qiao and Dhingra, Bhuwan and Liu, Zhengping and Cohen, William and Lu, Xinghua}, 27 | booktitle={Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)}, 28 | pages={2567--2577}, 29 | year={2019} 30 | } 31 | """ 32 | 33 | 34 | class Pubmed_QA(Task): 35 | VERSION = 0 36 | DATASET_PATH = "pubmed_qa" 37 | DATASET_NAME = "pqa_labeled" 38 | 39 | def has_training_docs(self): 40 | return False 41 | 42 | def has_validation_docs(self): 43 | return False 44 | 45 | def has_test_docs(self): 46 | return True 47 | 48 | def test_docs(self): 49 | if self.has_test_docs(): 50 | # HF is labelled as train but its really just for testing 51 | return self.dataset["train"] 52 | 53 | def doc_to_text(self, doc): 54 | ctxs = "\n".join(doc["context"]["contexts"]) 55 | return "Abstract: {}\nQuestion: {}\nAnswer:".format( 56 | ctxs, doc["question"], doc["final_decision"] 57 | ) 58 | 59 | def should_decontaminate(self): 60 | return True 61 | 62 | def doc_to_decontamination_query(self, doc): 63 | return doc["question"] + " " + "\n".join(doc["context"]["contexts"]) 64 | 65 | def doc_to_target(self, doc): 66 | return " {}".format(doc["final_decision"]) 67 | 68 | def construct_requests(self, doc, ctx): 69 | """Uses RequestFactory to construct Requests and returns 70 | an iterable of Requests which will be sent to the LM. 71 | """ 72 | ll_yes, _ = rf.loglikelihood(ctx, " yes") 73 | ll_no, _ = rf.loglikelihood(ctx, " no") 74 | ll_maybe, _ = rf.loglikelihood(ctx, " maybe") 75 | return ll_yes, ll_no, ll_maybe 76 | 77 | def process_results(self, doc, results): 78 | gold = doc["final_decision"] 79 | ll_yes, ll_no, ll_maybe = results 80 | pred = np.argmax(results) 81 | return { 82 | "acc": ["yes", "no", "maybe"][pred] == gold, 83 | } 84 | 85 | def aggregation(self): 86 | return {"acc": mean} 87 | 88 | def higher_is_better(self): 89 | return {"acc": True} 90 | -------------------------------------------------------------------------------- /lm_eval/tasks/qa4mre.py: -------------------------------------------------------------------------------- 1 | """ 2 | QA4MRE 2011-2013: Overview of Question Answering for Machine Reading Evaluation 3 | https://www.cs.cmu.edu/~./hovy/papers/13CLEF-QA4MRE.pdf 4 | 5 | The (English only) QA4MRE challenge which was run as a Lab at CLEF 2011-2013. 6 | The main objective of this exercise is to develop a methodology for evaluating 7 | Machine Reading systems through Question Answering and Reading Comprehension 8 | Tests. Systems should be able to extract knowledge from large volumes of text 9 | and use this knowledge to answer questions. Four different tasks have been 10 | organized during these years: Main Task, Processing Modality and Negation for 11 | Machine Reading, Machine Reading of Biomedical Texts about Alzheimer's disease, 12 | and Entrance Exam. 13 | 14 | Homepage: http://nlp.uned.es/clef-qa/repository/qa4mre.php 15 | """ 16 | from lm_eval.base import MultipleChoiceTask 17 | 18 | 19 | _CITATION = """ 20 | @inproceedings{Peas2013QA4MRE2O, 21 | title={QA4MRE 2011-2013: Overview of Question Answering for Machine Reading Evaluation}, 22 | author={Anselmo Pe{\~n}as and Eduard H. Hovy and Pamela Forner and {\'A}lvaro Rodrigo and Richard F. E. Sutcliffe and Roser Morante}, 23 | booktitle={CLEF}, 24 | year={2013} 25 | } 26 | """ # noqa: W605 27 | 28 | 29 | class QA4MRE(MultipleChoiceTask): 30 | VERSION = 0 31 | DATASET_PATH = "qa4mre" 32 | DATASET_NAME = None 33 | 34 | def has_training_docs(self): 35 | return False 36 | 37 | def has_validation_docs(self): 38 | return False 39 | 40 | def has_test_docs(self): 41 | return True 42 | 43 | def test_docs(self): 44 | # `qa4mre` only has train data so we use it for the test docs. 45 | return map(self._process_doc, self.dataset["train"]) 46 | 47 | def _process_doc(self, doc): 48 | choices = doc["answer_options"]["answer_str"] 49 | out_doc = { 50 | "source": doc["document_str"].strip().replace("'", "'"), 51 | "query": doc["question_str"], 52 | "choices": choices, 53 | "gold": int(doc["correct_answer_id"]) - 1, 54 | } 55 | return out_doc 56 | 57 | def doc_to_text(self, doc): 58 | return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"]) 59 | 60 | def should_decontaminate(self): 61 | return True 62 | 63 | def doc_to_decontamination_query(self, doc): 64 | return doc["source"] + " " + doc["query"] 65 | 66 | 67 | class QA4MRE_2011(QA4MRE): 68 | DATASET_NAME = "2011.main.EN" 69 | 70 | 71 | class QA4MRE_2012(QA4MRE): 72 | DATASET_NAME = "2012.main.EN" 73 | 74 | 75 | class QA4MRE_2013(QA4MRE): 76 | DATASET_NAME = "2013.main.EN" 77 | -------------------------------------------------------------------------------- /lm_eval/tasks/quac.py: -------------------------------------------------------------------------------- 1 | """ 2 | QuAC: Question Answering in Context 3 | https://arxiv.org/abs/1808.07036 4 | 5 | Question Answering in Context (QuAC) is a dataset for modeling, understanding, and 6 | participating in information seeking dialog. Data instances consist of an interactive 7 | dialog between two crowd workers: (1) a student who poses a sequence of freeform 8 | questions to learn as much as possible about a hidden Wikipedia text, and (2) 9 | a teacher who answers the questions by providing short excerpts (spans) from the text. 10 | 11 | Homepage: https://quac.ai/ 12 | """ 13 | import inspect 14 | import lm_eval.datasets.quac.quac 15 | from lm_eval.base import Task 16 | 17 | 18 | _CITATION = """ 19 | @article{choi2018quac, 20 | title={Quac: Question answering in context}, 21 | author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke}, 22 | journal={arXiv preprint arXiv:1808.07036}, 23 | year={2018} 24 | } 25 | """ 26 | 27 | 28 | class QuAC(Task): 29 | VERSION = 0 30 | DATASET_PATH = inspect.getfile(lm_eval.datasets.quac.quac) 31 | DATASET_NAME = None 32 | 33 | def has_training_docs(self): 34 | return True 35 | 36 | def has_validation_docs(self): 37 | return True 38 | 39 | def has_test_docs(self): 40 | return False 41 | 42 | def training_docs(self): 43 | if self._training_docs is None: 44 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 45 | return self._training_docs 46 | 47 | def validation_docs(self): 48 | return map(self._process_doc, self.dataset["validation"]) 49 | 50 | def test_docs(self): 51 | raise NotImplementedError("QuAC has no test docs.") 52 | 53 | def _process_doc(self, doc): 54 | doc["title"] = doc["title"] + " - " + doc["section_title"] 55 | return doc 56 | 57 | def doc_to_text(self, doc): 58 | return ( 59 | "TITLE: " 60 | + doc["title"] 61 | + "\n" 62 | + "PARAGRAPH: " 63 | + doc["paragraph"] 64 | + "\n\n" 65 | + "Q: " 66 | + doc["question"] 67 | + "\n\n" 68 | + "A: " 69 | ) 70 | 71 | def should_decontaminate(self): 72 | return True 73 | 74 | def doc_to_decontamination_query(self, doc): 75 | return doc["paragraph"] 76 | 77 | def doc_to_target(self, doc): 78 | return doc["answer"] 79 | 80 | def construct_requests(self, doc, ctx): 81 | """Uses RequestFactory to construct Requests and returns an iterable of 82 | Requests which will be sent to the LM. 83 | 84 | :param doc: 85 | The document as returned from training_docs, validation_docs, or test_docs. 86 | :param ctx: str 87 | The context string, generated by fewshot_context. This includes the natural 88 | language description, as well as the few shot examples, and the question 89 | part of the document for `doc`. 90 | """ 91 | # TODO: implement evaluation. 92 | raise NotImplementedError("Evaluation not implemented") 93 | 94 | def process_results(self, doc, results): 95 | """Take a single document and the LM results and evaluates, returning a 96 | dict where keys are the names of submetrics and values are the values of 97 | the metric for that one document 98 | 99 | :param doc: 100 | The document as returned from training_docs, validation_docs, or test_docs. 101 | :param results: 102 | The results of the requests created in construct_requests. 103 | """ 104 | # TODO: implement evaluation. 105 | raise NotImplementedError("Evaluation not implemented") 106 | 107 | def aggregation(self): 108 | """ 109 | :returns: {str: [float] -> float} 110 | A dictionary where keys are the names of submetrics and values are 111 | functions that aggregate a list of metrics 112 | """ 113 | # TODO: implement evaluation. 114 | raise NotImplementedError("Evaluation not implemented") 115 | 116 | def higher_is_better(self): 117 | """ 118 | :returns: {str: bool} 119 | A dictionary where keys are the names of submetrics and values are 120 | whether a higher value of the submetric is better 121 | """ 122 | # TODO: implement evaluation. 123 | raise NotImplementedError("Evaluation not implemented") 124 | -------------------------------------------------------------------------------- /lm_eval/tasks/sat.py: -------------------------------------------------------------------------------- 1 | """ 2 | Similarity of Semantic Relations 3 | https://arxiv.org/pdf/cs/0608100.pdf 4 | 5 | SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374 6 | multiple-choice analogy questions; 5 choices per question. 7 | 8 | Homepage: https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art) 9 | """ 10 | import inspect 11 | import lm_eval.datasets.sat_analogies.sat_analogies 12 | from lm_eval.base import MultipleChoiceTask 13 | 14 | 15 | _CITATION = """ 16 | @article{article, 17 | author = {Turney, Peter}, 18 | year = {2006}, 19 | month = {09}, 20 | pages = {379-416}, 21 | title = {Similarity of Semantic Relations}, 22 | volume = {32}, 23 | journal = {Computational Linguistics}, 24 | doi = {10.1162/coli.2006.32.3.379} 25 | } 26 | """ 27 | 28 | 29 | class SATAnalogies(MultipleChoiceTask): 30 | VERSION = 0 31 | DATASET_PATH = inspect.getfile(lm_eval.datasets.sat_analogies.sat_analogies) 32 | DATASET_NAME = None 33 | 34 | def __init__(self, data_dir: str): 35 | """ 36 | SAT Analog Questions is not publicly available. You must request the data 37 | by emailing Peter Turney and then download it to a local directory path 38 | which should be passed into the `data_dir` arg. 39 | """ 40 | super().__init__(data_dir=data_dir) 41 | 42 | def has_training_docs(self): 43 | return False 44 | 45 | def has_validation_docs(self): 46 | return True 47 | 48 | def has_test_docs(self): 49 | return False 50 | 51 | def training_docs(self): 52 | return [] 53 | 54 | def validation_docs(self): 55 | return map(self._process_doc, self.dataset["validation"]) 56 | 57 | def test_docs(self): 58 | return [] 59 | 60 | def _process_doc(self, doc): 61 | return { 62 | "source": doc["source"], 63 | "query": doc["stem"].split(" ")[:2], 64 | "choices": [ 65 | "{} is to {}".format(*c.split(" ")[:2]) for c in doc["choices"] 66 | ], 67 | "gold": ["a", "b", "c", "d", "e"].index(doc["solution"].strip()), 68 | } 69 | 70 | def doc_to_text(self, doc): 71 | return "{} is to {} as".format(*doc["query"]) 72 | 73 | def should_decontaminate(self): 74 | return True 75 | 76 | def doc_to_decontamination_query(self, doc): 77 | return doc["source"] + "\n" + " ".join(doc["query"]) 78 | -------------------------------------------------------------------------------- /lm_eval/tasks/sciq.py: -------------------------------------------------------------------------------- 1 | """ 2 | Crowdsourcing Multiple Choice Science Questions 3 | https://aclanthology.org/W17-4413.pdf 4 | 5 | The SciQ dataset contains 13,679 crowdsourced science exam questions about Physics, 6 | Chemistry and Biology, among others. The questions are in multiple-choice format 7 | with 4 answer options each. For the majority of the questions, an additional paragraph 8 | with supporting evidence for the correct answer is provided. 9 | 10 | Homepage: https://allenai.org/data/sciq 11 | """ 12 | from lm_eval.base import MultipleChoiceTask 13 | 14 | 15 | _CITATION = """ 16 | @inproceedings{Welbl2017CrowdsourcingMC, 17 | title={Crowdsourcing Multiple Choice Science Questions}, 18 | author={Johannes Welbl and Nelson F. Liu and Matt Gardner}, 19 | booktitle={NUT@EMNLP}, 20 | year={2017} 21 | } 22 | """ 23 | 24 | 25 | class SciQ(MultipleChoiceTask): 26 | VERSION = 0 27 | DATASET_PATH = "sciq" 28 | DATASET_NAME = None 29 | 30 | def has_training_docs(self): 31 | return True 32 | 33 | def has_validation_docs(self): 34 | return True 35 | 36 | def has_test_docs(self): 37 | return True 38 | 39 | def training_docs(self): 40 | if self._training_docs is None: 41 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 42 | return self._training_docs 43 | 44 | def validation_docs(self): 45 | return map(self._process_doc, self.dataset["validation"]) 46 | 47 | def test_docs(self): 48 | return map(self._process_doc, self.dataset["test"]) 49 | 50 | def _process_doc(self, doc): 51 | choices = [ 52 | doc["distractor1"], 53 | doc["distractor2"], 54 | doc["distractor3"], 55 | doc["correct_answer"], 56 | ] 57 | src = doc["support"] 58 | out_doc = { 59 | "source": src, 60 | "query": doc["question"], 61 | "choices": choices, 62 | "gold": 3, 63 | } 64 | return out_doc 65 | 66 | def doc_to_text(self, doc): 67 | return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"]).strip() 68 | 69 | def should_decontaminate(self): 70 | return True 71 | 72 | def doc_to_decontamination_query(self, doc): 73 | return doc["source"] + " " + doc["query"] 74 | -------------------------------------------------------------------------------- /lm_eval/tasks/swag.py: -------------------------------------------------------------------------------- 1 | """ 2 | SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference 3 | https://arxiv.org/pdf/1808.05326.pdf 4 | 5 | SWAG (Situations With Adversarial Generations) is an adversarial dataset 6 | that consists of 113k multiple choice questions about grounded situations. Each 7 | question is a video caption from LSMDC or ActivityNet Captions, with four answer 8 | choices about what might happen next in the scene. The correct answer is the 9 | (real) video caption for the next event in the video; the three incorrect 10 | answers are adversarially generated and human verified, so as to fool machines 11 | but not humans. 12 | 13 | Homepage: https://rowanzellers.com/swag/ 14 | """ 15 | from lm_eval.base import MultipleChoiceTask 16 | 17 | 18 | _CITATION = """ 19 | @inproceedings{zellers2018swagaf, 20 | title={SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference}, 21 | author={Zellers, Rowan and Bisk, Yonatan and Schwartz, Roy and Choi, Yejin}, 22 | booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP)", 23 | year={2018} 24 | } 25 | """ 26 | 27 | 28 | class SWAG(MultipleChoiceTask): 29 | VERSION = 0 30 | DATASET_PATH = "swag" 31 | DATASET_NAME = "regular" 32 | 33 | def has_training_docs(self): 34 | return True 35 | 36 | def has_validation_docs(self): 37 | return True 38 | 39 | def has_test_docs(self): 40 | return False 41 | 42 | def training_docs(self): 43 | if self._training_docs is None: 44 | self._training_docs = list(map(self._process_doc, self.dataset["train"])) 45 | return self._training_docs 46 | 47 | def validation_docs(self): 48 | return map(self._process_doc, self.dataset["validation"]) 49 | 50 | def _process_doc(self, doc): 51 | out_doc = { 52 | "query": doc["startphrase"], 53 | "choices": [doc["ending0"], doc["ending1"], doc["ending2"], doc["ending3"]], 54 | "gold": int(doc["label"]), 55 | } 56 | return out_doc 57 | 58 | def doc_to_text(self, doc): 59 | return doc["query"] 60 | -------------------------------------------------------------------------------- /lm_eval/tasks/triviaqa.py: -------------------------------------------------------------------------------- 1 | """ 2 | TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension 3 | https://arxiv.org/pdf/1705.03551.pdf 4 | 5 | TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence 6 | triples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts 7 | and independently gathered evidence documents, six per question on average, that provide 8 | high quality distant supervision for answering the questions. 9 | 10 | Homepage: https://nlp.cs.washington.edu/triviaqa/ 11 | """ 12 | import inspect 13 | import lm_eval.datasets.triviaqa.triviaqa 14 | from lm_eval.base import Task, rf 15 | from lm_eval.metrics import mean 16 | 17 | 18 | _CITATION = """ 19 | @InProceedings{JoshiTriviaQA2017, 20 | author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke}, 21 | title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension}, 22 | booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics}, 23 | month = {July}, 24 | year = {2017}, 25 | address = {Vancouver, Canada}, 26 | publisher = {Association for Computational Linguistics}, 27 | } 28 | """ 29 | 30 | 31 | class TriviaQA(Task): 32 | VERSION = 1 33 | DATASET_PATH = inspect.getfile(lm_eval.datasets.triviaqa.triviaqa) 34 | DATASET_NAME = None 35 | 36 | def has_training_docs(self): 37 | return True 38 | 39 | def has_validation_docs(self): 40 | return True 41 | 42 | def has_test_docs(self): 43 | return False 44 | 45 | def training_docs(self): 46 | return self.dataset["train"] 47 | 48 | def validation_docs(self): 49 | return self.dataset["validation"] 50 | 51 | def test_docs(self): 52 | raise NotImplementedError() 53 | 54 | def doc_to_text(self, doc): 55 | return f"Question: {doc['question']}\nAnswer:" 56 | 57 | def should_decontaminate(self): 58 | return True 59 | 60 | def doc_to_decontamination_query(self, doc): 61 | return doc["question"] 62 | 63 | def doc_to_target(self, doc): 64 | return " " + doc["answer"]["value"] 65 | 66 | def _remove_prefixes(self, aliases): 67 | # Optimization: Remove any alias that has a strict prefix elsewhere in the list 68 | # we can do this because if the prefix is acceptable by isgreedy, we can stop looking 69 | aliases.sort() 70 | ret = [aliases[0]] 71 | for alias in aliases[1:]: 72 | if not alias.startswith(ret[-1]): 73 | ret.append(alias) 74 | return ret 75 | 76 | def construct_requests(self, doc, ctx): 77 | ret = [] 78 | for alias in self._remove_prefixes(doc["answer"]["aliases"]): 79 | _, is_prediction = rf.loglikelihood(ctx, " " + alias) 80 | ret.append(is_prediction) 81 | return ret 82 | 83 | def process_results(self, doc, results): 84 | return {"acc": float(any(results))} 85 | 86 | def aggregation(self): 87 | return { 88 | "acc": mean, 89 | } 90 | 91 | def higher_is_better(self): 92 | return {"acc": True} 93 | -------------------------------------------------------------------------------- /lm_eval/tasks/unscramble.py: -------------------------------------------------------------------------------- 1 | """ 2 | Language Models are Few-Shot Learners 3 | https://arxiv.org/pdf/2005.14165.pdf 4 | 5 | Unscramble is a small battery of 5 “character manipulation” tasks. Each task 6 | involves giving the model a word distorted by some combination of scrambling, 7 | addition, or deletion of characters, and asking it to recover the original word. 8 | 9 | Homepage: https://github.com/openai/gpt-3/tree/master/data 10 | """ 11 | import inspect 12 | import lm_eval.datasets.unscramble.unscramble 13 | from lm_eval.base import Task, rf 14 | from lm_eval.metrics import mean 15 | 16 | 17 | _CITATION = """ 18 | @inproceedings{NEURIPS2020_1457c0d6, 19 | author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario}, 20 | booktitle = {Advances in Neural Information Processing Systems}, 21 | editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin}, 22 | pages = {1877--1901}, 23 | publisher = {Curran Associates, Inc.}, 24 | title = {Language Models are Few-Shot Learners}, 25 | url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf}, 26 | volume = {33}, 27 | year = {2020} 28 | } 29 | """ 30 | 31 | 32 | class WordUnscrambleTask(Task): 33 | VERSION = 0 34 | DATASET_PATH = inspect.getfile(lm_eval.datasets.unscramble.unscramble) 35 | DATASET_NAME = None 36 | 37 | def has_training_docs(self): 38 | return False 39 | 40 | def has_validation_docs(self): 41 | return True 42 | 43 | def has_test_docs(self): 44 | return False 45 | 46 | def validation_docs(self): 47 | return self.dataset["validation"] 48 | 49 | def doc_to_text(self, doc): 50 | return doc["context"] 51 | 52 | def should_decontaminate(self): 53 | return True 54 | 55 | def doc_to_decontamination_query(self, doc): 56 | return doc["context"] 57 | 58 | def doc_to_target(self, doc): 59 | return doc["completion"] 60 | 61 | def construct_requests(self, doc, ctx): 62 | completion = rf.greedy_until(ctx, ["\n"]) 63 | return completion 64 | 65 | def process_results(self, doc, results): 66 | pred = results[0] 67 | gold = doc["completion"] 68 | return {"acc": int(pred == gold)} 69 | 70 | def aggregation(self): 71 | return {"acc": mean} 72 | 73 | def higher_is_better(self): 74 | return {"acc": True} 75 | 76 | 77 | class Anagrams1(WordUnscrambleTask): 78 | DATASET_NAME = "mid_word_1_anagrams" 79 | 80 | 81 | class Anagrams2(WordUnscrambleTask): 82 | DATASET_NAME = "mid_word_2_anagrams" 83 | 84 | 85 | class CycleLetters(WordUnscrambleTask): 86 | DATASET_NAME = "cycle_letters_in_word" 87 | 88 | 89 | class RandomInsertion(WordUnscrambleTask): 90 | DATASET_NAME = "random_insertion_in_word" 91 | 92 | 93 | class ReversedWords(WordUnscrambleTask): 94 | DATASET_NAME = "reversed_words" 95 | -------------------------------------------------------------------------------- /lm_eval/tasks/webqs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Semantic Parsing on Freebase from Question-Answer Pairs 3 | https://cs.stanford.edu/~pliang/papers/freebase-emnlp2013.pdf 4 | 5 | WebQuestions is a benchmark for question answering. The dataset consists of 6,642 6 | question/answer pairs. The questions are supposed to be answerable by Freebase, a 7 | large knowledge graph. The questions are mostly centered around a single named entity. 8 | The questions are popular ones asked on the web (at least in 2013). 9 | 10 | Homepage: https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a 11 | """ 12 | from lm_eval.base import rf, Task 13 | from lm_eval.metrics import mean 14 | 15 | 16 | _CITATION = """ 17 | @inproceedings{berant-etal-2013-semantic, 18 | title = "Semantic Parsing on {F}reebase from Question-Answer Pairs", 19 | author = "Berant, Jonathan and 20 | Chou, Andrew and 21 | Frostig, Roy and 22 | Liang, Percy", 23 | booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing", 24 | month = oct, 25 | year = "2013", 26 | address = "Seattle, Washington, USA", 27 | publisher = "Association for Computational Linguistics", 28 | url = "https://aclanthology.org/D13-1160", 29 | pages = "1533--1544", 30 | } 31 | """ 32 | 33 | 34 | class WebQs(Task): 35 | VERSION = 0 36 | DATASET_PATH = "web_questions" 37 | DATASET_NAME = None 38 | 39 | def has_training_docs(self): 40 | return True 41 | 42 | def has_validation_docs(self): 43 | return False 44 | 45 | def has_test_docs(self): 46 | return True 47 | 48 | def training_docs(self): 49 | if self._training_docs is None: 50 | self._training_docs = list(self.dataset["train"]) 51 | return self._training_docs 52 | 53 | def test_docs(self): 54 | return self.dataset["test"] 55 | 56 | def doc_to_text(self, doc): 57 | return "Question: " + doc["question"] + "\nAnswer:" 58 | 59 | def should_decontaminate(self): 60 | return True 61 | 62 | def doc_to_decontamination_query(self, doc): 63 | return doc["question"] 64 | 65 | def doc_to_target(self, doc): 66 | # this picks one answer to be the "correct" one, despite sometimes 67 | # multiple correct answers being possible. 68 | # TODO: make sure we're actually handling multi-answer correctly 69 | return " " + doc["answers"][0] 70 | 71 | def _remove_prefixes(self, aliases): 72 | # Optimization: Remove any alias that has a strict prefix elsewhere in the list 73 | # we can do this because if the prefix is acceptable by isgreedy, we can stop looking 74 | aliases.sort() 75 | ret = [aliases[0]] 76 | for alias in aliases[1:]: 77 | if not alias.startswith(ret[-1]): 78 | ret.append(alias) 79 | 80 | return ret 81 | 82 | def construct_requests(self, doc, ctx): 83 | ret = [] 84 | for alias in self._remove_prefixes(doc["answers"]): 85 | _, is_prediction = rf.loglikelihood(ctx, " " + alias) 86 | ret.append(is_prediction) 87 | return ret 88 | 89 | def process_results(self, doc, results): 90 | return {"acc": float(any(results))} 91 | 92 | def aggregation(self): 93 | return { 94 | "acc": mean, 95 | } 96 | 97 | def higher_is_better(self): 98 | return {"acc": True} 99 | -------------------------------------------------------------------------------- /lm_eval/tasks/wikitext.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pointer Sentinel Mixture Models 3 | https://arxiv.org/pdf/1609.07843.pdf 4 | 5 | The WikiText language modeling dataset is a collection of over 100 million tokens 6 | extracted from the set of verified Good and Featured articles on Wikipedia. 7 | 8 | NOTE: This `Task` is based on WikiText-2. 9 | 10 | Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/ 11 | """ 12 | import re 13 | import inspect 14 | import lm_eval.datasets.wikitext.wikitext 15 | from lm_eval.base import PerplexityTask 16 | 17 | 18 | _CITATION = """ 19 | @misc{merity2016pointer, 20 | title={Pointer Sentinel Mixture Models}, 21 | author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher}, 22 | year={2016}, 23 | eprint={1609.07843}, 24 | archivePrefix={arXiv}, 25 | primaryClass={cs.CL} 26 | } 27 | """ 28 | 29 | 30 | def wikitext_detokenizer(string): 31 | # contractions 32 | string = string.replace("s '", "s'") 33 | string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string) 34 | # number separators 35 | string = string.replace(" @-@ ", "-") 36 | string = string.replace(" @,@ ", ",") 37 | string = string.replace(" @.@ ", ".") 38 | # punctuation 39 | string = string.replace(" : ", ": ") 40 | string = string.replace(" ; ", "; ") 41 | string = string.replace(" . ", ". ") 42 | string = string.replace(" ! ", "! ") 43 | string = string.replace(" ? ", "? ") 44 | string = string.replace(" , ", ", ") 45 | # double brackets 46 | string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string) 47 | string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string) 48 | string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string) 49 | string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string) 50 | string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string) 51 | # miscellaneous 52 | string = string.replace("= = = =", "====") 53 | string = string.replace("= = =", "===") 54 | string = string.replace("= =", "==") 55 | string = string.replace(" " + chr(176) + " ", chr(176)) 56 | string = string.replace(" \n", "\n") 57 | string = string.replace("\n ", "\n") 58 | string = string.replace(" N ", " 1 ") 59 | string = string.replace(" 's", "'s") 60 | 61 | return string 62 | 63 | 64 | class WikiText(PerplexityTask): 65 | VERSION = 1 66 | DATASET_PATH = inspect.getfile(lm_eval.datasets.wikitext.wikitext) 67 | DATASET_NAME = "wikitext-2-raw-v1" 68 | 69 | def has_training_docs(self): 70 | return True 71 | 72 | def has_validation_docs(self): 73 | return True 74 | 75 | def has_test_docs(self): 76 | return True 77 | 78 | def training_docs(self): 79 | return map(self._process_doc, self.dataset["train"]) 80 | 81 | def validation_docs(self): 82 | return map(self._process_doc, self.dataset["validation"]) 83 | 84 | def test_docs(self): 85 | return map(self._process_doc, self.dataset["test"]) 86 | 87 | def _process_doc(self, doc): 88 | return doc["page"] 89 | 90 | def doc_to_target(self, doc): 91 | return wikitext_detokenizer(doc) 92 | 93 | def should_decontaminate(self): 94 | return True 95 | 96 | def count_words(self, doc): 97 | # count number of words in *original doc before detokenization* 98 | return len(re.split(r"\s+", doc)) 99 | -------------------------------------------------------------------------------- /outlier_analysis.md: -------------------------------------------------------------------------------- 1 | # Outlier Analysis 2 | Quantization, especially post-training quantization (PTQ) which operates with limited data and GPU resources, has become increasingly challenging for transformer language models (e.g., a 12% accuracy drop in BERT [1] and catastrophic degradation in OPT-175B [2]. 3 | 4 |

5 | 6 |

7 | 8 | Outliers on these models show structural phenomena. Firstly, they present in asymmetric shape and concentrate on certain channels. For example, from the colored part in the above figure, it can be seen that almost all the tokens contribute to outliers on certain channels. OPT-66B has hard negative outliers on the 8725-th channel and hard positive ones on the 6353-th channel. For BERT, outliers concentrate on the 308 and 381 channels. Second, a few tokens provide even larger values compared to others such as [SEP] in BERT and [EOT] in OPT (look at the orange part in the figure). 9 | 10 | ## Channel Aspect 11 | In terms of channels, outliers consistently emerge in certain channels over different inputs. [1, 2] find that these problematic channels are limited and propose some fine-grained methods. [1] employs a per-embedding-group quantization scheme that uses different quantization parameters for distinct channel groups. [2] proposes to utilize FP16 representations for channels holding signals over 6. [3] identifies this feature lying in LayerNorm’s output and migrates the scaling parameter of LayerNorm to subsequent modules with an equivalent transformation to attenuate outliers. [4] propose to calculate scaling values by equalizing ranges between activations and weights. Furthermore, [5] designs the scaling factors that concern the interactive results of troublesome activation and following weights to scale down outlier channels. Also, it notices the asymmetric presentation of outliers and designs a shifting operation. Besides, [6] discovers that normal values are not that important and discards those adjacent to outliers to make room for outliers. 12 | 13 | ## Token Aspect 14 | In terms of tokens, different tokens exhibit varying degrees of outliers. We find that this phenomenon is obvious in BERT and BLOOM, but less obvious in OPTs. Observing that tokens that denote more aggressive outliers often appear in examples, we conjecture that token divergence might relate to token frequency during the pre-training phase. 15 | 16 | To combat this challenge, [2, 7] introduce a novel scheme called per-token quantization that dynamically computes quantization parameters for each token. [5] investigates the clipping impact of outliers and recommends finding an appropriate clipping range in a token-wise manner. 17 | 18 | ## Related works 19 | [1]. Yelysei Bondarenko, et al, Understanding and overcoming the challenges of efficient transformer quantization. EMNLP 2021. 20 | [2]. Tim Dettmers, et al, LLM.int8 (): 8-bit matrix mul- tiplication for transformers at scale. NeurIPS 2022. 21 | [3]. Xiuying Wei, et al. Outlier suppression: Pushing the limit of low-bit transformer language models. NeurIPS 2022. 22 | [4]. Guangxuan Xiao, Ji Lin, et al. Smoothquant: Accurate and efficient post-training quantization for large language models. ICML 2023. 23 | [5]. Xiuying Wei, et al. Outlier Suppression+: Accurate quantization of large language models by equivalent and optimal shifting and scaling. arXiv preprint arXiv:2304.09145. 24 | [6]. Cong Guo, et al. OliVe: Accelerating Large Language Models via Hardware-friendly Outlier-Victim Pair Quantization. ISCA 2023. 25 | [7]. Zhewei Yao, et al. ZeroQuant: Efficient and affordable post-training quantization for large-scale transformers. NeurIPS 2022. 26 | 27 | -------------------------------------------------------------------------------- /pile_statistics.json: -------------------------------------------------------------------------------- 1 | { 2 | "Data": "Pile statistics", 3 | "Document Count": 210607728, 4 | "Total Pile Characters": 421215456, 5 | "File Start Offsets": [ 6 | 0, 7 | 7021438, 8 | 14042822, 9 | 21066113, 10 | 28086515, 11 | 35106072, 12 | 42123306, 13 | 49145091, 14 | 56165817, 15 | 63185587, 16 | 70211208, 17 | 77234322, 18 | 84249267, 19 | 91267634, 20 | 98285983, 21 | 105305110, 22 | 112322489, 23 | 119342491, 24 | 126367373, 25 | 133389153, 26 | 140412039, 27 | 147432373, 28 | 154452516, 29 | 161470190, 30 | 168492733, 31 | 175512521, 32 | 182526939, 33 | 189547478, 34 | 196565318, 35 | 203583306 36 | ] 37 | } 38 | -------------------------------------------------------------------------------- /quant_transformer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/quant_transformer/__init__.py -------------------------------------------------------------------------------- /quant_transformer/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/quant_transformer/model/__init__.py -------------------------------------------------------------------------------- /quant_transformer/model/quant_model.py: -------------------------------------------------------------------------------- 1 | from .quant_opt import QuantizedOPTForCausalLM # noqa: F401 2 | from .quant_bloom import QuantizedBloomForCausalLM # noqa: F401 3 | from .quant_llama import QuantizedLlamaForCausalLM 4 | from quant_transformer.quantization.observer import ObserverBase 5 | _SUPPORT_MODELS = ['opt', 'bloom'] 6 | 7 | 8 | def quantize_model(fp_model, config): 9 | config_quant = config.quant 10 | config_quant.is_remove_padding = config_quant.get('is_remove_padding', True) 11 | config_quant.migrate = config_quant.get('migrate', False) 12 | fp_model.eval() 13 | model = eval("Quantized" + str(fp_model.__class__.__name__))( 14 | fp_model, config_quant.w_qconfig, config_quant.a_qconfig, qinput=False, 15 | is_remove_padding=config_quant.is_remove_padding, 16 | ) 17 | for name, module in model.named_modules(): 18 | if isinstance(module, ObserverBase) and 'act' in name: 19 | module.set_name(name) 20 | model.eval() 21 | return model 22 | -------------------------------------------------------------------------------- /quant_transformer/model/util_layernorm.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | import torch.nn.functional as F 3 | 4 | 5 | class QuantizedLayerNorm(nn.Module): 6 | 7 | def __init__(self, org_module): 8 | super(QuantizedLayerNorm, self).__init__() 9 | self.normalized_shape = org_module.normalized_shape 10 | self.eps = org_module.eps 11 | self.elementwise_affine = org_module.elementwise_affine 12 | self.weight = org_module.weight 13 | self.bias = org_module.bias 14 | 15 | def forward(self, input): 16 | return F.layer_norm( 17 | input, self.normalized_shape, self.weight, self.bias, self.eps) 18 | 19 | def extra_repr(self) -> str: 20 | return '{normalized_shape}, eps={eps}, ' \ 21 | 'elementwise_affine={elementwise_affine}'.format(**self.__dict__) 22 | 23 | 24 | class Identity(nn.Module): 25 | def __init__(self): 26 | super().__init__() 27 | self.migrate = False 28 | self.migrate_scale = None 29 | 30 | def set_migrate(self, state): 31 | if self.migrate_scale is None: 32 | self.migrate = False 33 | else: 34 | self.migrate = state 35 | 36 | def set_migrate_scale(self, migrate_scale): 37 | self.migrate_scale = migrate_scale 38 | self.migrate = True 39 | 40 | def set_migrate_bias(self, migrate_bias): 41 | self.migrate_bias = migrate_bias 42 | self.migrate = True 43 | 44 | def forward(self, X): 45 | if self.migrate: 46 | X = X * self.migrate_scale + self.migrate_bias 47 | return X 48 | -------------------------------------------------------------------------------- /quant_transformer/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | from .quantized_module import Quantizer 2 | from .quantized_module import QuantizedModule, QuantizedLayer 3 | from .state import enable_calibration_quantization, enable_calibration_woquantization, \ 4 | enable_quantization, disable_all 5 | -------------------------------------------------------------------------------- /quant_transformer/quantization/state.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from .fake_quant import QuantizeBase 3 | from .observer import ObserverBase 4 | logger = logging.getLogger("OS+") 5 | 6 | 7 | def enable_calibration_woquantization(model, quantizer_type='fake_quant', except_quantizer=None): 8 | logger.info('Enable observer and Disable quantize for {}'.format(quantizer_type)) 9 | for name, submodule in model.named_modules(): 10 | if isinstance(submodule, QuantizeBase): 11 | if (quantizer_type not in name) or \ 12 | (except_quantizer is not None and name.split('.')[-1] in except_quantizer): 13 | logger.info('The except_quantizer is {}'.format(name)) 14 | submodule.disable_observer() 15 | submodule.disable_fake_quant() 16 | continue 17 | logger.debug('Enable observer and Disable quant: {}'.format(name)) 18 | submodule.enable_observer() 19 | submodule.disable_fake_quant() 20 | 21 | 22 | def enable_calibration_quantization(model, quantizer_type='fake_quant', except_quantizer=None): 23 | logger.info('Enable observer and Enable quantize for {}'.format(quantizer_type)) 24 | for name, submodule in model.named_modules(): 25 | if isinstance(submodule, QuantizeBase): 26 | if (quantizer_type not in name) or \ 27 | (except_quantizer is not None and name.split('.')[-1] in except_quantizer): 28 | logger.debug('The except_quantizer is {}'.format(name)) 29 | submodule.disable_observer() 30 | submodule.disable_fake_quant() 31 | continue 32 | logger.debug('Enable observer and Enable quant: {}'.format(name)) 33 | submodule.enable_observer() 34 | submodule.enable_fake_quant() 35 | 36 | 37 | def enable_quantization(model, quantizer_type='fake_quant', except_quantizer=None): 38 | logger.info('Disable observer and Enable quantize.') 39 | for name, submodule in model.named_modules(): 40 | if isinstance(submodule, QuantizeBase): 41 | if (quantizer_type not in name) or \ 42 | (except_quantizer is not None and name.split('.')[-1] in except_quantizer): 43 | logger.debug('The except_quantizer is {}'.format(name)) 44 | submodule.disable_observer() 45 | submodule.disable_fake_quant() 46 | continue 47 | logger.debug('Disable observer and Enable quant: {}'.format(name)) 48 | submodule.disable_observer() 49 | submodule.enable_fake_quant() 50 | 51 | 52 | def disable_all(model): 53 | logger.info('Disable observer and disable quantize.') 54 | for name, submodule in model.named_modules(): 55 | if isinstance(submodule, QuantizeBase): 56 | logger.debug('Disable observer and disable quant: {}'.format(name)) 57 | submodule.disable_observer() 58 | submodule.disable_fake_quant() 59 | 60 | 61 | def set_observer_name(model): 62 | logger.info('set name for obsever') 63 | for name, submodule in model.named_modules(): 64 | if isinstance(submodule, ObserverBase): 65 | submodule.set_name(name) 66 | -------------------------------------------------------------------------------- /quant_transformer/quantization/util_quant.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def round_ste(x: torch.Tensor): 5 | """ 6 | Implement Straight-Through Estimator for rounding operation. 7 | """ 8 | return (x.round() - x).detach() + x 9 | 10 | 11 | def fake_quantize_per_tensor_affine(x, scale, zero_point, quant_min, quant_max): 12 | x_int = round_ste(x / scale) + zero_point 13 | x_quant = torch.clamp(x_int, quant_min, quant_max) 14 | x_dequant = (x_quant - zero_point) * scale 15 | return x_dequant 16 | 17 | 18 | def quantize_per_channel_affine(x, scale, zero_point, ch_axis, quant_min, quant_max): 19 | new_shape = [1] * len(x.shape) 20 | new_shape[ch_axis] = x.shape[ch_axis] 21 | scale = scale.reshape(new_shape) 22 | zero_point = zero_point.reshape(new_shape) 23 | x_int = round_ste(x / scale) + zero_point 24 | x_quant = torch.clamp(x_int, quant_min, quant_max) 25 | return x_quant 26 | 27 | 28 | def dequantize_per_channel_affine(x, scale, zero_point, ch_axis, quant_min, quant_max): 29 | new_shape = [1] * len(x.shape) 30 | new_shape[ch_axis] = x.shape[ch_axis] 31 | scale = scale.reshape(new_shape) 32 | zero_point = zero_point.reshape(new_shape) 33 | x_dequant = (x - zero_point) * scale 34 | return x_dequant 35 | 36 | 37 | def quantize_per_tensor_affine(x, scale, zero_point, quant_min, quant_max): 38 | x_int = round_ste(x / scale) + zero_point 39 | x_quant = torch.clamp(x_int, quant_min, quant_max) 40 | return x_quant 41 | 42 | 43 | def dequantize_per_tensor_affine(x, scale, zero_point, quant_min, quant_max): 44 | x_dequant = (x - zero_point) * scale 45 | return x_dequant 46 | 47 | 48 | def fake_quantize_per_channel_affine(x, scale, zero_point, ch_axis, quant_min, quant_max): 49 | new_shape = [1] * len(x.shape) 50 | new_shape[ch_axis] = x.shape[ch_axis] 51 | scale = scale.reshape(new_shape) 52 | zero_point = zero_point.reshape(new_shape) 53 | x_int = round_ste(x / scale) + zero_point 54 | x_quant = torch.clamp(x_int, quant_min, quant_max) 55 | x_dequant = (x_quant - zero_point) * scale 56 | return x_dequant 57 | -------------------------------------------------------------------------------- /quant_transformer/solver/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/quant_transformer/solver/__init__.py -------------------------------------------------------------------------------- /quant_transformer/solver/token_wise_clipping.py: -------------------------------------------------------------------------------- 1 | from torch.nn import MSELoss 2 | import logging 3 | from quant_transformer.quantization.fake_quant import QuantizeBase 4 | logger = logging.getLogger("OS+") 5 | 6 | 7 | def set_ratio(model, ratio): 8 | for name, module in model.named_modules(): 9 | if isinstance(module, QuantizeBase): 10 | if 'act' in name: 11 | module.observer.set_percentile(ratio) 12 | module.observer.cnt = 0 13 | module.disable_fake_quant() 14 | module.enable_observer() 15 | if 'weight' in name: 16 | module.disable_fake_quant() 17 | 18 | 19 | def enable_quantization(model): 20 | for name, submodule in model.named_modules(): 21 | if isinstance(submodule, QuantizeBase): 22 | if 'act' in name: 23 | submodule.disable_observer() 24 | submodule.enable_fake_quant() 25 | if 'weight' in name: 26 | submodule.enable_fake_quant() 27 | 28 | 29 | def calibrate(model, fp_input, fp_output=False): 30 | loss = 0 31 | for i, batch in enumerate(fp_input): 32 | if fp_output: 33 | loss += model(**batch, labels=fp_input[i]['input_ids']).loss 34 | else: 35 | model(**batch) 36 | return loss 37 | 38 | 39 | def find_ratio(model, fp_input, fp_output, param): 40 | p, loss = 0, None 41 | iters = param['iters'] 42 | step = param['step'] 43 | for i in range(iters): 44 | set_ratio(model, 1.0 - step * i) 45 | calibrate(model, fp_input) 46 | enable_quantization(model) 47 | cur_loss = calibrate(model, fp_input, True) 48 | logger.info('the ratio is {}, the loss is {}'.format(1.0 - step * i, cur_loss)) 49 | if loss is None or loss > cur_loss: 50 | loss = cur_loss 51 | p = i 52 | ratio = 1.0 - step * p 53 | logger.info('the best percentile is {}'.format(ratio)) 54 | set_ratio(model, ratio) 55 | calibrate(model, fp_input) 56 | 57 | 58 | loss_fct = MSELoss() 59 | 60 | 61 | a_bit_iters = { 62 | 8: 0.05, 63 | 6: 0.1, 64 | } 65 | 66 | 67 | def cac_step_iters(a_bit, bs): 68 | step = 0.005 69 | step = float(format(step, '.2g')) 70 | iters = int(a_bit_iters[a_bit] / step) 71 | print('the step is {}, the iter is {}'.format(step, iters)) 72 | return step, iters 73 | 74 | 75 | def token_wise_clipping(model, fp_input, fp_output, config, batch_size): 76 | config_quant = config.quant 77 | 78 | logger.info("*** Evaluate Token Percentile ***") 79 | step, iters = cac_step_iters(config_quant.a_qconfig.bit, batch_size) 80 | 81 | if hasattr(config_quant.a_qconfig, 'token_quantile'): 82 | set_ratio(model, config_quant.a_qconfig.token_quantile) 83 | calibrate(model, fp_input) 84 | logger.info('the best percentile is {}'.format(config_quant.a_qconfig.token_quantile)) 85 | else: 86 | step, iters = cac_step_iters(config_quant.a_qconfig.bit, batch_size) 87 | find_ratio(model, fp_input, fp_output, 88 | {'iters': getattr(config.quant, 'iters', iters), 89 | 'step': getattr(config.quant, 'step', step)}) 90 | -------------------------------------------------------------------------------- /scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/scripts/__init__.py -------------------------------------------------------------------------------- /scripts/clean_training_data/README.md: -------------------------------------------------------------------------------- 1 | janitor.py contains a script to remove benchmark data contamination from training data sets. 2 | It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.14165). 3 | 4 | ## Algorithm 5 | 1) Collects all contamination text files that are to be removed from training data 6 | 2) Filters training data by finding `N`gram matches between the training data 7 | and any contamination 8 | 1) `N`grams ignore case and punctuation and are split on whitespace. 9 | 2) Matching `N`gram substrings are removed, as is a `window_to_remove` character window around 10 | the match, splitting the training data into chunks 11 | 3) Any chunks less than `minimum_slice_length` are removed 12 | 4) Training data sets split into more than `too_dirty_cutoff` are considered 13 | completey contaminated and removed 14 | 15 | OpenAI used: 16 | ``` 17 | ngram_n = 13 18 | window_to_remove = 200 19 | minimum_slice_length = 200 20 | too_dirty_cutoff = 10 21 | ``` 22 | 23 | ## Compiling 24 | 25 | Janitor can be used as a pure python program, but it is much faster if the ngram 26 | code is run in C++. To compile the C++ code, run 27 | 28 | ``` 29 | pip install pybind11 30 | c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix) 31 | ``` 32 | 33 | If your your compiler isn't linked to python, you may need to add to the above `-undefined dynamic_lookup` 34 | -------------------------------------------------------------------------------- /scripts/clean_training_data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/scripts/clean_training_data/__init__.py -------------------------------------------------------------------------------- /scripts/clean_training_data/compress_and_package.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import argparse 3 | import os 4 | import subprocess 5 | import shutil 6 | 7 | from tqdm import tqdm 8 | from tqdm_multiprocess import TqdmMultiProcessPool 9 | 10 | import logging 11 | from tqdm_multiprocess.logger import setup_logger_tqdm 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def process_task( 17 | working_directory, output_directory, bucket_file_path, tqdm_func, global_tqdm 18 | ): 19 | command = f"zstd {bucket_file_path}" 20 | logger.info(command) 21 | subprocess.call(command, shell=True) 22 | 23 | compressed_file = bucket_file_path + ".zst" 24 | if output_directory: 25 | shutil.move(compressed_file, output_directory) 26 | 27 | os.remove(bucket_file_path) 28 | global_tqdm.update() 29 | 30 | 31 | def compress_and_move(working_directory, output_directory, process_count): 32 | os.makedirs(output_directory, exist_ok=True) 33 | original_info_file_path = os.path.join(working_directory, "info.json") 34 | assert os.path.exists(original_info_file_path) 35 | 36 | tasks = [] 37 | bucket_file_paths = glob.glob( 38 | os.path.join(working_directory, "output", f"*.bkt.txt.sorted") 39 | ) 40 | for bucket_file_path in bucket_file_paths: 41 | task = (process_task, (working_directory, output_directory, bucket_file_path)) 42 | tasks.append(task) 43 | 44 | pool = TqdmMultiProcessPool(process_count) 45 | 46 | def on_done(_): 47 | return None 48 | 49 | def on_error(_): 50 | return None 51 | 52 | global_progress = tqdm( 53 | total=len(bucket_file_paths), dynamic_ncols=True, unit="file" 54 | ) 55 | _ = pool.map(global_progress, tasks, on_error, on_done) 56 | 57 | shutil.copy(original_info_file_path, os.path.join(output_directory, "info.json")) 58 | 59 | 60 | parser = argparse.ArgumentParser(description="sort 13gram buckets") 61 | parser.add_argument("-dir", "--working_directory", required=True) 62 | parser.add_argument("-output", "--output_directory", required=True) 63 | parser.add_argument("-procs", "--process_count", type=int, default=8) 64 | 65 | if __name__ == "__main__": 66 | version = 1.00 67 | print(f"Running version {version}") 68 | 69 | logfile_path = "compress_and_package.log" 70 | setup_logger_tqdm(logfile_path) 71 | 72 | args = parser.parse_args() 73 | compress_and_move(args.working_directory, args.output_directory, args.process_count) 74 | -------------------------------------------------------------------------------- /scripts/clean_training_data/investigate_pile.py: -------------------------------------------------------------------------------- 1 | from lm_eval.decontamination.archiver import Reader 2 | import os 3 | import json 4 | from functools import reduce 5 | import glob 6 | import tqdm 7 | 8 | from tqdm_multiprocess import TqdmMultiProcessPool 9 | 10 | 11 | def get_file_stats(file_path, tqdm_func, global_tqdm): 12 | reader = Reader() 13 | total_documents = 0 14 | total_size = 0 15 | update_frequency = 10000 16 | current_file_position = 0 17 | 18 | with tqdm_func( 19 | total=os.path.getsize(file_path), dynamic_ncols=True, unit="byte", unit_scale=1 20 | ) as progress: 21 | for document in reader.read(file_path, get_meta=True): 22 | total_size += len(document) 23 | total_documents += 1 24 | 25 | if total_documents % update_frequency == 0: 26 | new_file_pos = reader.fh.tell() 27 | bytes_read = new_file_pos - current_file_position 28 | current_file_position = new_file_pos 29 | progress.update(bytes_read) 30 | global_tqdm.update(bytes_read) 31 | 32 | return (total_documents, total_size) 33 | 34 | 35 | def get_files(): 36 | directory = "pile" 37 | files = list(sorted(glob.glob(os.path.join(directory, "*.jsonl.zst*")))) 38 | print(files) 39 | return files 40 | 41 | 42 | def get_stats(): 43 | files = get_files() 44 | total_size_bytes = sum(map(lambda x: os.path.getsize(x), files)) 45 | 46 | pool = TqdmMultiProcessPool(4) 47 | global_tqdm = tqdm.tqdm( 48 | total=total_size_bytes, dynamic_ncols=True, unit="byte", unit_scale=1 49 | ) 50 | 51 | # Generate minhashes with pool 52 | tasks = [(get_file_stats, (file,)) for file in files] 53 | 54 | def on_done(_): 55 | return None 56 | 57 | def on_error(_): 58 | return None 59 | 60 | results = pool.map(global_tqdm, tasks, on_error, on_done) 61 | 62 | total_documents, total_size = reduce( 63 | lambda x, y: (x[0] + y[0], x[1] + y[1]), results 64 | ) 65 | 66 | start_offsets = [] 67 | current_offset = 0 68 | for file_document_count, _ in results: 69 | start_offsets.append(current_offset) 70 | current_offset += file_document_count 71 | 72 | return (total_documents, total_size, start_offsets) 73 | 74 | 75 | if __name__ == "__main__": 76 | version = 1.01 77 | print(f"Running version {version}") 78 | 79 | stats_file_path = "pile_statistics.json" 80 | if os.path.exists(stats_file_path): 81 | stats = json.load(open(stats_file_path, "r")) 82 | else: 83 | document_count, total_document_size_chars, start_offsets = get_stats() 84 | stats = { 85 | "Data": "Pile statistics", 86 | "Document Count": document_count, 87 | "Total Pile Characters": total_document_size_chars, 88 | "File Start Offsets": start_offsets, 89 | } 90 | json.dump(stats, open(stats_file_path, "w"), indent=4) 91 | 92 | print(f"document_count: {stats['Document Count']}") 93 | print(f"total_chars: {stats['Total Pile Characters']}") 94 | print(f"start_offsets: {stats['File Start Offsets']}") 95 | -------------------------------------------------------------------------------- /scripts/clean_training_data/process_sorted_buckets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Processes each sorted bucket, creating a new file listing all ngrams that matched more then 10 3 | unique documents with their unique document counts. Uses multiprocessing and very little memory 4 | as we stream from presorted buckets. Will use a lot of disk though. 5 | 6 | Arguments 7 | --------- 8 | --working_directory (-dir) 9 | Directory containing the sorted buckets, processed files will be deposited here. Default: current directory 10 | --move_dir (-move) 11 | Directory to move processed 13grams too. Default: Do nothing 12 | --process_count (-procs) 13 | Number of processes to use. Default: 4 14 | """ 15 | 16 | import argparse 17 | import glob 18 | import os 19 | from pathlib import Path 20 | import re 21 | import shutil 22 | 23 | from tqdm import tqdm 24 | from tqdm_multiprocess import TqdmMultiProcessPool 25 | 26 | from scripts.clean_training_data.archiver import TextReader, TextArchive 27 | 28 | import logging 29 | from tqdm_multiprocess.logger import setup_logger_tqdm 30 | 31 | logger = logging.getLogger(__name__) 32 | 33 | 34 | # Multiprocessed 35 | def process_bucket( 36 | bucket_file_path, processed_directory, move_dir, tqdm_func, global_tqdm 37 | ): 38 | 39 | bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path)) # noqa: W605 40 | done_file = os.path.join( 41 | processed_directory, f"ngram_bucket_processing_{bucket_id}.done" 42 | ) 43 | if os.path.exists(done_file): 44 | logger.info(f"bucket {bucket_id} already processed, skipping") 45 | return 46 | 47 | # For managing tqdm 48 | file_size = os.path.getsize(bucket_file_path) 49 | bucket_progress = tqdm_func( 50 | total=file_size, dynamic_ncols=True, unit="byte", unit_scale=1 51 | ) 52 | current_file_position = 0 53 | update_frequency = 100 * 1000000 # 100mb 54 | update_counter = 0 55 | 56 | # Iterate through and output ngrams which occur in more then 10 documents 57 | bucket = TextReader(bucket_file_path) 58 | 59 | output_file_path = bucket_file_path + ".processed" 60 | output_archive = TextArchive(output_file_path, mode="wb") 61 | 62 | current_ngram = "" 63 | current_ngram_document_ids = set() 64 | for line in bucket.read(): 65 | [ngram, document_id] = line.rsplit(" ", 1) 66 | 67 | # Write ngram if more then 10 unique document occurrences 68 | if ngram != current_ngram: 69 | if len(current_ngram_document_ids) > 10: 70 | output_archive.add_data( 71 | f"{current_ngram} {len(current_ngram_document_ids)}" 72 | ) 73 | current_ngram = ngram 74 | current_ngram_document_ids = set() 75 | 76 | current_ngram_document_ids.add(document_id) 77 | 78 | # Update tqdm 79 | update_counter += bucket.fh.tell() - current_file_position 80 | current_file_position = bucket.fh.tell() 81 | if update_counter > update_frequency: 82 | bucket_progress.update(update_counter) 83 | update_counter = 0 84 | 85 | # Remainder 86 | if len(current_ngram_document_ids) > 10: 87 | output_archive.add_data(f"{current_ngram} {len(current_ngram_document_ids)}") 88 | 89 | output_archive.commit() 90 | Path(done_file).touch() 91 | 92 | if move_dir: 93 | shutil.move(output_file_path, move_dir) 94 | 95 | global_tqdm.update() 96 | 97 | 98 | def process_sorted_buckets(working_directory, move_dir, process_count): 99 | bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt.sorted")) 100 | processed_directory = os.path.join(working_directory, "processed") 101 | os.makedirs(processed_directory, exist_ok=True) 102 | 103 | pool = TqdmMultiProcessPool(process_count) 104 | tasks = [ 105 | (process_bucket, (bucket_file, processed_directory, move_dir)) 106 | for bucket_file in bucket_file_paths 107 | ] 108 | 109 | global_tqdm = tqdm(total=len(bucket_file_paths), dynamic_ncols=True, unit="bucket") 110 | 111 | def on_done(_): 112 | return None 113 | 114 | def on_error(_): 115 | return None 116 | 117 | _ = pool.map(global_tqdm, tasks, on_error, on_done) 118 | 119 | 120 | parser = argparse.ArgumentParser(description="Process 13 grams from sorted buckets.") 121 | parser.add_argument("-dir", "--working_directory", default="") 122 | parser.add_argument("-move", "--move_dir", default="") 123 | parser.add_argument("-procs", "--process_count", type=int, default=4) 124 | 125 | if __name__ == "__main__": 126 | 127 | logfile_path = "process13grams.log" 128 | setup_logger_tqdm(logfile_path) 129 | 130 | args = parser.parse_args() 131 | process_sorted_buckets(args.working_directory, args.move_dir, args.process_count) 132 | -------------------------------------------------------------------------------- /scripts/clean_training_data/sort_13_gram_buckets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Iteratively runs gnu sort on each bucket, uses up to 8 cores. 3 | 4 | Arguments 5 | --------- 6 | --working_directory (-dir) 7 | Directory containing the bucketed 13-grams. Sorted buckets will be deposited in the same 8 | directory and the unsorted buckets are removed after. 9 | """ 10 | 11 | import glob 12 | import argparse 13 | import os 14 | import signal 15 | from signal import SIGINT 16 | import subprocess 17 | 18 | from tqdm import tqdm 19 | 20 | import logging 21 | from tqdm_multiprocess.logger import setup_logger_tqdm 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | terminate = False 26 | 27 | 28 | def handler(signal_received, frame): 29 | global terminate 30 | terminate = True 31 | 32 | 33 | def sort_13_gram_buckets(working_directory): 34 | bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt")) 35 | 36 | for bucket_file_path in tqdm(bucket_file_paths, dynamic_ncols=True): 37 | sorted_file_path = bucket_file_path + ".sorted" 38 | command = f"sort {bucket_file_path} > {sorted_file_path}" 39 | logger.info(command) 40 | subprocess.call(command, shell=True) 41 | 42 | if terminate: 43 | return 44 | 45 | os.remove(bucket_file_path) 46 | 47 | 48 | parser = argparse.ArgumentParser(description="sort 13gram buckets") 49 | parser.add_argument("-dir", "--working_directory", default="") 50 | 51 | if __name__ == "__main__": 52 | 53 | version = 1.00 54 | print(f"Running version {version}") 55 | 56 | # Handle sigint (ctrl-c) cleanly 57 | previous_signal_int = signal.signal(SIGINT, handler) 58 | 59 | logfile_path = "sort13grambuckets.log" 60 | setup_logger_tqdm(logfile_path) 61 | 62 | args = parser.parse_args() 63 | sort_13_gram_buckets(args.working_directory) 64 | -------------------------------------------------------------------------------- /scripts/cost_estimate.py: -------------------------------------------------------------------------------- 1 | import random 2 | import transformers 3 | from lm_eval import tasks, evaluator 4 | from lm_eval.base import LM 5 | 6 | 7 | class DryrunLM(LM): 8 | def __init__(self): 9 | self.tokencost = 0 10 | self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2") 11 | self.tokenizer.pad_token = "<|endoftext|>" 12 | 13 | @classmethod 14 | def create_from_arg_string(cls, arg_string): 15 | return cls() 16 | 17 | def loglikelihood(self, requests): 18 | res = [] 19 | 20 | for ctx, cont in requests: 21 | res.append((-random.random(), False)) 22 | self.tokencost += len(self.tokenizer.tokenize(ctx + cont)) 23 | 24 | return res 25 | 26 | def greedy_until(self, requests): 27 | res = [] 28 | 29 | for ctx, until in requests: 30 | res.append("lol") 31 | 32 | # assume worst case - generates until 256 33 | self.tokencost += len(self.tokenizer.tokenize(ctx)) + 256 34 | 35 | return res 36 | 37 | def loglikelihood_rolling(self, requests): 38 | res = [] 39 | 40 | for (s,) in requests: 41 | # assume worst case: extra full context 42 | self.tokencost += len(self.tokenizer.tokenize(s)) + 2048 43 | 44 | return res 45 | 46 | 47 | def main(): 48 | lm = DryrunLM() 49 | 50 | task_list = "arc_challenge,arc_easy,boolq,cola,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,record,rte,sciq,sst,triviaqa,webqs,wic,wikitext,winogrande,wnli,wsc" 51 | values = [] 52 | for taskname in task_list.split(","): 53 | lm.tokencost = 0 54 | evaluator.evaluate( 55 | lm=lm, 56 | task_dict={taskname: tasks.get_task(taskname)()}, 57 | num_fewshot=0, 58 | limit=None, 59 | bootstrap_iters=10, 60 | description_dict=None, 61 | ) 62 | 63 | print(taskname, lm.tokencost) 64 | values.append( 65 | [ 66 | taskname, 67 | lm.tokencost, 68 | lm.tokencost / 1000 * 0.0008, 69 | lm.tokencost / 1000 * 0.0012, 70 | lm.tokencost / 1000 * 0.006, 71 | lm.tokencost / 1000 * 0.06, 72 | ] 73 | ) 74 | from pytablewriter import MarkdownTableWriter 75 | 76 | writer = MarkdownTableWriter() 77 | writer.headers = ["Task", "Tokens", "Ada", "Babbage", "Curie", "Davinci"] 78 | 79 | values.sort(key=lambda x: -x[1]) 80 | totcost = sum([x[1] for x in values]) 81 | values.append( 82 | [ 83 | "**Total**", 84 | totcost, 85 | totcost / 1000 * 0.0008, 86 | totcost / 1000 * 0.0012, 87 | totcost / 1000 * 0.006, 88 | totcost / 1000 * 0.06, 89 | ] 90 | ) 91 | 92 | writer.value_matrix = values 93 | 94 | print(writer.dumps()) 95 | 96 | 97 | if __name__ == "__main__": 98 | main() 99 | -------------------------------------------------------------------------------- /scripts/get_prompts.py: -------------------------------------------------------------------------------- 1 | from lm_eval import tasks 2 | from itertools import islice 3 | 4 | ct = 3 5 | 6 | for ( 7 | tname, 8 | Task, 9 | ) in tasks.TASK_REGISTRY.items(): # [('record', tasks.superglue.ReCoRD)]:# 10 | task = Task() 11 | 12 | print("#", tname) 13 | docs = islice( 14 | task.validation_docs() if task.has_validation_docs() else task.test_docs(), ct 15 | ) 16 | print() 17 | for i in range(ct): 18 | print() 19 | doc = next(docs) 20 | print("**Context**:", "\n```\n" + task.doc_to_text(doc) + "\n```\n") 21 | print() 22 | print("**Target**:", "\n```\n" + task.doc_to_target(doc) + "\n```\n") 23 | print() 24 | -------------------------------------------------------------------------------- /scripts/make_gpt2_test_cases.py: -------------------------------------------------------------------------------- 1 | import transformers 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | import random 6 | 7 | random.seed(42) 8 | 9 | 10 | data = [ 11 | "A multilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)", 12 | "The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons (with threshold activation); see § Terminology", 13 | 'Multilayer perceptrons are sometimes colloquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]', 14 | "An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear activation function.", 15 | "MLP utilizes a supervised learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]", 16 | "Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. ", 17 | "Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.", 18 | "A multilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)", 19 | "Hello World", 20 | ] 21 | 22 | 23 | model = transformers.GPT2LMHeadModel.from_pretrained("gpt2") 24 | tok = transformers.GPT2Tokenizer.from_pretrained("gpt2") 25 | 26 | tgs = [] 27 | 28 | for dat in data: 29 | random.seed(dat) 30 | # print(model(tok.encode(dat, return_tensors="pt"))[0][0]) 31 | 32 | toks = tok.encode(dat, return_tensors="pt") 33 | ind = random.randrange(len(toks[0]) - 1) 34 | logits = F.log_softmax(model(toks)[0], dim=-1)[:, :-1] # [batch, seq, vocab] 35 | 36 | res = torch.gather(logits, 2, toks[:, 1:].unsqueeze(-1)).squeeze(-1)[0] 37 | 38 | tgs.append(float(res[ind:].sum())) 39 | print( 40 | r'("""' 41 | + tok.decode(toks[0, : ind + 1]) 42 | + r'""", """' 43 | + tok.decode(toks[0, ind + 1 :]) 44 | + r'"""), ' 45 | ) 46 | 47 | print(tgs) 48 | -------------------------------------------------------------------------------- /scripts/make_table_tasks.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python make_table_tasks.py --output 4 | """ 5 | import argparse 6 | import logging 7 | from lm_eval import tasks 8 | from pytablewriter import MarkdownTableWriter 9 | 10 | 11 | logging.basicConfig(level=logging.INFO) 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | def check(tf): 16 | if tf: 17 | return "✓" 18 | else: 19 | return " " 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--output", type=str, default="task_table.md") 25 | args = parser.parse_args() 26 | 27 | writer = MarkdownTableWriter() 28 | writer.headers = ["Task Name", "Train", "Val", "Test", "Val/Test Docs", "Metrics"] 29 | values = [] 30 | 31 | tasks = tasks.TASK_REGISTRY.items() 32 | tasks = sorted(tasks, key=lambda x: x[0]) 33 | for tname, Task in tasks: 34 | task = Task() 35 | v = [ 36 | tname, 37 | check(task.has_training_docs()), 38 | check(task.has_validation_docs()), 39 | check(task.has_test_docs()), 40 | len( 41 | list( 42 | task.test_docs() if task.has_test_docs() else task.validation_docs() 43 | ) 44 | ), 45 | ", ".join(task.aggregation().keys()), 46 | ] 47 | logger.info(v) 48 | values.append(v) 49 | writer.value_matrix = values 50 | table = writer.dumps() 51 | with open(args.output, "w") as f: 52 | f.write(table) 53 | -------------------------------------------------------------------------------- /scripts/write_out.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import json 4 | import os 5 | import random 6 | from lm_eval import tasks 7 | from lm_eval.utils import join_iters 8 | 9 | EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n" 10 | 11 | 12 | def parse_args(): 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--output_base_path", required=True) 15 | parser.add_argument("--tasks", default="all_tasks") 16 | parser.add_argument("--provide_description", action="store_true") 17 | parser.add_argument("--sets", type=str, default="val") # example: val,test 18 | parser.add_argument("--num_fewshot", type=int, default=1) 19 | parser.add_argument("--seed", type=int, default=42) 20 | parser.add_argument("--num_examples", type=int, default=1) 21 | parser.add_argument("--description_dict_path", default=None) 22 | return parser.parse_args() 23 | 24 | 25 | def main(): 26 | args = parse_args() 27 | np.random.seed(args.seed) 28 | 29 | if args.tasks == "all_tasks": 30 | task_names = tasks.ALL_TASKS 31 | else: 32 | task_names = args.tasks.split(",") 33 | task_dict = tasks.get_task_dict(task_names) 34 | 35 | description_dict = {} 36 | if args.description_dict_path: 37 | with open(args.description_dict_path, "r") as f: 38 | description_dict = json.load(f) 39 | 40 | os.makedirs(args.output_base_path, exist_ok=True) 41 | for task_name, task in task_dict.items(): 42 | rnd = random.Random() 43 | rnd.seed(args.seed) 44 | 45 | iters = [] 46 | 47 | for set in args.sets.split(","): 48 | if set == "train" and task.has_training_docs(): 49 | docs = task.training_docs() 50 | if set == "val" and task.has_validation_docs(): 51 | docs = task.validation_docs() 52 | if set == "test" and task.has_test_docs(): 53 | docs = task.test_docs() 54 | iters.append(docs) 55 | 56 | docs = join_iters(iters) 57 | 58 | description = ( 59 | description_dict[task_name] 60 | if description_dict and task_name in description_dict 61 | else "" 62 | ) 63 | 64 | with open(os.path.join(args.output_base_path, task_name), "w") as f: 65 | for i, doc in ( 66 | zip(range(args.num_examples), docs) 67 | if args.num_examples > 0 68 | else enumerate(docs) 69 | ): 70 | f.write(EXAMPLE_DIVIDER.format(i=i)) 71 | ctx = task.fewshot_context( 72 | doc=doc, 73 | num_fewshot=args.num_fewshot, 74 | rnd=rnd, 75 | description=description, 76 | ) 77 | f.write(ctx + "\n") 78 | 79 | 80 | if __name__ == "__main__": 81 | main() 82 | -------------------------------------------------------------------------------- /templates/new_multiple_choice_task.py: -------------------------------------------------------------------------------- 1 | # TODO: Remove all TODO comments once the implementation is complete. 2 | """ 3 | TODO: Add the Paper Title on this line. 4 | TODO: Add the paper's PDF URL (preferably from arXiv) on this line. 5 | 6 | TODO: Write a Short Description of the task. 7 | 8 | Homepage: TODO: Add the URL to the task's Homepage here. 9 | """ 10 | from lm_eval.base import MultipleChoiceTask 11 | 12 | 13 | # TODO: Add the BibTeX citation for the task. 14 | _CITATION = """ 15 | """ 16 | 17 | 18 | # TODO: Replace `NewTask` with the name of your Task. 19 | class NewTask(MultipleChoiceTask): 20 | VERSION = 0 21 | # TODO: Add the `DATASET_PATH` string. This will be the name of the `Task` 22 | # dataset as denoted in HuggingFace `datasets`. 23 | DATASET_PATH = "" 24 | # TODO: Add the `DATASET_NAME` string. This is the name of a subset within 25 | # `DATASET_PATH`. If there aren't specific subsets you need, leave this as `None`. 26 | DATASET_NAME = None 27 | 28 | def has_training_docs(self): 29 | # TODO: Fill in the return with `True` if the Task has training data; else `False`. 30 | return False 31 | 32 | def has_validation_docs(self): 33 | # TODO: Fill in the return with `True` if the Task has validation data; else `False`. 34 | return False 35 | 36 | def has_test_docs(self): 37 | # TODO: Fill in the return with `True` if the Task has test data; else `False`. 38 | return False 39 | 40 | def training_docs(self): 41 | if self.has_training_docs(): 42 | # We cache training documents in `self._training_docs` for faster 43 | # few-shot processing. If the data is too large to fit in memory, 44 | # return the training data as a generator instead of a list. 45 | if self._training_docs is None: 46 | # TODO: Return the training document generator from `self.dataset`. 47 | # In most case you can leave this as is unless the dataset split is 48 | # named differently than the default `"train"`. 49 | self._training_docs = list( 50 | map(self._process_doc, self.dataset["train"]) 51 | ) 52 | return self._training_docs 53 | 54 | def validation_docs(self): 55 | if self.has_validation_docs(): 56 | # TODO: Return the validation document generator from `self.dataset`. 57 | # In most case you can leave this as is unless the dataset split is 58 | # named differently than the default `"validation"`. 59 | return map(self._process_doc, self.dataset["validation"]) 60 | 61 | def test_docs(self): 62 | if self.has_test_docs(): 63 | # TODO: Return the test document generator from `self.dataset`. 64 | # In most case you can leave this as is unless the dataset split is 65 | # named differently than the default `"test"`. 66 | return map(self._process_doc, self.dataset["test"]) 67 | 68 | def _process_doc(self, doc): 69 | # TODO: Process the documents into a dictionary with the following keys: 70 | return { 71 | "query": "", # The query prompt. 72 | "choices": [], # The list of choices. 73 | "gold": 0, # The integer used to index into the correct element of `"choices"`. 74 | } 75 | 76 | def doc_to_text(self, doc): 77 | # TODO: Format the query prompt portion of the document example. 78 | return doc["query"] 79 | --------------------------------------------------------------------------------