├── .coveragerc
├── .flake8
├── .github
    └── workflows
    │   ├── pull_request.yml
    │   └── python-app.yml
├── .gitignore
├── .pre-commit-config.yaml
├── LICENSE.md
├── README.md
├── docs
    ├── decontamination.md
    ├── description_guide.md
    ├── img
    │   └── fewshot_example_gpt3.png
    ├── task_guide.md
    └── task_table.md
├── exp
    ├── bloom_176b
    │   ├── int6.yaml
    │   └── int8.yaml
    ├── bloomz_176b
    │   ├── int6.yaml
    │   └── int8.yaml
    ├── llama
    │   ├── int4_token.yaml
    │   ├── int4_token_disable.yaml
    │   ├── int6_token.yaml
    │   └── int6_token_disable.yaml
    └── opt
    │   ├── int4_group.yaml
    │   ├── int5_token.yaml
    │   ├── int6.yaml
    │   └── int8.yaml
├── figure
    ├── outlier_phenomenon.png
    └── outlier_suppression_plus.png
├── ignore.txt
├── lm_eval
    ├── __init__.py
    ├── base.py
    ├── datasets
    │   ├── ai2_arc
    │   │   ├── README.md
    │   │   ├── ai2_arc.py
    │   │   └── dataset_infos.json
    │   ├── arithmetic
    │   │   ├── __init__.py
    │   │   ├── arithmetic.py
    │   │   └── dataset_infos.json
    │   ├── asdiv
    │   │   ├── __init__.py
    │   │   ├── asdiv.py
    │   │   └── dataset_infos.json
    │   ├── coqa
    │   │   ├── __init__.py
    │   │   ├── coqa.py
    │   │   └── dataset_infos.json
    │   ├── drop
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── drop.py
    │   ├── headqa
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── headqa.py
    │   ├── hellaswag
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── hellaswag.py
    │   ├── hendrycks_ethics
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── hendrycks_ethics.py
    │   ├── hendrycks_math
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── hendrycks_math.py
    │   ├── lambada_openai
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── lambada_openai.py
    │   ├── logiqa
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── logiqa.py
    │   ├── mutual
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── mutual.py
    │   ├── pile
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── pile.py
    │   ├── piqa
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── piqa.py
    │   ├── quac
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── quac.py
    │   ├── sat_analogies
    │   │   ├── __init__.py
    │   │   └── sat_analogies.py
    │   ├── story_cloze
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── story_cloze.py
    │   ├── super_glue
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── super_glue.py
    │   ├── triviaqa
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── triviaqa.py
    │   ├── unscramble
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── unscramble.py
    │   ├── wikitext
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── wikitext.py
    │   └── winogrande
    │   │   ├── __init__.py
    │   │   ├── dataset_infos.json
    │   │   └── winogrande.py
    ├── decontamination
    │   ├── __init__.py
    │   ├── archiver.py
    │   ├── decontaminate.py
    │   └── janitor.py
    ├── evaluator.py
    ├── metrics.py
    ├── models
    │   ├── __init__.py
    │   ├── bloom.py
    │   ├── dummy.py
    │   ├── gpt2.py
    │   ├── gpt3.py
    │   ├── llama.py
    │   ├── opt.py
    │   └── textsynth.py
    ├── tasks
    │   ├── __init__.py
    │   ├── anli.py
    │   ├── arc.py
    │   ├── arithmetic.py
    │   ├── asdiv.py
    │   ├── blimp.py
    │   ├── cbt.py
    │   ├── coqa.py
    │   ├── drop.py
    │   ├── glue.py
    │   ├── gsm8k.py
    │   ├── headqa.py
    │   ├── hellaswag.py
    │   ├── hendrycks_ethics.py
    │   ├── hendrycks_math.py
    │   ├── hendrycks_test.py
    │   ├── lambada.py
    │   ├── lambada_cloze.py
    │   ├── lambada_multilingual.py
    │   ├── logiqa.py
    │   ├── mathqa.py
    │   ├── mc_taco.py
    │   ├── mutual.py
    │   ├── naturalqs.py
    │   ├── openbookqa.py
    │   ├── pile.py
    │   ├── piqa.py
    │   ├── prost.py
    │   ├── pubmedqa.py
    │   ├── qa4mre.py
    │   ├── qasper.py
    │   ├── quac.py
    │   ├── race.py
    │   ├── sat.py
    │   ├── sciq.py
    │   ├── squad.py
    │   ├── storycloze.py
    │   ├── superglue.py
    │   ├── swag.py
    │   ├── translation.py
    │   ├── triviaqa.py
    │   ├── truthfulqa.py
    │   ├── unscramble.py
    │   ├── webqs.py
    │   ├── wikitext.py
    │   ├── winogrande.py
    │   └── wsc273.py
    └── utils.py
├── main.py
├── outlier_analysis.md
├── pile_statistics.json
├── quant_transformer
    ├── __init__.py
    ├── model
    │   ├── __init__.py
    │   ├── quant_bloom.py
    │   ├── quant_llama.py
    │   ├── quant_model.py
    │   ├── quant_opt.py
    │   └── util_layernorm.py
    ├── quantization
    │   ├── __init__.py
    │   ├── fake_quant.py
    │   ├── migration.py
    │   ├── migration_bloom.py
    │   ├── migration_llama.py
    │   ├── observer.py
    │   ├── quantized_module.py
    │   ├── state.py
    │   └── util_quant.py
    └── solver
    │   ├── __init__.py
    │   ├── calibrate.py
    │   ├── export.py
    │   └── token_wise_clipping.py
├── scripts
    ├── __init__.py
    ├── clean_training_data
    │   ├── README.md
    │   ├── __init__.py
    │   ├── compress_and_package.py
    │   ├── generate_13_grams.py
    │   ├── investigate_pile.py
    │   ├── janitor_util.cpp
    │   ├── process_sorted_buckets.py
    │   └── sort_13_gram_buckets.py
    ├── cost_estimate.py
    ├── get_prompts.py
    ├── make_gpt2_test_cases.py
    ├── make_table_tasks.py
    └── write_out.py
└── templates
    ├── new_multiple_choice_task.py
    └── new_task.py


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | 
 3 | # tasks that aren't wired up.
 4 | omit =
 5 |     lm_eval/tasks/quac.py
 6 |     lm_eval/tasks/storycloze.py
 7 |     lm_eval/tasks/cbt.py
 8 |     lm_eval/tasks/sat.py
 9 |     lm_eval/tasks/triviaqa.py
10 |     lm_eval/tasks/naturalqs.py
11 |     lm_eval/models/dummy.py
12 | 
13 | [report]
14 | exclude_lines =
15 |     # Skip any pass lines such as may be used for @abstractmethod
16 |     pass
17 | 
18 |     # Have to re-enable the standard pragma
19 |     pragma: no cover
20 | 
21 |     # Don't complain about missing debug-only code:
22 |     def __repr__
23 |     if self\.debug
24 | 
25 |     # Don't complain if tests don't hit defensive assertion code:
26 |     raise AssertionError
27 |     raise NotImplementedError
28 |     return NotImplemented
29 | 


--------------------------------------------------------------------------------
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E203, E266, E501, W503, F403, F401, C901
3 | max-line-length = 127
4 | max-complexity = 10
5 | select = B,C,E,F,W,T4,B9
6 | 


--------------------------------------------------------------------------------
/.github/workflows/pull_request.yml:
--------------------------------------------------------------------------------
 1 | name: Pull Request
 2 | 
 3 | on: [pull_request]
 4 | 
 5 | jobs:
 6 |   pre-commit:
 7 |     runs-on: ubuntu-20.04
 8 |     steps:
 9 |       - uses: actions/checkout@v3
10 |       - uses: actions/setup-python@v4
11 |         with:
12 |           python-version: 3.8
13 |       - uses: pre-commit/action@v2.0.3
14 | 


--------------------------------------------------------------------------------
/.github/workflows/python-app.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will install Python dependencies, run tests and lint with a single version of Python
 2 | # For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions
 3 | 
 4 | name: Build
 5 | 
 6 | on:
 7 |   push:
 8 |     branches: [ master ]
 9 |   pull_request:
10 |     branches: [ master ]
11 | 
12 | jobs:
13 |   build:
14 | 
15 |     runs-on: ubuntu-latest
16 | 
17 |     steps:
18 |     - uses: actions/checkout@v3
19 |     - name: Cache
20 |       uses: actions/cache@v2.1.3
21 |       with:
22 |         # A list of files, directories, and wildcard patterns to cache and restore
23 |         path: |
24 |           ~/.cache
25 |         # An explicit key for restoring and saving the cache
26 |         key: evaldata-cache-4
27 |     - name: Set up Python 3.9
28 |       uses: actions/setup-python@v4
29 |       with:
30 |         python-version: 3.9
31 |     - name: Install dependencies
32 |       run: |
33 |         python -m pip install --upgrade pip
34 |         pip install flake8 pytest pytest-cov
35 |         pip install -e .[dev,multilingual]
36 |         # Install optional git dependencies
37 |         pip install bleurt@https://github.com/google-research/bleurt/archive/b610120347ef22b494b6d69b4316e303f5932516.zip#egg=bleurt
38 |         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
39 |     - name: Lint with flake8
40 |       run: |
41 |         # stop the build if there are Python syntax errors or undefined names
42 |         flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
43 |         # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
44 |         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
45 |     - name: Test with pytest
46 |       run: |
47 |         pytest -vv --cov=lm_eval/ tests/
48 |     - name: Upload to codecov
49 |       run: |
50 |         bash <(curl -s https://codecov.io/bash) -t $CODECOV_TOKEN
51 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | env
 2 | *.pyc
 3 | data/
 4 | lm_cache
 5 | .idea
 6 | tests
 7 | experiment
 8 | lj_exp
 9 | lj_experiment
10 | lm_eval.egg-info/
11 | 
12 | 


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # Ignore test linting to avoid conflicting changes to version stability.
 2 | exclude: ^tests/testdata/
 3 | repos:
 4 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 5 |     rev: v4.1.0
 6 |     hooks:
 7 |       - id: check-added-large-files
 8 |       - id: check-ast
 9 |       - id: check-byte-order-marker
10 |       - id: check-case-conflict
11 |       - id: check-json
12 |       - id: check-merge-conflict
13 |       - id: check-symlinks
14 |       - id: check-yaml
15 |       - id: destroyed-symlinks
16 |       - id: detect-private-key
17 |       - id: end-of-file-fixer
18 |       - id: no-commit-to-branch
19 |       - id: requirements-txt-fixer
20 |       - id: trailing-whitespace
21 |       - id: fix-byte-order-marker
22 |         exclude: docs/CNAME
23 |       - id: fix-encoding-pragma
24 |         args: [--remove]
25 |       - id: mixed-line-ending
26 |         args: [--fix=lf]
27 |   - repo: https://github.com/pycqa/flake8
28 |     rev: 3.7.9
29 |     hooks:
30 |       - id: flake8
31 |   - repo: https://github.com/psf/black
32 |     rev: 22.3.0
33 |     hooks:
34 |       - id: black
35 |         language_version: python3.8
36 |   - repo: https://github.com/codespell-project/codespell
37 |     rev: v2.1.0
38 |     hooks:
39 |       - id: codespell
40 |         exclude: >
41 |           (?x)^(
42 |               .*\.json|ignore.txt
43 |           )$
44 |         args: [--check-filenames, --check-hidden, --ignore-words=ignore.txt]
45 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 EleutherAI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/docs/decontamination.md:
--------------------------------------------------------------------------------
 1 | # Decontamination
 2 | 
 3 | ## Usage
 4 | 
 5 | Simply add a "--decontamination_ngrams_path" when running main.py. The provided directory should contain
 6 | the ngram files and info.json produced in "Pile Ngram Generation" further down.
 7 | 
 8 | ```bash
 9 | python main.py \
10 |     --model gpt2 \
11 |     --device 0 \
12 |     --tasks sciq \
13 |     --decontamination_ngrams_path path/containing/training/set/ngrams
14 | ```
15 | 
16 | ## Background
17 | Downstream evaluations test model generalization, and are less useful when test set data also exists in the training set, referred to as leakage or contamination.
18 | 
19 | Filtering your training set against the test set is a good first step, however this isn't always possible, as in the case of a new benchmark or one that wasn't considered prior to model training. When training set filtering isn't possible, it is useful to measure the impact of test set leakage by detecting the contaminated test examples and producing a clean version of the benchmark.
20 | 
21 | The basis for our decontamination procedure can be found in Appendix C of "Language Models are Few-Shot Learners". OpenAI defined a test document as contaminated if any N-gram overlap existed with any training document. They used a range of N values between 8 and 13 depending on dataset, while we just used 13 for simplicity.
22 | 
23 | ## Implementation
24 | Contamination detection can be found in `lm_eval/decontaminate.py` with supporting code in `lm_eval/decontamination/`.
25 | 
26 | decontaminate.py does the following:
27 | 1. Build dictionaries of all ngrams and their corresponding evaluation/document ids.
28 | 2. Scan through sorted files containing training set n-grams.
29 | 3. If a match is found, the corresponding evaluation/document combinations are marked as contaminated.
30 | 
31 | `lm_eval/evaluator.py` can then produce a clean version of the benchmark by excluding the results of contaminated documents. For each metric, a clean version will be shown in the results with a "decontaminate" suffix.
32 | 
33 | This is disabled by default for new tasks, to support decontamination on a task override the "should_decontaminate" and "doc_to_decontamination_query" methods. For more details see the [task guide](task_guide.md).
34 | 
35 | ## Pile Ngram Generation
36 | The relevant scripts can be found in `scripts/clean_training_data`, which also import from
37 | `lm_eval/decontamination/`
38 | 
39 | 1. git clone https://github.com/EleutherAI/lm-evaluation-harness.git
40 | 2. pip install -r requirements.txt
41 | 3. Download The Pile from [The Eye](https://the-eye.eu/public/AI/pile/train/)
42 | 4. Place pile files in "pile" directory under "lm-evaluation-harness" (or create a symlink)
43 | 5. Run generate_13_grams.
44 | 
45 | ```bash
46 | export PYTHONHASHSEED=0
47 | python -m scripts/clean_training_data/generate_13_grams \
48 |        -dir path/to/working/directory \
49 |        -n 13 \
50 |        -buckets 500
51 | ```
52 | 
53 | Took approximately 4 days for us. We had the time to wait, but this could be scaled out by doing partial pile scans on multiple instances of this script and merging the relevant buckets. We fixed PYTHONHASHSEED to ensure reproducibility of bucket hashing in case you need to stop and start.
54 | 
55 | 6. Sort the generated 13-grams.
56 | ```bash
57 | python -m scripts/clean_training_data/sort_13_gram_buckets \
58 |        -dir path/to/working/directory/output
59 | ```
60 | 
61 | Took approximately 5 days for us. You could speed this up by spreading the files around to different machines and running the sort script before gathering them together.
62 | 
63 | 7. Compress the sorted 13 grams files and place them together with info.json.
64 | 
65 | This step only takes a few hours.
66 | 
67 | ```bash
68 | python -m scripts/clean_training_data/compress_and_package \
69 |        -dir path/to/working/directory \
70 |        -output path/to/final/directory \
71 |        -procs 8
72 | ```
73 | 
74 | Congratulations, the final directory can now be passed to lm-evaulation-harness with the "--decontamination_ngrams_path" argument.
75 | 


--------------------------------------------------------------------------------
/docs/description_guide.md:
--------------------------------------------------------------------------------
 1 | # Description Guide
 2 | 
 3 | ![fewshot-example](./img/fewshot_example_gpt3.png)
 4 | (Figure from [Brown et al., 2020](https://arxiv.org/pdf/2005.14165.pdf))
 5 | 
 6 | Task descriptions provide in-context task instruction for your language model. If you'd like to prepend a natural language description to your few-shot examples and prompt, you can do so on a per-task basis via the `description_dict` arg of [`evaluator.evaluate`](../lm_eval/evaluator.py). This `description_dict` must adhere to the following key-value structure:
 7 | 
 8 | - **key**: the task name (`str`) as specified in the lm-eval-harness [task registry](../lm_eval/tasks/__init__.py).
 9 | - **value**: the corresponding (`str`) description/prompt for the task identified by **key**.
10 | 
11 | ```python
12 | description_dict = {
13 |     "task_name_1": "description",
14 |     "task_name_2": "description",
15 |     ...
16 | }
17 | ```
18 | 
19 | Note that a task's description will be separated from its following few-shot examples and prompt by a new line as such:
20 | 
21 | ```python
22 | """
23 | <description>
24 | 
25 | <examples>
26 | 
27 | <prompt>
28 | """
29 | ```
30 | 
31 | ## Descriptions in File
32 | 
33 | One can also interface with the aforementioned [`evaluator.evaluate`](../lm_eval/evaluator.py) (or `evaluator.simple_evaluate`) method from a higher level by simply passing a JSON file path to the `description_dict_path` arg of the command-line interface (CLI) program, `main.py`. The JSON file pointed to should be structured the same as the `description_dict`. E.g. for some file at `/your/path/descriptions.json` you may have:
34 | 
35 | ```json
36 | {
37 |     "cycle_letters": "Please unscramble the letters into a word, and write that word:",
38 |     "copa": "Given a premise and one alternative with a causal relation to the premise and another without, choose the more plausible alternative"
39 | }
40 | ```
41 | 
42 | which can then be supplied to the CLI as:
43 | 
44 | ```bash
45 | python main.py  \
46 | --tasks cycle_letters,copa \
47 | --description_dict_path /your/path/descriptions.json \
48 | ...
49 | ```
50 | 


--------------------------------------------------------------------------------
/docs/img/fewshot_example_gpt3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/docs/img/fewshot_example_gpt3.png


--------------------------------------------------------------------------------
/exp/bloom_176b/int6.yaml:
--------------------------------------------------------------------------------
 1 | quant:
 2 |     a_qconfig:
 3 |         quantizer: FixedFakeQuantize
 4 |         observer: AvgTokenQuantileObserver
 5 |         bit: 6
 6 |         symmetric: False
 7 |         ch_axis: -1
 8 |         token_quantile: 0.985
 9 |     w_qconfig:
10 |         quantizer: FixedFakeQuantize
11 |         observer: MinMaxObserver
12 |         bit: 6
13 |         symmetric: False
14 |         ch_axis: 0 # perchannel 0 perlayer -1
15 |     calibrate: 128
16 |     calibrate_path: /mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/pile_cali
17 |     is_remove_padding: True
18 |     migrate: True
19 | model:
20 |     max_length: 512


--------------------------------------------------------------------------------
/exp/bloom_176b/int8.yaml:
--------------------------------------------------------------------------------
 1 | quant:
 2 |     a_qconfig:
 3 |         quantizer: FixedFakeQuantize
 4 |         observer: AvgTokenQuantileObserver
 5 |         bit: 8
 6 |         symmetric: True
 7 |         ch_axis: -1
 8 |         token_quantile: 0.995
 9 |     w_qconfig:
10 |         quantizer: FixedFakeQuantize
11 |         observer: MinMaxObserver
12 |         bit: 8
13 |         symmetric: True
14 |         ch_axis: -1 # perchannel 0 perlayer -1
15 |     calibrate: 128
16 |     calibrate_path: /mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/pile_cali
17 |     is_remove_padding: True
18 |     migrate: True
19 | model:
20 |     max_length: 512


--------------------------------------------------------------------------------
/exp/bloomz_176b/int6.yaml:
--------------------------------------------------------------------------------
 1 | quant:
 2 |     a_qconfig:
 3 |         quantizer: FixedFakeQuantize
 4 |         observer: AvgTokenQuantileObserver
 5 |         bit: 6
 6 |         symmetric: False
 7 |         ch_axis: -1
 8 |         token_quantile: 0.995
 9 |     w_qconfig:
10 |         quantizer: FixedFakeQuantize
11 |         observer: MinMaxObserver
12 |         bit: 6
13 |         symmetric: False
14 |         ch_axis: 0 # perchannel 0 perlayer -1
15 |     calibrate: 128
16 |     calibrate_path: /mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/pile_cali
17 |     is_remove_padding: True
18 |     migrate: True
19 | model:
20 |     max_length: 512


--------------------------------------------------------------------------------
/exp/bloomz_176b/int8.yaml:
--------------------------------------------------------------------------------
 1 | quant:
 2 |     a_qconfig:
 3 |         quantizer: FixedFakeQuantize
 4 |         observer: AvgMinMaxObserver
 5 |         bit: 8
 6 |         symmetric: True
 7 |         ch_axis: -1
 8 |     w_qconfig:
 9 |         quantizer: FixedQuantize
10 |         observer: MinMaxObserver
11 |         bit: 8
12 |         symmetric: True
13 |         ch_axis: -1 # perchannel 0 perlayer -1
14 |     calibrate: 128
15 |     calibrate_path: /mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/pile_cali
16 |     is_remove_padding: True
17 |     migrate: True
18 | model:
19 |     max_length: 512


--------------------------------------------------------------------------------
/exp/llama/int4_token.yaml:
--------------------------------------------------------------------------------
 1 | quant: 
 2 |     a_qconfig:
 3 |         quantizer: TokenFixedFakeQuantize
 4 |         observer: MinMaxObserver # EMAMSEObserver EMAMinMaxObserver EMAQuantileObserver EMAPruneMinMaxObserver
 5 |         bit: 4
 6 |         symmetric: False
 7 |         ch_axis: 0
 8 |     w_qconfig:
 9 |         quantizer: FixedQuantize
10 |         observer: MinMaxObserver
11 |         bit: 4
12 |         symmetric: False
13 |         ch_axis: 0 # perchannel 0 perlayer -1
14 |     calibrate: 128
15 |     calibrate_path: /mnt/cache/weixiuying.vendor/wikitext/wiki_cali
16 |     is_remove_padding: True
17 |     except_quantizer: [query_permute_post_act_fake_quant, key_transpose_post_act_fake_quant, value_permute_post_act_fake_quant, attention_probs_post_act_fake_quant] # disable bmm quantization
18 |     migrate: True
19 | model:
20 |     max_length: 2048
21 | 


--------------------------------------------------------------------------------
/exp/llama/int4_token_disable.yaml:
--------------------------------------------------------------------------------
 1 | quant: 
 2 |     a_qconfig:
 3 |         quantizer: TokenFixedFakeQuantize
 4 |         observer: MinMaxObserver # EMAMSEObserver EMAMinMaxObserver EMAQuantileObserver EMAPruneMinMaxObserver
 5 |         bit: 4
 6 |         symmetric: False
 7 |         ch_axis: 0
 8 |         disable_down_proj: True
 9 |     w_qconfig:
10 |         quantizer: FixedQuantize
11 |         observer: MinMaxObserver
12 |         bit: 4
13 |         symmetric: False
14 |         ch_axis: 0 # perchannel 0 perlayer -1
15 |     calibrate: 128
16 |     calibrate_path: /mnt/cache/weixiuying.vendor/wikitext/wiki_cali 
17 |     is_remove_padding: True
18 |     except_quantizer: [query_permute_post_act_fake_quant, key_transpose_post_act_fake_quant, value_permute_post_act_fake_quant, attention_probs_post_act_fake_quant] # disable bmm quantization
19 |     migrate: True
20 | model:
21 |     max_length: 2048
22 | 


--------------------------------------------------------------------------------
/exp/llama/int6_token.yaml:
--------------------------------------------------------------------------------
 1 | quant: 
 2 |     a_qconfig:
 3 |         quantizer: TokenFixedFakeQuantize
 4 |         observer: MinMaxObserver # EMAMSEObserver EMAMinMaxObserver EMAQuantileObserver EMAPruneMinMaxObserver
 5 |         bit: 6
 6 |         symmetric: False
 7 |         ch_axis: 0
 8 |     w_qconfig:
 9 |         quantizer: FixedQuantize
10 |         observer: MinMaxObserver
11 |         bit: 6
12 |         symmetric: False
13 |         ch_axis: 0 # perchannel 0 perlayer -1
14 |     calibrate: 128
15 |     calibrate_path: /mnt/cache/weixiuying.vendor/wikitext/wiki_cali
16 |     is_remove_padding: True
17 |     except_quantizer: [query_permute_post_act_fake_quant, key_transpose_post_act_fake_quant, value_permute_post_act_fake_quant, attention_probs_post_act_fake_quant] # disable bmm quantization
18 |     migrate: True
19 | model:
20 |     max_length: 2048
21 | 


--------------------------------------------------------------------------------
/exp/llama/int6_token_disable.yaml:
--------------------------------------------------------------------------------
 1 | quant: 
 2 |     a_qconfig:
 3 |         quantizer: TokenFixedFakeQuantize
 4 |         observer: MinMaxObserver # EMAMSEObserver EMAMinMaxObserver EMAQuantileObserver EMAPruneMinMaxObserver
 5 |         bit: 6
 6 |         symmetric: False
 7 |         ch_axis: 0
 8 |         disable_down_proj: True
 9 |     w_qconfig:
10 |         quantizer: FixedQuantize
11 |         observer: MinMaxObserver
12 |         bit: 6
13 |         symmetric: False
14 |         ch_axis: 0 # perchannel 0 perlayer -1
15 |     calibrate: 128
16 |     calibrate_path: /mnt/cache/weixiuying.vendor/wikitext/wiki_cali
17 |     is_remove_padding: True
18 |     except_quantizer: [query_permute_post_act_fake_quant, key_transpose_post_act_fake_quant, value_permute_post_act_fake_quant, attention_probs_post_act_fake_quant] # disable bmm quantization
19 |     migrate: True
20 | model:
21 |     max_length: 2048
22 | 


--------------------------------------------------------------------------------
/exp/opt/int4_group.yaml:
--------------------------------------------------------------------------------
 1 | quant:
 2 |     a_qconfig:
 3 |         quantizer: GroupFixedFakeQuantize
 4 |         group_size: 1024
 5 |         observer: MinMaxObserver # EMAMSEObserver EMAMinMaxObserver EMAQuantileObserver EMAPruneMinMaxObserver
 6 |         bit: 4
 7 |         symmetric: False
 8 |         ch_axis: 0
 9 |     w_qconfig:
10 |         quantizer: GroupFixedQuantize
11 |         group_size: 1024
12 |         observer: MinMaxObserver
13 |         bit: 4
14 |         symmetric: False
15 |         ch_axis: 0 # perchannel 0 perlayer -1
16 |     calibrate: 128
17 |     calibrate_path: /mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/pile_cali    
18 |     is_remove_padding: True
19 |     except_quantizer: [query_permute_post_act_fake_quant, key_transpose_post_act_fake_quant, value_permute_post_act_fake_quant, attention_probs_post_act_fake_quant] # disable bmm quantization
20 |     migrate: True
21 | model:
22 |     max_length: 2048
23 | 


--------------------------------------------------------------------------------
/exp/opt/int5_token.yaml:
--------------------------------------------------------------------------------
 1 | quant: 
 2 |     a_qconfig:
 3 |         quantizer: TokenFixedFakeQuantize
 4 |         observer: MinMaxObserver # EMAMSEObserver EMAMinMaxObserver EMAQuantileObserver EMAPruneMinMaxObserver
 5 |         bit: 5
 6 |         symmetric: False
 7 |         ch_axis: 0
 8 |     w_qconfig:
 9 |         quantizer: FixedQuantize
10 |         observer: MinMaxObserver
11 |         bit: 5
12 |         symmetric: False
13 |         ch_axis: 0 # perchannel 0 perlayer -1
14 |     calibrate: 128
15 |     calibrate_path: /mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/pile_cali    
16 |     is_remove_padding: True
17 |     except_quantizer: [query_permute_post_act_fake_quant, key_transpose_post_act_fake_quant, value_permute_post_act_fake_quant, attention_probs_post_act_fake_quant] # disable bmm quantization
18 |     migrate: True
19 | model:
20 |     max_length: 2048
21 | 


--------------------------------------------------------------------------------
/exp/opt/int6.yaml:
--------------------------------------------------------------------------------
 1 | quant:
 2 |     a_qconfig:
 3 |         quantizer: FixedFakeQuantize
 4 |         observer: AvgMinMaxObserver # EMAMSEObserver EMAMinMaxObserver EMAQuantileObserver EMAPruneMinMaxObserver
 5 |         bit: 6
 6 |         symmetric: False
 7 |         ch_axis: -1 # perlayer -1 perchannel 0
 8 |     w_qconfig:
 9 |         quantizer: FixedQuantize
10 |         observer: MinMaxObserver
11 |         bit: 6
12 |         symmetric: False
13 |         ch_axis: 0 # perchannel 0 perlayer -1
14 |     calibrate: 128
15 |     calibrate_path: /mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/pile_cali
16 |     is_remove_padding: True
17 |     migrate: True
18 | model:
19 |     max_length: 512


--------------------------------------------------------------------------------
/exp/opt/int8.yaml:
--------------------------------------------------------------------------------
 1 | quant:
 2 |     a_qconfig:
 3 |         quantizer: FixedFakeQuantize
 4 |         observer: AvgMinMaxObserver # EMAMSEObserver EMAMinMaxObserver EMAQuantileObserver EMAPruneMinMaxObserver
 5 |         bit: 8
 6 |         symmetric: True
 7 |         ch_axis: -1 # perlayer -1 perchannel 0
 8 |     w_qconfig:
 9 |         quantizer: FixedQuantize
10 |         observer: MinMaxObserver
11 |         bit: 8
12 |         symmetric: True
13 |         ch_axis: -1 # perchannel 0 perlayer -1
14 |     calibrate: 128
15 |     calibrate_path: /mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/pile_cali    
16 |     is_remove_padding: True
17 |     migrate: True
18 | model:
19 |     max_length: 512


--------------------------------------------------------------------------------
/figure/outlier_phenomenon.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/figure/outlier_phenomenon.png


--------------------------------------------------------------------------------
/figure/outlier_suppression_plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/figure/outlier_suppression_plus.png


--------------------------------------------------------------------------------
/ignore.txt:
--------------------------------------------------------------------------------
1 | ROUGE
2 | rouge
3 | nin
4 | 


--------------------------------------------------------------------------------
/lm_eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/ai2_arc/dataset_infos.json:
--------------------------------------------------------------------------------
1 | {"ARC-Challenge": {"description": "A new dataset of 7,787 genuine grade-school level, multiple-choice science questions, assembled to encourage research in\n advanced question-answering. The dataset is partitioned into a Challenge Set and an Easy Set, where the former contains\n only questions answered incorrectly by both a retrieval-based algorithm and a word co-occurrence algorithm. We are also\n including a corpus of over 14 million science sentences relevant to the task, and an implementation of three neural baseline models for this dataset. We pose ARC as a challenge to the community.\n", "citation": "@article{allenai:arc,\n      author    = {Peter Clark  and Isaac Cowhey and Oren Etzioni and Tushar Khot and\n                    Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},\n      title     = {Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},\n      journal   = {arXiv:1803.05457v1},\n      year      = {2018},\n}\n", "homepage": "https://allenai.org/data/arc", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "choices": {"feature": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answerKey": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "ai2_arc", "config_name": "ARC-Challenge", "version": {"version_str": "1.0.0", "description": "", "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 377740, "num_examples": 1172, "dataset_name": "ai2_arc"}, "train": {"name": "train", "num_bytes": 351888, "num_examples": 1119, "dataset_name": "ai2_arc"}, "validation": {"name": "validation", "num_bytes": 97254, "num_examples": 299, "dataset_name": "ai2_arc"}}, "download_checksums": {"/mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/arc/ARC-V1-Feb2018.zip": {"num_bytes": 680841265, "checksum": "6d2d5ab50b2ceec6ba5f79c921be77cf2de712ea25a2b3f4fff3acc101cecfa0"}}, "download_size": 680841265, "dataset_size": 826882, "size_in_bytes": 681668147}, "ARC-Easy": {"description": "A new dataset of 7,787 genuine grade-school level, multiple-choice science questions, assembled to encourage research in\n advanced question-answering. The dataset is partitioned into a Challenge Set and an Easy Set, where the former contains\n only questions answered incorrectly by both a retrieval-based algorithm and a word co-occurrence algorithm. We are also\n including a corpus of over 14 million science sentences relevant to the task, and an implementation of three neural baseline models for this dataset. We pose ARC as a challenge to the community.\n", "citation": "@article{allenai:arc,\n      author    = {Peter Clark  and Isaac Cowhey and Oren Etzioni and Tushar Khot and\n                    Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},\n      title     = {Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},\n      journal   = {arXiv:1803.05457v1},\n      year      = {2018},\n}\n", "homepage": "https://allenai.org/data/arc", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "choices": {"feature": {"text": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answerKey": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "ai2_arc", "config_name": "ARC-Easy", "version": {"version_str": "1.0.0", "description": "", "datasets_version_to_prepare": null, "major": 1, "minor": 0, "patch": 0}, "splits": {"test": {"name": "test", "num_bytes": 661997, "num_examples": 2376, "dataset_name": "ai2_arc"}, "train": {"name": "train", "num_bytes": 623254, "num_examples": 2251, "dataset_name": "ai2_arc"}, "validation": {"name": "validation", "num_bytes": 158498, "num_examples": 570, "dataset_name": "ai2_arc"}}, "download_checksums": {"/mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/arc/ARC-V1-Feb2018.zip": {"num_bytes": 680841265, "checksum": "6d2d5ab50b2ceec6ba5f79c921be77cf2de712ea25a2b3f4fff3acc101cecfa0"}}, "download_size": 680841265, "dataset_size": 1443749, "size_in_bytes": 682285014}}


--------------------------------------------------------------------------------
/lm_eval/datasets/arithmetic/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/arithmetic/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/asdiv/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/asdiv/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/asdiv/asdiv.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """ASDIV dataset."""
 15 | 
 16 | 
 17 | import os
 18 | import xml.etree.ElementTree as ET
 19 | 
 20 | import datasets
 21 | 
 22 | 
 23 | _CITATION = """\
 24 | @misc{miao2021diverse,
 25 |     title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
 26 |     author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
 27 |     year={2021},
 28 |     eprint={2106.15772},
 29 |     archivePrefix={arXiv},
 30 |     primaryClass={cs.AI}
 31 | }
 32 | """
 33 | 
 34 | _DESCRIPTION = """\
 35 | ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language
 36 | patterns and problem types) English math word problem (MWP) corpus for evaluating
 37 | the capability of various MWP solvers. Existing MWP corpora for studying AI progress
 38 | remain limited either in language usage patterns or in problem types. We thus present
 39 | a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem
 40 | types taught in elementary school. Each MWP is annotated with its problem type and grade
 41 | level (for indicating the level of difficulty).
 42 | """
 43 | 
 44 | _HOMEPAGE = "https://github.com/chaochun/nlu-asdiv-dataset"
 45 | 
 46 | # TODO: Add the licence for the dataset here if you can find it
 47 | _LICENSE = ""
 48 | 
 49 | _URLS = "https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip"
 50 | 
 51 | 
 52 | class ASDiv(datasets.GeneratorBasedBuilder):
 53 |     """ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers"""
 54 | 
 55 |     VERSION = datasets.Version("0.0.1")
 56 | 
 57 |     BUILDER_CONFIGS = [
 58 |         datasets.BuilderConfig(
 59 |             name="asdiv",
 60 |             version=VERSION,
 61 |             description="A diverse corpus for evaluating and developing english math word problem solvers",
 62 |         )
 63 |     ]
 64 | 
 65 |     def _info(self):
 66 |         features = datasets.Features(
 67 |             {
 68 |                 "body": datasets.Value("string"),
 69 |                 "question": datasets.Value("string"),
 70 |                 "solution_type": datasets.Value("string"),
 71 |                 "answer": datasets.Value("string"),
 72 |                 "formula": datasets.Value("string"),
 73 |             }
 74 |         )
 75 |         return datasets.DatasetInfo(
 76 |             description=_DESCRIPTION,
 77 |             features=features,
 78 |             homepage=_HOMEPAGE,
 79 |             license=_LICENSE,
 80 |             citation=_CITATION,
 81 |         )
 82 | 
 83 |     def _split_generators(self, dl_manager):
 84 |         urls = _URLS
 85 |         data_dir = dl_manager.download_and_extract(urls)
 86 |         base_filepath = "nlu-asdiv-dataset-55790e5270bb91ccfa5053194b25732534696b50"
 87 |         return [
 88 |             datasets.SplitGenerator(
 89 |                 name=datasets.Split.VALIDATION,
 90 |                 # These kwargs will be passed to _generate_examples
 91 |                 gen_kwargs={
 92 |                     "filepath": os.path.join(
 93 |                         data_dir, base_filepath, "dataset", "ASDiv.xml"
 94 |                     ),
 95 |                     "split": datasets.Split.VALIDATION,
 96 |                 },
 97 |             ),
 98 |         ]
 99 | 
100 |     # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
101 |     def _generate_examples(self, filepath, split):
102 |         tree = ET.parse(filepath)
103 |         root = tree.getroot()
104 |         for key, problem in enumerate(root.iter("Problem")):
105 |             yield key, {
106 |                 "body": problem.find("Body").text,
107 |                 "question": problem.find("Question").text,
108 |                 "solution_type": problem.find("Solution-Type").text,
109 |                 "answer": problem.find("Answer").text,
110 |                 "formula": problem.find("Formula").text,
111 |             }
112 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/asdiv/dataset_infos.json:
--------------------------------------------------------------------------------
1 | {"asdiv": {"description": "ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language\npatterns and problem types) English math word problem (MWP) corpus for evaluating\nthe capability of various MWP solvers. Existing MWP corpora for studying AI progress\nremain limited either in language usage patterns or in problem types. We thus present\na new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem\ntypes taught in elementary school. Each MWP is annotated with its problem type and grade\nlevel (for indicating the level of difficulty).\n", "citation": "@misc{miao2021diverse,\n    title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},\n    author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},\n    year={2021},\n    eprint={2106.15772},\n    archivePrefix={arXiv},\n    primaryClass={cs.AI}\n}\n", "homepage": "https://github.com/chaochun/nlu-asdiv-dataset", "license": "", "features": {"body": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "solution_type": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}, "formula": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "as_div", "config_name": "asdiv", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"validation": {"name": "validation", "num_bytes": 501489, "num_examples": 2305, "dataset_name": "as_div"}}, "download_checksums": {"https://github.com/chaochun/nlu-asdiv-dataset/archive/55790e5270bb91ccfa5053194b25732534696b50.zip": {"num_bytes": 440966, "checksum": "8f1fe4f6d5f170ec1e24ab78c244153c14c568b1bb2b1dad0324e71f37939a2d"}}, "download_size": 440966, "post_processing_size": null, "dataset_size": 501489, "size_in_bytes": 942455}}
2 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/coqa/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/coqa/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/coqa/dataset_infos.json:
--------------------------------------------------------------------------------
1 | {"coqa": {"description": "CoQA is a large-scale dataset for building Conversational Question Answering\nsystems. The goal of the CoQA challenge is to measure the ability of machines to\nunderstand a text passage and answer a series of interconnected questions that\nappear in a conversation.\n", "citation": "@misc{reddy2018coqa,\n    title={CoQA: A Conversational Question Answering Challenge},\n    author={Siva Reddy and Danqi Chen and Christopher D. Manning},\n    year={2018},\n    eprint={1808.07042},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://stanfordnlp.github.io/coqa/", "license": "", "features": {"id": {"dtype": "string", "id": null, "_type": "Value"}, "source": {"dtype": "string", "id": null, "_type": "Value"}, "story": {"dtype": "string", "id": null, "_type": "Value"}, "questions": {"feature": {"input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "answers": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "additional_answers": {"0": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "1": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}, "2": {"feature": {"span_start": {"dtype": "int32", "id": null, "_type": "Value"}, "span_end": {"dtype": "int32", "id": null, "_type": "Value"}, "span_text": {"dtype": "string", "id": null, "_type": "Value"}, "input_text": {"dtype": "string", "id": null, "_type": "Value"}, "turn_id": {"dtype": "int32", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "coqa", "config_name": "coqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 26250528, "num_examples": 7199, "dataset_name": "coqa"}, "validation": {"name": "validation", "num_bytes": 3765933, "num_examples": 500, "dataset_name": "coqa"}}, "download_checksums": {"https://nlp.stanford.edu/data/coqa/coqa-train-v1.0.json": {"num_bytes": 49001836, "checksum": "b0fdb2bc1bd38dd3ca2ce5fa2ac3e02c6288ac914f241ac409a655ffb6619fa6"}, "https://nlp.stanford.edu/data/coqa/coqa-dev-v1.0.json": {"num_bytes": 9090845, "checksum": "dfa367a9733ce53222918d0231d9b3bedc2b8ee831a2845f62dfc70701f2540a"}}, "download_size": 58092681, "post_processing_size": null, "dataset_size": 30016461, "size_in_bytes": 88109142}}
2 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/drop/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/drop/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/drop/dataset_infos.json:
--------------------------------------------------------------------------------
1 | {"drop": {"description": "DROP is a QA dataset which tests comprehensive understanding of paragraphs. In \nthis crowdsourced, adversarially-created, 96k question-answering benchmark, a \nsystem must resolve multiple references in a question, map them onto a paragraph,\nand perform discrete operations over them (such as addition, counting, or sorting).\n", "citation": "@misc{dua2019drop,\n    title={DROP: A Reading Comprehension Benchmark Requiring Discrete Reasoning Over Paragraphs}, \n    author={Dheeru Dua and Yizhong Wang and Pradeep Dasigi and Gabriel Stanovsky and Sameer Singh and Matt Gardner},\n    year={2019},\n    eprint={1903.00161},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://allenai.org/data/drop", "license": "", "features": {"section_id": {"dtype": "string", "id": null, "_type": "Value"}, "passage": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "query_id": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "validated_answers": {"feature": {"number": {"dtype": "string", "id": null, "_type": "Value"}, "date": {"day": {"dtype": "string", "id": null, "_type": "Value"}, "month": {"dtype": "string", "id": null, "_type": "Value"}, "year": {"dtype": "string", "id": null, "_type": "Value"}}, "spans": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "worker_id": {"dtype": "string", "id": null, "_type": "Value"}, "hit_id": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "drop", "config_name": "drop", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 108858121, "num_examples": 77409, "dataset_name": "drop"}, "validation": {"name": "validation", "num_bytes": 12560739, "num_examples": 9536, "dataset_name": "drop"}}, "download_checksums": {"https://s3-us-west-2.amazonaws.com/allennlp/datasets/drop/drop_dataset.zip": {"num_bytes": 8308692, "checksum": "39d2278a29fd729de301b111a45f434c24834f40df8f4ff116d864589e3249d6"}}, "download_size": 8308692, "post_processing_size": null, "dataset_size": 121418860, "size_in_bytes": 129727552}}
2 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/headqa/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/headqa/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/hellaswag/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/hellaswag/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/hellaswag/dataset_infos.json:
--------------------------------------------------------------------------------
1 | {"default": {"description": "\n", "citation": "@inproceedings{zellers2019hellaswag,\n    title={HellaSwag: Can a Machine Really Finish Your Sentence?},\n    author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},\n    booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},\n    year={2019}\n}\n", "homepage": "https://rowanzellers.com/hellaswag/", "license": "", "features": {"ind": {"dtype": "int32", "id": null, "_type": "Value"}, "activity_label": {"dtype": "string", "id": null, "_type": "Value"}, "ctx_a": {"dtype": "string", "id": null, "_type": "Value"}, "ctx_b": {"dtype": "string", "id": null, "_type": "Value"}, "ctx": {"dtype": "string", "id": null, "_type": "Value"}, "endings": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "source_id": {"dtype": "string", "id": null, "_type": "Value"}, "split": {"dtype": "string", "id": null, "_type": "Value"}, "split_type": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"dtype": "string", "id": null, "_type": "Value"}}, "supervised_keys": null, "builder_name": "hellaswag", "config_name": "default", "version": {"version_str": "0.1.0", "description": null, "datasets_version_to_prepare": null, "major": 0, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 43307616, "num_examples": 39905, "dataset_name": "hellaswag"}, "test": {"name": "test", "num_bytes": 10810696, "num_examples": 10003, "dataset_name": "hellaswag"}, "validation": {"name": "validation", "num_bytes": 11194634, "num_examples": 10042, "dataset_name": "hellaswag"}}, "download_checksums": {"/mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/hellaswag/hellaswag_train.jsonl": {"num_bytes": 47496131, "checksum": "dae5e69249868cb9fe4e23ff925c60b66169564cfb7072d793cd7356a2b69f8d"}, "/mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/hellaswag/hellaswag_test.jsonl": {"num_bytes": 11752147, "checksum": "da082b00543e422b8d25394614d102944586986def4de5cd1bd36d86bcb76261"}, "/mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/hellaswag/hellaswag_val.jsonl": {"num_bytes": 12246618, "checksum": "0aa3b88843990f3f10a97b9575c94d7b71fb2205240ba04ae4884d9e9c992588"}}, "download_size": 71494896, "dataset_size": 65312946, "size_in_bytes": 136807842}}
2 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/hellaswag/hellaswag.py:
--------------------------------------------------------------------------------
  1 | """TODO(hellaswag): Add a description here."""
  2 | 
  3 | 
  4 | import json
  5 | 
  6 | import datasets
  7 | 
  8 | 
  9 | # TODO(hellaswag): BibTeX citation
 10 | _CITATION = """\
 11 | @inproceedings{zellers2019hellaswag,
 12 |     title={HellaSwag: Can a Machine Really Finish Your Sentence?},
 13 |     author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
 14 |     booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
 15 |     year={2019}
 16 | }
 17 | """
 18 | 
 19 | # TODO(hellaswag):
 20 | _DESCRIPTION = """
 21 | """
 22 | _URL = "/mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/hellaswag/"
 23 | _URLS = {
 24 |     "train": _URL + "hellaswag_train.jsonl",
 25 |     "test": _URL + "hellaswag_test.jsonl",
 26 |     "dev": _URL + "hellaswag_val.jsonl",
 27 | }
 28 | 
 29 | 
 30 | class Hellaswag(datasets.GeneratorBasedBuilder):
 31 |     """TODO(hellaswag): Short description of my dataset."""
 32 | 
 33 |     # TODO(hellaswag): Set up version.
 34 |     VERSION = datasets.Version("0.1.0")
 35 | 
 36 |     def _info(self):
 37 |         # TODO(hellaswag): Specifies the datasets.DatasetInfo object
 38 |         return datasets.DatasetInfo(
 39 |             # This is the description that will appear on the datasets page.
 40 |             description=_DESCRIPTION,
 41 |             # datasets.features.FeatureConnectors
 42 |             features=datasets.Features(
 43 |                 {
 44 |                     # These are the features of your dataset like images, labels ...
 45 |                     "ind": datasets.Value("int32"),
 46 |                     "activity_label": datasets.Value("string"),
 47 |                     "ctx_a": datasets.Value("string"),
 48 |                     "ctx_b": datasets.Value("string"),
 49 |                     "ctx": datasets.Value("string"),
 50 |                     "endings": datasets.features.Sequence(datasets.Value("string")),
 51 |                     "source_id": datasets.Value("string"),
 52 |                     "split": datasets.Value("string"),
 53 |                     "split_type": datasets.Value("string"),
 54 |                     "label": datasets.Value("string"),
 55 |                 }
 56 |             ),
 57 |             # If there's a common (input, target) tuple from the features,
 58 |             # specify them here. They'll be used if as_supervised=True in
 59 |             # builder.as_dataset.
 60 |             supervised_keys=None,
 61 |             # Homepage of the dataset for documentation
 62 |             homepage="https://rowanzellers.com/hellaswag/",
 63 |             citation=_CITATION,
 64 |         )
 65 | 
 66 |     def _split_generators(self, dl_manager):
 67 |         """Returns SplitGenerators."""
 68 |         # TODO(hellaswag): Downloads the data and defines the splits
 69 |         # dl_manager is a datasets.download.DownloadManager that can be used to
 70 |         # download and extract URLs
 71 |         urls_to_download = _URLS
 72 |         dl_dir = dl_manager.download_and_extract(urls_to_download)
 73 |         return [
 74 |             datasets.SplitGenerator(
 75 |                 name=datasets.Split.TRAIN,
 76 |                 # These kwargs will be passed to _generate_examples
 77 |                 gen_kwargs={"filepath": dl_dir["train"]},
 78 |             ),
 79 |             datasets.SplitGenerator(
 80 |                 name=datasets.Split.TEST,
 81 |                 # These kwargs will be passed to _generate_examples
 82 |                 gen_kwargs={"filepath": dl_dir["test"]},
 83 |             ),
 84 |             datasets.SplitGenerator(
 85 |                 name=datasets.Split.VALIDATION,
 86 |                 # These kwargs will be passed to _generate_examples
 87 |                 gen_kwargs={"filepath": dl_dir["dev"]},
 88 |             ),
 89 |         ]
 90 | 
 91 |     def _generate_examples(self, filepath):
 92 |         """Yields examples."""
 93 |         # TODO(hellaswag): Yields (key, example) tuples from the dataset
 94 |         with open(filepath, encoding="utf-8") as f:
 95 |             for id_, row in enumerate(f):
 96 |                 data = json.loads(row)
 97 |                 yield id_, {
 98 |                     "ind": int(data["ind"]),
 99 |                     "activity_label": data["activity_label"],
100 |                     "ctx_a": data.get("ctx_a", ""),
101 |                     "ctx_b": data.get("ctx_b", ""),
102 |                     "ctx": data["ctx"],
103 |                     "endings": data.get("endings", []),
104 |                     "source_id": data["source_id"],
105 |                     "split": data["split"],
106 |                     "split_type": data["split_type"],
107 |                     "label": str(data.get("label", "")),
108 |                 }
109 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/hendrycks_ethics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/hendrycks_ethics/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/hendrycks_math/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/hendrycks_math/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/hendrycks_math/hendrycks_math.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """MATH dataset."""
 15 | 
 16 | 
 17 | import json
 18 | import os
 19 | import pathlib
 20 | 
 21 | import datasets
 22 | 
 23 | 
 24 | _CITATION = """\
 25 | @article{hendrycksmath2021,
 26 |   title={Measuring Mathematical Problem Solving With the Math Dataset},
 27 |   author={Dan Hendrycks and Collin Burns and Saurav Kadavath and Akul Arora and Steven Basart and Eric Tang and Dawn Song and Jacob Steinhardt},
 28 |   journal={NeurIPS},
 29 |   year={2021}
 30 | }
 31 | """
 32 | 
 33 | _DESCRIPTION = """\
 34 | MATH is a dataset of 12,500 challenging competition mathematics problems. Each
 35 | problem in Math has a full step-by-step solution which can be used to teach
 36 | models to generate answer derivations and explanations.
 37 | """
 38 | 
 39 | _HOMEPAGE = "https://github.com/hendrycks/math"
 40 | 
 41 | # TODO: Add the licence for the dataset here if you can find it
 42 | _LICENSE = ""
 43 | 
 44 | _URLS = "https://people.eecs.berkeley.edu/~hendrycks/MATH.tar"
 45 | 
 46 | _NAMES = [
 47 |     "algebra",
 48 |     "counting_and_probability",
 49 |     "geometry",
 50 |     "intermediate_algebra",
 51 |     "number_theory",
 52 |     "prealgebra",
 53 |     "precalculus",
 54 | ]
 55 | 
 56 | 
 57 | class HendrycksMath(datasets.GeneratorBasedBuilder):
 58 |     """MATH is a dataset of 12,500 challenging competition mathematics problems."""
 59 | 
 60 |     VERSION = datasets.Version("0.0.1")
 61 | 
 62 |     BUILDER_CONFIGS = [
 63 |         datasets.BuilderConfig(name=name, version=version, description=name)
 64 |         for name, version in zip(_NAMES, [VERSION] * len(_NAMES))
 65 |     ]
 66 | 
 67 |     def _info(self):
 68 |         features = datasets.Features(
 69 |             {
 70 |                 "problem": datasets.Value("string"),
 71 |                 "level": datasets.Value("string"),
 72 |                 "type": datasets.Value("string"),
 73 |                 "solution": datasets.Value("string"),
 74 |             }
 75 |         )
 76 |         return datasets.DatasetInfo(
 77 |             description=_DESCRIPTION,
 78 |             features=features,
 79 |             homepage=_HOMEPAGE,
 80 |             license=_LICENSE,
 81 |             citation=_CITATION,
 82 |         )
 83 | 
 84 |     def _split_generators(self, dl_manager):
 85 |         urls = _URLS
 86 |         data_dir = dl_manager.download_and_extract(urls)
 87 |         return [
 88 |             datasets.SplitGenerator(
 89 |                 name=datasets.Split.TRAIN,
 90 |                 # These kwargs will be passed to _generate_examples
 91 |                 gen_kwargs={
 92 |                     "basepath": os.path.join(
 93 |                         data_dir, "MATH", "train", self.config.name
 94 |                     ),
 95 |                     "split": "train",
 96 |                 },
 97 |             ),
 98 |             datasets.SplitGenerator(
 99 |                 name=datasets.Split.TEST,
100 |                 # These kwargs will be passed to _generate_examples
101 |                 gen_kwargs={
102 |                     "basepath": os.path.join(
103 |                         data_dir, "MATH", "test", self.config.name
104 |                     ),
105 |                     "split": "test",
106 |                 },
107 |             ),
108 |         ]
109 | 
110 |     # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
111 |     def _generate_examples(self, basepath, split):
112 |         key = 0
113 |         for file in sorted(pathlib.Path(basepath).iterdir()):
114 |             with open(file, "r", encoding="utf-8") as f:
115 |                 data = json.load(f)
116 |                 yield key, {
117 |                     "problem": data["problem"],
118 |                     "level": data["level"],
119 |                     "type": data["type"],
120 |                     "solution": data["solution"],
121 |                 }
122 |                 key += 1
123 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/lambada_openai/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/lambada_openai/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/logiqa/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/logiqa/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/logiqa/dataset_infos.json:
--------------------------------------------------------------------------------
1 | {"logiqa": {"description": "LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA\ninstances, covering multiple types of deductive reasoning. Results show that state-\nof-the-art neural models perform by far worse than human ceiling. The dataset can\nalso serve as a benchmark for reinvestigating logical AI under the deep learning\nNLP setting.\n", "citation": "@misc{liu2020logiqa,\n    title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning}, \n    author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},\n    year={2020},\n    eprint={2007.08124},\n    archivePrefix={arXiv},\n    primaryClass={cs.CL}\n}\n", "homepage": "https://github.com/lgw863/LogiQA-dataset", "license": "", "features": {"label": {"dtype": "string", "id": null, "_type": "Value"}, "context": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "logiqa", "config_name": "logiqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 6419852, "num_examples": 7376, "dataset_name": "logiqa"}, "test": {"name": "test", "num_bytes": 571705, "num_examples": 651, "dataset_name": "logiqa"}, "validation": {"name": "validation", "num_bytes": 562437, "num_examples": 651, "dataset_name": "logiqa"}}, "download_checksums": {"https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Train.txt": {"num_bytes": 6281272, "checksum": "7d5bb1f58278e33b395744cd2ad8d7600faa0b3c4d615c659a44ec1181d759fa"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Test.txt": {"num_bytes": 559060, "checksum": "359acb78c37802208f7fde9e2f6574b8526527c63d6a336f90a53f1932cb4701"}, "https://raw.githubusercontent.com/lgw863/LogiQA-dataset/master/Eval.txt": {"num_bytes": 550021, "checksum": "4c49e6753b7262c001506b9151135abf722247035ab075dad93acdea5789c01f"}}, "download_size": 7390353, "post_processing_size": null, "dataset_size": 7553994, "size_in_bytes": 14944347}}
2 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/mutual/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/mutual/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/mutual/dataset_infos.json:
--------------------------------------------------------------------------------
1 | {"mutual": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nThe MuTual dataset.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 5141602, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 634396, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 624271, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6400269, "size_in_bytes": 17398147}, "mutual_plus": {"description": "MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is\nmodified from Chinese high school English listening comprehension test data.\n\nMuTualPlus is a more difficult MuTual that replaces positive responses with a safe responses.", "citation": "@inproceedings{mutual,\n    title = \"MuTual: A Dataset for Multi-Turn Dialogue Reasoning\",\n    author = \"Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming\" ,\n    booktitle = \"Proceedings of the 58th Conference of the Association for Computational Linguistics\",\n    year = \"2020\",\n    publisher = \"Association for Computational Linguistics\",\n}\n", "homepage": "https://github.com/Nealcly/MuTual", "license": "", "features": {"answers": {"dtype": "string", "id": null, "_type": "Value"}, "options": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "article": {"dtype": "string", "id": null, "_type": "Value"}, "id": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "mutual", "config_name": "mutual_plus", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 4921179, "num_examples": 7088, "dataset_name": "mutual"}, "test": {"name": "test", "num_bytes": 606620, "num_examples": 886, "dataset_name": "mutual"}, "validation": {"name": "validation", "num_bytes": 597340, "num_examples": 886, "dataset_name": "mutual"}}, "download_checksums": {"https://github.com/Nealcly/MuTual/archive/master.zip": {"num_bytes": 10997878, "checksum": "bb325cf6c672f0f02699993a37138b0fa0af6fcfc77ec81dfbe46add4d7b29f9"}}, "download_size": 10997878, "post_processing_size": null, "dataset_size": 6125139, "size_in_bytes": 17123017}}
2 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/pile/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/pile/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/piqa/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/piqa/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/piqa/dataset_infos.json:
--------------------------------------------------------------------------------
1 | {"plain_text": {"description": "To apply eyeshadow without a brush, should I use a cotton swab or a toothpick?\nQuestions requiring this kind of physical commonsense pose a challenge to state-of-the-art\nnatural language understanding systems. The PIQA dataset introduces the task of physical commonsense reasoning\nand a corresponding benchmark dataset Physical Interaction: Question Answering or PIQA.\n\nPhysical commonsense knowledge is a major challenge on the road to true AI-completeness,\nincluding robots that interact with the world and understand natural language.\n\nThe dataset focuses on everyday situations with a preference for atypical solutions.\nThe dataset is inspired by instructables.com, which provides users with instructions on how to build, craft,\nbake, or manipulate objects using everyday materials.\n\nThe underlying task is formualted as multiple choice question answering:\ngiven a question `q` and two possible solutions `s1`, `s2`, a model or\na human must choose the most appropriate solution, of which exactly one is correct.\nThe dataset is further cleaned of basic artifacts using the AFLite algorithm which is an improvement of\nadversarial filtering. The dataset contains 16,000 examples for training, 2,000 for development and 3,000 for testing.\n", "citation": "@inproceedings{Bisk2020,\n  author = {Yonatan Bisk and Rowan Zellers and\n            Ronan Le Bras and Jianfeng Gao\n            and Yejin Choi},\n  title = {PIQA: Reasoning about Physical Commonsense in\n           Natural Language},\n  booktitle = {Thirty-Fourth AAAI Conference on\n               Artificial Intelligence},\n  year = {2020},\n}\n", "homepage": "https://yonatanbisk.com/piqa/", "license": "", "features": {"goal": {"dtype": "string", "id": null, "_type": "Value"}, "sol1": {"dtype": "string", "id": null, "_type": "Value"}, "sol2": {"dtype": "string", "id": null, "_type": "Value"}, "label": {"num_classes": 2, "names": ["0", "1"], "names_file": null, "id": null, "_type": "ClassLabel"}}, "post_processed": null, "supervised_keys": null, "builder_name": "piqa", "config_name": "plain_text", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 4104026, "num_examples": 16113, "dataset_name": "piqa"}, "test": {"name": "test", "num_bytes": 761521, "num_examples": 3084, "dataset_name": "piqa"}, "validation": {"name": "validation", "num_bytes": 464321, "num_examples": 1838, "dataset_name": "piqa"}}, "download_checksums": {"/mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/piqa/physicaliqa-train-dev.zip": {"num_bytes": 1824009, "checksum": "54d32a04f59a7e354396f321723c8d7ec35cc6b08506563d8d1ffcc15ce98ddd"}, "/mnt/lustre/weixiuying.vendor/datasets/nlp_datasets/piqa/tests.jsonl": {"num_bytes": 814616, "checksum": "402f1e2e61347db773e6e5e0a6b24f97396b59f6fd046dcdcbc12f483ac8553b"}}, "download_size": 2638625, "post_processing_size": null, "dataset_size": 5329868, "size_in_bytes": 7968493}}
2 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/quac/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/quac/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/quac/dataset_infos.json:
--------------------------------------------------------------------------------
1 | {"quac": {"description": "Question Answering in Context (QuAC) is a dataset for modeling, understanding, and \nparticipating in information seeking dialog. Data instances consist of an interactive\ndialog between two crowd workers: (1) a student who poses a sequence of freeform\nquestions to learn as much as possible about a hidden Wikipedia text, and (2)\na teacher who answers the questions by providing short excerpts (spans) from the text.\n", "citation": "@article{choi2018quac,\n    title={Quac: Question answering in context},\n    author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},\n    journal={arXiv preprint arXiv:1808.07036},\n    year={2018}\n}\n", "homepage": "https://quac.ai/", "license": "", "features": {"title": {"dtype": "string", "id": null, "_type": "Value"}, "section_title": {"dtype": "string", "id": null, "_type": "Value"}, "paragraph": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"dtype": "string", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "quac", "config_name": "quac", "version": {"version_str": "1.1.0", "description": null, "major": 1, "minor": 1, "patch": 0}, "splits": {"train": {"name": "train", "num_bytes": 212391958, "num_examples": 83568, "dataset_name": "quac"}, "validation": {"name": "validation", "num_bytes": 20678483, "num_examples": 7354, "dataset_name": "quac"}}, "download_checksums": {"https://s3.amazonaws.com/my89public/quac/train_v0.2.json": {"num_bytes": 68114819, "checksum": "ff5cca5a2e4b4d1cb5b5ced68b9fce88394ef6d93117426d6d4baafbcc05c56a"}, "https://s3.amazonaws.com/my89public/quac/val_v0.2.json": {"num_bytes": 8929167, "checksum": "09e622916280ba04c9352acb1bc5bbe80f11a2598f6f34e934c51d9e6570f378"}}, "download_size": 77043986, "post_processing_size": null, "dataset_size": 233070441, "size_in_bytes": 310114427}}
2 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/sat_analogies/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/sat_analogies/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/story_cloze/README.md:
--------------------------------------------------------------------------------
1 | # datasets
2 | 
3 | This directory contains custom HuggingFace [dataset loading scripts](https://huggingface.co/docs/datasets/dataset_script). They are provided to maintain backward compatibility with the ad-hoc data downloaders in earlier versions of the `lm-evaluation-harness` before HuggingFace [`datasets`](https://huggingface.co/docs/datasets/index) was adopted as the default downloading manager. For example, some instances in the HuggingFace `datasets` repository process features (e.g. whitespace stripping, lower-casing, etc.) in ways that the `lm-evaluation-harness` did not.
4 | 
5 | __NOTE__: We are __not__ accepting any additional loading scripts into the main branch! If you'd like to use a custom dataset, fork the repo and follow HuggingFace's loading script guide found [here](https://huggingface.co/docs/datasets/dataset_script). You can then override your `Task`'s `DATASET_PATH` attribute to point to this script's local path.
6 | 
7 | 
8 | __WARNING__: A handful of loading scripts are included in this collection because they have not yet been pushed to the Huggingface Hub or a HuggingFace organization repo. We will remove such scripts once pushed.
9 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/story_cloze/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/story_cloze/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/story_cloze/dataset_infos.json:
--------------------------------------------------------------------------------
1 | {"2016": {"description": "\nStory Cloze Test' is a commonsense reasoning framework for evaluating story understanding,\nstory generation, and script learning.This test requires a system to choose the correct ending\nto a four-sentence story.\n", "citation": "@inproceedings{mostafazadeh2017lsdsem,\n  title={Lsdsem 2017 shared task: The story cloze test},\n  author={Mostafazadeh, Nasrin and Roth, Michael and Louis, Annie and Chambers, Nathanael and Allen, James},\n  booktitle={Proceedings of the 2nd Workshop on Linking Models of Lexical, Sentential and Discourse-level Semantics},\n  pages={46--51},\n  year={2017}\n}\n", "homepage": "https://cs.rochester.edu/nlp/rocstories/", "license": "", "features": {"story_id": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_1": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_2": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_3": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_4": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz2": {"dtype": "string", "id": null, "_type": "Value"}, "answer_right_ending": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "story_cloze", "config_name": "2016", "version": {"version_str": "0.0.0", "description": null, "major": 0, "minor": 0, "patch": 0}, "splits": {"validation": {"name": "validation", "num_bytes": 614084, "num_examples": 1871, "dataset_name": "story_cloze"}, "test": {"name": "test", "num_bytes": 613184, "num_examples": 1871, "dataset_name": "story_cloze"}}, "download_checksums": {}, "download_size": 0, "post_processing_size": null, "dataset_size": 1227268, "size_in_bytes": 1227268}, "2018": {"description": "\nStory Cloze Test' is a commonsense reasoning framework for evaluating story understanding,\nstory generation, and script learning.This test requires a system to choose the correct ending\nto a four-sentence story.\n", "citation": "@inproceedings{mostafazadeh2017lsdsem,\n  title={Lsdsem 2017 shared task: The story cloze test},\n  author={Mostafazadeh, Nasrin and Roth, Michael and Louis, Annie and Chambers, Nathanael and Allen, James},\n  booktitle={Proceedings of the 2nd Workshop on Linking Models of Lexical, Sentential and Discourse-level Semantics},\n  pages={46--51},\n  year={2017}\n}\n", "homepage": "https://cs.rochester.edu/nlp/rocstories/", "license": "", "features": {"story_id": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_1": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_2": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_3": {"dtype": "string", "id": null, "_type": "Value"}, "input_sentence_4": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz1": {"dtype": "string", "id": null, "_type": "Value"}, "sentence_quiz2": {"dtype": "string", "id": null, "_type": "Value"}, "answer_right_ending": {"dtype": "int32", "id": null, "_type": "Value"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "story_cloze", "config_name": "2018", "version": "0.0.0", "splits": {"validation": {"name": "validation", "num_bytes": 515439, "num_examples": 1571, "dataset_name": "story_cloze"}}, "download_checksums": {}, "download_size": 0, "post_processing_size": null, "dataset_size": 515439, "size_in_bytes": 515439}}


--------------------------------------------------------------------------------
/lm_eval/datasets/super_glue/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/super_glue/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/triviaqa/README.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | dataset_info:
 3 |   features:
 4 |   - name: question_id
 5 |     dtype: string
 6 |   - name: question_source
 7 |     dtype: string
 8 |   - name: question
 9 |     dtype: string
10 |   - name: answer
11 |     struct:
12 |     - name: aliases
13 |       sequence: string
14 |     - name: value
15 |       dtype: string
16 |   - name: search_results
17 |     sequence:
18 |     - name: description
19 |       dtype: string
20 |     - name: filename
21 |       dtype: string
22 |     - name: rank
23 |       dtype: int32
24 |     - name: title
25 |       dtype: string
26 |     - name: url
27 |       dtype: string
28 |     - name: search_context
29 |       dtype: string
30 |   config_name: triviaqa
31 |   splits:
32 |   - name: train
33 |     num_bytes: 1270894387
34 |     num_examples: 87622
35 |   - name: validation
36 |     num_bytes: 163755044
37 |     num_examples: 11313
38 |   download_size: 632549060
39 |   dataset_size: 1434649431
40 | ---
41 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/triviaqa/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/triviaqa/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/triviaqa/dataset_infos.json:
--------------------------------------------------------------------------------
1 | {"triviaqa": {"description": "TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence\ntriples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts\nand independently gathered evidence documents, six per question on average, that provide\nhigh quality distant supervision for answering the questions.\n", "citation": "@InProceedings{JoshiTriviaQA2017,\n    author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},\n    title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},\n    booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},\n    month = {July},\n    year = {2017},\n    address = {Vancouver, Canada},\n    publisher = {Association for Computational Linguistics},\n}\n", "homepage": "https://nlp.cs.washington.edu/triviaqa/", "license": "Apache License 2.0", "features": {"question_id": {"dtype": "string", "id": null, "_type": "Value"}, "question_source": {"dtype": "string", "id": null, "_type": "Value"}, "question": {"dtype": "string", "id": null, "_type": "Value"}, "answer": {"aliases": {"feature": {"dtype": "string", "id": null, "_type": "Value"}, "length": -1, "id": null, "_type": "Sequence"}, "value": {"dtype": "string", "id": null, "_type": "Value"}}, "search_results": {"feature": {"description": {"dtype": "string", "id": null, "_type": "Value"}, "filename": {"dtype": "string", "id": null, "_type": "Value"}, "rank": {"dtype": "int32", "id": null, "_type": "Value"}, "title": {"dtype": "string", "id": null, "_type": "Value"}, "url": {"dtype": "string", "id": null, "_type": "Value"}, "search_context": {"dtype": "string", "id": null, "_type": "Value"}}, "length": -1, "id": null, "_type": "Sequence"}}, "post_processed": null, "supervised_keys": null, "task_templates": null, "builder_name": "triviaqa", "config_name": "triviaqa", "version": {"version_str": "0.0.1", "description": null, "major": 0, "minor": 0, "patch": 1}, "splits": {"train": {"name": "train", "num_bytes": 1271393601, "num_examples": 87622, "dataset_name": "triviaqa"}, "validation": {"name": "validation", "num_bytes": 163819509, "num_examples": 11313, "dataset_name": "triviaqa"}}, "download_checksums": {"http://eaidata.bmk.sh/data/triviaqa-unfiltered.tar.gz": {"num_bytes": 546481381, "checksum": "adc19b42769062d241a8fbe834c56e58598d9322eb6c614e9f33a68a2cf5523e"}}, "download_size": 546481381, "post_processing_size": null, "dataset_size": 1435213110, "size_in_bytes": 1981694491}}
2 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/unscramble/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/unscramble/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/unscramble/unscramble.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """Unscramble dataset."""
 15 | 
 16 | 
 17 | import json
 18 | import os
 19 | 
 20 | import datasets
 21 | 
 22 | 
 23 | _CITATION = """\
 24 | @inproceedings{NEURIPS2020_1457c0d6,
 25 |     author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
 26 |     booktitle = {Advances in Neural Information Processing Systems},
 27 |     editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
 28 |     pages = {1877--1901},
 29 |     publisher = {Curran Associates, Inc.},
 30 |     title = {Language Models are Few-Shot Learners},
 31 |     url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
 32 |     volume = {33},
 33 |     year = {2020}
 34 | }
 35 | """
 36 | 
 37 | _DESCRIPTION = """\
 38 | Unscramble is a small battery of 5 “character manipulation” tasks. Each task
 39 | involves giving the model a word distorted by some combination of scrambling,
 40 | addition, or deletion of characters, and asking it to recover the original word.
 41 | """
 42 | 
 43 | _HOMEPAGE = "https://github.com/openai/gpt-3/tree/master/data"
 44 | 
 45 | # TODO: Add the licence for the dataset here if you can find it
 46 | _LICENSE = ""
 47 | 
 48 | _BASE_URL = "https://raw.githubusercontent.com/openai/gpt-3/master/data"
 49 | 
 50 | 
 51 | _DESCRIPTIONS = {
 52 |     "mid_word_1_anagrams": "Anagrams of all but the first and last letter.",
 53 |     "mid_word_2_anagrams": "Anagrams of all but the first and last 2 letters.",
 54 |     "cycle_letters_in_word": "Cycle letters in the word.",
 55 |     "random_insertion_in_word": "Random insertions in the word that must be removed.",
 56 |     "reversed_words": "Words spelled backwards that must be reversed.",
 57 | }
 58 | _NAMES = _DESCRIPTIONS.keys()
 59 | 
 60 | 
 61 | class Unscramble(datasets.GeneratorBasedBuilder):
 62 |     """Unscramble is a small battery of 5 “character manipulation” tasks."""
 63 | 
 64 |     VERSION = datasets.Version("0.0.1")
 65 | 
 66 |     BUILDER_CONFIGS = [
 67 |         datasets.BuilderConfig(
 68 |             name=name, version=version, description=_DESCRIPTIONS[name]
 69 |         )
 70 |         for name, version in zip(_NAMES, [VERSION] * len(_NAMES))
 71 |     ]
 72 | 
 73 |     def _info(self):
 74 |         features = datasets.Features(
 75 |             {
 76 |                 "context": datasets.Value("string"),
 77 |                 "completion": datasets.Value("string"),
 78 |             }
 79 |         )
 80 |         return datasets.DatasetInfo(
 81 |             description=_DESCRIPTION,
 82 |             features=features,
 83 |             homepage=_HOMEPAGE,
 84 |             license=_LICENSE,
 85 |             citation=_CITATION,
 86 |         )
 87 | 
 88 |     def _split_generators(self, dl_manager):
 89 |         urls = os.path.join(_BASE_URL, f"{self.config.name}.jsonl.gz")
 90 |         data_dir = dl_manager.download_and_extract(urls)
 91 |         return [
 92 |             datasets.SplitGenerator(
 93 |                 name=datasets.Split.VALIDATION,
 94 |                 # These kwargs will be passed to _generate_examples
 95 |                 gen_kwargs={
 96 |                     "filepath": data_dir,
 97 |                     "split": "validation",
 98 |                 },
 99 |             ),
100 |         ]
101 | 
102 |     # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
103 |     def _generate_examples(self, filepath, split):
104 |         with open(filepath, encoding="utf-8") as f:
105 |             for key, row in enumerate(f):
106 |                 data = json.loads(row)
107 |                 yield key, {
108 |                     "context": data["context"],
109 |                     "completion": data["completion"],
110 |                 }
111 | 


--------------------------------------------------------------------------------
/lm_eval/datasets/wikitext/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/wikitext/__init__.py


--------------------------------------------------------------------------------
/lm_eval/datasets/winogrande/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/datasets/winogrande/__init__.py


--------------------------------------------------------------------------------
/lm_eval/decontamination/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/lm_eval/decontamination/__init__.py


--------------------------------------------------------------------------------
/lm_eval/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from . import gpt2
 2 | from . import gpt3
 3 | from . import textsynth
 4 | from . import dummy
 5 | from . import opt
 6 | from . import bloom
 7 | from . import llama
 8 | 
 9 | 
10 | MODEL_REGISTRY = {
11 |     "hf": gpt2.HFLM,
12 |     "gpt2": gpt2.GPT2LM,
13 |     "gpt3": gpt3.GPT3LM,
14 |     "textsynth": textsynth.TextSynthLM,
15 |     "dummy": dummy.DummyLM,
16 |     'opt': opt.OPTLM,
17 |     'bloom': bloom.BLOOMLM,
18 |     'llama': llama.LLAMALM,
19 | }
20 | 
21 | 
22 | def get_model(model_name):
23 |     return MODEL_REGISTRY[model_name]
24 | 


--------------------------------------------------------------------------------
/lm_eval/models/bloom.py:
--------------------------------------------------------------------------------
  1 | import transformers
  2 | import torch
  3 | from lm_eval.base import BaseLM
  4 | from accelerate.big_modeling import dispatch_model, infer_auto_device_map, get_balanced_memory
  5 | 
  6 | 
  7 | class BLOOMLM(BaseLM):
  8 | 
  9 |     def __init__(
 10 |         self,
 11 |         device="cuda",
 12 |         pretrained="bloom",
 13 |         revision="main",
 14 |         subfolder=None,
 15 |         tokenizer=None,
 16 |         batch_size=1,
 17 |         dtype=torch.float32,
 18 |         max_length=-1
 19 |     ):
 20 |         super().__init__()
 21 | 
 22 |         assert isinstance(device, str)
 23 |         assert isinstance(pretrained, str)
 24 |         assert isinstance(batch_size, int)
 25 | 
 26 |         if device:
 27 |             if device not in ["cuda", "cpu"]:
 28 |                 device = int(device)
 29 |             self._device = torch.device(device)
 30 |             print(f"Using device '{device}'")
 31 |         else:
 32 |             print("Device not specified")
 33 |             print(f"Cuda Available? {torch.cuda.is_available()}")
 34 |             self._device = (
 35 |                 torch.device("cuda")
 36 |                 if torch.cuda.is_available()
 37 |                 else torch.device("cpu")
 38 |             )
 39 |         self.dtype = dtype
 40 |         self.model = transformers.AutoModelForCausalLM.from_pretrained(
 41 |             pretrained,
 42 |             revision=revision + ("/" + subfolder if subfolder is not None else ""),
 43 |             torch_dtype=self.dtype
 44 |         )
 45 |         if max_length != -1:
 46 |             self.model.config.n_ctx = max_length
 47 |         else:
 48 |             self.model.config.n_ctx = 512
 49 |         self.pretrained = pretrained
 50 |         self.no_split_modules = self.model._no_split_modules
 51 |         self.model.eval()
 52 |         # pretrained tokenizer for neo is broken for now so just hard-coding this to gpt2
 53 |         self.tokenizer = transformers.AutoTokenizer.from_pretrained(
 54 |             pretrained if tokenizer is None else tokenizer,
 55 |             revision=revision,
 56 |             # subfolder=subfolder,
 57 |             use_fast=True,
 58 |         )
 59 |         self.vocab_size = self.tokenizer.vocab_size
 60 |         self.batch_size_per_gpu = batch_size  # todo: adaptive batch size
 61 | 
 62 |     def prepare_for_inference(self):
 63 |         self.no_split_modules = self.model._no_split_modules
 64 |         self.model.to(self.dtype)
 65 |         max_memory = get_balanced_memory(
 66 |             self.model,
 67 |             no_split_module_classes=self.no_split_modules,
 68 |             dtype=self.dtype
 69 |         )
 70 |         device_map = infer_auto_device_map(
 71 |             self.model,
 72 |             no_split_module_classes=self.no_split_modules,
 73 |             dtype=self.dtype,
 74 |             max_memory=max_memory,
 75 |         )
 76 |         print(device_map)
 77 |         dispatch_model(self.model, device_map=device_map)
 78 |         self.model.eval()
 79 | 
 80 |     @property
 81 |     def eot_token_id(self):
 82 |         # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
 83 |         return self.tokenizer.eos_token_id
 84 | 
 85 |     @property
 86 |     def max_length(self,):
 87 |         return self.model.config.n_ctx
 88 | 
 89 |     @property
 90 |     def max_gen_toks(self):
 91 |         return 256
 92 | 
 93 |     @property
 94 |     def batch_size(self):
 95 |         # TODO: fix multi-gpu
 96 |         return self.batch_size_per_gpu  # * gpus
 97 | 
 98 |     @property
 99 |     def device(self):
100 |         # TODO: fix multi-gpu
101 |         return self._device
102 | 
103 |     def tok_encode(self, string: str):
104 |         return self.tokenizer.encode(string, add_special_tokens=False)
105 | 
106 |     def tok_decode(self, tokens):
107 |         return self.tokenizer.decode(tokens)
108 | 
109 |     def _model_call(self, inps, attention_mask=None):
110 |         """
111 |         inps: a torch tensor of shape [batch, sequence]
112 |         the size of sequence may vary from call to call
113 | 
114 |         returns: a torch tensor of shape [batch, sequence, vocab] with the
115 |         logits returned from the model
116 |         """
117 |         with torch.no_grad():
118 |             return self.model(inps, attention_mask=attention_mask)[0][:, :, :250680]
119 | 
120 |     def _model_generate(self, context, max_length, eos_token_id):
121 |         return self.model.generate(
122 |             context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False
123 |         )
124 | 


--------------------------------------------------------------------------------
/lm_eval/models/dummy.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | from lm_eval.base import LM
 3 | 
 4 | 
 5 | class DummyLM(LM):
 6 |     def __init__(self):
 7 |         pass
 8 | 
 9 |     @classmethod
10 |     def create_from_arg_string(cls, arg_string, additional_config=None):
11 |         return cls()
12 | 
13 |     def loglikelihood(self, requests):
14 |         res = []
15 | 
16 |         for _ in requests:
17 |             res.append((-random.random(), False))
18 | 
19 |         return res
20 | 
21 |     def greedy_until(self, requests):
22 |         res = []
23 | 
24 |         for ctx, _ in requests:
25 |             res.append("lol")
26 |             assert ctx.strip() != ""
27 | 
28 |         return res
29 | 
30 |     def loglikelihood_rolling(self, requests):
31 |         res = []
32 | 
33 |         for _ in requests:
34 |             res.append(-random.random())
35 | 
36 |         return res
37 | 


--------------------------------------------------------------------------------
/lm_eval/models/llama.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from lm_eval.base import BaseLM
  3 | from transformers import LlamaForCausalLM, LlamaTokenizer
  4 | from accelerate.big_modeling import dispatch_model, infer_auto_device_map, get_balanced_memory
  5 | 
  6 | 
  7 | class LLAMALM(BaseLM):
  8 | 
  9 |     def __init__(
 10 |         self,
 11 |         device="cuda",
 12 |         pretrained="llama",
 13 |         revision="main",
 14 |         subfolder=None,
 15 |         tokenizer=None,
 16 |         batch_size=1,
 17 |         dtype=torch.float32,
 18 |         max_length=-1,
 19 |     ):
 20 |         super().__init__()
 21 | 
 22 |         assert isinstance(device, str)
 23 |         assert isinstance(pretrained, str)
 24 |         assert isinstance(batch_size, int)
 25 | 
 26 |         if device:
 27 |             if device not in ["cuda", "cpu"]:
 28 |                 device = int(device)
 29 |             self._device = torch.device(device)
 30 |             print(f"Using device '{device}'")
 31 |         else:
 32 |             print("Device not specified")
 33 |             print(f"Cuda Available? {torch.cuda.is_available()}")
 34 |             self._device = (
 35 |                 torch.device("cuda")
 36 |                 if torch.cuda.is_available()
 37 |                 else torch.device("cpu")
 38 |             )
 39 |         self.dtype = dtype
 40 |         self.model = LlamaForCausalLM.from_pretrained(
 41 |             pretrained,
 42 |             revision=revision + ("/" + subfolder if subfolder is not None else ""),
 43 |             torch_dtype=self.dtype
 44 |         )
 45 |         if max_length != -1:
 46 |             self.model.config.max_sequence_length = max_length
 47 |         self.pretrained = pretrained
 48 |         self.no_split_modules = self.model._no_split_modules
 49 |         self.model.eval()
 50 |         self.tokenizer = LlamaTokenizer.from_pretrained(
 51 |             pretrained if tokenizer is None else tokenizer,
 52 |             revision=revision,
 53 |             # subfolder=subfolder,
 54 |         )
 55 |         if self.tokenizer.pad_token_id is None:
 56 |             self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
 57 |         self.vocab_size = self.tokenizer.vocab_size
 58 |         self.batch_size_per_gpu = batch_size  # todo: adaptive batch size
 59 | 
 60 |     def prepare_for_inference(self):
 61 |         self.no_split_modules = self.model._no_split_modules
 62 |         self.model.to(self.dtype)
 63 |         max_memory = get_balanced_memory(
 64 |             self.model,
 65 |             no_split_module_classes=self.no_split_modules,
 66 |             dtype=self.dtype,
 67 |         )
 68 |         device_map = infer_auto_device_map(
 69 |             self.model,
 70 |             no_split_module_classes=self.no_split_modules,
 71 |             dtype=self.dtype,
 72 |             max_memory=max_memory,
 73 |         )
 74 |         print(device_map)
 75 |         dispatch_model(self.model, device_map=device_map)
 76 |         self.model.eval()
 77 | 
 78 |     @property
 79 |     def eot_token_id(self):
 80 |         # we use EOT because end of *text* is more accurate for what we're doing than end of *sentence*
 81 |         return self.tokenizer.eos_token_id
 82 | 
 83 |     @property
 84 |     def max_length(self):
 85 |         return self.model.config.max_sequence_length
 86 | 
 87 |     @property
 88 |     def max_gen_toks(self):
 89 |         return 256
 90 | 
 91 |     @property
 92 |     def batch_size(self):
 93 |         # TODO: fix multi-gpu
 94 |         return self.batch_size_per_gpu  # * gpus
 95 | 
 96 |     @property
 97 |     def device(self):
 98 |         # TODO: fix multi-gpu
 99 |         return self._device
100 | 
101 |     def tok_encode(self, string: str):
102 |         return self.tokenizer.encode(string, add_special_tokens=False)
103 | 
104 |     def tok_decode(self, tokens):
105 |         return self.tokenizer.decode(tokens)
106 | 
107 |     def _model_call(self, inps, attention_mask=None):
108 |         """
109 |         inps: a torch tensor of shape [batch, sequence]
110 |         the size of sequence may vary from call to call
111 | 
112 |         returns: a torch tensor of shape [batch, sequence, vocab] with the
113 |         logits returned from the model
114 |         """
115 |         with torch.no_grad():
116 |             return self.model(inps, attention_mask=attention_mask)[0][:, :, :len(self.tokenizer)]
117 | 
118 |     def _model_generate(self, context, max_length, eos_token_id):
119 |         return self.model.generate(
120 |             context, max_length=max_length, eos_token_id=eos_token_id, do_sample=False
121 |         )
122 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/arc.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge
 3 | https://arxiv.org/pdf/1803.05457.pdf
 4 | 
 5 | The ARC dataset consists of 7,787 science exam questions drawn from a variety
 6 | of sources, including science questions provided under license by a research
 7 | partner affiliated with AI2. These are text-only, English language exam questions
 8 | that span several grade levels as indicated in the files. Each question has a
 9 | multiple choice structure (typically 4 answer options). The questions are sorted
10 | into a Challenge Set of 2,590 “hard” questions (those that both a retrieval and
11 | a co-occurrence method fail to answer correctly) and an Easy Set of 5,197 questions.
12 | 
13 | Homepage: https://allenai.org/data/arc
14 | """
15 | import inspect
16 | from lm_eval.base import MultipleChoiceTask
17 | import lm_eval.datasets.ai2_arc.ai2_arc
18 | 
19 | _CITATION = """
20 | @article{Clark2018ThinkYH,
21 |   title={Think you have Solved Question Answering? Try ARC, the AI2 Reasoning Challenge},
22 |   author={Peter Clark and Isaac Cowhey and Oren Etzioni and Tushar Khot and Ashish Sabharwal and Carissa Schoenick and Oyvind Tafjord},
23 |   journal={ArXiv},
24 |   year={2018},
25 |   volume={abs/1803.05457}
26 | }
27 | """
28 | 
29 | 
30 | class ARCEasy(MultipleChoiceTask):
31 |     VERSION = 0
32 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.ai2_arc.ai2_arc)
33 |     DATASET_NAME = "ARC-Easy"
34 | 
35 |     def has_training_docs(self):
36 |         return True
37 | 
38 |     def has_validation_docs(self):
39 |         return True
40 | 
41 |     def has_test_docs(self):
42 |         return True
43 | 
44 |     def training_docs(self):
45 |         if self._training_docs is None:
46 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
47 |         return self._training_docs
48 | 
49 |     def validation_docs(self):
50 |         return map(self._process_doc, self.dataset["validation"])
51 | 
52 |     def test_docs(self):
53 |         return map(self._process_doc, self.dataset["test"])
54 | 
55 |     def _process_doc(self, doc):
56 |         # NOTE: Some `doc["answerKey"]`s are in numeric string format being one
57 |         # of {'1', '2', '3', '4', '5'}. We map them back to letters.
58 |         num_to_letter = {"1": "A", "2": "B", "3": "C", "4": "D", "5": "E"}
59 |         doc["answerKey"] = num_to_letter.get(doc["answerKey"], doc["answerKey"])
60 |         out_doc = {
61 |             "id": doc["id"],
62 |             "query": "Question: " + doc["question"] + "\nAnswer:",
63 |             "choices": doc["choices"]["text"],
64 |             "gold": ["A", "B", "C", "D", "E"].index(doc["answerKey"]),
65 |         }
66 |         return out_doc
67 | 
68 |     def doc_to_text(self, doc):
69 |         return doc["query"]
70 | 
71 |     def should_decontaminate(self):
72 |         return True
73 | 
74 |     def doc_to_decontamination_query(self, doc):
75 |         return doc["query"]
76 | 
77 | 
78 | class ARCChallenge(ARCEasy):
79 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.ai2_arc.ai2_arc)
80 |     DATASET_NAME = "ARC-Challenge"
81 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/arithmetic.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Language Models are Few-Shot Learners
  3 | https://arxiv.org/pdf/2005.14165.pdf
  4 | 
  5 | A small battery of 10 tests that involve asking language models a simple arithmetic
  6 | problem in natural language.
  7 | 
  8 | Homepage: https://github.com/openai/gpt-3/tree/master/data
  9 | """
 10 | import inspect
 11 | import lm_eval.datasets.arithmetic.arithmetic
 12 | from lm_eval.base import Task, rf
 13 | from lm_eval.metrics import mean
 14 | 
 15 | 
 16 | _CITATION = """
 17 | @inproceedings{NEURIPS2020_1457c0d6,
 18 |     author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
 19 |     booktitle = {Advances in Neural Information Processing Systems},
 20 |     editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
 21 |     pages = {1877--1901},
 22 |     publisher = {Curran Associates, Inc.},
 23 |     title = {Language Models are Few-Shot Learners},
 24 |     url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
 25 |     volume = {33},
 26 |     year = {2020}
 27 | }
 28 | """
 29 | 
 30 | 
 31 | class Arithmetic(Task):
 32 |     VERSION = 0
 33 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.arithmetic.arithmetic)
 34 | 
 35 |     def has_training_docs(self):
 36 |         return False
 37 | 
 38 |     def has_validation_docs(self):
 39 |         return True
 40 | 
 41 |     def has_test_docs(self):
 42 |         return False
 43 | 
 44 |     def training_docs(self):
 45 |         return NotImplemented
 46 | 
 47 |     def validation_docs(self):
 48 |         return self.dataset["validation"]
 49 | 
 50 |     def test_docs(self):
 51 |         return NotImplemented
 52 | 
 53 |     def doc_to_text(self, doc):
 54 |         return doc["context"]
 55 | 
 56 |     def should_decontaminate(self):
 57 |         return True
 58 | 
 59 |     def doc_to_decontamination_query(self, doc):
 60 |         return doc["context"]
 61 | 
 62 |     def doc_to_target(self, doc):
 63 |         return doc["completion"]
 64 | 
 65 |     def construct_requests(self, doc, ctx):
 66 |         ll, is_prediction = rf.loglikelihood(ctx, doc["completion"])
 67 |         return is_prediction
 68 | 
 69 |     def process_results(self, doc, results):
 70 |         (is_prediction,) = results
 71 |         return {"acc": is_prediction}
 72 | 
 73 |     def aggregation(self):
 74 |         return {
 75 |             "acc": mean,
 76 |         }
 77 | 
 78 |     def higher_is_better(self):
 79 |         return {"acc": True}
 80 | 
 81 | 
 82 | class Arithmetic2DPlus(Arithmetic):
 83 |     DATASET_NAME = "arithmetic_2da"
 84 | 
 85 | 
 86 | class Arithmetic2DMinus(Arithmetic):
 87 |     DATASET_NAME = "arithmetic_2ds"
 88 | 
 89 | 
 90 | class Arithmetic3DPlus(Arithmetic):
 91 |     DATASET_NAME = "arithmetic_3da"
 92 | 
 93 | 
 94 | class Arithmetic3DMinus(Arithmetic):
 95 |     DATASET_NAME = "arithmetic_3ds"
 96 | 
 97 | 
 98 | class Arithmetic4DPlus(Arithmetic):
 99 |     DATASET_NAME = "arithmetic_4da"
100 | 
101 | 
102 | class Arithmetic4DMinus(Arithmetic):
103 |     DATASET_NAME = "arithmetic_4ds"
104 | 
105 | 
106 | class Arithmetic5DPlus(Arithmetic):
107 |     DATASET_NAME = "arithmetic_5da"
108 | 
109 | 
110 | class Arithmetic5DMinus(Arithmetic):
111 |     DATASET_NAME = "arithmetic_5ds"
112 | 
113 | 
114 | class Arithmetic2DMultiplication(Arithmetic):
115 |     DATASET_NAME = "arithmetic_2dm"
116 | 
117 | 
118 | class Arithmetic1DComposite(Arithmetic):
119 |     DATASET_NAME = "arithmetic_1dc"
120 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/asdiv.py:
--------------------------------------------------------------------------------
 1 | """
 2 | ASDiv: A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers
 3 | https://arxiv.org/abs/2106.15772
 4 | 
 5 | ASDiv (Academia Sinica Diverse MWP Dataset) is a diverse (in terms of both language
 6 | patterns and problem types) English math word problem (MWP) corpus for evaluating
 7 | the capability of various MWP solvers. Existing MWP corpora for studying AI progress
 8 | remain limited either in language usage patterns or in problem types. We thus present
 9 | a new English MWP corpus with 2,305 MWPs that cover more text patterns and most problem
10 | types taught in elementary school. Each MWP is annotated with its problem type and grade
11 | level (for indicating the level of difficulty).
12 | 
13 | NOTE: We currently ignore formulas for answer generation.
14 | 
15 | Homepage: https://github.com/chaochun/nlu-asdiv-dataset
16 | """
17 | import inspect
18 | import lm_eval.datasets.asdiv.asdiv
19 | from lm_eval.base import rf, Task
20 | from lm_eval.metrics import mean
21 | 
22 | 
23 | _CITATION = """
24 | @misc{miao2021diverse,
25 |     title={A Diverse Corpus for Evaluating and Developing English Math Word Problem Solvers},
26 |     author={Shen-Yun Miao and Chao-Chun Liang and Keh-Yih Su},
27 |     year={2021},
28 |     eprint={2106.15772},
29 |     archivePrefix={arXiv},
30 |     primaryClass={cs.AI}
31 | }
32 | """
33 | 
34 | 
35 | class Asdiv(Task):
36 |     VERSION = 0
37 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.asdiv.asdiv)
38 | 
39 |     def has_training_docs(self):
40 |         return False
41 | 
42 |     def has_validation_docs(self):
43 |         return True
44 | 
45 |     def has_test_docs(self):
46 |         return False
47 | 
48 |     def training_docs(self):
49 |         raise NotImplementedError("This dataset has no training docs")
50 | 
51 |     def validation_docs(self):
52 |         return self.dataset["validation"]
53 | 
54 |     def test_docs(self):
55 |         raise NotImplementedError("This dataset has no test docs")
56 | 
57 |     def fewshot_context(
58 |         self, doc, num_fewshot, provide_description=None, rnd=None, description=None
59 |     ):
60 |         assert num_fewshot == 0, "ASDiv is intended only for the zero-shot setting."
61 |         return super().fewshot_context(
62 |             doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
63 |         )
64 | 
65 |     def doc_to_text(self, doc):
66 |         # TODO: add solution-type
67 |         return doc["body"] + "\n" + "Question:" + doc["question"] + "\n" + "Answer:"
68 | 
69 |     def should_decontaminate(self):
70 |         return True
71 | 
72 |     def doc_to_decontamination_query(self, doc):
73 |         return doc["body"] + " " + doc["question"]
74 | 
75 |     def doc_to_target(self, doc):
76 |         # TODO: add formula
77 | 
78 |         answer = doc["answer"].split(" (")[0]
79 |         return " " + answer
80 | 
81 |     def construct_requests(self, doc, ctx):
82 |         ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
83 |         return ll, is_greedy
84 | 
85 |     def process_results(self, doc, results):
86 |         ll, is_greedy = results
87 | 
88 |         return {"acc": int(is_greedy)}
89 | 
90 |     def aggregation(self):
91 |         return {"acc": mean}
92 | 
93 |     def higher_is_better(self):
94 |         return {"acc": True}
95 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/gsm8k.py:
--------------------------------------------------------------------------------
  1 | """
  2 | "Training Verifiers to Solve Math Word Problems"
  3 | https://arxiv.org/abs/2110.14168
  4 | 
  5 | State-of-the-art language models can match human performance on many tasks, but
  6 | they still struggle to robustly perform multi-step mathematical reasoning. To
  7 | diagnose the failures of current models and support research, we introduce GSM8K,
  8 | a dataset of 8.5K high quality linguistically diverse grade school math word problems.
  9 | We find that even the largest transformer models fail to achieve high test performance,
 10 | despite the conceptual simplicity of this problem distribution.
 11 | 
 12 | NOTE: See the official implementation of the task:
 13 |     https://github.com/openai/grade-school-math/blob/master/grade_school_math/calculator.py
 14 | for how to make use of the dataset's calculator annotations in your language
 15 | model's sample/generation function.
 16 | 
 17 | Homepage: https://github.com/openai/grade-school-math
 18 | """
 19 | import re
 20 | from lm_eval.base import Task, rf
 21 | from lm_eval.metrics import mean
 22 | 
 23 | 
 24 | _CITATION = """
 25 | @misc{cobbe2021training,
 26 |       title={Training Verifiers to Solve Math Word Problems},
 27 |       author={Karl Cobbe and Vineet Kosaraju and Mohammad Bavarian and Jacob Hilton and Reiichiro Nakano and Christopher Hesse and John Schulman},
 28 |       year={2021},
 29 |       eprint={2110.14168},
 30 |       archivePrefix={arXiv},
 31 |       primaryClass={cs.LG}
 32 | }
 33 | """
 34 | 
 35 | 
 36 | ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
 37 | INVALID_ANS = "[invalid]"
 38 | 
 39 | 
 40 | class GradeSchoolMath8K(Task):
 41 |     VERSION = 0
 42 |     DATASET_PATH = "gsm8k"
 43 |     DATASET_NAME = "main"
 44 | 
 45 |     def has_training_docs(self):
 46 |         return True
 47 | 
 48 |     def has_validation_docs(self):
 49 |         return False
 50 | 
 51 |     def has_test_docs(self):
 52 |         return True
 53 | 
 54 |     def training_docs(self):
 55 |         return self.dataset["train"]
 56 | 
 57 |     def validation_docs(self):
 58 |         raise NotImplementedError
 59 | 
 60 |     def test_docs(self):
 61 |         return self.dataset["test"]
 62 | 
 63 |     def doc_to_text(self, doc):
 64 |         return "Question: " + doc["question"] + "\nAnswer:"
 65 | 
 66 |     def doc_to_target(self, doc):
 67 |         return " " + doc["answer"]
 68 | 
 69 |     def construct_requests(self, doc, ctx):
 70 |         """Uses RequestFactory to construct Requests and returns an iterable of
 71 |         Requests which will be sent to the LM.
 72 | 
 73 |         :param doc:
 74 |             The document as returned from training_docs, validation_docs, or test_docs.
 75 |         :param ctx: str
 76 |             The context string, generated by fewshot_context. This includes the natural
 77 |             language description, as well as the few shot examples, and the question
 78 |             part of the document for `doc`.
 79 |         """
 80 |         # NOTE: The paper implements "verifiers" that assign a score to multiple
 81 |         # solutions and output the highest ranked solution.
 82 |         completion = rf.greedy_until(ctx, ["\n"])
 83 |         return completion
 84 | 
 85 |     def _extract_answer(self, completion):
 86 |         match = ANS_RE.search(completion)
 87 |         if match:
 88 |             match_str = match.group(1).strip()
 89 |             match_str = match_str.replace(",", "")
 90 |             return match_str
 91 |         else:
 92 |             return INVALID_ANS
 93 | 
 94 |     def _is_correct(self, completion, answer):
 95 |         gold = self._extract_answer(answer)
 96 |         assert gold != INVALID_ANS, "No ground truth answer found in the document."
 97 |         return self._extract_answer(completion) == gold
 98 | 
 99 |     def process_results(self, doc, results):
100 |         """Take a single document and the LM results and evaluates, returning a
101 |         dict where keys are the names of submetrics and values are the values of
102 |         the metric for that one document
103 | 
104 |         :param doc:
105 |             The document as returned from training_docs, validation_docs, or test_docs.
106 |         :param results:
107 |             The results of the requests created in construct_requests.
108 |         """
109 |         completion = results[0]
110 |         answer = doc["answer"]
111 |         return {"acc": self._is_correct(completion, answer)}
112 | 
113 |     def aggregation(self):
114 |         """
115 |         :returns: {str: [float] -> float}
116 |             A dictionary where keys are the names of submetrics and values are
117 |             functions that aggregate a list of metrics
118 |         """
119 |         return {"acc": mean}
120 | 
121 |     def higher_is_better(self):
122 |         """
123 |         :returns: {str: bool}
124 |             A dictionary where keys are the names of submetrics and values are
125 |             whether a higher value of the submetric is better
126 |         """
127 |         return {"acc": True}
128 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/headqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Interpretable Multi-Step Reasoning with Knowledge Extraction on Complex Healthcare Question Answering
 3 | https://aclanthology.org/P19-1092.pdf
 4 | 
 5 | HEAD-QA is a multi-choice HEAlthcare Dataset. The questions come from exams to
 6 | access a specialized position in the Spanish healthcare system, and are challenging
 7 | even for highly specialized humans.
 8 | 
 9 | Homepage: https://aghie.github.io/head-qa/
10 | """
11 | import inspect
12 | import lm_eval.datasets.headqa.headqa
13 | from lm_eval.base import MultipleChoiceTask
14 | 
15 | 
16 | _CITATION = """
17 | @misc{liu2020interpretable,
18 |     title={Interpretable Multi-Step Reasoning with Knowledge Extraction on Complex Healthcare Question Answering},
19 |     author={Ye Liu and Shaika Chowdhury and Chenwei Zhang and Cornelia Caragea and Philip S. Yu},
20 |     year={2020},
21 |     eprint={2008.02434},
22 |     archivePrefix={arXiv},
23 |     primaryClass={cs.AI}
24 | }
25 | """
26 | 
27 | 
28 | class HeadQABase(MultipleChoiceTask):
29 |     VERSION = 0
30 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.headqa.headqa)
31 | 
32 |     def has_training_docs(self):
33 |         return True
34 | 
35 |     def has_validation_docs(self):
36 |         return True
37 | 
38 |     def has_test_docs(self):
39 |         return True
40 | 
41 |     def training_docs(self):
42 |         if self._training_docs is None:
43 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
44 |         return self._training_docs
45 | 
46 |     def validation_docs(self):
47 |         return map(self._process_doc, self.dataset["validation"])
48 | 
49 |     def test_docs(self):
50 |         return map(self._process_doc, self.dataset["test"])
51 | 
52 |     def _process_doc(self, doc):
53 |         out_doc = {
54 |             "id": doc["qid"],
55 |             "query": "Question: " + doc["qtext"] + "\nAnswer:",
56 |             "choices": [answer["atext"] for answer in doc["answers"]],
57 |             "gold": int(doc["ra"]) - 1,
58 |         }
59 |         return out_doc
60 | 
61 |     def doc_to_text(self, doc):
62 |         return doc["query"]
63 | 
64 |     def should_decontaminate(self):
65 |         return True
66 | 
67 |     def doc_to_decontamination_query(self, doc):
68 |         return doc["query"]
69 | 
70 | 
71 | class HeadQAEn(HeadQABase):
72 |     DATASET_NAME = "en"
73 | 
74 | 
75 | class HeadQAEs(HeadQABase):
76 |     DATASET_NAME = "es"
77 | 
78 | 
79 | # for backwards compatibility
80 | class HeadQAEsDeprecated(HeadQABase):
81 |     DATASET_NAME = "es"
82 | 
83 |     def __init__(self):
84 |         super().__init__()
85 |         print(
86 |             "WARNING: headqa is deprecated. Please use headqa_es or headqa_en instead. See https://github.com/EleutherAI/lm-evaluation-harness/pull/240 for more info."
87 |         )
88 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/hellaswag.py:
--------------------------------------------------------------------------------
 1 | """
 2 | HellaSwag: Can a Machine Really Finish Your Sentence?
 3 | https://arxiv.org/pdf/1905.07830.pdf
 4 | 
 5 | Hellaswag is a commonsense inference challenge dataset. Though its questions are
 6 | trivial for humans (>95% accuracy), state-of-the-art models struggle (<48%). This is
 7 | achieved via Adversarial Filtering (AF), a data collection paradigm wherein a
 8 | series of discriminators iteratively select an adversarial set of machine-generated
 9 | wrong answers. AF proves to be surprisingly robust. The key insight is to scale up
10 | the length and complexity of the dataset examples towards a critical 'Goldilocks'
11 | zone wherein generated text is ridiculous to humans, yet often misclassified by
12 | state-of-the-art models.
13 | 
14 | Homepage: https://rowanzellers.com/hellaswag/
15 | """
16 | import re
17 | import inspect
18 | from lm_eval.base import MultipleChoiceTask
19 | import lm_eval.datasets.hellaswag.hellaswag
20 | 
21 | _CITATION = """
22 | @inproceedings{zellers2019hellaswag,
23 |     title={HellaSwag: Can a Machine Really Finish Your Sentence?},
24 |     author={Zellers, Rowan and Holtzman, Ari and Bisk, Yonatan and Farhadi, Ali and Choi, Yejin},
25 |     booktitle ={Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
26 |     year={2019}
27 | }
28 | """
29 | 
30 | 
31 | class HellaSwag(MultipleChoiceTask):
32 |     VERSION = 0
33 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.hellaswag.hellaswag)
34 |     DATASET_NAME = "hellaswag"
35 | 
36 |     def has_training_docs(self):
37 |         return True
38 | 
39 |     def has_validation_docs(self):
40 |         return True
41 | 
42 |     def has_test_docs(self):
43 |         return False
44 | 
45 |     def training_docs(self):
46 |         if self._training_docs is None:
47 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
48 |         return self._training_docs
49 | 
50 |     def validation_docs(self):
51 |         return map(self._process_doc, self.dataset["validation"])
52 | 
53 |     def _process_doc(self, doc):
54 |         ctx = doc["ctx_a"] + " " + doc["ctx_b"].capitalize()
55 |         out_doc = {
56 |             "query": self.preprocess(doc["activity_label"] + ": " + ctx),
57 |             "choices": [self.preprocess(ending) for ending in doc["endings"]],
58 |             "gold": int(doc["label"]),
59 |         }
60 |         return out_doc
61 | 
62 |     @classmethod
63 |     def preprocess(cls, text):
64 |         text = text.strip()
65 |         # NOTE: Brackets are artifacts of the WikiHow dataset portion of HellaSwag.
66 |         text = text.replace(" [title]", ". ")
67 |         text = re.sub("\\[.*?\\]", "", text)
68 |         text = text.replace("  ", " ")
69 |         return text
70 | 
71 |     def doc_to_text(self, doc):
72 |         return doc["query"]
73 | 
74 |     def should_decontaminate(self):
75 |         return True
76 | 
77 |     def doc_to_decontamination_query(self, doc):
78 |         return doc["query"]
79 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/lambada.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The LAMBADA dataset: Word prediction requiring a broad discourse context∗
  3 | https://arxiv.org/pdf/1606.06031.pdf
  4 | 
  5 | LAMBADA is a dataset to evaluate the capabilities of computational models for text
  6 | understanding by means of a word prediction task. LAMBADA is a collection of narrative
  7 | passages sharing the characteristic that human subjects are able to guess their last
  8 | word if they are exposed to the whole passage, but not if they only see the last
  9 | sentence preceding the target word. To succeed on LAMBADA, computational models
 10 | cannot simply rely on local context, but must be able to keep track of information
 11 | in the broader discourse.
 12 | 
 13 | Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
 14 | """
 15 | import inspect
 16 | import lm_eval.datasets.lambada_openai.lambada_openai
 17 | from lm_eval.base import Task, rf
 18 | from lm_eval.metrics import mean, perplexity
 19 | 
 20 | 
 21 | _CITATION = """
 22 | @misc{
 23 |     author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
 24 |     title={The LAMBADA dataset},
 25 |     DOI={10.5281/zenodo.2630551},
 26 |     publisher={Zenodo},
 27 |     year={2016},
 28 |     month={Aug}
 29 | }
 30 | """
 31 | 
 32 | 
 33 | class LambadaBase(Task):
 34 |     VERSION = None
 35 | 
 36 |     def training_docs(self):
 37 |         if self.has_training_docs():
 38 |             return self.dataset["train"]
 39 | 
 40 |     def validation_docs(self):
 41 |         if self.has_validation_docs():
 42 |             return self.dataset["validation"]
 43 | 
 44 |     def test_docs(self):
 45 |         if self.has_test_docs():
 46 |             return self.dataset["test"]
 47 | 
 48 |     def doc_to_text(self, doc):
 49 |         return doc["text"].rsplit(" ", 1)[0]
 50 | 
 51 |     def should_decontaminate(self):
 52 |         return True
 53 | 
 54 |     def doc_to_decontamination_query(self, doc):
 55 |         return doc["text"]
 56 | 
 57 |     def doc_to_target(self, doc):
 58 |         return " " + doc["text"].rsplit(" ", 1)[1]
 59 | 
 60 |     def construct_requests(self, doc, ctx):
 61 |         ll, is_greedy = rf.loglikelihood(ctx, self.doc_to_target(doc))
 62 | 
 63 |         return ll, is_greedy
 64 | 
 65 |     def process_results(self, doc, results):
 66 |         ll, is_greedy = results
 67 | 
 68 |         return {"ppl": ll, "acc": int(is_greedy)}
 69 | 
 70 |     def aggregation(self):
 71 |         return {"ppl": perplexity, "acc": mean}
 72 | 
 73 |     def higher_is_better(self):
 74 |         return {"ppl": False, "acc": True}
 75 | 
 76 | 
 77 | class LambadaStandard(LambadaBase):
 78 |     """The LAMBADA task using the standard original LAMBADA dataset."""
 79 | 
 80 |     VERSION = 0
 81 |     DATASET_PATH = "lambada"
 82 | 
 83 |     def has_training_docs(self):
 84 |         return False
 85 | 
 86 |     def has_validation_docs(self):
 87 |         return True
 88 | 
 89 |     def has_test_docs(self):
 90 |         return True
 91 | 
 92 | 
 93 | class LambadaOpenAI(LambadaBase):
 94 |     """The LAMBADA task using the LAMBADA OpenAI dataset, a modified version of the
 95 |     original LAMBADA dataset created by OpenAI for evaluating their GPT-2 model.
 96 | 
 97 |     Reference: https://github.com/openai/gpt-2/issues/131#issuecomment-497136199
 98 |     """
 99 | 
100 |     VERSION = 0
101 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.lambada_openai.lambada_openai)
102 | 
103 |     def has_training_docs(self):
104 |         return False
105 | 
106 |     def has_validation_docs(self):
107 |         return True
108 | 
109 |     def has_test_docs(self):
110 |         return False
111 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/lambada_cloze.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The LAMBADA dataset: Word prediction requiring a broad discourse context∗
 3 | https://arxiv.org/pdf/1606.06031.pdf
 4 | 
 5 | Cloze-style LAMBADA dataset.
 6 | LAMBADA is a dataset to evaluate the capabilities of computational models for text
 7 | understanding by means of a word prediction task. LAMBADA is a collection of narrative
 8 | passages sharing the characteristic that human subjects are able to guess their last
 9 | word if they are exposed to the whole passage, but not if they only see the last
10 | sentence preceding the target word. To succeed on LAMBADA, computational models
11 | cannot simply rely on local context, but must be able to keep track of information
12 | in the broader discourse.
13 | 
14 | Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
15 | """
16 | from lm_eval.tasks.lambada import LambadaOpenAI, LambadaStandard
17 | 
18 | 
19 | _CITATION = """
20 | @misc{
21 |     author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
22 |     title={The LAMBADA dataset},
23 |     DOI={10.5281/zenodo.2630551},
24 |     publisher={Zenodo},
25 |     year={2016},
26 |     month={Aug}
27 | }
28 | """
29 | 
30 | 
31 | class LambadaStandardCloze(LambadaStandard):
32 |     """Cloze-style LambadaStandard."""
33 | 
34 |     VERSION = 0
35 | 
36 |     def doc_to_text(self, doc):
37 |         return doc["text"].rsplit(" ", 1)[0] + " ____. ->"
38 | 
39 |     def should_decontaminate(self):
40 |         return True
41 | 
42 |     def doc_to_decontamination_query(self, doc):
43 |         return doc["text"]
44 | 
45 |     def doc_to_target(self, doc):
46 |         return " " + doc["text"].rsplit(" ", 1)[1]
47 | 
48 | 
49 | class LambadaOpenAICloze(LambadaOpenAI):
50 |     """Cloze-style LambadaOpenAI."""
51 | 
52 |     VERSION = 0
53 | 
54 |     def doc_to_text(self, doc):
55 |         return doc["text"].rsplit(" ", 1)[0] + " ____. ->"
56 | 
57 |     def should_decontaminate(self):
58 |         return True
59 | 
60 |     def doc_to_decontamination_query(self, doc):
61 |         return doc["text"]
62 | 
63 |     def doc_to_target(self, doc):
64 |         return " " + doc["text"].rsplit(" ", 1)[1]
65 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/lambada_multilingual.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The LAMBADA (OpenAI) dataset: Word prediction requiring a broad discourse context∗
 3 | https://arxiv.org/pdf/1606.06031.pdf
 4 | 
 5 | The LAMBADA OpenAI dataset machine-translated to other languages.
 6 | LAMBADA is a dataset to evaluate the capabilities of computational models for text
 7 | understanding by means of a word prediction task. LAMBADA is a collection of narrative
 8 | passages sharing the characteristic that human subjects are able to guess their last
 9 | word if they are exposed to the whole passage, but not if they only see the last
10 | sentence preceding the target word. To succeed on LAMBADA, computational models
11 | cannot simply rely on local context, but must be able to keep track of information
12 | in the broader discourse.
13 | 
14 | Homepage: https://zenodo.org/record/2630551#.X4Xzn5NKjUI
15 | 
16 | Reference (OpenAI): https://github.com/openai/gpt-2/issues/131#issuecomment-497136199
17 | """
18 | from .lambada import LambadaOpenAI
19 | 
20 | 
21 | _CITATION = """
22 | @misc{
23 |     author={Paperno, Denis and Kruszewski, Germán and Lazaridou, Angeliki and Pham, Quan Ngoc and Bernardi, Raffaella and Pezzelle, Sandro and Baroni, Marco and Boleda, Gemma and Fernández, Raquel},
24 |     title={The LAMBADA dataset},
25 |     DOI={10.5281/zenodo.2630551},
26 |     publisher={Zenodo},
27 |     year={2016},
28 |     month={Aug}
29 | }
30 | """
31 | 
32 | 
33 | class LambadaOpenAIMultilingualEnglish(LambadaOpenAI):
34 |     VERSION = 0
35 |     DATASET_NAME = "en"
36 | 
37 | 
38 | class LambadaOpenAIMultilingualFrench(LambadaOpenAI):
39 |     VERSION = 0
40 |     DATASET_NAME = "fr"
41 | 
42 | 
43 | class LambadaOpenAIMultilingualGerman(LambadaOpenAI):
44 |     VERSION = 0
45 |     DATASET_NAME = "de"
46 | 
47 | 
48 | class LambadaOpenAIMultilingualItalian(LambadaOpenAI):
49 |     VERSION = 0
50 |     DATASET_NAME = "it"
51 | 
52 | 
53 | class LambadaOpenAIMultilingualSpanish(LambadaOpenAI):
54 |     VERSION = 0
55 |     DATASET_NAME = "es"
56 | 
57 | 
58 | LANG_CLASSES = [
59 |     LambadaOpenAIMultilingualEnglish,
60 |     LambadaOpenAIMultilingualFrench,
61 |     LambadaOpenAIMultilingualGerman,
62 |     LambadaOpenAIMultilingualItalian,
63 |     LambadaOpenAIMultilingualSpanish,
64 | ]
65 | 
66 | 
67 | def construct_tasks():
68 |     tasks = {}
69 |     for lang_class in LANG_CLASSES:
70 |         tasks[f"lambada_openai_mt_{lang_class.DATASET_NAME}"] = lang_class
71 |     return tasks
72 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/logiqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning
 3 | https://arxiv.org/pdf/2007.08124.pdf
 4 | 
 5 | LogiQA is a dataset for testing human logical reasoning. It consists of 8,678 QA
 6 | instances, covering multiple types of deductive reasoning. Results show that state-
 7 | of-the-art neural models perform by far worse than human ceiling. The dataset can
 8 | also serve as a benchmark for reinvestigating logical AI under the deep learning
 9 | NLP setting.
10 | 
11 | Homepage: https://github.com/lgw863/LogiQA-dataset
12 | """
13 | import inspect
14 | import lm_eval.datasets.logiqa.logiqa
15 | from lm_eval.base import MultipleChoiceTask
16 | 
17 | 
18 | _CITATION = """
19 | @misc{liu2020logiqa,
20 |     title={LogiQA: A Challenge Dataset for Machine Reading Comprehension with Logical Reasoning},
21 |     author={Jian Liu and Leyang Cui and Hanmeng Liu and Dandan Huang and Yile Wang and Yue Zhang},
22 |     year={2020},
23 |     eprint={2007.08124},
24 |     archivePrefix={arXiv},
25 |     primaryClass={cs.CL}
26 | }
27 | """
28 | 
29 | 
30 | class LogiQA(MultipleChoiceTask):
31 |     VERSION = 0
32 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.logiqa.logiqa)
33 |     DATASET_NAME = None
34 | 
35 |     def has_training_docs(self):
36 |         return True
37 | 
38 |     def has_validation_docs(self):
39 |         return True
40 | 
41 |     def has_test_docs(self):
42 |         return True
43 | 
44 |     def training_docs(self):
45 |         if self._training_docs is None:
46 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
47 |         return self._training_docs
48 | 
49 |     def validation_docs(self):
50 |         return map(self._process_doc, self.dataset["validation"])
51 | 
52 |     def test_docs(self):
53 |         return map(self._process_doc, self.dataset["test"])
54 | 
55 |     def _process_doc(self, doc):
56 |         def format_example(doc, choices):
57 |             """
58 |             Passage: <passage>
59 |             Question: <question>
60 |             Choices:
61 |             A. <choice1>
62 |             B. <choice2>
63 |             C. <choice3>
64 |             D. <choice4>
65 |             Answer:
66 |             """
67 |             prompt = "Passage: " + doc["context"] + "\n"
68 |             prompt += "Question: " + doc["question"] + "\nChoices:\n"
69 |             for choice, option in zip(choices, doc["options"]):
70 |                 prompt += f"{choice.upper()}. {option}\n"
71 |             prompt += "Answer:"
72 |             return prompt
73 | 
74 |         choices = ["a", "b", "c", "d"]
75 |         return {
76 |             "passage": doc["context"],  # Used for decontamination
77 |             "query": format_example(doc, choices),
78 |             "choices": doc["options"],
79 |             "gold": choices.index(doc["label"]),
80 |         }
81 | 
82 |     def doc_to_text(self, doc):
83 |         return doc["query"]
84 | 
85 |     def should_decontaminate(self):
86 |         return True
87 | 
88 |     def doc_to_decontamination_query(self, doc):
89 |         return doc["passage"]
90 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/mathqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | MathQA: Towards Interpretable Math Word Problem Solving with Operation-Based Formalisms
 3 | https://arxiv.org/pdf/1905.13319.pdf
 4 | 
 5 | MathQA is a large-scale dataset of 37k English multiple-choice math word problems
 6 | covering multiple math domain categories by modeling operation programs corresponding
 7 | to word problems in the AQuA dataset (Ling et al., 2017).
 8 | 
 9 | Homepage: https://math-qa.github.io/math-QA/
10 | """
11 | import re
12 | from lm_eval.base import MultipleChoiceTask
13 | 
14 | 
15 | _CITATION = """
16 | @misc{amini2019mathqa,
17 |     title={MathQA: Towards Interpretable Math Word Problem Solving with Operation-Based Formalisms},
18 |     author={Aida Amini and Saadia Gabriel and Peter Lin and Rik Koncel-Kedziorski and Yejin Choi and Hannaneh Hajishirzi},
19 |     year={2019},
20 |     eprint={1905.13319},
21 |     archivePrefix={arXiv},
22 |     primaryClass={cs.CL}
23 | }
24 | """
25 | 
26 | 
27 | class MathQA(MultipleChoiceTask):
28 |     VERSION = 0
29 |     DATASET_PATH = "math_qa"
30 |     DATASET_NAME = None
31 | 
32 |     def has_training_docs(self):
33 |         return True
34 | 
35 |     def has_validation_docs(self):
36 |         return True
37 | 
38 |     def has_test_docs(self):
39 |         return True
40 | 
41 |     def training_docs(self):
42 |         if self._training_docs is None:
43 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
44 |         return self._training_docs
45 | 
46 |     def validation_docs(self):
47 |         return map(self._process_doc, self.dataset["validation"])
48 | 
49 |     def test_docs(self):
50 |         return map(self._process_doc, self.dataset["test"])
51 | 
52 |     def _process_doc(self, doc):
53 |         answer_idx = ["a", "b", "c", "d", "e"].index(doc["correct"])
54 |         choices = [
55 |             c[4:].rstrip(" ,")
56 |             for c in re.findall(r"[abcd] \) .*?, |e \) .*?$", doc["options"])
57 |         ]
58 | 
59 |         out_doc = {
60 |             "query": "Question: " + doc["Problem"] + "\nAnswer:",
61 |             "choices": choices,
62 |             "gold": answer_idx,
63 |         }
64 |         return out_doc
65 | 
66 |     def doc_to_text(self, doc):
67 |         return doc["query"]
68 | 
69 |     def should_decontaminate(self):
70 |         return True
71 | 
72 |     def doc_to_decontamination_query(self, doc):
73 |         return doc["query"]
74 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/mutual.py:
--------------------------------------------------------------------------------
  1 | """
  2 | MuTual: A Dataset for Multi-Turn Dialogue Reasoning
  3 | https://www.aclweb.org/anthology/2020.acl-main.130/
  4 | 
  5 | MuTual is a retrieval-based dataset for multi-turn dialogue reasoning, which is
  6 | modified from Chinese high school English listening comprehension test data.
  7 | 
  8 | Homepage: https://github.com/Nealcly/MuTual
  9 | """
 10 | import numpy as np
 11 | import inspect
 12 | import lm_eval.datasets.mutual.mutual
 13 | from lm_eval.base import Task, rf
 14 | from lm_eval.metrics import mean
 15 | 
 16 | 
 17 | _CITATION = """
 18 | @inproceedings{mutual,
 19 |     title = "MuTual: A Dataset for Multi-Turn Dialogue Reasoning",
 20 |     author = "Cui, Leyang  and Wu, Yu and Liu, Shujie and Zhang, Yue and Zhou, Ming" ,
 21 |     booktitle = "Proceedings of the 58th Conference of the Association for Computational Linguistics",
 22 |     year = "2020",
 23 |     publisher = "Association for Computational Linguistics",
 24 | }
 25 | """
 26 | 
 27 | 
 28 | class MuTualBase(Task):
 29 |     VERSION = 1
 30 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.mutual.mutual)
 31 |     DATASET_NAME = None
 32 |     CHOICES = ["A", "B", "C", "D"]
 33 | 
 34 |     def has_training_docs(self):
 35 |         return True
 36 | 
 37 |     def has_validation_docs(self):
 38 |         return True
 39 | 
 40 |     def has_test_docs(self):
 41 |         return False
 42 | 
 43 |     def training_docs(self):
 44 |         return self.dataset["train"]
 45 | 
 46 |     def validation_docs(self):
 47 |         return self.dataset["validation"]
 48 | 
 49 |     def test_docs(self):
 50 |         return NotImplemented
 51 | 
 52 |     def doc_to_text(self, doc):
 53 |         return self.detokenize(doc["article"])
 54 | 
 55 |     def should_decontaminate(self):
 56 |         return True
 57 | 
 58 |     def doc_to_decontamination_query(self, doc):
 59 |         return doc["article"]
 60 | 
 61 |     def doc_to_target(self, doc):
 62 |         return " " + self.detokenize(doc["options"][self.CHOICES.index(doc["answers"])])
 63 | 
 64 |     def construct_requests(self, doc, ctx):
 65 |         lls = []
 66 |         for option in doc["options"]:
 67 |             lls.append(rf.loglikelihood(ctx, f" {self.detokenize(option)}")[0])
 68 |         return lls
 69 | 
 70 |     def detokenize(self, text):
 71 |         text = text.replace(" '", "'")
 72 |         text = text.replace(" \n", "\n")
 73 |         text = text.replace("\n ", "\n")
 74 |         text = text.replace(" n't", "n't")
 75 |         text = text.replace("`` ", '"')
 76 |         text = text.replace("''", '"')
 77 |         # punctuation
 78 |         text = text.replace(" :", ":")
 79 |         text = text.replace(" ;", ";")
 80 |         text = text.replace(" !", "!")
 81 |         text = text.replace(" ?", "?")
 82 |         text = text.replace(" ,", ",")
 83 |         text = text.replace(" .", ".")
 84 |         return text
 85 | 
 86 |     def process_results(self, doc, results):
 87 |         gold = self.CHOICES.index(doc["answers"])
 88 |         r4_1 = np.argmax(results) == gold  # r4_1 = accuracy
 89 |         ranks = sorted(results, reverse=True)
 90 |         r4_2 = (ranks.index(results[gold]) == 1) + r4_1
 91 |         mrr = 1.0 / (ranks.index(results[gold]) + 1)  # `+ 1` for index offset
 92 |         return {"r@1": r4_1, "r@2": r4_2, "mrr": mrr}
 93 | 
 94 |     def aggregation(self):
 95 |         return {"r@1": mean, "r@2": mean, "mrr": mean}
 96 | 
 97 |     def higher_is_better(self):
 98 |         return {"r@1": True, "r@2": True, "mrr": True}
 99 | 
100 | 
101 | class MuTual(MuTualBase):
102 |     DATASET_NAME = "mutual"
103 | 
104 | 
105 | class MuTualPlus(MuTualBase):
106 |     DATASET_NAME = "mutual_plus"
107 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/openbookqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering
 3 | https://arxiv.org/pdf/1809.02789.pdf
 4 | 
 5 | OpenBookQA is a question-answering dataset modeled after open book exams for
 6 | assessing human understanding of a subject. It consists of 5,957 multiple-choice
 7 | elementary-level science questions (4,957 train, 500 dev, 500 test), which probe
 8 | the understanding of a small “book” of 1,326 core science facts and the application
 9 | of these facts to novel situations. For training, the dataset includes a mapping
10 | from each question to the core science fact it was designed to probe. Answering
11 | OpenBookQA questions requires additional broad common knowledge, not contained
12 | in the book. The questions, by design, are answered incorrectly by both a retrieval-
13 | based algorithm and a word co-occurrence algorithm.
14 | 
15 | Homepage: https://allenai.org/data/open-book-qa
16 | """
17 | from lm_eval.base import MultipleChoiceTask
18 | 
19 | 
20 | _CITATION = """
21 | @inproceedings{OpenBookQA2018,
22 |     title={Can a Suit of Armor Conduct Electricity? A New Dataset for Open Book Question Answering},
23 |     author={Todor Mihaylov and Peter Clark and Tushar Khot and Ashish Sabharwal},
24 |     booktitle={EMNLP},
25 |     year={2018}
26 | }
27 | """
28 | 
29 | 
30 | class OpenBookQA(MultipleChoiceTask):
31 |     VERSION = 0
32 |     DATASET_PATH = "openbookqa"
33 |     DATASET_NAME = "main"
34 | 
35 |     def has_training_docs(self):
36 |         return True
37 | 
38 |     def has_validation_docs(self):
39 |         return True
40 | 
41 |     def has_test_docs(self):
42 |         return True
43 | 
44 |     def training_docs(self):
45 |         if self._training_docs is None:
46 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
47 |         return self._training_docs
48 | 
49 |     def validation_docs(self):
50 |         return map(self._process_doc, self.dataset["validation"])
51 | 
52 |     def test_docs(self):
53 |         return map(self._process_doc, self.dataset["test"])
54 | 
55 |     def _process_doc(self, doc):
56 |         out_doc = {
57 |             "id": doc["id"],
58 |             "query": doc["question_stem"],
59 |             "choices": doc["choices"]["text"],
60 |             "gold": ["A", "B", "C", "D"].index(doc["answerKey"].strip()),
61 |         }
62 |         return out_doc
63 | 
64 |     def doc_to_text(self, doc):
65 |         return doc["query"]
66 | 
67 |     def should_decontaminate(self):
68 |         return True
69 | 
70 |     def doc_to_decontamination_query(self, doc):
71 |         return doc["query"]
72 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/pile.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The Pile: An 800GB Dataset of Diverse Text for Language Modeling
  3 | https://arxiv.org/pdf/2101.00027.pdf
  4 | 
  5 | The Pile is a 825 GiB diverse, open source language modelling data set that consists
  6 | of 22 smaller, high-quality datasets combined together. To score well on Pile
  7 | BPB (bits per byte), a model must be able to understand many disparate domains
  8 | including books, github repositories, webpages, chat logs, and medical, physics,
  9 | math, computer science, and philosophy papers.
 10 | 
 11 | Homepage: https://pile.eleuther.ai/
 12 | """
 13 | import inspect
 14 | import lm_eval.datasets.pile.pile
 15 | from lm_eval.base import PerplexityTask
 16 | 
 17 | 
 18 | _CITATION = """
 19 | @article{pile,
 20 |   title={The {P}ile: An 800GB Dataset of Diverse Text for Language Modeling},
 21 |   author={Gao, Leo and Biderman, Stella and Black, Sid and Golding, Laurence and Hoppe, Travis and Foster, Charles and Phang, Jason and He, Horace and Thite, Anish and Nabeshima, Noa and Presser, Shawn and Leahy, Connor},
 22 |   journal={arXiv preprint arXiv:2101.00027},
 23 |   year={2020}
 24 | }
 25 | """
 26 | 
 27 | 
 28 | class PilePerplexityTask(PerplexityTask):
 29 |     VERSION = 1
 30 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.pile.pile)
 31 |     DATASET_NAME = None
 32 | 
 33 |     def has_validation_docs(self):
 34 |         return True
 35 | 
 36 |     def has_test_docs(self):
 37 |         return True
 38 | 
 39 |     def validation_docs(self):
 40 |         for doc in self.dataset["validation"]:
 41 |             yield doc["text"]
 42 | 
 43 |     def test_docs(self):
 44 |         for doc in self.dataset["test"]:
 45 |             yield doc["text"]
 46 | 
 47 | 
 48 | class PileArxiv(PilePerplexityTask):
 49 |     DATASET_NAME = "pile_arxiv"
 50 | 
 51 | 
 52 | class PileBooks3(PilePerplexityTask):
 53 |     DATASET_NAME = "pile_books3"
 54 | 
 55 | 
 56 | class PileBookCorpus2(PilePerplexityTask):
 57 |     DATASET_NAME = "pile_bookcorpus2"
 58 | 
 59 | 
 60 | class PileDmMathematics(PilePerplexityTask):
 61 |     DATASET_NAME = "pile_dm-mathematics"
 62 | 
 63 | 
 64 | class PileEnron(PilePerplexityTask):
 65 |     DATASET_NAME = "pile_enron"
 66 | 
 67 | 
 68 | class PileEuroparl(PilePerplexityTask):
 69 |     DATASET_NAME = "pile_europarl"
 70 | 
 71 | 
 72 | class PileFreeLaw(PilePerplexityTask):
 73 |     DATASET_NAME = "pile_freelaw"
 74 | 
 75 | 
 76 | class PileGithub(PilePerplexityTask):
 77 |     DATASET_NAME = "pile_github"
 78 | 
 79 | 
 80 | class PileGutenberg(PilePerplexityTask):
 81 |     DATASET_NAME = "pile_gutenberg"
 82 | 
 83 | 
 84 | class PileHackernews(PilePerplexityTask):
 85 |     DATASET_NAME = "pile_hackernews"
 86 | 
 87 | 
 88 | class PileNIHExporter(PilePerplexityTask):
 89 |     DATASET_NAME = "pile_nih-exporter"
 90 | 
 91 | 
 92 | class PileOpenSubtitles(PilePerplexityTask):
 93 |     DATASET_NAME = "pile_opensubtitles"
 94 | 
 95 | 
 96 | class PileOpenWebText2(PilePerplexityTask):
 97 |     DATASET_NAME = "pile_openwebtext2"
 98 | 
 99 | 
100 | class PilePhilPapers(PilePerplexityTask):
101 |     DATASET_NAME = "pile_philpapers"
102 | 
103 | 
104 | class PilePileCc(PilePerplexityTask):
105 |     DATASET_NAME = "pile_pile-cc"
106 | 
107 | 
108 | class PilePubmedAbstracts(PilePerplexityTask):
109 |     DATASET_NAME = "pile_pubmed-abstracts"
110 | 
111 | 
112 | class PilePubmedCentral(PilePerplexityTask):
113 |     DATASET_NAME = "pile_pubmed-central"
114 | 
115 | 
116 | class PileStackExchange(PilePerplexityTask):
117 |     DATASET_NAME = "pile_stackexchange"
118 | 
119 | 
120 | class PileUspto(PilePerplexityTask):
121 |     DATASET_NAME = "pile_upsto"
122 | 
123 | 
124 | class PileUbuntuIrc(PilePerplexityTask):
125 |     DATASET_NAME = "pile_ubuntu-irc"
126 | 
127 | 
128 | class PileWikipedia(PilePerplexityTask):
129 |     DATASET_NAME = "pile_wikipedia"
130 | 
131 | 
132 | class PileYoutubeSubtitles(PilePerplexityTask):
133 |     DATASET_NAME = "pile_youtubesubtitles"
134 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/piqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | PIQA: Reasoning about Physical Commonsense in Natural Language
 3 | https://arxiv.org/pdf/1911.11641.pdf
 4 | 
 5 | Physical Interaction: Question Answering (PIQA) is a physical commonsense
 6 | reasoning and a corresponding benchmark dataset. PIQA was designed to investigate
 7 | the physical knowledge of existing models. To what extent are current approaches
 8 | actually learning about the world?
 9 | 
10 | Homepage: https://yonatanbisk.com/piqa/
11 | """
12 | import inspect
13 | from lm_eval.base import MultipleChoiceTask
14 | import lm_eval.datasets.piqa.piqa
15 | 
16 | _CITATION = """
17 | @inproceedings{Bisk2020,
18 |     author = {Yonatan Bisk and Rowan Zellers and
19 |             Ronan Le Bras and Jianfeng Gao
20 |             and Yejin Choi},
21 |     title = {PIQA: Reasoning about Physical Commonsense in
22 |            Natural Language},
23 |     booktitle = {Thirty-Fourth AAAI Conference on
24 |                Artificial Intelligence},
25 |     year = {2020},
26 | }
27 | """
28 | 
29 | 
30 | class PiQA(MultipleChoiceTask):
31 |     VERSION = 0
32 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.piqa.piqa)
33 |     DATASET_NAME = None
34 | 
35 |     def has_training_docs(self):
36 |         return True
37 | 
38 |     def has_validation_docs(self):
39 |         return True
40 | 
41 |     def has_test_docs(self):
42 |         return False
43 | 
44 |     def training_docs(self):
45 |         if self._training_docs is None:
46 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
47 |         return self._training_docs
48 | 
49 |     def validation_docs(self):
50 |         return map(self._process_doc, self.dataset["validation"])
51 | 
52 |     def _process_doc(self, doc):
53 |         out_doc = {
54 |             "goal": doc["goal"],
55 |             "choices": [doc["sol1"], doc["sol2"]],
56 |             "gold": doc["label"],
57 |         }
58 |         return out_doc
59 | 
60 |     def doc_to_text(self, doc):
61 |         return "Question: " + doc["goal"] + "\nAnswer:"
62 | 
63 |     def should_decontaminate(self):
64 |         return True
65 | 
66 |     def doc_to_decontamination_query(self, doc):
67 |         return doc["goal"]
68 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/prost.py:
--------------------------------------------------------------------------------
 1 | """
 2 | PROST: Physical Reasoning about Objects Through Space and Time
 3 | https://arxiv.org/pdf/2106.03634.pdf
 4 | 
 5 | PROST, Physical Reasoning about Objects Through Space and Time, is a dataset
 6 | consisting of 18,736 multiple-choice questions made from 14 manually curated
 7 | templates, covering 10 physical reasoning concepts. All questions are designed
 8 | to probe both causal and masked language models in a zero-shot setting.
 9 | 
10 | NOTE: PROST is limited to the zero-shot setting to adhere to authors' intentions
11 | as discussed in section 7 of the paper: "We hope that the community will use
12 | this dataset in the intended way: in a zero-shot setting to probe models which
13 | have been trained on data not specifically collected to succeed on PROST."
14 | 
15 | Homepage: https://github.com/nala-cub/prost
16 | """
17 | from lm_eval.base import MultipleChoiceTask
18 | 
19 | 
20 | _CITATION = """
21 | @inproceedings{aroca-ouellette-etal-2021-prost,
22 |     title = "{PROST}: {P}hysical Reasoning about Objects through Space and Time",
23 |     author = "Aroca-Ouellette, St{\'e}phane  and
24 |       Paik, Cory  and
25 |       Roncone, Alessandro  and
26 |       Kann, Katharina",
27 |     booktitle = "Findings of the Association for Computational Linguistics: ACL-IJCNLP 2021",
28 |     month = aug,
29 |     year = "2021",
30 |     address = "Online",
31 |     publisher = "Association for Computational Linguistics",
32 |     url = "https://aclanthology.org/2021.findings-acl.404",
33 |     pages = "4597--4608",
34 | }
35 | """
36 | 
37 | 
38 | class PROST(MultipleChoiceTask):
39 |     VERSION = 0
40 |     DATASET_PATH = "corypaik/prost"
41 |     DATASET_NAME = None
42 | 
43 |     def has_training_docs(self):
44 |         return False
45 | 
46 |     def has_validation_docs(self):
47 |         return False
48 | 
49 |     def has_test_docs(self):
50 |         return True
51 | 
52 |     def test_docs(self):
53 |         return map(self._process_doc, self.dataset["test"])
54 | 
55 |     def fewshot_context(
56 |         self, doc, num_fewshot, provide_description=None, rnd=None, description=None
57 |     ):
58 |         assert (
59 |             num_fewshot == 0
60 |         ), "PROST is designed to probe models in a zero-shot fashion only."
61 |         return super().fewshot_context(
62 |             doc=doc, num_fewshot=num_fewshot, rnd=rnd, description=description
63 |         )
64 | 
65 |     def _process_doc(self, doc):
66 |         out_doc = {
67 |             "query": f"{doc['context']}\nQuestion: {doc['ex_question']}\nAnswer:",
68 |             "choices": [doc["A"], doc["B"], doc["C"], doc["D"]],
69 |             "gold": doc["label"],
70 |         }
71 |         return out_doc
72 | 
73 |     def doc_to_text(self, doc):
74 |         return doc["query"]
75 | 
76 |     def should_decontaminate(self):
77 |         return True
78 | 
79 |     def doc_to_decontamination_query(self, doc):
80 |         return doc["query"]
81 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/pubmedqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | PubMedQA: A Dataset for Biomedical Research Question Answering
 3 | https://arxiv.org/pdf/1909.06146.pdf
 4 | 
 5 | PubMedQA is a novel biomedical question answering (QA) dataset collected from
 6 | PubMed abstracts. The task of PubMedQA is to answer research questions with
 7 | yes/no/maybe (e.g.: Do preoperative statins reduce atrial fibrillation after
 8 | coronary artery bypass grafting?) using the corresponding abstracts. PubMedQA
 9 | has 1k expert-annotated, 61.2k unlabeled and 211.3k artificially generated QA
10 | instances. Each PubMedQA instance is composed of (1) a question which is either
11 | an existing research article title or derived from one, (2) a context which is
12 | the corresponding abstract without its conclusion, (3) a long answer, which is
13 | the conclusion of the abstract and, presumably, answers the research question,
14 | and (4) a yes/no/maybe answer which summarizes the conclusion.
15 | 
16 | Homepage: https://pubmedqa.github.io/
17 | """
18 | import numpy as np
19 | from lm_eval.base import rf, Task
20 | from lm_eval.metrics import mean
21 | 
22 | 
23 | _CITATION = """
24 | @inproceedings{jin2019pubmedqa,
25 |     title={PubMedQA: A Dataset for Biomedical Research Question Answering},
26 |     author={Jin, Qiao and Dhingra, Bhuwan and Liu, Zhengping and Cohen, William and Lu, Xinghua},
27 |     booktitle={Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)},
28 |     pages={2567--2577},
29 |     year={2019}
30 | }
31 | """
32 | 
33 | 
34 | class Pubmed_QA(Task):
35 |     VERSION = 0
36 |     DATASET_PATH = "pubmed_qa"
37 |     DATASET_NAME = "pqa_labeled"
38 | 
39 |     def has_training_docs(self):
40 |         return False
41 | 
42 |     def has_validation_docs(self):
43 |         return False
44 | 
45 |     def has_test_docs(self):
46 |         return True
47 | 
48 |     def test_docs(self):
49 |         if self.has_test_docs():
50 |             # HF is labelled as train but its really just for testing
51 |             return self.dataset["train"]
52 | 
53 |     def doc_to_text(self, doc):
54 |         ctxs = "\n".join(doc["context"]["contexts"])
55 |         return "Abstract: {}\nQuestion: {}\nAnswer:".format(
56 |             ctxs, doc["question"], doc["final_decision"]
57 |         )
58 | 
59 |     def should_decontaminate(self):
60 |         return True
61 | 
62 |     def doc_to_decontamination_query(self, doc):
63 |         return doc["question"] + " " + "\n".join(doc["context"]["contexts"])
64 | 
65 |     def doc_to_target(self, doc):
66 |         return " {}".format(doc["final_decision"])
67 | 
68 |     def construct_requests(self, doc, ctx):
69 |         """Uses RequestFactory to construct Requests and returns
70 |         an iterable of Requests which will be sent to the LM.
71 |         """
72 |         ll_yes, _ = rf.loglikelihood(ctx, " yes")
73 |         ll_no, _ = rf.loglikelihood(ctx, " no")
74 |         ll_maybe, _ = rf.loglikelihood(ctx, " maybe")
75 |         return ll_yes, ll_no, ll_maybe
76 | 
77 |     def process_results(self, doc, results):
78 |         gold = doc["final_decision"]
79 |         ll_yes, ll_no, ll_maybe = results
80 |         pred = np.argmax(results)
81 |         return {
82 |             "acc": ["yes", "no", "maybe"][pred] == gold,
83 |         }
84 | 
85 |     def aggregation(self):
86 |         return {"acc": mean}
87 | 
88 |     def higher_is_better(self):
89 |         return {"acc": True}
90 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/qa4mre.py:
--------------------------------------------------------------------------------
 1 | """
 2 | QA4MRE 2011-2013: Overview of Question Answering for Machine Reading Evaluation
 3 | https://www.cs.cmu.edu/~./hovy/papers/13CLEF-QA4MRE.pdf
 4 | 
 5 | The (English only) QA4MRE challenge which was run as a Lab at CLEF 2011-2013.
 6 | The main objective of this exercise is to develop a methodology for evaluating
 7 | Machine Reading systems through Question Answering and Reading Comprehension
 8 | Tests. Systems should be able to extract knowledge from large volumes of text
 9 | and use this knowledge to answer questions. Four different tasks have been
10 | organized during these years: Main Task, Processing Modality and Negation for
11 | Machine Reading, Machine Reading of Biomedical Texts about Alzheimer's disease,
12 | and Entrance Exam.
13 | 
14 | Homepage: http://nlp.uned.es/clef-qa/repository/qa4mre.php
15 | """
16 | from lm_eval.base import MultipleChoiceTask
17 | 
18 | 
19 | _CITATION = """
20 | @inproceedings{Peas2013QA4MRE2O,
21 |     title={QA4MRE 2011-2013: Overview of Question Answering for Machine Reading Evaluation},
22 |     author={Anselmo Pe{\~n}as and Eduard H. Hovy and Pamela Forner and {\'A}lvaro Rodrigo and Richard F. E. Sutcliffe and Roser Morante},
23 |     booktitle={CLEF},
24 |     year={2013}
25 | }
26 | """  # noqa: W605
27 | 
28 | 
29 | class QA4MRE(MultipleChoiceTask):
30 |     VERSION = 0
31 |     DATASET_PATH = "qa4mre"
32 |     DATASET_NAME = None
33 | 
34 |     def has_training_docs(self):
35 |         return False
36 | 
37 |     def has_validation_docs(self):
38 |         return False
39 | 
40 |     def has_test_docs(self):
41 |         return True
42 | 
43 |     def test_docs(self):
44 |         # `qa4mre` only has train data so we use it for the test docs.
45 |         return map(self._process_doc, self.dataset["train"])
46 | 
47 |     def _process_doc(self, doc):
48 |         choices = doc["answer_options"]["answer_str"]
49 |         out_doc = {
50 |             "source": doc["document_str"].strip().replace("'", "'"),
51 |             "query": doc["question_str"],
52 |             "choices": choices,
53 |             "gold": int(doc["correct_answer_id"]) - 1,
54 |         }
55 |         return out_doc
56 | 
57 |     def doc_to_text(self, doc):
58 |         return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"])
59 | 
60 |     def should_decontaminate(self):
61 |         return True
62 | 
63 |     def doc_to_decontamination_query(self, doc):
64 |         return doc["source"] + " " + doc["query"]
65 | 
66 | 
67 | class QA4MRE_2011(QA4MRE):
68 |     DATASET_NAME = "2011.main.EN"
69 | 
70 | 
71 | class QA4MRE_2012(QA4MRE):
72 |     DATASET_NAME = "2012.main.EN"
73 | 
74 | 
75 | class QA4MRE_2013(QA4MRE):
76 |     DATASET_NAME = "2013.main.EN"
77 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/quac.py:
--------------------------------------------------------------------------------
  1 | """
  2 | QuAC: Question Answering in Context
  3 | https://arxiv.org/abs/1808.07036
  4 | 
  5 | Question Answering in Context (QuAC) is a dataset for modeling, understanding, and
  6 | participating in information seeking dialog. Data instances consist of an interactive
  7 | dialog between two crowd workers: (1) a student who poses a sequence of freeform
  8 | questions to learn as much as possible about a hidden Wikipedia text, and (2)
  9 | a teacher who answers the questions by providing short excerpts (spans) from the text.
 10 | 
 11 | Homepage: https://quac.ai/
 12 | """
 13 | import inspect
 14 | import lm_eval.datasets.quac.quac
 15 | from lm_eval.base import Task
 16 | 
 17 | 
 18 | _CITATION = """
 19 | @article{choi2018quac,
 20 |     title={Quac: Question answering in context},
 21 |     author={Choi, Eunsol and He, He and Iyyer, Mohit and Yatskar, Mark and Yih, Wen-tau and Choi, Yejin and Liang, Percy and Zettlemoyer, Luke},
 22 |     journal={arXiv preprint arXiv:1808.07036},
 23 |     year={2018}
 24 | }
 25 | """
 26 | 
 27 | 
 28 | class QuAC(Task):
 29 |     VERSION = 0
 30 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.quac.quac)
 31 |     DATASET_NAME = None
 32 | 
 33 |     def has_training_docs(self):
 34 |         return True
 35 | 
 36 |     def has_validation_docs(self):
 37 |         return True
 38 | 
 39 |     def has_test_docs(self):
 40 |         return False
 41 | 
 42 |     def training_docs(self):
 43 |         if self._training_docs is None:
 44 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
 45 |         return self._training_docs
 46 | 
 47 |     def validation_docs(self):
 48 |         return map(self._process_doc, self.dataset["validation"])
 49 | 
 50 |     def test_docs(self):
 51 |         raise NotImplementedError("QuAC has no test docs.")
 52 | 
 53 |     def _process_doc(self, doc):
 54 |         doc["title"] = doc["title"] + " - " + doc["section_title"]
 55 |         return doc
 56 | 
 57 |     def doc_to_text(self, doc):
 58 |         return (
 59 |             "TITLE: "
 60 |             + doc["title"]
 61 |             + "\n"
 62 |             + "PARAGRAPH: "
 63 |             + doc["paragraph"]
 64 |             + "\n\n"
 65 |             + "Q: "
 66 |             + doc["question"]
 67 |             + "\n\n"
 68 |             + "A: "
 69 |         )
 70 | 
 71 |     def should_decontaminate(self):
 72 |         return True
 73 | 
 74 |     def doc_to_decontamination_query(self, doc):
 75 |         return doc["paragraph"]
 76 | 
 77 |     def doc_to_target(self, doc):
 78 |         return doc["answer"]
 79 | 
 80 |     def construct_requests(self, doc, ctx):
 81 |         """Uses RequestFactory to construct Requests and returns an iterable of
 82 |         Requests which will be sent to the LM.
 83 | 
 84 |         :param doc:
 85 |             The document as returned from training_docs, validation_docs, or test_docs.
 86 |         :param ctx: str
 87 |             The context string, generated by fewshot_context. This includes the natural
 88 |             language description, as well as the few shot examples, and the question
 89 |             part of the document for `doc`.
 90 |         """
 91 |         # TODO: implement evaluation.
 92 |         raise NotImplementedError("Evaluation not implemented")
 93 | 
 94 |     def process_results(self, doc, results):
 95 |         """Take a single document and the LM results and evaluates, returning a
 96 |         dict where keys are the names of submetrics and values are the values of
 97 |         the metric for that one document
 98 | 
 99 |         :param doc:
100 |             The document as returned from training_docs, validation_docs, or test_docs.
101 |         :param results:
102 |             The results of the requests created in construct_requests.
103 |         """
104 |         # TODO: implement evaluation.
105 |         raise NotImplementedError("Evaluation not implemented")
106 | 
107 |     def aggregation(self):
108 |         """
109 |         :returns: {str: [float] -> float}
110 |             A dictionary where keys are the names of submetrics and values are
111 |             functions that aggregate a list of metrics
112 |         """
113 |         # TODO: implement evaluation.
114 |         raise NotImplementedError("Evaluation not implemented")
115 | 
116 |     def higher_is_better(self):
117 |         """
118 |         :returns: {str: bool}
119 |             A dictionary where keys are the names of submetrics and values are
120 |             whether a higher value of the submetric is better
121 |         """
122 |         # TODO: implement evaluation.
123 |         raise NotImplementedError("Evaluation not implemented")
124 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/sat.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Similarity of Semantic Relations
 3 | https://arxiv.org/pdf/cs/0608100.pdf
 4 | 
 5 | SAT (Scholastic Aptitude Test) Analogy Questions is a dataset comprising 374
 6 | multiple-choice analogy questions; 5 choices per question.
 7 | 
 8 | Homepage: https://aclweb.org/aclwiki/SAT_Analogy_Questions_(State_of_the_art)
 9 | """
10 | import inspect
11 | import lm_eval.datasets.sat_analogies.sat_analogies
12 | from lm_eval.base import MultipleChoiceTask
13 | 
14 | 
15 | _CITATION = """
16 | @article{article,
17 |     author = {Turney, Peter},
18 |     year = {2006},
19 |     month = {09},
20 |     pages = {379-416},
21 |     title = {Similarity of Semantic Relations},
22 |     volume = {32},
23 |     journal = {Computational Linguistics},
24 |     doi = {10.1162/coli.2006.32.3.379}
25 | }
26 | """
27 | 
28 | 
29 | class SATAnalogies(MultipleChoiceTask):
30 |     VERSION = 0
31 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.sat_analogies.sat_analogies)
32 |     DATASET_NAME = None
33 | 
34 |     def __init__(self, data_dir: str):
35 |         """
36 |         SAT Analog Questions is not publicly available. You must request the data
37 |         by emailing Peter Turney and then download it to a local directory path
38 |         which should be passed into the `data_dir` arg.
39 |         """
40 |         super().__init__(data_dir=data_dir)
41 | 
42 |     def has_training_docs(self):
43 |         return False
44 | 
45 |     def has_validation_docs(self):
46 |         return True
47 | 
48 |     def has_test_docs(self):
49 |         return False
50 | 
51 |     def training_docs(self):
52 |         return []
53 | 
54 |     def validation_docs(self):
55 |         return map(self._process_doc, self.dataset["validation"])
56 | 
57 |     def test_docs(self):
58 |         return []
59 | 
60 |     def _process_doc(self, doc):
61 |         return {
62 |             "source": doc["source"],
63 |             "query": doc["stem"].split(" ")[:2],
64 |             "choices": [
65 |                 "{} is to {}".format(*c.split(" ")[:2]) for c in doc["choices"]
66 |             ],
67 |             "gold": ["a", "b", "c", "d", "e"].index(doc["solution"].strip()),
68 |         }
69 | 
70 |     def doc_to_text(self, doc):
71 |         return "{} is to {} as".format(*doc["query"])
72 | 
73 |     def should_decontaminate(self):
74 |         return True
75 | 
76 |     def doc_to_decontamination_query(self, doc):
77 |         return doc["source"] + "\n" + " ".join(doc["query"])
78 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/sciq.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Crowdsourcing Multiple Choice Science Questions
 3 | https://aclanthology.org/W17-4413.pdf
 4 | 
 5 | The SciQ dataset contains 13,679 crowdsourced science exam questions about Physics,
 6 | Chemistry and Biology, among others. The questions are in multiple-choice format
 7 | with 4 answer options each. For the majority of the questions, an additional paragraph
 8 | with supporting evidence for the correct answer is provided.
 9 | 
10 | Homepage: https://allenai.org/data/sciq
11 | """
12 | from lm_eval.base import MultipleChoiceTask
13 | 
14 | 
15 | _CITATION = """
16 | @inproceedings{Welbl2017CrowdsourcingMC,
17 |     title={Crowdsourcing Multiple Choice Science Questions},
18 |     author={Johannes Welbl and Nelson F. Liu and Matt Gardner},
19 |     booktitle={NUT@EMNLP},
20 |     year={2017}
21 | }
22 | """
23 | 
24 | 
25 | class SciQ(MultipleChoiceTask):
26 |     VERSION = 0
27 |     DATASET_PATH = "sciq"
28 |     DATASET_NAME = None
29 | 
30 |     def has_training_docs(self):
31 |         return True
32 | 
33 |     def has_validation_docs(self):
34 |         return True
35 | 
36 |     def has_test_docs(self):
37 |         return True
38 | 
39 |     def training_docs(self):
40 |         if self._training_docs is None:
41 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
42 |         return self._training_docs
43 | 
44 |     def validation_docs(self):
45 |         return map(self._process_doc, self.dataset["validation"])
46 | 
47 |     def test_docs(self):
48 |         return map(self._process_doc, self.dataset["test"])
49 | 
50 |     def _process_doc(self, doc):
51 |         choices = [
52 |             doc["distractor1"],
53 |             doc["distractor2"],
54 |             doc["distractor3"],
55 |             doc["correct_answer"],
56 |         ]
57 |         src = doc["support"]
58 |         out_doc = {
59 |             "source": src,
60 |             "query": doc["question"],
61 |             "choices": choices,
62 |             "gold": 3,
63 |         }
64 |         return out_doc
65 | 
66 |     def doc_to_text(self, doc):
67 |         return "{}\nQuestion: {}\nAnswer:".format(doc["source"], doc["query"]).strip()
68 | 
69 |     def should_decontaminate(self):
70 |         return True
71 | 
72 |     def doc_to_decontamination_query(self, doc):
73 |         return doc["source"] + " " + doc["query"]
74 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/swag.py:
--------------------------------------------------------------------------------
 1 | """
 2 | SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference
 3 | https://arxiv.org/pdf/1808.05326.pdf
 4 | 
 5 | SWAG (Situations With Adversarial Generations) is an adversarial dataset
 6 | that consists of 113k multiple choice questions about grounded situations. Each
 7 | question is a video caption from LSMDC or ActivityNet Captions, with four answer
 8 | choices about what might happen next in the scene. The correct answer is the
 9 | (real) video caption for the next event in the video; the three incorrect
10 | answers are adversarially generated and human verified, so as to fool machines
11 | but not humans.
12 | 
13 | Homepage: https://rowanzellers.com/swag/
14 | """
15 | from lm_eval.base import MultipleChoiceTask
16 | 
17 | 
18 | _CITATION = """
19 | @inproceedings{zellers2018swagaf,
20 |     title={SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense Inference},
21 |     author={Zellers, Rowan and Bisk, Yonatan and Schwartz, Roy and Choi, Yejin},
22 |     booktitle = "Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing (EMNLP)",
23 |     year={2018}
24 | }
25 | """
26 | 
27 | 
28 | class SWAG(MultipleChoiceTask):
29 |     VERSION = 0
30 |     DATASET_PATH = "swag"
31 |     DATASET_NAME = "regular"
32 | 
33 |     def has_training_docs(self):
34 |         return True
35 | 
36 |     def has_validation_docs(self):
37 |         return True
38 | 
39 |     def has_test_docs(self):
40 |         return False
41 | 
42 |     def training_docs(self):
43 |         if self._training_docs is None:
44 |             self._training_docs = list(map(self._process_doc, self.dataset["train"]))
45 |         return self._training_docs
46 | 
47 |     def validation_docs(self):
48 |         return map(self._process_doc, self.dataset["validation"])
49 | 
50 |     def _process_doc(self, doc):
51 |         out_doc = {
52 |             "query": doc["startphrase"],
53 |             "choices": [doc["ending0"], doc["ending1"], doc["ending2"], doc["ending3"]],
54 |             "gold": int(doc["label"]),
55 |         }
56 |         return out_doc
57 | 
58 |     def doc_to_text(self, doc):
59 |         return doc["query"]
60 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/triviaqa.py:
--------------------------------------------------------------------------------
 1 | """
 2 | TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension
 3 | https://arxiv.org/pdf/1705.03551.pdf
 4 | 
 5 | TriviaQA is a reading comprehension dataset containing over 650K question-answer-evidence
 6 | triples. TriviaQA includes 95K question-answer pairs authored by trivia enthusiasts
 7 | and independently gathered evidence documents, six per question on average, that provide
 8 | high quality distant supervision for answering the questions.
 9 | 
10 | Homepage: https://nlp.cs.washington.edu/triviaqa/
11 | """
12 | import inspect
13 | import lm_eval.datasets.triviaqa.triviaqa
14 | from lm_eval.base import Task, rf
15 | from lm_eval.metrics import mean
16 | 
17 | 
18 | _CITATION = """
19 | @InProceedings{JoshiTriviaQA2017,
20 |     author = {Joshi, Mandar and Choi, Eunsol and Weld, Daniel S. and Zettlemoyer, Luke},
21 |     title = {TriviaQA: A Large Scale Distantly Supervised Challenge Dataset for Reading Comprehension},
22 |     booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics},
23 |     month = {July},
24 |     year = {2017},
25 |     address = {Vancouver, Canada},
26 |     publisher = {Association for Computational Linguistics},
27 | }
28 | """
29 | 
30 | 
31 | class TriviaQA(Task):
32 |     VERSION = 1
33 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.triviaqa.triviaqa)
34 |     DATASET_NAME = None
35 | 
36 |     def has_training_docs(self):
37 |         return True
38 | 
39 |     def has_validation_docs(self):
40 |         return True
41 | 
42 |     def has_test_docs(self):
43 |         return False
44 | 
45 |     def training_docs(self):
46 |         return self.dataset["train"]
47 | 
48 |     def validation_docs(self):
49 |         return self.dataset["validation"]
50 | 
51 |     def test_docs(self):
52 |         raise NotImplementedError()
53 | 
54 |     def doc_to_text(self, doc):
55 |         return f"Question: {doc['question']}\nAnswer:"
56 | 
57 |     def should_decontaminate(self):
58 |         return True
59 | 
60 |     def doc_to_decontamination_query(self, doc):
61 |         return doc["question"]
62 | 
63 |     def doc_to_target(self, doc):
64 |         return " " + doc["answer"]["value"]
65 | 
66 |     def _remove_prefixes(self, aliases):
67 |         # Optimization: Remove any alias that has a strict prefix elsewhere in the list
68 |         # we can do this because if the prefix is acceptable by isgreedy, we can stop looking
69 |         aliases.sort()
70 |         ret = [aliases[0]]
71 |         for alias in aliases[1:]:
72 |             if not alias.startswith(ret[-1]):
73 |                 ret.append(alias)
74 |         return ret
75 | 
76 |     def construct_requests(self, doc, ctx):
77 |         ret = []
78 |         for alias in self._remove_prefixes(doc["answer"]["aliases"]):
79 |             _, is_prediction = rf.loglikelihood(ctx, " " + alias)
80 |             ret.append(is_prediction)
81 |         return ret
82 | 
83 |     def process_results(self, doc, results):
84 |         return {"acc": float(any(results))}
85 | 
86 |     def aggregation(self):
87 |         return {
88 |             "acc": mean,
89 |         }
90 | 
91 |     def higher_is_better(self):
92 |         return {"acc": True}
93 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/unscramble.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Language Models are Few-Shot Learners
 3 | https://arxiv.org/pdf/2005.14165.pdf
 4 | 
 5 | Unscramble is a small battery of 5 “character manipulation” tasks. Each task
 6 | involves giving the model a word distorted by some combination of scrambling,
 7 | addition, or deletion of characters, and asking it to recover the original word.
 8 | 
 9 | Homepage: https://github.com/openai/gpt-3/tree/master/data
10 | """
11 | import inspect
12 | import lm_eval.datasets.unscramble.unscramble
13 | from lm_eval.base import Task, rf
14 | from lm_eval.metrics import mean
15 | 
16 | 
17 | _CITATION = """
18 | @inproceedings{NEURIPS2020_1457c0d6,
19 |     author = {Brown, Tom and Mann, Benjamin and Ryder, Nick and Subbiah, Melanie and Kaplan, Jared D and Dhariwal, Prafulla and Neelakantan, Arvind and Shyam, Pranav and Sastry, Girish and Askell, Amanda and Agarwal, Sandhini and Herbert-Voss, Ariel and Krueger, Gretchen and Henighan, Tom and Child, Rewon and Ramesh, Aditya and Ziegler, Daniel and Wu, Jeffrey and Winter, Clemens and Hesse, Chris and Chen, Mark and Sigler, Eric and Litwin, Mateusz and Gray, Scott and Chess, Benjamin and Clark, Jack and Berner, Christopher and McCandlish, Sam and Radford, Alec and Sutskever, Ilya and Amodei, Dario},
20 |     booktitle = {Advances in Neural Information Processing Systems},
21 |     editor = {H. Larochelle and M. Ranzato and R. Hadsell and M. F. Balcan and H. Lin},
22 |     pages = {1877--1901},
23 |     publisher = {Curran Associates, Inc.},
24 |     title = {Language Models are Few-Shot Learners},
25 |     url = {https://proceedings.neurips.cc/paper/2020/file/1457c0d6bfcb4967418bfb8ac142f64a-Paper.pdf},
26 |     volume = {33},
27 |     year = {2020}
28 | }
29 | """
30 | 
31 | 
32 | class WordUnscrambleTask(Task):
33 |     VERSION = 0
34 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.unscramble.unscramble)
35 |     DATASET_NAME = None
36 | 
37 |     def has_training_docs(self):
38 |         return False
39 | 
40 |     def has_validation_docs(self):
41 |         return True
42 | 
43 |     def has_test_docs(self):
44 |         return False
45 | 
46 |     def validation_docs(self):
47 |         return self.dataset["validation"]
48 | 
49 |     def doc_to_text(self, doc):
50 |         return doc["context"]
51 | 
52 |     def should_decontaminate(self):
53 |         return True
54 | 
55 |     def doc_to_decontamination_query(self, doc):
56 |         return doc["context"]
57 | 
58 |     def doc_to_target(self, doc):
59 |         return doc["completion"]
60 | 
61 |     def construct_requests(self, doc, ctx):
62 |         completion = rf.greedy_until(ctx, ["\n"])
63 |         return completion
64 | 
65 |     def process_results(self, doc, results):
66 |         pred = results[0]
67 |         gold = doc["completion"]
68 |         return {"acc": int(pred == gold)}
69 | 
70 |     def aggregation(self):
71 |         return {"acc": mean}
72 | 
73 |     def higher_is_better(self):
74 |         return {"acc": True}
75 | 
76 | 
77 | class Anagrams1(WordUnscrambleTask):
78 |     DATASET_NAME = "mid_word_1_anagrams"
79 | 
80 | 
81 | class Anagrams2(WordUnscrambleTask):
82 |     DATASET_NAME = "mid_word_2_anagrams"
83 | 
84 | 
85 | class CycleLetters(WordUnscrambleTask):
86 |     DATASET_NAME = "cycle_letters_in_word"
87 | 
88 | 
89 | class RandomInsertion(WordUnscrambleTask):
90 |     DATASET_NAME = "random_insertion_in_word"
91 | 
92 | 
93 | class ReversedWords(WordUnscrambleTask):
94 |     DATASET_NAME = "reversed_words"
95 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/webqs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Semantic Parsing on Freebase from Question-Answer Pairs
 3 | https://cs.stanford.edu/~pliang/papers/freebase-emnlp2013.pdf
 4 | 
 5 | WebQuestions is a benchmark for question answering. The dataset consists of 6,642
 6 | question/answer pairs. The questions are supposed to be answerable by Freebase, a
 7 | large knowledge graph. The questions are mostly centered around a single named entity.
 8 | The questions are popular ones asked on the web (at least in 2013).
 9 | 
10 | Homepage: https://worksheets.codalab.org/worksheets/0xba659fe363cb46e7a505c5b6a774dc8a
11 | """
12 | from lm_eval.base import rf, Task
13 | from lm_eval.metrics import mean
14 | 
15 | 
16 | _CITATION = """
17 | @inproceedings{berant-etal-2013-semantic,
18 |     title = "Semantic Parsing on {F}reebase from Question-Answer Pairs",
19 |     author = "Berant, Jonathan  and
20 |       Chou, Andrew  and
21 |       Frostig, Roy  and
22 |       Liang, Percy",
23 |     booktitle = "Proceedings of the 2013 Conference on Empirical Methods in Natural Language Processing",
24 |     month = oct,
25 |     year = "2013",
26 |     address = "Seattle, Washington, USA",
27 |     publisher = "Association for Computational Linguistics",
28 |     url = "https://aclanthology.org/D13-1160",
29 |     pages = "1533--1544",
30 | }
31 | """
32 | 
33 | 
34 | class WebQs(Task):
35 |     VERSION = 0
36 |     DATASET_PATH = "web_questions"
37 |     DATASET_NAME = None
38 | 
39 |     def has_training_docs(self):
40 |         return True
41 | 
42 |     def has_validation_docs(self):
43 |         return False
44 | 
45 |     def has_test_docs(self):
46 |         return True
47 | 
48 |     def training_docs(self):
49 |         if self._training_docs is None:
50 |             self._training_docs = list(self.dataset["train"])
51 |         return self._training_docs
52 | 
53 |     def test_docs(self):
54 |         return self.dataset["test"]
55 | 
56 |     def doc_to_text(self, doc):
57 |         return "Question: " + doc["question"] + "\nAnswer:"
58 | 
59 |     def should_decontaminate(self):
60 |         return True
61 | 
62 |     def doc_to_decontamination_query(self, doc):
63 |         return doc["question"]
64 | 
65 |     def doc_to_target(self, doc):
66 |         # this picks one answer to be the "correct" one, despite sometimes
67 |         # multiple correct answers being possible.
68 |         # TODO: make sure we're actually handling multi-answer correctly
69 |         return " " + doc["answers"][0]
70 | 
71 |     def _remove_prefixes(self, aliases):
72 |         # Optimization: Remove any alias that has a strict prefix elsewhere in the list
73 |         # we can do this because if the prefix is acceptable by isgreedy, we can stop looking
74 |         aliases.sort()
75 |         ret = [aliases[0]]
76 |         for alias in aliases[1:]:
77 |             if not alias.startswith(ret[-1]):
78 |                 ret.append(alias)
79 | 
80 |         return ret
81 | 
82 |     def construct_requests(self, doc, ctx):
83 |         ret = []
84 |         for alias in self._remove_prefixes(doc["answers"]):
85 |             _, is_prediction = rf.loglikelihood(ctx, " " + alias)
86 |             ret.append(is_prediction)
87 |         return ret
88 | 
89 |     def process_results(self, doc, results):
90 |         return {"acc": float(any(results))}
91 | 
92 |     def aggregation(self):
93 |         return {
94 |             "acc": mean,
95 |         }
96 | 
97 |     def higher_is_better(self):
98 |         return {"acc": True}
99 | 


--------------------------------------------------------------------------------
/lm_eval/tasks/wikitext.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pointer Sentinel Mixture Models
 3 | https://arxiv.org/pdf/1609.07843.pdf
 4 | 
 5 | The WikiText language modeling dataset is a collection of over 100 million tokens
 6 | extracted from the set of verified Good and Featured articles on Wikipedia.
 7 | 
 8 | NOTE: This `Task` is based on WikiText-2.
 9 | 
10 | Homepage: https://www.salesforce.com/products/einstein/ai-research/the-wikitext-dependency-language-modeling-dataset/
11 | """
12 | import re
13 | import inspect
14 | import lm_eval.datasets.wikitext.wikitext
15 | from lm_eval.base import PerplexityTask
16 | 
17 | 
18 | _CITATION = """
19 | @misc{merity2016pointer,
20 |     title={Pointer Sentinel Mixture Models},
21 |     author={Stephen Merity and Caiming Xiong and James Bradbury and Richard Socher},
22 |     year={2016},
23 |     eprint={1609.07843},
24 |     archivePrefix={arXiv},
25 |     primaryClass={cs.CL}
26 | }
27 | """
28 | 
29 | 
30 | def wikitext_detokenizer(string):
31 |     # contractions
32 |     string = string.replace("s '", "s'")
33 |     string = re.sub(r"/' [0-9]/", r"/'[0-9]/", string)
34 |     # number separators
35 |     string = string.replace(" @-@ ", "-")
36 |     string = string.replace(" @,@ ", ",")
37 |     string = string.replace(" @.@ ", ".")
38 |     # punctuation
39 |     string = string.replace(" : ", ": ")
40 |     string = string.replace(" ; ", "; ")
41 |     string = string.replace(" . ", ". ")
42 |     string = string.replace(" ! ", "! ")
43 |     string = string.replace(" ? ", "? ")
44 |     string = string.replace(" , ", ", ")
45 |     # double brackets
46 |     string = re.sub(r"\(\s*([^\)]*?)\s*\)", r"(\1)", string)
47 |     string = re.sub(r"\[\s*([^\]]*?)\s*\]", r"[\1]", string)
48 |     string = re.sub(r"{\s*([^}]*?)\s*}", r"{\1}", string)
49 |     string = re.sub(r"\"\s*([^\"]*?)\s*\"", r'"\1"', string)
50 |     string = re.sub(r"'\s*([^']*?)\s*'", r"'\1'", string)
51 |     # miscellaneous
52 |     string = string.replace("= = = =", "====")
53 |     string = string.replace("= = =", "===")
54 |     string = string.replace("= =", "==")
55 |     string = string.replace(" " + chr(176) + " ", chr(176))
56 |     string = string.replace(" \n", "\n")
57 |     string = string.replace("\n ", "\n")
58 |     string = string.replace(" N ", " 1 ")
59 |     string = string.replace(" 's", "'s")
60 | 
61 |     return string
62 | 
63 | 
64 | class WikiText(PerplexityTask):
65 |     VERSION = 1
66 |     DATASET_PATH = inspect.getfile(lm_eval.datasets.wikitext.wikitext)
67 |     DATASET_NAME = "wikitext-2-raw-v1"
68 | 
69 |     def has_training_docs(self):
70 |         return True
71 | 
72 |     def has_validation_docs(self):
73 |         return True
74 | 
75 |     def has_test_docs(self):
76 |         return True
77 | 
78 |     def training_docs(self):
79 |         return map(self._process_doc, self.dataset["train"])
80 | 
81 |     def validation_docs(self):
82 |         return map(self._process_doc, self.dataset["validation"])
83 | 
84 |     def test_docs(self):
85 |         return map(self._process_doc, self.dataset["test"])
86 | 
87 |     def _process_doc(self, doc):
88 |         return doc["page"]
89 | 
90 |     def doc_to_target(self, doc):
91 |         return wikitext_detokenizer(doc)
92 | 
93 |     def should_decontaminate(self):
94 |         return True
95 | 
96 |     def count_words(self, doc):
97 |         # count number of words in *original doc before detokenization*
98 |         return len(re.split(r"\s+", doc))
99 | 


--------------------------------------------------------------------------------
/outlier_analysis.md:
--------------------------------------------------------------------------------
 1 | # Outlier Analysis
 2 | Quantization, especially post-training quantization (PTQ)  which operates with limited data and GPU resources, has become increasingly challenging for transformer language models (e.g., a 12% accuracy drop in BERT [1] and catastrophic degradation in OPT-175B [2].
 3 | 
 4 | <p align="center">
 5 |   <img src="figure/outlier_phenomenon.png">
 6 | </p>
 7 | 
 8 | Outliers on these models show structural phenomena. Firstly, they present in asymmetric shape and concentrate on certain channels. For example, from the colored part in the above figure, it can be seen that almost all the tokens contribute to outliers on certain channels. OPT-66B has hard negative outliers on the 8725-th channel and hard positive ones on the 6353-th channel. For BERT, outliers concentrate on the 308 and 381 channels. Second, a few tokens provide even larger values compared to others such as [SEP] in BERT and [EOT] in OPT (look at the orange part in the figure).
 9 | 
10 | ## Channel Aspect
11 | In terms of channels, outliers consistently emerge in certain channels over different inputs. [1, 2] find that these problematic channels are limited and propose some fine-grained methods. [1] employs a per-embedding-group quantization scheme that uses different quantization parameters for distinct channel groups. [2] proposes to utilize FP16 representations for channels holding signals over 6. [3] identifies this feature lying in LayerNorm’s output and migrates the scaling parameter of LayerNorm to subsequent modules with an equivalent transformation to attenuate outliers. [4] propose to calculate scaling values by equalizing ranges between activations and weights. Furthermore, [5] designs the scaling factors that concern the interactive results of troublesome activation and following weights to scale down outlier channels. Also, it notices the asymmetric presentation of outliers and designs a shifting operation. Besides, [6] discovers that normal values are not that important and discards those adjacent to outliers to make room for outliers.
12 | 
13 | ## Token Aspect
14 | In terms of tokens, different tokens exhibit varying degrees of outliers. We find that this phenomenon is obvious in BERT and BLOOM, but less obvious in OPTs. Observing that tokens that denote more aggressive outliers often appear in examples, we conjecture that token divergence might relate to token frequency during the pre-training phase.
15 | 
16 | To combat this challenge, [2, 7] introduce a novel scheme called per-token quantization that dynamically computes quantization parameters for each token. [5] investigates the clipping impact of outliers and recommends finding an appropriate clipping range in a token-wise manner.
17 | 
18 | ## Related works
19 | [1]. Yelysei Bondarenko, et al, Understanding and overcoming the challenges of efficient transformer quantization. EMNLP 2021.  
20 | [2]. Tim Dettmers, et al, LLM.int8 (): 8-bit matrix mul- tiplication for transformers at scale. NeurIPS 2022.  
21 | [3]. Xiuying Wei, et al. Outlier suppression: Pushing the limit of low-bit transformer language models. NeurIPS 2022.  
22 | [4]. Guangxuan Xiao, Ji Lin, et al.  Smoothquant: Accurate and efficient post-training quantization for large language models. ICML 2023.  
23 | [5]. Xiuying Wei, et al. Outlier Suppression+: Accurate quantization of large language models by equivalent and optimal shifting and scaling. arXiv preprint arXiv:2304.09145.  
24 | [6]. Cong Guo, et al. OliVe: Accelerating Large Language Models via Hardware-friendly Outlier-Victim Pair Quantization. ISCA 2023.  
25 | [7]. Zhewei Yao, et al. ZeroQuant: Efficient and affordable post-training quantization for large-scale transformers. NeurIPS 2022.
26 | 
27 | 


--------------------------------------------------------------------------------
/pile_statistics.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Data": "Pile statistics",
 3 |     "Document Count": 210607728,
 4 |     "Total Pile Characters": 421215456,
 5 |     "File Start Offsets": [
 6 |         0,
 7 |         7021438,
 8 |         14042822,
 9 |         21066113,
10 |         28086515,
11 |         35106072,
12 |         42123306,
13 |         49145091,
14 |         56165817,
15 |         63185587,
16 |         70211208,
17 |         77234322,
18 |         84249267,
19 |         91267634,
20 |         98285983,
21 |         105305110,
22 |         112322489,
23 |         119342491,
24 |         126367373,
25 |         133389153,
26 |         140412039,
27 |         147432373,
28 |         154452516,
29 |         161470190,
30 |         168492733,
31 |         175512521,
32 |         182526939,
33 |         189547478,
34 |         196565318,
35 |         203583306
36 |     ]
37 | }
38 | 


--------------------------------------------------------------------------------
/quant_transformer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/quant_transformer/__init__.py


--------------------------------------------------------------------------------
/quant_transformer/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/quant_transformer/model/__init__.py


--------------------------------------------------------------------------------
/quant_transformer/model/quant_model.py:
--------------------------------------------------------------------------------
 1 | from .quant_opt import QuantizedOPTForCausalLM  # noqa: F401
 2 | from .quant_bloom import QuantizedBloomForCausalLM  # noqa: F401
 3 | from .quant_llama import QuantizedLlamaForCausalLM
 4 | from quant_transformer.quantization.observer import ObserverBase
 5 | _SUPPORT_MODELS = ['opt', 'bloom']
 6 | 
 7 | 
 8 | def quantize_model(fp_model, config):
 9 |     config_quant = config.quant
10 |     config_quant.is_remove_padding = config_quant.get('is_remove_padding', True)
11 |     config_quant.migrate = config_quant.get('migrate', False)
12 |     fp_model.eval()
13 |     model = eval("Quantized" + str(fp_model.__class__.__name__))(
14 |         fp_model, config_quant.w_qconfig, config_quant.a_qconfig, qinput=False,
15 |         is_remove_padding=config_quant.is_remove_padding,
16 |     )
17 |     for name, module in model.named_modules():
18 |         if isinstance(module, ObserverBase) and 'act' in name:
19 |             module.set_name(name)
20 |     model.eval()
21 |     return model
22 | 


--------------------------------------------------------------------------------
/quant_transformer/model/util_layernorm.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | import torch.nn.functional as F
 3 | 
 4 | 
 5 | class QuantizedLayerNorm(nn.Module):
 6 | 
 7 |     def __init__(self, org_module):
 8 |         super(QuantizedLayerNorm, self).__init__()
 9 |         self.normalized_shape = org_module.normalized_shape
10 |         self.eps = org_module.eps
11 |         self.elementwise_affine = org_module.elementwise_affine
12 |         self.weight = org_module.weight
13 |         self.bias = org_module.bias
14 | 
15 |     def forward(self, input):
16 |         return F.layer_norm(
17 |             input, self.normalized_shape, self.weight, self.bias, self.eps)
18 | 
19 |     def extra_repr(self) -> str:
20 |         return '{normalized_shape}, eps={eps}, ' \
21 |             'elementwise_affine={elementwise_affine}'.format(**self.__dict__)
22 | 
23 | 
24 | class Identity(nn.Module):
25 |     def __init__(self):
26 |         super().__init__()
27 |         self.migrate = False
28 |         self.migrate_scale = None
29 | 
30 |     def set_migrate(self, state):
31 |         if self.migrate_scale is None:
32 |             self.migrate = False
33 |         else:
34 |             self.migrate = state
35 | 
36 |     def set_migrate_scale(self, migrate_scale):
37 |         self.migrate_scale = migrate_scale
38 |         self.migrate = True
39 | 
40 |     def set_migrate_bias(self, migrate_bias):
41 |         self.migrate_bias = migrate_bias
42 |         self.migrate = True
43 | 
44 |     def forward(self, X):
45 |         if self.migrate:
46 |             X = X * self.migrate_scale + self.migrate_bias
47 |         return X
48 | 


--------------------------------------------------------------------------------
/quant_transformer/quantization/__init__.py:
--------------------------------------------------------------------------------
1 | from .quantized_module import Quantizer
2 | from .quantized_module import QuantizedModule, QuantizedLayer
3 | from .state import enable_calibration_quantization, enable_calibration_woquantization, \
4 |     enable_quantization, disable_all
5 | 


--------------------------------------------------------------------------------
/quant_transformer/quantization/state.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from .fake_quant import QuantizeBase
 3 | from .observer import ObserverBase
 4 | logger = logging.getLogger("OS+")
 5 | 
 6 | 
 7 | def enable_calibration_woquantization(model, quantizer_type='fake_quant', except_quantizer=None):
 8 |     logger.info('Enable observer and Disable quantize for {}'.format(quantizer_type))
 9 |     for name, submodule in model.named_modules():
10 |         if isinstance(submodule, QuantizeBase):
11 |             if (quantizer_type not in name) or \
12 |                (except_quantizer is not None and name.split('.')[-1] in except_quantizer):
13 |                 logger.info('The except_quantizer is {}'.format(name))
14 |                 submodule.disable_observer()
15 |                 submodule.disable_fake_quant()
16 |                 continue
17 |             logger.debug('Enable observer and Disable quant: {}'.format(name))
18 |             submodule.enable_observer()
19 |             submodule.disable_fake_quant()
20 | 
21 | 
22 | def enable_calibration_quantization(model, quantizer_type='fake_quant', except_quantizer=None):
23 |     logger.info('Enable observer and Enable quantize for {}'.format(quantizer_type))
24 |     for name, submodule in model.named_modules():
25 |         if isinstance(submodule, QuantizeBase):
26 |             if (quantizer_type not in name) or \
27 |                (except_quantizer is not None and name.split('.')[-1] in except_quantizer):
28 |                 logger.debug('The except_quantizer is {}'.format(name))
29 |                 submodule.disable_observer()
30 |                 submodule.disable_fake_quant()
31 |                 continue
32 |             logger.debug('Enable observer and Enable quant: {}'.format(name))
33 |             submodule.enable_observer()
34 |             submodule.enable_fake_quant()
35 | 
36 | 
37 | def enable_quantization(model, quantizer_type='fake_quant', except_quantizer=None):
38 |     logger.info('Disable observer and Enable quantize.')
39 |     for name, submodule in model.named_modules():
40 |         if isinstance(submodule, QuantizeBase):
41 |             if (quantizer_type not in name) or \
42 |                (except_quantizer is not None and name.split('.')[-1] in except_quantizer):
43 |                 logger.debug('The except_quantizer is {}'.format(name))
44 |                 submodule.disable_observer()
45 |                 submodule.disable_fake_quant()
46 |                 continue
47 |             logger.debug('Disable observer and Enable quant: {}'.format(name))
48 |             submodule.disable_observer()
49 |             submodule.enable_fake_quant()
50 | 
51 | 
52 | def disable_all(model):
53 |     logger.info('Disable observer and disable quantize.')
54 |     for name, submodule in model.named_modules():
55 |         if isinstance(submodule, QuantizeBase):
56 |             logger.debug('Disable observer and disable quant: {}'.format(name))
57 |             submodule.disable_observer()
58 |             submodule.disable_fake_quant()
59 | 
60 | 
61 | def set_observer_name(model):
62 |     logger.info('set name for obsever')
63 |     for name, submodule in model.named_modules():
64 |         if isinstance(submodule, ObserverBase):
65 |             submodule.set_name(name)
66 | 


--------------------------------------------------------------------------------
/quant_transformer/quantization/util_quant.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def round_ste(x: torch.Tensor):
 5 |     """
 6 |     Implement Straight-Through Estimator for rounding operation.
 7 |     """
 8 |     return (x.round() - x).detach() + x
 9 | 
10 | 
11 | def fake_quantize_per_tensor_affine(x, scale, zero_point, quant_min, quant_max):
12 |     x_int = round_ste(x / scale) + zero_point
13 |     x_quant = torch.clamp(x_int, quant_min, quant_max)
14 |     x_dequant = (x_quant - zero_point) * scale
15 |     return x_dequant
16 | 
17 | 
18 | def quantize_per_channel_affine(x, scale, zero_point, ch_axis, quant_min, quant_max):
19 |     new_shape = [1] * len(x.shape)
20 |     new_shape[ch_axis] = x.shape[ch_axis]
21 |     scale = scale.reshape(new_shape)
22 |     zero_point = zero_point.reshape(new_shape)
23 |     x_int = round_ste(x / scale) + zero_point
24 |     x_quant = torch.clamp(x_int, quant_min, quant_max)
25 |     return x_quant
26 | 
27 | 
28 | def dequantize_per_channel_affine(x, scale, zero_point, ch_axis, quant_min, quant_max):
29 |     new_shape = [1] * len(x.shape)
30 |     new_shape[ch_axis] = x.shape[ch_axis]
31 |     scale = scale.reshape(new_shape)
32 |     zero_point = zero_point.reshape(new_shape)
33 |     x_dequant = (x - zero_point) * scale
34 |     return x_dequant
35 | 
36 | 
37 | def quantize_per_tensor_affine(x, scale, zero_point, quant_min, quant_max):
38 |     x_int = round_ste(x / scale) + zero_point
39 |     x_quant = torch.clamp(x_int, quant_min, quant_max)
40 |     return x_quant
41 | 
42 | 
43 | def dequantize_per_tensor_affine(x, scale, zero_point, quant_min, quant_max):
44 |     x_dequant = (x - zero_point) * scale
45 |     return x_dequant
46 | 
47 | 
48 | def fake_quantize_per_channel_affine(x, scale, zero_point, ch_axis, quant_min, quant_max):
49 |     new_shape = [1] * len(x.shape)
50 |     new_shape[ch_axis] = x.shape[ch_axis]
51 |     scale = scale.reshape(new_shape)
52 |     zero_point = zero_point.reshape(new_shape)
53 |     x_int = round_ste(x / scale) + zero_point
54 |     x_quant = torch.clamp(x_int, quant_min, quant_max)
55 |     x_dequant = (x_quant - zero_point) * scale
56 |     return x_dequant
57 | 


--------------------------------------------------------------------------------
/quant_transformer/solver/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/quant_transformer/solver/__init__.py


--------------------------------------------------------------------------------
/quant_transformer/solver/token_wise_clipping.py:
--------------------------------------------------------------------------------
 1 | from torch.nn import MSELoss
 2 | import logging
 3 | from quant_transformer.quantization.fake_quant import QuantizeBase
 4 | logger = logging.getLogger("OS+")
 5 | 
 6 | 
 7 | def set_ratio(model, ratio):
 8 |     for name, module in model.named_modules():
 9 |         if isinstance(module, QuantizeBase):
10 |             if 'act' in name:
11 |                 module.observer.set_percentile(ratio)
12 |                 module.observer.cnt = 0
13 |                 module.disable_fake_quant()
14 |                 module.enable_observer()
15 |             if 'weight' in name:
16 |                 module.disable_fake_quant()
17 | 
18 | 
19 | def enable_quantization(model):
20 |     for name, submodule in model.named_modules():
21 |         if isinstance(submodule, QuantizeBase):
22 |             if 'act' in name:
23 |                 submodule.disable_observer()
24 |                 submodule.enable_fake_quant()
25 |             if 'weight' in name:
26 |                 submodule.enable_fake_quant()
27 | 
28 | 
29 | def calibrate(model, fp_input, fp_output=False):
30 |     loss = 0
31 |     for i, batch in enumerate(fp_input):
32 |         if fp_output:
33 |             loss += model(**batch, labels=fp_input[i]['input_ids']).loss
34 |         else:
35 |             model(**batch)
36 |     return loss
37 | 
38 | 
39 | def find_ratio(model, fp_input, fp_output, param):
40 |     p, loss = 0, None
41 |     iters = param['iters']
42 |     step = param['step']
43 |     for i in range(iters):
44 |         set_ratio(model, 1.0 - step * i)
45 |         calibrate(model, fp_input)
46 |         enable_quantization(model)
47 |         cur_loss = calibrate(model, fp_input, True)
48 |         logger.info('the ratio is {}, the loss is {}'.format(1.0 - step * i, cur_loss))
49 |         if loss is None or loss > cur_loss:
50 |             loss = cur_loss
51 |             p = i
52 |     ratio = 1.0 - step * p
53 |     logger.info('the best percentile is {}'.format(ratio))
54 |     set_ratio(model, ratio)
55 |     calibrate(model, fp_input)
56 | 
57 | 
58 | loss_fct = MSELoss()
59 | 
60 | 
61 | a_bit_iters = {
62 |     8: 0.05,
63 |     6: 0.1,
64 | }
65 | 
66 | 
67 | def cac_step_iters(a_bit, bs):
68 |     step = 0.005
69 |     step = float(format(step, '.2g'))
70 |     iters = int(a_bit_iters[a_bit] / step)
71 |     print('the step is {}, the iter is {}'.format(step, iters))
72 |     return step, iters
73 | 
74 | 
75 | def token_wise_clipping(model, fp_input, fp_output, config, batch_size):
76 |     config_quant = config.quant
77 | 
78 |     logger.info("*** Evaluate Token Percentile ***")
79 |     step, iters = cac_step_iters(config_quant.a_qconfig.bit, batch_size)
80 | 
81 |     if hasattr(config_quant.a_qconfig, 'token_quantile'):
82 |         set_ratio(model, config_quant.a_qconfig.token_quantile)
83 |         calibrate(model, fp_input)
84 |         logger.info('the best percentile is {}'.format(config_quant.a_qconfig.token_quantile))
85 |     else:
86 |         step, iters = cac_step_iters(config_quant.a_qconfig.bit, batch_size)
87 |         find_ratio(model, fp_input, fp_output,
88 |                    {'iters': getattr(config.quant, 'iters', iters),
89 |                    'step': getattr(config.quant, 'step', step)})
90 | 


--------------------------------------------------------------------------------
/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/scripts/__init__.py


--------------------------------------------------------------------------------
/scripts/clean_training_data/README.md:
--------------------------------------------------------------------------------
 1 | janitor.py contains a script to remove benchmark data contamination from training data sets.
 2 | It uses the approach described in the [GPT-3 paper](https://arxiv.org/abs/2005.14165).
 3 | 
 4 | ## Algorithm
 5 | 1) Collects all contamination text files that are to be removed from training data
 6 | 2) Filters training data by finding `N`gram matches between the training data
 7 |    and any contamination
 8 |    1) `N`grams ignore case and punctuation and are split on whitespace.
 9 |    2) Matching `N`gram substrings are removed, as is a `window_to_remove` character window around
10 |     the match, splitting the training data into chunks
11 |    3) Any chunks less than `minimum_slice_length` are removed
12 |    4) Training data sets split into more than `too_dirty_cutoff` are considered
13 |     completey contaminated and removed
14 | 
15 | OpenAI used:
16 | ```
17 | ngram_n = 13
18 | window_to_remove = 200
19 | minimum_slice_length = 200
20 | too_dirty_cutoff = 10
21 | ```
22 | 
23 | ## Compiling
24 | 
25 | Janitor can be used as a pure python program, but it is much faster if the ngram
26 | code is run in C++. To compile the C++ code, run
27 | 
28 | ```
29 | pip install pybind11
30 | c++ -O3 -Wall -shared -std=c++11 -fPIC $(python3 -m pybind11 --includes) janitor_util.cpp -o janitor_util$(python3-config --extension-suffix)
31 | ```
32 | 
33 | If your your compiler isn't linked to python, you may need to add to the above `-undefined dynamic_lookup`
34 | 


--------------------------------------------------------------------------------
/scripts/clean_training_data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ModelTC/Outlier_Suppression_Plus/3ba97ae2dab0e6e5ead5da1795f50fd47025a49d/scripts/clean_training_data/__init__.py


--------------------------------------------------------------------------------
/scripts/clean_training_data/compress_and_package.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import argparse
 3 | import os
 4 | import subprocess
 5 | import shutil
 6 | 
 7 | from tqdm import tqdm
 8 | from tqdm_multiprocess import TqdmMultiProcessPool
 9 | 
10 | import logging
11 | from tqdm_multiprocess.logger import setup_logger_tqdm
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | def process_task(
17 |     working_directory, output_directory, bucket_file_path, tqdm_func, global_tqdm
18 | ):
19 |     command = f"zstd {bucket_file_path}"
20 |     logger.info(command)
21 |     subprocess.call(command, shell=True)
22 | 
23 |     compressed_file = bucket_file_path + ".zst"
24 |     if output_directory:
25 |         shutil.move(compressed_file, output_directory)
26 | 
27 |     os.remove(bucket_file_path)
28 |     global_tqdm.update()
29 | 
30 | 
31 | def compress_and_move(working_directory, output_directory, process_count):
32 |     os.makedirs(output_directory, exist_ok=True)
33 |     original_info_file_path = os.path.join(working_directory, "info.json")
34 |     assert os.path.exists(original_info_file_path)
35 | 
36 |     tasks = []
37 |     bucket_file_paths = glob.glob(
38 |         os.path.join(working_directory, "output", f"*.bkt.txt.sorted")
39 |     )
40 |     for bucket_file_path in bucket_file_paths:
41 |         task = (process_task, (working_directory, output_directory, bucket_file_path))
42 |         tasks.append(task)
43 | 
44 |     pool = TqdmMultiProcessPool(process_count)
45 | 
46 |     def on_done(_):
47 |         return None
48 | 
49 |     def on_error(_):
50 |         return None
51 | 
52 |     global_progress = tqdm(
53 |         total=len(bucket_file_paths), dynamic_ncols=True, unit="file"
54 |     )
55 |     _ = pool.map(global_progress, tasks, on_error, on_done)
56 | 
57 |     shutil.copy(original_info_file_path, os.path.join(output_directory, "info.json"))
58 | 
59 | 
60 | parser = argparse.ArgumentParser(description="sort 13gram buckets")
61 | parser.add_argument("-dir", "--working_directory", required=True)
62 | parser.add_argument("-output", "--output_directory", required=True)
63 | parser.add_argument("-procs", "--process_count", type=int, default=8)
64 | 
65 | if __name__ == "__main__":
66 |     version = 1.00
67 |     print(f"Running version {version}")
68 | 
69 |     logfile_path = "compress_and_package.log"
70 |     setup_logger_tqdm(logfile_path)
71 | 
72 |     args = parser.parse_args()
73 |     compress_and_move(args.working_directory, args.output_directory, args.process_count)
74 | 


--------------------------------------------------------------------------------
/scripts/clean_training_data/investigate_pile.py:
--------------------------------------------------------------------------------
 1 | from lm_eval.decontamination.archiver import Reader
 2 | import os
 3 | import json
 4 | from functools import reduce
 5 | import glob
 6 | import tqdm
 7 | 
 8 | from tqdm_multiprocess import TqdmMultiProcessPool
 9 | 
10 | 
11 | def get_file_stats(file_path, tqdm_func, global_tqdm):
12 |     reader = Reader()
13 |     total_documents = 0
14 |     total_size = 0
15 |     update_frequency = 10000
16 |     current_file_position = 0
17 | 
18 |     with tqdm_func(
19 |         total=os.path.getsize(file_path), dynamic_ncols=True, unit="byte", unit_scale=1
20 |     ) as progress:
21 |         for document in reader.read(file_path, get_meta=True):
22 |             total_size += len(document)
23 |             total_documents += 1
24 | 
25 |             if total_documents % update_frequency == 0:
26 |                 new_file_pos = reader.fh.tell()
27 |                 bytes_read = new_file_pos - current_file_position
28 |                 current_file_position = new_file_pos
29 |                 progress.update(bytes_read)
30 |                 global_tqdm.update(bytes_read)
31 | 
32 |     return (total_documents, total_size)
33 | 
34 | 
35 | def get_files():
36 |     directory = "pile"
37 |     files = list(sorted(glob.glob(os.path.join(directory, "*.jsonl.zst*"))))
38 |     print(files)
39 |     return files
40 | 
41 | 
42 | def get_stats():
43 |     files = get_files()
44 |     total_size_bytes = sum(map(lambda x: os.path.getsize(x), files))
45 | 
46 |     pool = TqdmMultiProcessPool(4)
47 |     global_tqdm = tqdm.tqdm(
48 |         total=total_size_bytes, dynamic_ncols=True, unit="byte", unit_scale=1
49 |     )
50 | 
51 |     # Generate minhashes with pool
52 |     tasks = [(get_file_stats, (file,)) for file in files]
53 | 
54 |     def on_done(_):
55 |         return None
56 | 
57 |     def on_error(_):
58 |         return None
59 | 
60 |     results = pool.map(global_tqdm, tasks, on_error, on_done)
61 | 
62 |     total_documents, total_size = reduce(
63 |         lambda x, y: (x[0] + y[0], x[1] + y[1]), results
64 |     )
65 | 
66 |     start_offsets = []
67 |     current_offset = 0
68 |     for file_document_count, _ in results:
69 |         start_offsets.append(current_offset)
70 |         current_offset += file_document_count
71 | 
72 |     return (total_documents, total_size, start_offsets)
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     version = 1.01
77 |     print(f"Running version {version}")
78 | 
79 |     stats_file_path = "pile_statistics.json"
80 |     if os.path.exists(stats_file_path):
81 |         stats = json.load(open(stats_file_path, "r"))
82 |     else:
83 |         document_count, total_document_size_chars, start_offsets = get_stats()
84 |         stats = {
85 |             "Data": "Pile statistics",
86 |             "Document Count": document_count,
87 |             "Total Pile Characters": total_document_size_chars,
88 |             "File Start Offsets": start_offsets,
89 |         }
90 |         json.dump(stats, open(stats_file_path, "w"), indent=4)
91 | 
92 |     print(f"document_count: {stats['Document Count']}")
93 |     print(f"total_chars: {stats['Total Pile Characters']}")
94 |     print(f"start_offsets: {stats['File Start Offsets']}")
95 | 


--------------------------------------------------------------------------------
/scripts/clean_training_data/process_sorted_buckets.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Processes each sorted bucket, creating a new file listing all ngrams that matched more then 10
  3 | unique documents with their unique document counts. Uses multiprocessing and very little memory
  4 | as we stream from presorted buckets. Will use a lot of disk though.
  5 | 
  6 | Arguments
  7 | ---------
  8 | --working_directory (-dir)
  9 |     Directory containing the sorted buckets, processed files will be deposited here. Default: current directory
 10 | --move_dir (-move)
 11 |     Directory to move processed 13grams too. Default: Do nothing
 12 | --process_count (-procs)
 13 |     Number of processes to use. Default: 4
 14 | """
 15 | 
 16 | import argparse
 17 | import glob
 18 | import os
 19 | from pathlib import Path
 20 | import re
 21 | import shutil
 22 | 
 23 | from tqdm import tqdm
 24 | from tqdm_multiprocess import TqdmMultiProcessPool
 25 | 
 26 | from scripts.clean_training_data.archiver import TextReader, TextArchive
 27 | 
 28 | import logging
 29 | from tqdm_multiprocess.logger import setup_logger_tqdm
 30 | 
 31 | logger = logging.getLogger(__name__)
 32 | 
 33 | 
 34 | # Multiprocessed
 35 | def process_bucket(
 36 |     bucket_file_path, processed_directory, move_dir, tqdm_func, global_tqdm
 37 | ):
 38 | 
 39 |     bucket_id = re.sub("\D", "", os.path.basename(bucket_file_path))  # noqa: W605
 40 |     done_file = os.path.join(
 41 |         processed_directory, f"ngram_bucket_processing_{bucket_id}.done"
 42 |     )
 43 |     if os.path.exists(done_file):
 44 |         logger.info(f"bucket {bucket_id} already processed, skipping")
 45 |         return
 46 | 
 47 |     # For managing tqdm
 48 |     file_size = os.path.getsize(bucket_file_path)
 49 |     bucket_progress = tqdm_func(
 50 |         total=file_size, dynamic_ncols=True, unit="byte", unit_scale=1
 51 |     )
 52 |     current_file_position = 0
 53 |     update_frequency = 100 * 1000000  # 100mb
 54 |     update_counter = 0
 55 | 
 56 |     # Iterate through and output ngrams which occur in more then 10 documents
 57 |     bucket = TextReader(bucket_file_path)
 58 | 
 59 |     output_file_path = bucket_file_path + ".processed"
 60 |     output_archive = TextArchive(output_file_path, mode="wb")
 61 | 
 62 |     current_ngram = ""
 63 |     current_ngram_document_ids = set()
 64 |     for line in bucket.read():
 65 |         [ngram, document_id] = line.rsplit(" ", 1)
 66 | 
 67 |         # Write ngram if more then 10 unique document occurrences
 68 |         if ngram != current_ngram:
 69 |             if len(current_ngram_document_ids) > 10:
 70 |                 output_archive.add_data(
 71 |                     f"{current_ngram} {len(current_ngram_document_ids)}"
 72 |                 )
 73 |             current_ngram = ngram
 74 |             current_ngram_document_ids = set()
 75 | 
 76 |         current_ngram_document_ids.add(document_id)
 77 | 
 78 |         # Update tqdm
 79 |         update_counter += bucket.fh.tell() - current_file_position
 80 |         current_file_position = bucket.fh.tell()
 81 |         if update_counter > update_frequency:
 82 |             bucket_progress.update(update_counter)
 83 |             update_counter = 0
 84 | 
 85 |     # Remainder
 86 |     if len(current_ngram_document_ids) > 10:
 87 |         output_archive.add_data(f"{current_ngram} {len(current_ngram_document_ids)}")
 88 | 
 89 |     output_archive.commit()
 90 |     Path(done_file).touch()
 91 | 
 92 |     if move_dir:
 93 |         shutil.move(output_file_path, move_dir)
 94 | 
 95 |     global_tqdm.update()
 96 | 
 97 | 
 98 | def process_sorted_buckets(working_directory, move_dir, process_count):
 99 |     bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt.sorted"))
100 |     processed_directory = os.path.join(working_directory, "processed")
101 |     os.makedirs(processed_directory, exist_ok=True)
102 | 
103 |     pool = TqdmMultiProcessPool(process_count)
104 |     tasks = [
105 |         (process_bucket, (bucket_file, processed_directory, move_dir))
106 |         for bucket_file in bucket_file_paths
107 |     ]
108 | 
109 |     global_tqdm = tqdm(total=len(bucket_file_paths), dynamic_ncols=True, unit="bucket")
110 | 
111 |     def on_done(_):
112 |         return None
113 | 
114 |     def on_error(_):
115 |         return None
116 | 
117 |     _ = pool.map(global_tqdm, tasks, on_error, on_done)
118 | 
119 | 
120 | parser = argparse.ArgumentParser(description="Process 13 grams from sorted buckets.")
121 | parser.add_argument("-dir", "--working_directory", default="")
122 | parser.add_argument("-move", "--move_dir", default="")
123 | parser.add_argument("-procs", "--process_count", type=int, default=4)
124 | 
125 | if __name__ == "__main__":
126 | 
127 |     logfile_path = "process13grams.log"
128 |     setup_logger_tqdm(logfile_path)
129 | 
130 |     args = parser.parse_args()
131 |     process_sorted_buckets(args.working_directory, args.move_dir, args.process_count)
132 | 


--------------------------------------------------------------------------------
/scripts/clean_training_data/sort_13_gram_buckets.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Iteratively runs gnu sort on each bucket, uses up to 8 cores.
 3 | 
 4 | Arguments
 5 | ---------
 6 | --working_directory (-dir)
 7 |     Directory containing the bucketed 13-grams. Sorted buckets will be deposited in the same
 8 |     directory and the unsorted buckets are removed after.
 9 | """
10 | 
11 | import glob
12 | import argparse
13 | import os
14 | import signal
15 | from signal import SIGINT
16 | import subprocess
17 | 
18 | from tqdm import tqdm
19 | 
20 | import logging
21 | from tqdm_multiprocess.logger import setup_logger_tqdm
22 | 
23 | logger = logging.getLogger(__name__)
24 | 
25 | terminate = False
26 | 
27 | 
28 | def handler(signal_received, frame):
29 |     global terminate
30 |     terminate = True
31 | 
32 | 
33 | def sort_13_gram_buckets(working_directory):
34 |     bucket_file_paths = glob.glob(os.path.join(working_directory, f"*.bkt.txt"))
35 | 
36 |     for bucket_file_path in tqdm(bucket_file_paths, dynamic_ncols=True):
37 |         sorted_file_path = bucket_file_path + ".sorted"
38 |         command = f"sort {bucket_file_path} > {sorted_file_path}"
39 |         logger.info(command)
40 |         subprocess.call(command, shell=True)
41 | 
42 |         if terminate:
43 |             return
44 | 
45 |         os.remove(bucket_file_path)
46 | 
47 | 
48 | parser = argparse.ArgumentParser(description="sort 13gram buckets")
49 | parser.add_argument("-dir", "--working_directory", default="")
50 | 
51 | if __name__ == "__main__":
52 | 
53 |     version = 1.00
54 |     print(f"Running version {version}")
55 | 
56 |     # Handle sigint (ctrl-c) cleanly
57 |     previous_signal_int = signal.signal(SIGINT, handler)
58 | 
59 |     logfile_path = "sort13grambuckets.log"
60 |     setup_logger_tqdm(logfile_path)
61 | 
62 |     args = parser.parse_args()
63 |     sort_13_gram_buckets(args.working_directory)
64 | 


--------------------------------------------------------------------------------
/scripts/cost_estimate.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import transformers
 3 | from lm_eval import tasks, evaluator
 4 | from lm_eval.base import LM
 5 | 
 6 | 
 7 | class DryrunLM(LM):
 8 |     def __init__(self):
 9 |         self.tokencost = 0
10 |         self.tokenizer = transformers.GPT2TokenizerFast.from_pretrained("gpt2")
11 |         self.tokenizer.pad_token = "<|endoftext|>"
12 | 
13 |     @classmethod
14 |     def create_from_arg_string(cls, arg_string):
15 |         return cls()
16 | 
17 |     def loglikelihood(self, requests):
18 |         res = []
19 | 
20 |         for ctx, cont in requests:
21 |             res.append((-random.random(), False))
22 |             self.tokencost += len(self.tokenizer.tokenize(ctx + cont))
23 | 
24 |         return res
25 | 
26 |     def greedy_until(self, requests):
27 |         res = []
28 | 
29 |         for ctx, until in requests:
30 |             res.append("lol")
31 | 
32 |             # assume worst case - generates until 256
33 |             self.tokencost += len(self.tokenizer.tokenize(ctx)) + 256
34 | 
35 |         return res
36 | 
37 |     def loglikelihood_rolling(self, requests):
38 |         res = []
39 | 
40 |         for (s,) in requests:
41 |             # assume worst case: extra full context
42 |             self.tokencost += len(self.tokenizer.tokenize(s)) + 2048
43 | 
44 |         return res
45 | 
46 | 
47 | def main():
48 |     lm = DryrunLM()
49 | 
50 |     task_list = "arc_challenge,arc_easy,boolq,cola,copa,headqa,hellaswag,lambada,logiqa,mathqa,mc_taco,mrpc,multirc,openbookqa,piqa,prost,pubmedqa,qnli,qqp,race,record,rte,sciq,sst,triviaqa,webqs,wic,wikitext,winogrande,wnli,wsc"
51 |     values = []
52 |     for taskname in task_list.split(","):
53 |         lm.tokencost = 0
54 |         evaluator.evaluate(
55 |             lm=lm,
56 |             task_dict={taskname: tasks.get_task(taskname)()},
57 |             num_fewshot=0,
58 |             limit=None,
59 |             bootstrap_iters=10,
60 |             description_dict=None,
61 |         )
62 | 
63 |         print(taskname, lm.tokencost)
64 |         values.append(
65 |             [
66 |                 taskname,
67 |                 lm.tokencost,
68 |                 lm.tokencost / 1000 * 0.0008,
69 |                 lm.tokencost / 1000 * 0.0012,
70 |                 lm.tokencost / 1000 * 0.006,
71 |                 lm.tokencost / 1000 * 0.06,
72 |             ]
73 |         )
74 |     from pytablewriter import MarkdownTableWriter
75 | 
76 |     writer = MarkdownTableWriter()
77 |     writer.headers = ["Task", "Tokens", "Ada", "Babbage", "Curie", "Davinci"]
78 | 
79 |     values.sort(key=lambda x: -x[1])
80 |     totcost = sum([x[1] for x in values])
81 |     values.append(
82 |         [
83 |             "**Total**",
84 |             totcost,
85 |             totcost / 1000 * 0.0008,
86 |             totcost / 1000 * 0.0012,
87 |             totcost / 1000 * 0.006,
88 |             totcost / 1000 * 0.06,
89 |         ]
90 |     )
91 | 
92 |     writer.value_matrix = values
93 | 
94 |     print(writer.dumps())
95 | 
96 | 
97 | if __name__ == "__main__":
98 |     main()
99 | 


--------------------------------------------------------------------------------
/scripts/get_prompts.py:
--------------------------------------------------------------------------------
 1 | from lm_eval import tasks
 2 | from itertools import islice
 3 | 
 4 | ct = 3
 5 | 
 6 | for (
 7 |     tname,
 8 |     Task,
 9 | ) in tasks.TASK_REGISTRY.items():  # [('record', tasks.superglue.ReCoRD)]:#
10 |     task = Task()
11 | 
12 |     print("#", tname)
13 |     docs = islice(
14 |         task.validation_docs() if task.has_validation_docs() else task.test_docs(), ct
15 |     )
16 |     print()
17 |     for i in range(ct):
18 |         print()
19 |         doc = next(docs)
20 |         print("**Context**:", "\n```\n" + task.doc_to_text(doc) + "\n```\n")
21 |         print()
22 |         print("**Target**:", "\n```\n" + task.doc_to_target(doc) + "\n```\n")
23 |         print()
24 | 


--------------------------------------------------------------------------------
/scripts/make_gpt2_test_cases.py:
--------------------------------------------------------------------------------
 1 | import transformers
 2 | 
 3 | import torch
 4 | import torch.nn.functional as F
 5 | import random
 6 | 
 7 | random.seed(42)
 8 | 
 9 | 
10 | data = [
11 |     "A multilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)",
12 |     "The term MLP is used ambiguously, sometimes loosely to any feedforward ANN, sometimes strictly to refer to networks composed of multiple layers of perceptrons (with threshold activation); see § Terminology",
13 |     'Multilayer perceptrons are sometimes colloquially referred to as "vanilla" neural networks, especially when they have a single hidden layer.[1]',
14 |     "An MLP consists of at least three layers of nodes: an input layer, a hidden layer and an output layer. Except for the input nodes, each node is a neuron that uses a nonlinear activation function.",
15 |     "MLP utilizes a supervised learning technique called backpropagation for training.[2][3] Its multiple layers and non-linear activation distinguish MLP from a linear perceptron. It can distinguish data that is not linearly separable.[4]",
16 |     "Recent work has demonstrated substantial gains on many NLP tasks and benchmarks by pre-training on a large corpus of text followed by fine-tuning on a specific task. While typically task-agnostic in architecture, this method still requires task-specific fine-tuning datasets of thousands or tens of thousands of examples. By contrast, humans can generally perform a new language task from only a few examples or from simple instructions - something which current NLP systems still largely struggle to do. Here we show that scaling up language models greatly improves task-agnostic, few-shot performance, sometimes even reaching competitiveness with prior state-of-the-art fine-tuning approaches. ",
17 |     "Specifically, we train GPT-3, an autoregressive language model with 175 billion parameters, 10x more than any previous non-sparse language model, and test its performance in the few-shot setting. For all tasks, GPT-3 is applied without any gradient updates or fine-tuning, with tasks and few-shot demonstrations specified purely via text interaction with the model. GPT-3 achieves strong performance on many NLP datasets, including translation, question-answering, and cloze tasks, as well as several tasks that require on-the-fly reasoning or domain adaptation, such as unscrambling words, using a novel word in a sentence, or performing 3-digit arithmetic. At the same time, we also identify some datasets where GPT-3's few-shot learning still struggles, as well as some datasets where GPT-3 faces methodological issues related to training on large web corpora. Finally, we find that GPT-3 can generate samples of news articles which human evaluators have difficulty distinguishing from articles written by humans. We discuss broader societal impacts of this finding and of GPT-3 in general.",
18 |     "A multilayer perceptron (MLP) is a class of feedforward artificial neural network (ANN)",
19 |     "Hello World",
20 | ]
21 | 
22 | 
23 | model = transformers.GPT2LMHeadModel.from_pretrained("gpt2")
24 | tok = transformers.GPT2Tokenizer.from_pretrained("gpt2")
25 | 
26 | tgs = []
27 | 
28 | for dat in data:
29 |     random.seed(dat)
30 |     # print(model(tok.encode(dat, return_tensors="pt"))[0][0])
31 | 
32 |     toks = tok.encode(dat, return_tensors="pt")
33 |     ind = random.randrange(len(toks[0]) - 1)
34 |     logits = F.log_softmax(model(toks)[0], dim=-1)[:, :-1]  # [batch, seq, vocab]
35 | 
36 |     res = torch.gather(logits, 2, toks[:, 1:].unsqueeze(-1)).squeeze(-1)[0]
37 | 
38 |     tgs.append(float(res[ind:].sum()))
39 |     print(
40 |         r'("""'
41 |         + tok.decode(toks[0, : ind + 1])
42 |         + r'""", """'
43 |         + tok.decode(toks[0, ind + 1 :])
44 |         + r'"""), '
45 |     )
46 | 
47 | print(tgs)
48 | 


--------------------------------------------------------------------------------
/scripts/make_table_tasks.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 |    python make_table_tasks.py --output <markdown_filename>
 4 | """
 5 | import argparse
 6 | import logging
 7 | from lm_eval import tasks
 8 | from pytablewriter import MarkdownTableWriter
 9 | 
10 | 
11 | logging.basicConfig(level=logging.INFO)
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | def check(tf):
16 |     if tf:
17 |         return "✓"
18 |     else:
19 |         return " "
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--output", type=str, default="task_table.md")
25 |     args = parser.parse_args()
26 | 
27 |     writer = MarkdownTableWriter()
28 |     writer.headers = ["Task Name", "Train", "Val", "Test", "Val/Test Docs", "Metrics"]
29 |     values = []
30 | 
31 |     tasks = tasks.TASK_REGISTRY.items()
32 |     tasks = sorted(tasks, key=lambda x: x[0])
33 |     for tname, Task in tasks:
34 |         task = Task()
35 |         v = [
36 |             tname,
37 |             check(task.has_training_docs()),
38 |             check(task.has_validation_docs()),
39 |             check(task.has_test_docs()),
40 |             len(
41 |                 list(
42 |                     task.test_docs() if task.has_test_docs() else task.validation_docs()
43 |                 )
44 |             ),
45 |             ", ".join(task.aggregation().keys()),
46 |         ]
47 |         logger.info(v)
48 |         values.append(v)
49 |     writer.value_matrix = values
50 |     table = writer.dumps()
51 |     with open(args.output, "w") as f:
52 |         f.write(table)
53 | 


--------------------------------------------------------------------------------
/scripts/write_out.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | import json
 4 | import os
 5 | import random
 6 | from lm_eval import tasks
 7 | from lm_eval.utils import join_iters
 8 | 
 9 | EXAMPLE_DIVIDER = "!!@@##@@!! -- Example {i}\n"
10 | 
11 | 
12 | def parse_args():
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--output_base_path", required=True)
15 |     parser.add_argument("--tasks", default="all_tasks")
16 |     parser.add_argument("--provide_description", action="store_true")
17 |     parser.add_argument("--sets", type=str, default="val")  # example: val,test
18 |     parser.add_argument("--num_fewshot", type=int, default=1)
19 |     parser.add_argument("--seed", type=int, default=42)
20 |     parser.add_argument("--num_examples", type=int, default=1)
21 |     parser.add_argument("--description_dict_path", default=None)
22 |     return parser.parse_args()
23 | 
24 | 
25 | def main():
26 |     args = parse_args()
27 |     np.random.seed(args.seed)
28 | 
29 |     if args.tasks == "all_tasks":
30 |         task_names = tasks.ALL_TASKS
31 |     else:
32 |         task_names = args.tasks.split(",")
33 |     task_dict = tasks.get_task_dict(task_names)
34 | 
35 |     description_dict = {}
36 |     if args.description_dict_path:
37 |         with open(args.description_dict_path, "r") as f:
38 |             description_dict = json.load(f)
39 | 
40 |     os.makedirs(args.output_base_path, exist_ok=True)
41 |     for task_name, task in task_dict.items():
42 |         rnd = random.Random()
43 |         rnd.seed(args.seed)
44 | 
45 |         iters = []
46 | 
47 |         for set in args.sets.split(","):
48 |             if set == "train" and task.has_training_docs():
49 |                 docs = task.training_docs()
50 |             if set == "val" and task.has_validation_docs():
51 |                 docs = task.validation_docs()
52 |             if set == "test" and task.has_test_docs():
53 |                 docs = task.test_docs()
54 |             iters.append(docs)
55 | 
56 |         docs = join_iters(iters)
57 | 
58 |         description = (
59 |             description_dict[task_name]
60 |             if description_dict and task_name in description_dict
61 |             else ""
62 |         )
63 | 
64 |         with open(os.path.join(args.output_base_path, task_name), "w") as f:
65 |             for i, doc in (
66 |                 zip(range(args.num_examples), docs)
67 |                 if args.num_examples > 0
68 |                 else enumerate(docs)
69 |             ):
70 |                 f.write(EXAMPLE_DIVIDER.format(i=i))
71 |                 ctx = task.fewshot_context(
72 |                     doc=doc,
73 |                     num_fewshot=args.num_fewshot,
74 |                     rnd=rnd,
75 |                     description=description,
76 |                 )
77 |                 f.write(ctx + "\n")
78 | 
79 | 
80 | if __name__ == "__main__":
81 |     main()
82 | 


--------------------------------------------------------------------------------
/templates/new_multiple_choice_task.py:
--------------------------------------------------------------------------------
 1 | # TODO: Remove all TODO comments once the implementation is complete.
 2 | """
 3 | TODO: Add the Paper Title on this line.
 4 | TODO: Add the paper's PDF URL (preferably from arXiv) on this line.
 5 | 
 6 | TODO: Write a Short Description of the task.
 7 | 
 8 | Homepage: TODO: Add the URL to the task's Homepage here.
 9 | """
10 | from lm_eval.base import MultipleChoiceTask
11 | 
12 | 
13 | # TODO: Add the BibTeX citation for the task.
14 | _CITATION = """
15 | """
16 | 
17 | 
18 | # TODO: Replace `NewTask` with the name of your Task.
19 | class NewTask(MultipleChoiceTask):
20 |     VERSION = 0
21 |     # TODO: Add the `DATASET_PATH` string. This will be the name of the `Task`
22 |     # dataset as denoted in HuggingFace `datasets`.
23 |     DATASET_PATH = ""
24 |     # TODO: Add the `DATASET_NAME` string. This is the name of a subset within
25 |     # `DATASET_PATH`. If there aren't specific subsets you need, leave this as `None`.
26 |     DATASET_NAME = None
27 | 
28 |     def has_training_docs(self):
29 |         # TODO: Fill in the return with `True` if the Task has training data; else `False`.
30 |         return False
31 | 
32 |     def has_validation_docs(self):
33 |         # TODO: Fill in the return with `True` if the Task has validation data; else `False`.
34 |         return False
35 | 
36 |     def has_test_docs(self):
37 |         # TODO: Fill in the return with `True` if the Task has test data; else `False`.
38 |         return False
39 | 
40 |     def training_docs(self):
41 |         if self.has_training_docs():
42 |             # We cache training documents in `self._training_docs` for faster
43 |             # few-shot processing. If the data is too large to fit in memory,
44 |             # return the training data as a generator instead of a list.
45 |             if self._training_docs is None:
46 |                 # TODO: Return the training document generator from `self.dataset`.
47 |                 # In most case you can leave this as is unless the dataset split is
48 |                 # named differently than the default `"train"`.
49 |                 self._training_docs = list(
50 |                     map(self._process_doc, self.dataset["train"])
51 |                 )
52 |             return self._training_docs
53 | 
54 |     def validation_docs(self):
55 |         if self.has_validation_docs():
56 |             # TODO: Return the validation document generator from `self.dataset`.
57 |             # In most case you can leave this as is unless the dataset split is
58 |             # named differently than the default `"validation"`.
59 |             return map(self._process_doc, self.dataset["validation"])
60 | 
61 |     def test_docs(self):
62 |         if self.has_test_docs():
63 |             # TODO: Return the test document generator from `self.dataset`.
64 |             # In most case you can leave this as is unless the dataset split is
65 |             # named differently than the default `"test"`.
66 |             return map(self._process_doc, self.dataset["test"])
67 | 
68 |     def _process_doc(self, doc):
69 |         # TODO: Process the documents into a dictionary with the following keys:
70 |         return {
71 |             "query": "",  # The query prompt.
72 |             "choices": [],  # The list of choices.
73 |             "gold": 0,  # The integer used to index into the correct element of `"choices"`.
74 |         }
75 | 
76 |     def doc_to_text(self, doc):
77 |         # TODO: Format the query prompt portion of the document example.
78 |         return doc["query"]
79 | 


--------------------------------------------------------------------------------