├── .gitignore
├── .gitmodules
├── CHANGELOG.md
├── LICENSE
├── README.md
├── arguments.py
├── assets
    ├── benchmark_overview.png
    ├── logo.jpeg
    └── task_correlation.png
├── claude.tokenizer.json
├── configs
    ├── alce_nocite.yaml
    ├── alce_nocite_short.yaml
    ├── cite.yaml
    ├── cite_short.yaml
    ├── icl.yaml
    ├── icl_short.yaml
    ├── longqa.yaml
    ├── longqa_short.yaml
    ├── niah.yaml
    ├── niah_long.yaml
    ├── rag.yaml
    ├── rag_short.yaml
    ├── rag_vllm.yaml
    ├── recall.yaml
    ├── recall_demo.yaml
    ├── recall_short.yaml
    ├── recall_vllm.yaml
    ├── rerank.yaml
    ├── rerank_short.yaml
    ├── ruler.yaml
    ├── ruler_short.yaml
    ├── summ.yaml
    └── summ_short.yaml
├── data.py
├── eval.py
├── eval_alce.py
├── longproc_addon
    ├── README.md
    ├── __init__.py
    ├── configs
    │   ├── countdown.yaml
    │   ├── html_to_tsv.yaml
    │   ├── path_traversal.yaml
    │   ├── pseudo_to_code.yaml
    │   ├── tom_tracking.yaml
    │   └── travel_planning.yaml
    └── longproc_helmet_loader.py
├── model_utils.py
├── prompts
    ├── asqa_nocite.json
    ├── asqa_revised.json
    ├── qampari_nocite.json
    └── qampari_revised.json
├── requirements.txt
├── scripts
    ├── collect_results.py
    ├── download_data.sh
    ├── eval_gpt4_longqa.py
    ├── eval_gpt4_longqa.sh
    ├── eval_gpt4_summ.py
    ├── eval_gpt4_summ.sh
    ├── generate_configs.py
    ├── run_api.sh
    ├── run_eval.sh
    ├── run_eval_hf_endpoint.sh
    ├── run_eval_slurm.sh
    ├── run_eval_tgi.sh
    ├── run_eval_vllm_gaudi.sh
    ├── run_short_slurm.sh
    └── vllm-gaudi
    │   ├── build_image.sh
    │   ├── compose.yaml
    │   └── launch_container.sh
├── utils.py
└── visualization.ipynb


/.gitignore:
--------------------------------------------------------------------------------
  1 | .*.swp
  2 | .*.swo
  3 | output/
  4 | env/
  5 | *_env/
  6 | joblog/
  7 | 
  8 | # Byte-compiled / optimized / DLL files
  9 | __pycache__/
 10 | *.py[cod]
 11 | *$py.class
 12 | 
 13 | # C extensions
 14 | *.so
 15 | 
 16 | # Distribution / packaging
 17 | .Python
 18 | build/
 19 | develop-eggs/
 20 | dist/
 21 | downloads/
 22 | eggs/
 23 | .eggs/
 24 | lib/
 25 | lib64/
 26 | parts/
 27 | sdist/
 28 | var/
 29 | wheels/
 30 | share/python-wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | MANIFEST
 35 | 
 36 | # PyInstaller
 37 | #  Usually these files are written by a python script from a template
 38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 39 | *.manifest
 40 | *.spec
 41 | 
 42 | # Installer logs
 43 | pip-log.txt
 44 | pip-delete-this-directory.txt
 45 | 
 46 | # Unit test / coverage reports
 47 | htmlcov/
 48 | .tox/
 49 | .nox/
 50 | .coverage
 51 | .coverage.*
 52 | .cache
 53 | nosetests.xml
 54 | coverage.xml
 55 | *.cover
 56 | *.py,cover
 57 | .hypothesis/
 58 | .pytest_cache/
 59 | cover/
 60 | 
 61 | # Translations
 62 | *.mo
 63 | *.pot
 64 | 
 65 | # Django stuff:
 66 | *.log
 67 | local_settings.py
 68 | db.sqlite3
 69 | db.sqlite3-journal
 70 | 
 71 | # Flask stuff:
 72 | instance/
 73 | .webassets-cache
 74 | 
 75 | # Scrapy stuff:
 76 | .scrapy
 77 | 
 78 | # Sphinx documentation
 79 | docs/_build/
 80 | 
 81 | # PyBuilder
 82 | .pybuilder/
 83 | target/
 84 | 
 85 | # Jupyter Notebook
 86 | .ipynb_checkpoints
 87 | 
 88 | # IPython
 89 | profile_default/
 90 | ipython_config.py
 91 | 
 92 | # pyenv
 93 | #   For a library or package, you might want to ignore these files since the code is
 94 | #   intended to run in multiple environments; otherwise, check them in:
 95 | # .python-version
 96 | 
 97 | # pipenv
 98 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 99 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
100 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
101 | #   install all needed dependencies.
102 | #Pipfile.lock
103 | 
104 | # poetry
105 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
106 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
107 | #   commonly ignored for libraries.
108 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
109 | #poetry.lock
110 | 
111 | # pdm
112 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113 | #pdm.lock
114 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
115 | #   in version control.
116 | #   https://pdm.fming.dev/#use-with-ide
117 | .pdm.toml
118 | 
119 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
120 | __pypackages__/
121 | 
122 | # Celery stuff
123 | celerybeat-schedule
124 | celerybeat.pid
125 | 
126 | # SageMath parsed files
127 | *.sage.py
128 | 
129 | # Environments
130 | .env
131 | .venv
132 | env/
133 | venv/
134 | ENV/
135 | env.bak/
136 | venv.bak/
137 | 
138 | # Spyder project settings
139 | .spyderproject
140 | .spyproject
141 | 
142 | # Rope project settings
143 | .ropeproject
144 | 
145 | # mkdocs documentation
146 | /site
147 | 
148 | # mypy
149 | .mypy_cache/
150 | .dmypy.json
151 | dmypy.json
152 | 
153 | # Pyre type checker
154 | .pyre/
155 | 
156 | # pytype static type analyzer
157 | .pytype/
158 | 
159 | # Cython debug symbols
160 | cython_debug/
161 | 
162 | # PyCharm
163 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
164 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
165 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
166 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
167 | #.idea/
168 | 
169 | data
170 | output
171 | configs/_*
172 | slurm
173 | 
174 | *.ipynb
175 | gty*


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "longproc_addon/longproc"]
2 | 	path = longproc_addon/longproc
3 | 	url = https://github.com/princeton-pli/LongProc.git
4 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Changelog
 2 | All notable changes will be documented in this file.
 3 | 
 4 | ## 2025-02-25
 5 | 
 6 | In this version, we make some significant improvements to reduce the cost of running the experiments.
 7 | 
 8 | - Add support for batch evaluation
 9 |     - For OpenAI and Anthropic, we use their batch API to reduce the cost of API calls by 50% ([OpenAI documentation](https://cookbook.openai.com/examples/batch_processing), [Anthropic documentation](https://docs.anthropic.com/en/docs/build-with-claude/message-batches)). The model-based evaluation script have also been updated to reduce cost.
10 |     - For other API providers, we use a simple multi-threading approach to parallelize the API calls
11 |     - For open-source models, we use batching from the VLLM library for more speed-up.
12 | - Changes to the datasets pre-processing — the paper will be updated in a future version.
13 |     - ICL datasets now evaluate 500 samples instead of 100, use a different set of demonstrations for each test instance, and we balance the number of test labels—this is to make the evaluation more consistent and robust. 
14 |     - RAG, Re-ranking, and Citation use `hashlib` for consistent hashing
15 | - Visualization jupyter notebook for plotting results.
16 | - Support for SGLang, which can be faster for certain supported models.
17 | - Support for reasoning models, such as DeepSeek's R1 models, where we parse out the reasoning steps from the model's output.
18 | - Other minor changes, such as adding documentation.
19 | 
20 | ## 2024-10-04
21 | 
22 | Thanks to @8188zq and @chtmp223 for pointing out some issues in the current repo: some results are not fully reproducible due to random seeding problems.
23 | This affects the results for ICL, Re-reranking, and the RAG tasks, where the demo sample for each question may differ.
24 | We have updated the code to fix this issue, and will update the results on the paper and the spreadsheet soon to reflect the changes.
25 | Specifically, we make sure the seeding is consistent across runs and independent of system settings.
26 | 
27 | Other minor changes:
28 |  - Clean up `data.py` and remove unused code.
29 |  - Update argument descriptions.
30 |  - Log exceptions


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Princeton Natural Language Processing
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/arguments.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import yaml
 3 | import ast
 4 | import os
 5 | 
 6 | import logging
 7 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
 8 |                     datefmt='%m/%d/%Y %H:%M:%S')
 9 | logger = logging.getLogger(__name__)
10 | logger.setLevel(logging.INFO)
11 | 
12 | 
13 | def parse_arguments():
14 |     parser = argparse.ArgumentParser(description="evaluation on downstream tasks")
15 |     parser.add_argument("--config", type=str, default=None, help="path to config file")
16 |     parser.add_argument("--tag", type=str, default="eval", help="tag to add to the output file")
17 | 
18 |     # model setting
19 |     parser.add_argument("--model_name_or_path", type=str, default=None)
20 |     parser.add_argument("--use_vllm", action="store_true", help="whether to use vllm engine")
21 |     parser.add_argument("--use_sglang", action="store_true", help="whether to use sglang engine")
22 |     parser.add_argument("--use_vllm_serving", action="store_true", help="whether to use vllm serving engine")
23 |     parser.add_argument("--use_tgi_serving", action="store_true", help="whether to use tgi serving engine")
24 |     parser.add_argument("--endpoint_url", type=str,default="http://localhost:8080/v1/", help="endpoint url for tgi or vllm serving engine")
25 |     parser.add_argument("--api_key", type=str, default="EMPTY", help="api key for model endpoint")
26 | 
27 |     # data settings
28 |     parser.add_argument("--datasets", type=str, default=None, help="comma separated list of dataset names")
29 |     parser.add_argument("--demo_files", type=str, default=None, help="comma separated list of demo files")
30 |     parser.add_argument("--test_files", type=str, default=None, help="comma separated list of test files")
31 |     parser.add_argument("--output_dir", type=str, default=None, help="path to save the predictions")
32 |     parser.add_argument("--overwrite", action="store_true", help="whether to the saved file")
33 |     parser.add_argument("--max_test_samples", type=int, default=None)
34 |     parser.add_argument("--num_workers", type=int, default=4, help="number of workers for data loading")
35 | 
36 |     # dataset specific settings
37 |     parser.add_argument("--popularity_threshold", type=int, default=3, help="popularity threshold for popqa, in log scale")
38 | 
39 |     # evaluation settings
40 |     parser.add_argument("--shots", type=int, default=2, help="total number of ICL demos")
41 |     parser.add_argument("--input_max_length", type=str, default='8192', help="the maximum number of tokens of the input, we truncate the end of the context; can be separated by comma to match the specified datasets")
42 | 
43 |     # generation settings
44 |     parser.add_argument("--do_sample", type=ast.literal_eval, choices=[True, False], default=False, help="whether to use sampling (false is greedy), overwrites temperature")
45 |     parser.add_argument("--generation_max_length", type=str, default='10', help="max number of tokens to generate, can be separated by comma to match the specified datasets")
46 |     parser.add_argument("--generation_min_length", type=int, default=0, help="min number of tokens to generate")
47 |     parser.add_argument("--temperature", type=float, default=0.0, help="generation temperature")
48 |     parser.add_argument("--top_p", type=float, default=1.0, help="top-p parameter for nucleus sampling")
49 |     parser.add_argument("--stop_newline", type=ast.literal_eval, choices=[True, False], default=False, help="whether to stop generation at newline")
50 |     parser.add_argument("--system_message", type=str, default=None, help="system message to add to the beginning of context")
51 | 
52 |     # model specific settings
53 |     parser.add_argument("--seed", type=int, default=42, help="random seed")
54 |     parser.add_argument("--no_cuda", action="store_true", help="disable cuda")
55 |     parser.add_argument("--no_bf16", action="store_true", help="disable bf16 and use fp32")
56 |     parser.add_argument("--no_torch_compile", action="store_true", help="disable torchcompile")
57 |     parser.add_argument("--use_chat_template", type=ast.literal_eval, choices=[True, False], default=False, help="whether to use chat template")
58 |     parser.add_argument("--rope_theta", type=int, default=None, help="override rope theta")
59 |     parser.add_argument("--thinking", action="store_true", help="for reasoning models (e.g., Deepseek-r1), when this is set, we allow the model to generate an additional 32k tokens and exclude all texts between <think>*</think> from the output for evaluation")
60 | 
61 |     # misc
62 |     parser.add_argument("--debug", action="store_true", help="for debugging")
63 |     parser.add_argument("--count_tokens", action="store_true", help="instead of running generation, just count the number of tokens (only for HF models not API)")
64 | 
65 |     args = parser.parse_args()
66 |     config = yaml.safe_load(open(args.config)) if args.config is not None else {}
67 |     parser.set_defaults(**config)
68 |     args = parser.parse_args()
69 | 
70 |     if args.output_dir is None:
71 |         args.output_dir = f"output/{os.path.basename(args.model_name_or_path)}"
72 | 
73 |     if args.rope_theta is not None:
74 |         args.output_dir = args.output_dir + f"-override-rope{args.rope_theta}"
75 | 
76 |     if not args.do_sample and args.temperature != 0.0:
77 |         args.temperature = 0.0
78 |         logger.info("overwriting temperature to 0.0 since do_sample is False")
79 | 
80 |     return args
81 | 


--------------------------------------------------------------------------------
/assets/benchmark_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/princeton-nlp/HELMET/87bbd6d8e316dd9ea9c5515b8f20e44bf2d768b1/assets/benchmark_overview.png


--------------------------------------------------------------------------------
/assets/logo.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/princeton-nlp/HELMET/87bbd6d8e316dd9ea9c5515b8f20e44bf2d768b1/assets/logo.jpeg


--------------------------------------------------------------------------------
/assets/task_correlation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/princeton-nlp/HELMET/87bbd6d8e316dd9ea9c5515b8f20e44bf2d768b1/assets/task_correlation.png


--------------------------------------------------------------------------------
/configs/alce_nocite.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: '131072'
 2 | datasets: alce_asqa_nocite_700
 3 | generation_max_length: 600
 4 | test_files: data/alce/asqa_eval_gtr_top2000.json
 5 | demo_files: prompts/asqa_nocite.json
 6 | use_chat_template: true
 7 | max_test_samples: 100
 8 | shots: 0
 9 | stop_new_line: false
10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
11 | 


--------------------------------------------------------------------------------
/configs/alce_nocite_short.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 8192,16384,32768,65536
 2 | datasets: alce_asqa_nocite_30,alce_asqa_nocite_75,alce_asqa_nocite_165,alce_asqa_nocite_345
 3 | generation_max_length: 600
 4 | test_files: data/alce/asqa_eval_gtr_top2000.json,data/alce/asqa_eval_gtr_top2000.json,data/alce/asqa_eval_gtr_top2000.json,data/alce/asqa_eval_gtr_top2000.json
 5 | demo_files: prompts/asqa_nocite.json,prompts/asqa_nocite.json,prompts/asqa_nocite.json,prompts/asqa_nocite.json
 6 | use_chat_template: true
 7 | max_test_samples: 100
 8 | shots: 0
 9 | stop_new_line: false
10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
11 | 


--------------------------------------------------------------------------------
/configs/cite.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 131072,131072
 2 | datasets: alce_asqa_700,alce_qampari_700
 3 | generation_max_length: 300,300
 4 | test_files: data/alce/asqa_eval_gtr_top2000.json,data/alce/qampari_eval_gtr_top2000.json
 5 | demo_files: prompts/asqa_revised.json,prompts/qampari_revised.json
 6 | use_chat_template: true
 7 | max_test_samples: 100
 8 | shots: 2
 9 | stop_new_line: false
10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
11 | 


--------------------------------------------------------------------------------
/configs/cite_short.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536
 2 | datasets: alce_asqa_30,alce_asqa_75,alce_asqa_165,alce_asqa_345,alce_qampari_30,alce_qampari_75,alce_qampari_165,alce_qampari_345
 3 | generation_max_length: 300,300,300,300,300,300,300,300
 4 | test_files: data/alce/asqa_eval_gtr_top2000.json,data/alce/asqa_eval_gtr_top2000.json,data/alce/asqa_eval_gtr_top2000.json,data/alce/asqa_eval_gtr_top2000.json,data/alce/qampari_eval_gtr_top2000.json,data/alce/qampari_eval_gtr_top2000.json,data/alce/qampari_eval_gtr_top2000.json,data/alce/qampari_eval_gtr_top2000.json
 5 | demo_files: prompts/asqa_revised.json,prompts/asqa_revised.json,prompts/asqa_revised.json,prompts/asqa_revised.json,prompts/qampari_revised.json,prompts/qampari_revised.json,prompts/qampari_revised.json,prompts/qampari_revised.json
 6 | use_chat_template: true
 7 | max_test_samples: 100
 8 | shots: 2
 9 | stop_new_line: false
10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
11 | 


--------------------------------------------------------------------------------
/configs/icl.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 131072,131072,131072,131072,131072
 2 | datasets: icl_trec_coarse_6600shot_balance,icl_trec_fine_6400shot_balance,icl_banking77_5900shot_balance,icl_clinic150_7050shot_balance,icl_nlu_8296shot_balance
 3 | generation_max_length: 20,20,20,20,20
 4 | test_files: ',,,,'
 5 | demo_files: ',,,,'
 6 | use_chat_template: false
 7 | max_test_samples: 500
 8 | shots: 0
 9 | stop_new_line: true
10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
11 | 


--------------------------------------------------------------------------------
/configs/icl_short.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536
 2 | datasets: icl_trec_coarse_400shot_balance,icl_trec_coarse_800shot_balance,icl_trec_coarse_1600shot_balance,icl_trec_coarse_3300shot_balance,icl_trec_fine_400shot_balance,icl_trec_fine_800shot_balance,icl_trec_fine_1600shot_balance,icl_trec_fine_3200shot_balance,icl_banking77_360shot_balance,icl_banking77_720shot_balance,icl_banking77_1450shot_balance,icl_banking77_2900shot_balance,icl_clinic150_440shot_balance,icl_clinic150_880shot_balance,icl_clinic150_1750shot_balance,icl_clinic150_3525shot_balance,icl_nlu_510shot_balance,icl_nlu_1020shot_balance,icl_nlu_2040shot_balance,icl_nlu_4080shot_balance
 3 | generation_max_length: 20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
 4 | test_files: ',,,,,,,,,,,,,,,,,,,'
 5 | demo_files: ',,,,,,,,,,,,,,,,,,,'
 6 | use_chat_template: false
 7 | max_test_samples: 500
 8 | shots: 0
 9 | stop_new_line: true
10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
11 | 


--------------------------------------------------------------------------------
/configs/longqa.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 131072,131072,131072
 2 | datasets: narrativeqa_130772,infbench_qa_eng_130862,infbench_choice_eng_130862
 3 | generation_max_length: 100,10,10
 4 | test_files: ',,'
 5 | demo_files: ',,'
 6 | use_chat_template: true
 7 | max_test_samples: 100
 8 | shots: 2
 9 | stop_new_line: false
10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
11 | 


--------------------------------------------------------------------------------
/configs/longqa_short.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536
 2 | datasets: narrativeqa_7892,narrativeqa_16084,narrativeqa_32468,narrativeqa_65236,infbench_qa_eng_7982,infbench_qa_eng_16174,infbench_qa_eng_32558,infbench_qa_eng_65326,infbench_choice_eng_7982,infbench_choice_eng_16174,infbench_choice_eng_32558,infbench_choice_eng_65326
 3 | generation_max_length: 100,100,100,100,10,10,10,10,10,10,10,10
 4 | test_files: ',,,,,,,,,,,'
 5 | demo_files: ',,,,,,,,,,,'
 6 | use_chat_template: true
 7 | max_test_samples: 100
 8 | shots: 2
 9 | stop_new_line: false
10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
11 | 


--------------------------------------------------------------------------------
/configs/niah.yaml:
--------------------------------------------------------------------------------
1 | input_max_length: 131072
2 | datasets: ruler_niah_s_2
3 | generation_max_length: 50
4 | test_files: data/ruler/niah_single_2/validation_131072.jsonl
5 | demo_files: ''
6 | 


--------------------------------------------------------------------------------
/configs/niah_long.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072
 2 | datasets: ruler_niah_s_1,ruler_niah_s_1,ruler_niah_s_2,ruler_niah_s_2,ruler_niah_s_3,ruler_niah_s_3,ruler_niah_mk_1,ruler_niah_mk_1,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mq,ruler_niah_mq,ruler_niah_mv,ruler_niah_mv,ruler_cwe,ruler_cwe,ruler_fwe,ruler_fwe,ruler_vt,ruler_vt,ruler_qa_1,ruler_qa_1,ruler_qa_2,ruler_qa_2
 3 | generation_max_length: 50,50,50,50,50,50,50,50,50,50,100,100,100,100,50,50,100,100,50,50,50,50,50,50,50,50
 4 | test_files: data/ruler/niah_single_1/validation_65536.jsonl,data/ruler/niah_single_1/validation_131072.jsonl,data/ruler/niah_single_2/validation_65536.jsonl,data/ruler/niah_single_2/validation_131072.jsonl,data/ruler/niah_single_3/validation_65536.jsonl,data/ruler/niah_single_3/validation_131072.jsonl,data/ruler/niah_multikey_1/validation_65536.jsonl,data/ruler/niah_multikey_1/validation_131072.jsonl,data/ruler/niah_multikey_2/validation_65536.jsonl,data/ruler/niah_multikey_2/validation_131072.jsonl,data/ruler/niah_multikey_3/validation_65536.jsonl,data/ruler/niah_multikey_3/validation_131072.jsonl,data/ruler/niah_multiquery/validation_65536.jsonl,data/ruler/niah_multiquery/validation_131072.jsonl,data/ruler/niah_multivalue/validation_65536.jsonl,data/ruler/niah_multivalue/validation_131072.jsonl,data/ruler/cwe/validation_65536.jsonl,data/ruler/cwe/validation_131072.jsonl,data/ruler/fwe/validation_65536.jsonl,data/ruler/fwe/validation_131072.jsonl,data/ruler/vt/validation_65536.jsonl,data/ruler/vt/validation_131072.jsonl,data/ruler/qa_1/validation_65536.jsonl,data/ruler/qa_1/validation_131072.jsonl,data/ruler/qa_2/validation_65536.jsonl,data/ruler/qa_2/validation_131072.jsonl
 5 | demo_files: ',,,,,,,,,,,,,,,,,,,,,,,,,'
 6 | use_chat_template: false
 7 | max_test_samples: 100
 8 | shots: 0
 9 | stop_new_line: false
10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
11 | 


--------------------------------------------------------------------------------
/configs/rag.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 131072,131072,131072,131072
 2 | datasets: kilt_nq,kilt_triviaqa,kilt_hotpotqa,kilt_popqa_3
 3 | generation_max_length: 20,20,20,20
 4 | test_files: data/kilt/nq-dev-multikilt_1000_k1000_dep6.jsonl,data/kilt/triviaqa-dev-multikilt_1000_k1000_dep6.jsonl,data/kilt/hotpotqa-dev-multikilt_1000_k1000_dep3.jsonl,data/kilt/popqa_test_1000_k1000_dep6.jsonl
 5 | demo_files: data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl,data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl,data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl,data/kilt/popqa_test_1000_k3_dep6.jsonl
 6 | use_chat_template: false
 7 | max_test_samples: 100
 8 | shots: 2
 9 | stop_new_line: true
10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
11 | 


--------------------------------------------------------------------------------
/configs/rag_short.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536
 2 | datasets: kilt_nq,kilt_nq,kilt_nq,kilt_nq,kilt_triviaqa,kilt_triviaqa,kilt_triviaqa,kilt_triviaqa,kilt_hotpotqa,kilt_hotpotqa,kilt_hotpotqa,kilt_hotpotqa,kilt_popqa_3,kilt_popqa_3,kilt_popqa_3,kilt_popqa_3
 3 | generation_max_length: 20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
 4 | test_files: data/kilt/nq-dev-multikilt_1000_k50_dep6.jsonl,data/kilt/nq-dev-multikilt_1000_k105_dep6.jsonl,data/kilt/nq-dev-multikilt_1000_k220_dep6.jsonl,data/kilt/nq-dev-multikilt_1000_k440_dep6.jsonl,data/kilt/triviaqa-dev-multikilt_1000_k50_dep6.jsonl,data/kilt/triviaqa-dev-multikilt_1000_k105_dep6.jsonl,data/kilt/triviaqa-dev-multikilt_1000_k220_dep6.jsonl,data/kilt/triviaqa-dev-multikilt_1000_k440_dep6.jsonl,data/kilt/hotpotqa-dev-multikilt_1000_k50_dep3.jsonl,data/kilt/hotpotqa-dev-multikilt_1000_k105_dep3.jsonl,data/kilt/hotpotqa-dev-multikilt_1000_k220_dep3.jsonl,data/kilt/hotpotqa-dev-multikilt_1000_k440_dep3.jsonl,data/kilt/popqa_test_1000_k50_dep6.jsonl,data/kilt/popqa_test_1000_k105_dep6.jsonl,data/kilt/popqa_test_1000_k220_dep6.jsonl,data/kilt/popqa_test_1000_k440_dep6.jsonl
 5 | demo_files: data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl,data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl,data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl,data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl,data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl,data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl,data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl,data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl,data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl,data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl,data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl,data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl,data/kilt/popqa_test_1000_k3_dep6.jsonl,data/kilt/popqa_test_1000_k3_dep6.jsonl,data/kilt/popqa_test_1000_k3_dep6.jsonl,data/kilt/popqa_test_1000_k3_dep6.jsonl
 6 | use_chat_template: false
 7 | max_test_samples: 100
 8 | shots: 2
 9 | stop_new_line: true
10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
11 | 


--------------------------------------------------------------------------------
/configs/rag_vllm.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 131072,131072,131072,131072
 2 | datasets: kilt_nq,kilt_triviaqa,kilt_hotpotqa,kilt_popqa_3
 3 | generation_max_length: 20,20,20,20
 4 | test_files: data/kilt/nq-dev-multikilt_1000_k1000_dep6.jsonl,data/kilt/triviaqa-dev-multikilt_1000_k1000_dep6.jsonl,data/kilt/hotpotqa-dev-multikilt_1000_k1000_dep3.jsonl,data/kilt/popqa_test_1000_k1000_dep6.jsonl
 5 | demo_files: data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl,data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl,data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl,data/kilt/popqa_test_1000_k3_dep6.jsonl
 6 | use_chat_template: true
 7 | max_test_samples: 100
 8 | shots: 2
 9 | stop_new_line: true
10 | model_name_or_path: meta-llama/Llama-3.3-70B-Instruct
11 | output_dir: output/vllm-gaudi/Llama-3.3-70B-Instruct
12 | use_vllm_serving: true
13 | 


--------------------------------------------------------------------------------
/configs/recall.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 131072,131072,131072,131072
 2 | datasets: ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mv,json_kv
 3 | generation_max_length: 50,100,50,100
 4 | test_files: data/ruler/niah_multikey_2/validation_131072.jsonl,data/ruler/niah_multikey_3/validation_131072.jsonl,data/ruler/niah_multivalue/validation_131072.jsonl,data/json_kv/test_k1800_dep6.jsonl
 5 | demo_files: ',,,'
 6 | use_chat_template: false
 7 | max_test_samples: 100
 8 | shots: 2
 9 | stop_new_line: false
10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
11 | 


--------------------------------------------------------------------------------
/configs/recall_demo.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 8192
 2 | datasets: ruler_niah_mk_2
 3 | generation_max_length: 50
 4 | test_files: data/ruler/niah_multikey_2/validation_8192.jsonl
 5 | demo_files: ''
 6 | use_chat_template: true
 7 | max_test_samples: 5
 8 | shots: 2
 9 | top_p: 0.95 # need to be >0 and <1
10 | stop_new_line: false
11 | model_name_or_path: tgi:meta-llama/Llama-3.2-1B-Instruct
12 | output_dir: output/tgi/meta-llama/Llama-3.2-1B-Instruct
13 | use_tgi_serving: true


--------------------------------------------------------------------------------
/configs/recall_short.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536
 2 | datasets: ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mv,ruler_niah_mv,ruler_niah_mv,ruler_niah_mv,json_kv,json_kv,json_kv,json_kv
 3 | generation_max_length: 50,50,50,50,100,100,100,100,50,50,50,50,100,100,100,100
 4 | test_files: data/ruler/niah_multikey_2/validation_8192.jsonl,data/ruler/niah_multikey_2/validation_16384.jsonl,data/ruler/niah_multikey_2/validation_32768.jsonl,data/ruler/niah_multikey_2/validation_65536.jsonl,data/ruler/niah_multikey_3/validation_8192.jsonl,data/ruler/niah_multikey_3/validation_16384.jsonl,data/ruler/niah_multikey_3/validation_32768.jsonl,data/ruler/niah_multikey_3/validation_65536.jsonl,data/ruler/niah_multivalue/validation_8192.jsonl,data/ruler/niah_multivalue/validation_16384.jsonl,data/ruler/niah_multivalue/validation_32768.jsonl,data/ruler/niah_multivalue/validation_65536.jsonl,data/json_kv/test_k105_dep6.jsonl,data/json_kv/test_k220_dep6.jsonl,data/json_kv/test_k440_dep6.jsonl,data/json_kv/test_k900_dep6.jsonl
 5 | demo_files: ',,,,,,,,,,,,,,,'
 6 | use_chat_template: false
 7 | max_test_samples: 100
 8 | shots: 2
 9 | stop_new_line: false
10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
11 | 


--------------------------------------------------------------------------------
/configs/recall_vllm.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 131072,131072,131072,131072
 2 | datasets: ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mv,json_kv
 3 | generation_max_length: 50,100,50,100
 4 | test_files: data/ruler/niah_multikey_2/validation_131072.jsonl,data/ruler/niah_multikey_3/validation_131072.jsonl,data/ruler/niah_multivalue/validation_131072.jsonl,data/json_kv/test_k1800_dep6.jsonl
 5 | demo_files: ',,,'
 6 | use_chat_template: true
 7 | max_test_samples: 100
 8 | shots: 2
 9 | stop_new_line: false
10 | model_name_or_path: meta-llama/Llama-3.3-70B-Instruct
11 | output_dir: output/vllm-gaudi/Llama-3.3-70B-Instruct
12 | use_vllm_serving: true
13 | 


--------------------------------------------------------------------------------
/configs/rerank.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: '131072'
 2 | datasets: msmarco_rerank_psg
 3 | generation_max_length: '200'
 4 | test_files: data/msmarco/test_reranking_data_k1000_dep3.jsonl
 5 | demo_files: data/msmarco/test_reranking_data_k10_dep3.jsonl
 6 | use_chat_template: false
 7 | max_test_samples: 100
 8 | shots: 2
 9 | stop_new_line: true
10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
11 | 


--------------------------------------------------------------------------------
/configs/rerank_short.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 8192,16384,32768,65536
 2 | datasets: msmarco_rerank_psg,msmarco_rerank_psg,msmarco_rerank_psg,msmarco_rerank_psg
 3 | generation_max_length: 200,200,200,200
 4 | test_files: data/msmarco/test_reranking_data_k50_dep3.jsonl,data/msmarco/test_reranking_data_k130_dep3.jsonl,data/msmarco/test_reranking_data_k285_dep3.jsonl,data/msmarco/test_reranking_data_k600_dep3.jsonl
 5 | demo_files: data/msmarco/test_reranking_data_k10_dep3.jsonl,data/msmarco/test_reranking_data_k10_dep3.jsonl,data/msmarco/test_reranking_data_k10_dep3.jsonl,data/msmarco/test_reranking_data_k10_dep3.jsonl
 6 | use_chat_template: false
 7 | max_test_samples: 100
 8 | shots: 2
 9 | stop_new_line: true
10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
11 | 


--------------------------------------------------------------------------------
/configs/ruler.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 131072,131072,131072,131072,131072,131072,131072,131072,131072,131072,131072,131072,131072
 2 | datasets: ruler_niah_s_1,ruler_niah_s_2,ruler_niah_s_3,ruler_niah_mk_1,ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mq,ruler_niah_mv,ruler_cwe,ruler_fwe,ruler_vt,ruler_qa_1,ruler_qa_2
 3 | generation_max_length: 50,50,50,50,50,100,100,50,100,50,50,50,50
 4 | test_files: data/ruler/niah_single_1/validation_131072.jsonl,data/ruler/niah_single_2/validation_131072.jsonl,data/ruler/niah_single_3/validation_131072.jsonl,data/ruler/niah_multikey_1/validation_131072.jsonl,data/ruler/niah_multikey_2/validation_131072.jsonl,data/ruler/niah_multikey_3/validation_131072.jsonl,data/ruler/niah_multiquery/validation_131072.jsonl,data/ruler/niah_multivalue/validation_131072.jsonl,data/ruler/cwe/validation_131072.jsonl,data/ruler/fwe/validation_131072.jsonl,data/ruler/vt/validation_131072.jsonl,data/ruler/qa_1/validation_131072.jsonl,data/ruler/qa_2/validation_131072.jsonl
 5 | demo_files: ',,,,,,,,,,,,'
 6 | use_chat_template: false
 7 | max_test_samples: 100
 8 | shots: 0
 9 | stop_new_line: false
10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
11 | 


--------------------------------------------------------------------------------
/configs/ruler_short.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536
 2 | datasets: ruler_niah_s_1,ruler_niah_s_1,ruler_niah_s_1,ruler_niah_s_1,ruler_niah_s_2,ruler_niah_s_2,ruler_niah_s_2,ruler_niah_s_2,ruler_niah_s_3,ruler_niah_s_3,ruler_niah_s_3,ruler_niah_s_3,ruler_niah_mk_1,ruler_niah_mk_1,ruler_niah_mk_1,ruler_niah_mk_1,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mq,ruler_niah_mq,ruler_niah_mq,ruler_niah_mq,ruler_niah_mv,ruler_niah_mv,ruler_niah_mv,ruler_niah_mv,ruler_cwe,ruler_cwe,ruler_cwe,ruler_cwe,ruler_fwe,ruler_fwe,ruler_fwe,ruler_fwe,ruler_vt,ruler_vt,ruler_vt,ruler_vt,ruler_qa_1,ruler_qa_1,ruler_qa_1,ruler_qa_1,ruler_qa_2,ruler_qa_2,ruler_qa_2,ruler_qa_2
 3 | generation_max_length: 50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,100,100,100,100,100,100,100,100,50,50,50,50,100,100,100,100,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50
 4 | test_files: data/ruler/niah_single_1/validation_8192.jsonl,data/ruler/niah_single_1/validation_16384.jsonl,data/ruler/niah_single_1/validation_32768.jsonl,data/ruler/niah_single_1/validation_65536.jsonl,data/ruler/niah_single_2/validation_8192.jsonl,data/ruler/niah_single_2/validation_16384.jsonl,data/ruler/niah_single_2/validation_32768.jsonl,data/ruler/niah_single_2/validation_65536.jsonl,data/ruler/niah_single_3/validation_8192.jsonl,data/ruler/niah_single_3/validation_16384.jsonl,data/ruler/niah_single_3/validation_32768.jsonl,data/ruler/niah_single_3/validation_65536.jsonl,data/ruler/niah_multikey_1/validation_8192.jsonl,data/ruler/niah_multikey_1/validation_16384.jsonl,data/ruler/niah_multikey_1/validation_32768.jsonl,data/ruler/niah_multikey_1/validation_65536.jsonl,data/ruler/niah_multikey_2/validation_8192.jsonl,data/ruler/niah_multikey_2/validation_16384.jsonl,data/ruler/niah_multikey_2/validation_32768.jsonl,data/ruler/niah_multikey_2/validation_65536.jsonl,data/ruler/niah_multikey_3/validation_8192.jsonl,data/ruler/niah_multikey_3/validation_16384.jsonl,data/ruler/niah_multikey_3/validation_32768.jsonl,data/ruler/niah_multikey_3/validation_65536.jsonl,data/ruler/niah_multiquery/validation_8192.jsonl,data/ruler/niah_multiquery/validation_16384.jsonl,data/ruler/niah_multiquery/validation_32768.jsonl,data/ruler/niah_multiquery/validation_65536.jsonl,data/ruler/niah_multivalue/validation_8192.jsonl,data/ruler/niah_multivalue/validation_16384.jsonl,data/ruler/niah_multivalue/validation_32768.jsonl,data/ruler/niah_multivalue/validation_65536.jsonl,data/ruler/cwe/validation_8192.jsonl,data/ruler/cwe/validation_16384.jsonl,data/ruler/cwe/validation_32768.jsonl,data/ruler/cwe/validation_65536.jsonl,data/ruler/fwe/validation_8192.jsonl,data/ruler/fwe/validation_16384.jsonl,data/ruler/fwe/validation_32768.jsonl,data/ruler/fwe/validation_65536.jsonl,data/ruler/vt/validation_8192.jsonl,data/ruler/vt/validation_16384.jsonl,data/ruler/vt/validation_32768.jsonl,data/ruler/vt/validation_65536.jsonl,data/ruler/qa_1/validation_8192.jsonl,data/ruler/qa_1/validation_16384.jsonl,data/ruler/qa_1/validation_32768.jsonl,data/ruler/qa_1/validation_65536.jsonl,data/ruler/qa_2/validation_8192.jsonl,data/ruler/qa_2/validation_16384.jsonl,data/ruler/qa_2/validation_32768.jsonl,data/ruler/qa_2/validation_65536.jsonl
 5 | demo_files: ',,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,'
 6 | use_chat_template: false
 7 | max_test_samples: 100
 8 | shots: 0
 9 | stop_new_line: false
10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
11 | 


--------------------------------------------------------------------------------
/configs/summ.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 131072,131072
 2 | datasets: infbench_sum_eng_129672,multi_lexsum_130372
 3 | generation_max_length: 1200,400
 4 | test_files: ','
 5 | demo_files: ','
 6 | use_chat_template: true
 7 | max_test_samples: 100
 8 | shots: 2
 9 | stop_new_line: false
10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
11 | 


--------------------------------------------------------------------------------
/configs/summ_short.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536
 2 | datasets: infbench_sum_eng_6792,infbench_sum_eng_14984,infbench_sum_eng_31368,infbench_sum_eng_64136,multi_lexsum_7492,multi_lexsum_15684,multi_lexsum_32068,multi_lexsum_64836
 3 | generation_max_length: 1200,1200,1200,1200,400,400,400,400
 4 | test_files: ',,,,,,,'
 5 | demo_files: ',,,,,,,'
 6 | use_chat_template: true
 7 | max_test_samples: 100
 8 | shots: 2
 9 | stop_new_line: false
10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct
11 | 


--------------------------------------------------------------------------------
/eval.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | 
  3 | from collections import defaultdict
  4 | import re
  5 | import random
  6 | import json
  7 | import time
  8 | 
  9 | from tqdm import tqdm
 10 | import numpy as np
 11 | import torch
 12 | from torch.utils.data import DataLoader
 13 | 
 14 | from arguments import parse_arguments
 15 | from model_utils import load_LLM, OpenAIModel, AnthropicModel, TgiVllmModel
 16 | 
 17 | from data import (
 18 |     load_data,
 19 |     TestItemDataset,
 20 | )
 21 | 
 22 | import logging
 23 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
 24 |                     datefmt='%m/%d/%Y %H:%M:%S')
 25 | logger = logging.getLogger(__name__)
 26 | logger.setLevel(logging.INFO)
 27 | 
 28 | 
 29 | def run_test(args, model, dataset, test_file, demo_file):
 30 |     logger.info(f"running test on {dataset} with test {test_file} and demo {demo_file}")
 31 |     # dataset specific changes tag
 32 |     tag = args.tag
 33 |     if dataset == "popqa":
 34 |         tag += f"_pop{args.popularity_threshold}"
 35 | 
 36 |     test_name = os.path.splitext(os.path.basename(test_file))[0]
 37 |     output_path = os.path.join(args.output_dir, f"{dataset}_{tag}_{test_name}_in{args.input_max_length}_size{args.max_test_samples}_shots{args.shots}_samp{args.do_sample}max{args.generation_max_length}min{args.generation_min_length}t{args.temperature}p{args.top_p}_chat{args.use_chat_template}_{args.seed}.json")
 38 |     if os.path.exists(output_path) and not args.overwrite and not args.debug:
 39 |         logger.info(f"{output_path} already exists, skipping...")
 40 |         return output_path
 41 | 
 42 |     random.seed(args.seed)
 43 |     data = load_data(args, dataset, test_file, demo_file)
 44 |     logger.info(f"loaded {len(data['data'])} samples from {dataset}")
 45 | 
 46 |     dataloader = DataLoader(
 47 |         TestItemDataset(data, model, model.tokenizer),
 48 |         batch_size=1,
 49 |         shuffle=False,
 50 |         collate_fn=lambda x: x,
 51 |         num_workers=args.num_workers if not args.debug else 0,
 52 |     )
 53 | 
 54 |     # we first prepare all inputs and then run the evaluation in batch
 55 |     # the dataloader is a bit of an overkill here, but it makes it easier to switch back to iterative instead of batch eval
 56 |     metrics = defaultdict(list)
 57 |     all_inputs = []
 58 |     all_input_texts = []
 59 |     for idx, inputs in enumerate(tqdm(dataloader, desc="Preparing inputs")):
 60 |         inputs, input_text = inputs[0]
 61 |         if args.count_tokens:
 62 |             # count_tokens is only available for models that tokenizes the input
 63 |             metrics['input_len'].append(inputs.input_ids.shape[1])
 64 |             continue
 65 |         all_inputs.append(inputs)
 66 |         all_input_texts.append(input_text)
 67 | 
 68 |     # HY: for the thinking mode, we add additional 32k tokens to allow models to generate thinking process
 69 |     if args.thinking:
 70 |         args.generation_max_length += 32768
 71 |         args.input_max_length += 32768
 72 |         model.max_length = args.input_max_length
 73 |         model.generation_max_length = args.generation_max_length
 74 |         args.stop_newline = False
 75 |         logger.info(f"thinking mode, adding 32k tokens to generation and input max length, also disabling stop_newline")
 76 | 
 77 |     logger.info("Running generation...")
 78 |     start_time = time.time()
 79 |     # generate all outputs
 80 |     if (isinstance(model, OpenAIModel) or isinstance(model, AnthropicModel)) and (not isinstance(model, TgiVllmModel)):
 81 |         # using the batch API makes it cheaper and faster
 82 |         logger.info(f"Using the OpenAI/Anthropic batch API by default, if you want to use the iterative API, please change the code")
 83 |         all_outputs = model.generate_batch(all_inputs, batch_file=output_path+".batch")
 84 |     else:
 85 |         all_outputs = model.generate_batch(all_inputs)
 86 |     end_time = time.time()
 87 | 
 88 |     # then we do all the postprocessing + evaluation
 89 |     results = []
 90 |     for idx, output in enumerate(all_outputs):
 91 |         test_item = data["data"][idx]
 92 |         input_text = all_input_texts[idx]
 93 | 
 94 |         if output is None:
 95 |             logger.info(f"skipping example {idx+1} because the model returned None")
 96 |             continue
 97 | 
 98 |         # If we do not use the chat template, then we are doing completion, and for the sake of parsing, we want to prepend the system prompt to the input.
 99 |         # For example, since we are autocompleting "Answer:"" in the input, then we should prepend the system prompt to the output as well.
100 |         # This requires some coordination from the dataset preprocessing
101 |         if not args.use_chat_template:
102 |             prepend_text = data["system_template"].format(**test_item)
103 |             output["output"] = prepend_text + output["output"]
104 | 
105 |         if args.thinking:
106 |             matches = re.search(r"(.*</think>)(.*)", output['output'], flags=re.DOTALL)
107 |             if matches:
108 |                 output["output"] = matches.group(2).strip()
109 |                 output["thoughts"] = matches.group(1).strip()
110 | 
111 |         mets, others = data['post_process'](output, test_item)
112 |         output.update({**others, **mets})
113 |         for k, v in mets.items():
114 |             metrics[k].append(v)
115 | 
116 |         metrics["input_len"].append(output["input_len"])
117 |         metrics["output_len"].append(output["output_len"])
118 |         result = {**test_item, **output}
119 |         result.pop("context", None)
120 |         result.pop("input_ids", None)
121 |         if input_text is None:
122 |             input_text = result['input_text']
123 |         results.append(result)
124 | 
125 |         # print out some examples, we also limit how much we print out since it can get really long
126 |         if idx < 5 or args.debug:
127 |             logger.info(f"Example {idx+1}: ")
128 |             logger.info(f"Decoder inputs:\n{input_text}\n")
129 | 
130 |             logger.info(f"Input length: {output['input_len']}")
131 |             # currently we hardcode somethings to print out, but you may change these to print out other things
132 |             logger.info(f"Question: {test_item['question'] if 'question' in test_item else ''}")
133 |             logger.info(f"Answer: {test_item['answer'] if 'answer' in test_item else ''}")
134 |             logger.info(f"Output: {output['output']}")
135 |             logger.info(f"Parsed output: {output['parsed_output']}")
136 |             logger.info(f"Metrics: {mets}")
137 | 
138 |         if args.debug:
139 |             import pdb; pdb.set_trace()
140 | 
141 |     if not args.no_cuda:
142 |         mem_usage = sum([torch.cuda.max_memory_allocated(i) for i in range(torch.cuda.device_count())])
143 |         logger.info(f"Memory usage: {mem_usage/1000**3:.02f} GB")
144 |     logger.info(f"Total time: {end_time - start_time:.02f} s")
145 |     logger.info(f"Throughput: {len(results) / (end_time - start_time):.02f} samples/s")
146 | 
147 |     if args.count_tokens:
148 |         logger.info(f"----{dataset}----\nAverage input length: {np.mean(metrics['input_len']):.02f}, std input length: {np.std(metrics['input_len']):.02f}, max input length: {max(metrics['input_len'])}, min input length: {min(metrics['input_len'])}\n----returning----")
149 |         return output_path
150 | 
151 |     if len(results) == 0:
152 |         logger.error("No results to evaluate, something went wrong, returning...")
153 |         return output_path
154 | 
155 |     averaged_metrics = {k: np.mean(v)*(100 if "_len" not in k else 1) for k, v in metrics.items()}
156 | 
157 |     logger.info("Averaged metrics:")
158 |     for k, v in averaged_metrics.items():
159 |         logger.info(f"{k}: {v:.02f}")
160 | 
161 |     output = {
162 |         "args": args.__dict__,
163 |         "data": results,
164 |         "metrics": metrics,
165 |         "averaged_metrics": averaged_metrics,
166 |         "throughput": len(results) / (end_time - start_time),
167 |     }
168 |     if not args.no_cuda:
169 |         output["memory_usage"] = mem_usage
170 | 
171 |     if args.output_dir is not None:
172 |         with open(output_path, "w") as f:
173 |             json.dump(output, f, indent=4)
174 |         # this makes it easier to parse results, but alce uses a different evaluation script
175 |         if not "alce" in dataset:
176 |             with open(output_path + ".score", "w") as f:
177 |                 json.dump(output["averaged_metrics"], f, indent=4)
178 |         logger.info(f"done, results are written to {output_path}")
179 | 
180 |     return output_path
181 | 
182 | 
183 | def main():
184 |     args = parse_arguments()
185 | 
186 |     logger.info(f"Arguments: {args}")
187 |     assert args.model_name_or_path is not None
188 |     os.makedirs(args.output_dir, exist_ok=True)
189 | 
190 |     datasets = args.datasets.split(",")
191 |     test_files = args.test_files.split(",")
192 |     demo_files = args.demo_files.split(",")
193 |     max_lengths = ([int(args.input_max_length)] * len(datasets)) if isinstance(args.input_max_length, int) or len(args.input_max_length.split(",")) == 1 else [int(l) for l in args.input_max_length.split(",")]
194 |     gen_lengths = ([int(args.generation_max_length)] * len(datasets)) if isinstance(args.generation_max_length, int) or len(args.generation_max_length.split(",")) == 1 else [int(l) for l in args.generation_max_length.split(",")]
195 |     assert len(test_files) == len(demo_files)
196 | 
197 |     args.input_max_length = max(max_lengths)
198 |     model = load_LLM(args)
199 | 
200 |     for dataset, test_file, demo_file, max_length, gen_length in zip(datasets, test_files, demo_files, max_lengths, gen_lengths):
201 |         args.datasets = dataset
202 |         args.test_files = test_file
203 |         args.demo_files = demo_file
204 |         args.input_max_length = max_length
205 |         args.generation_max_length = gen_length
206 |         model.max_length = max_length
207 |         model.generation_max_length = gen_length
208 | 
209 |         try:
210 |             output_path = run_test(args, model, dataset, test_file, demo_file)
211 | 
212 |             if "alce" in dataset and not args.count_tokens and (not os.path.exists(output_path+".score") or args.overwrite):
213 |                 import eval_alce
214 |                 logger.info("running eval_alce.py...")
215 |                 cli_args = ["--f", output_path]
216 |                 if not "nocite" in dataset:
217 |                     cli_args.append("--citations")
218 |                 # HY: If you want to run the full ALCE evaluation, you should uncomment the following lines
219 |                 # In HELMET, we don't use the MAUVE scores.
220 |                 # if "asqa" in dataset:
221 |                 #     cli_args.append("--mauve")
222 |                 # elif "eli5" in dataset:
223 |                 #   cli_args += ["mauve", "--claims_nli"]
224 |                 eval_alce.main(cli_args)
225 | 
226 |         except Exception as e:
227 |             # in case we run into some kind of error
228 |             logger.exception(e)
229 |             logger.error(f"Error in {dataset}, continuing...")
230 |             if args.debug:
231 |                 raise e
232 | 
233 | if __name__ == "__main__":
234 |     main()
235 | 
236 | 


--------------------------------------------------------------------------------
/eval_alce.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import collections
  3 | import json
  4 | import re
  5 | import string
  6 | import torch
  7 | import copy
  8 | 
  9 | from nltk import sent_tokenize
 10 | import numpy as np
 11 | from rouge_score import rouge_scorer, scoring
 12 | from tqdm import tqdm
 13 | import sys
 14 | import logging
 15 | from collections import defaultdict
 16 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
 17 |                     datefmt='%m/%d/%Y %H:%M:%S')
 18 | logger = logging.getLogger(__name__)
 19 | logger.setLevel(logging.INFO)
 20 | 
 21 | from transformers import (
 22 |     AutoModelForSeq2SeqLM,
 23 |     AutoTokenizer,
 24 |     pipeline
 25 | )
 26 | 
 27 | from utils import normalize_answer, get_max_memory, remove_citations
 28 | 
 29 | QA_MODEL="gaotianyu1350/roberta-large-squad"
 30 | AUTOAIS_MODEL="google/t5_xxl_true_nli_mixture"
 31 | 
 32 | global autoais_model, autoais_tokenizer
 33 | autoais_model, autoais_tokenizer = None, None
 34 | 
 35 | 
 36 | def compute_f1(a_gold, a_pred):
 37 |     """Compute F1 score between two strings."""
 38 | 
 39 |     def _get_tokens(s):
 40 |         if not s:
 41 |             return []
 42 |         return normalize_answer(s).split()
 43 | 
 44 |     gold_toks = _get_tokens(a_gold)
 45 |     pred_toks = _get_tokens(a_pred)
 46 | 
 47 |     common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
 48 |     num_same = sum(common.values())
 49 | 
 50 |     if len(gold_toks) == 0 or len(pred_toks) == 0:
 51 |         # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
 52 |         return int(gold_toks == pred_toks)
 53 | 
 54 |     if num_same == 0:
 55 |         return 0
 56 | 
 57 |     precision = 1.0 * num_same / len(pred_toks)
 58 |     recall = 1.0 * num_same / len(gold_toks)
 59 |     f1 = (2 * precision * recall) / (precision + recall)
 60 | 
 61 |     return f1
 62 | 
 63 | 
 64 | def compute_exact(a_gold, a_pred):
 65 |     """Check whether two strings are equal up to normalization."""
 66 | 
 67 |     return int(normalize_answer(a_gold) == normalize_answer(a_pred))
 68 | 
 69 | 
 70 | def exact_presence(short_answers, context):
 71 |     """Verify if any of the answers is present in the given context.
 72 |     Args:
 73 |         short_answers: list of short answers to look for in the context
 74 |         context: a paragraph to search for short answers
 75 |     Returns:
 76 |         true if any of the short answers is present in the context
 77 |     """
 78 | 
 79 |     n_short_answers = [normalize_answer(sa) for sa in short_answers]
 80 |     n_context = normalize_answer(context)
 81 | 
 82 |     for ans in n_short_answers:
 83 |         if ans in n_context:
 84 |             return True
 85 | 
 86 |     return False
 87 | 
 88 | 
 89 | def compute_rouge(data):
 90 |     """Main function for rouge scoring.
 91 |     If two references are provided,
 92 |     the best score is chosen for each instance.
 93 |     Args:
 94 |         data: requires field `output` and `answer` (or `annotations` for ASQA)
 95 |         metrics: list of evaluation metrics
 96 |     Returns:
 97 |         dictionary representation of rouge scores
 98 |     """
 99 |     def _rouge_calculation(hypotheses,
100 |                         references1,
101 |                         references2=[],
102 |                         metrics=['rougeLsum']):
103 | 
104 |         if references2 == []:
105 |             references2 = references1
106 | 
107 |         scorer = rouge_scorer.RougeScorer(metrics, use_stemmer=True)
108 |         aggregator = scoring.BootstrapAggregator()
109 | 
110 |         for i in range(len(hypotheses)):
111 |             scores1 = scorer.score(references1[i], hypotheses[i])
112 |             scores2 = scorer.score(references2[i], hypotheses[i])
113 |             if scores1['rougeLsum'].fmeasure > scores2['rougeLsum'].fmeasure:
114 |                 aggregator.add_scores(scores1)
115 |             else:
116 |                 aggregator.add_scores(scores2)
117 | 
118 |         scores = {m: [] for m in metrics}
119 | 
120 |         for m in metrics:
121 |             fmeasure = aggregator.aggregate()[m].mid.fmeasure
122 |             scores[m].append(fmeasure)
123 | 
124 |         for m in scores:
125 |             scores[m] = 100 * sum(scores[m]) / len(scores[m])
126 | 
127 |         return scores
128 | 
129 |     hypotheses = {}
130 |     references1 = {}
131 |     references2 = {}
132 | 
133 |     for idx, item in enumerate(data):
134 |         hypotheses[idx] = item["output"]
135 |         if "annotations" in item and item['annotations'] is not None: # For ASQA
136 |             references1[idx] = item["annotations"][0]["long_answer"]
137 |             references2[idx] = item["annotations"][1]["long_answer"]
138 |         else:
139 |             references1[idx] = item["answer"]
140 |             references2[idx] = item["answer"]
141 | 
142 |     h, r1, r2 = [], [], []
143 | 
144 |     for key in references1:
145 |         h.append(hypotheses[key])
146 |         r1.append(references1[key])
147 | 
148 |         if references2 is not None:
149 |             r2.append(references2[key])
150 | 
151 |     h = ['\n'.join(sent_tokenize(text.lower())) for text in h]
152 |     r1 = ['\n'.join(sent_tokenize(text.lower())) for text in r1]
153 |     r2 = ['\n'.join(sent_tokenize(text.lower())) for text in r2]
154 |     scores = _rouge_calculation(h, r1, r2)
155 | 
156 |     return scores['rougeLsum']
157 | 
158 | 
159 | def compute_str_em(data):
160 |     """Compute STR-EM metric (only for ASQA)
161 |     Args:
162 |         data: requires field `qa_pairs/short_answers` and `output`
163 |     Returns:
164 |         STR-EM and STR-EM-HIT ()
165 |     """
166 | 
167 |     if 'qa_pairs' not in data[0] or data[0]['qa_pairs'] is None:
168 |         return 0, 0
169 | 
170 |     acc = []
171 |     hit = []
172 | 
173 |     for item in data:
174 |         loc_acc = []
175 |         for qa_pair in item['qa_pairs']:
176 |             loc_acc.append(exact_presence(qa_pair['short_answers'], item["output"]))
177 |         acc.append(np.mean(loc_acc))
178 |         hit.append( int(np.mean(loc_acc) == 1) )
179 | 
180 |     return 100 * np.mean(acc), 100 * np.mean(hit)
181 | 
182 | 
183 | def compute_len(data):
184 |     """Compute average length of predictions."""
185 | 
186 |     res, cntr = 0, 0
187 |     for item in data:
188 |         res += len(item["output"].split())
189 |         cntr += 1
190 |     return res / cntr
191 | 
192 | 
193 | def compute_qa(data):
194 |     """Compute QA-based accuracy.
195 |     Args:
196 |         data: requires filed `qa_pairs/short_answers` and `output`
197 |     Returns:
198 |         QA metrics (QA-EM, QA-F1, QA-Hit)
199 |     """
200 | 
201 |     if 'qa_pairs' not in data[0] or data[0]['qa_pairs'] is None:
202 |         logger.warn("Warning: no QA pairs found in data")
203 |         return {
204 |             'QA-EM': 0,
205 |             'QA-F1': 0,
206 |             'QA-Hit': 0,
207 |         }
208 | 
209 |     # Load model
210 |     logger.info("Loading the RoBERTa-large SQuAD model for QA-based accuracy...")
211 |     qa_pipeline = pipeline("question-answering", model=QA_MODEL, device=0)
212 |     logger.info("Done")
213 | 
214 |     # Get prediction
215 |     logger.info("Computing the QA-based accuracy...")
216 |     em, f1, bins = [], [], []
217 |     for item in tqdm(data):
218 |         question = [qa_pair['question'] for qa_pair in item['qa_pairs']]
219 |         context = item['output'] if len(item['output']) > 0 else " "
220 |         results = qa_pipeline(question=question, context=context, handle_impossible_answer=True)
221 |         loc_counter, loc_em, loc_f1 = 0, 0, 0
222 | 
223 |         for idx, res in enumerate(results):
224 |             answers = item["qa_pairs"][idx]["short_answers"]
225 |             prediction = res["answer"]
226 | 
227 |             loc_em += max([compute_exact(a, prediction) for a in answers])
228 |             loc_f1 += max([compute_f1(a, prediction) for a in answers])
229 |             loc_counter += 1
230 | 
231 |         em.append(loc_em / loc_counter)
232 |         f1.append(loc_f1 / loc_counter)
233 |         bins.append(loc_em == loc_counter)
234 | 
235 |     return {
236 |         'QA-EM': 100 * np.mean(em),
237 |         'QA-F1': 100 * np.mean(f1),
238 |         'QA-Hit': 100 * np.mean(bins)
239 |     }
240 | 
241 | 
242 | def compute_mauve(data):
243 |     """Compute Mauve score."""
244 | 
245 |     logger.info("Computing MAUVE...")
246 |     human_data = []
247 |     model_data = []
248 |     for item in data:
249 |         # Remove ending punctuations
250 |         # Remove any new lines
251 |         # Truncate by 100 words
252 |         human_data.append(' '.join((item['question'] + " " + item['answer'].strip()).split()[:100]).rstrip(string.punctuation))
253 |         model_data.append(' '.join((item['question'] + " " + item['output'].strip()).split()[:100]).rstrip(string.punctuation))
254 | 
255 |     import mauve
256 |     out = mauve.compute_mauve(
257 |         p_text=human_data,
258 |         q_text=model_data,
259 |         device_id=0,
260 |         max_text_length=512,
261 |         verbose=True,
262 |         batch_size=8,
263 |         featurize_model_name="gpt2-large"
264 |     )
265 |     return out.mauve * 100
266 | 
267 | 
268 | def _run_nli_autoais(passage, claim):
269 |     """
270 |     Run inference for assessing AIS between a premise and hypothesis.
271 |     Adapted from https://github.com/google-research-datasets/Attributed-QA/blob/main/evaluation.py
272 |     """
273 |     global autoais_model, autoais_tokenizer
274 |     input_text = "premise: {} hypothesis: {}".format(passage, claim)
275 |     input_ids = autoais_tokenizer(input_text, return_tensors="pt").input_ids.to(autoais_model.device)
276 |     with torch.inference_mode():
277 |         outputs = autoais_model.generate(input_ids, max_new_tokens=10)
278 |     result = autoais_tokenizer.decode(outputs[0], skip_special_tokens=True)
279 |     inference = 1 if result == "1" else 0
280 |     return inference
281 | 
282 | 
283 | def compute_claims(data):
284 |     global autoais_model, autoais_tokenizer
285 |     if autoais_model is None:
286 |         logger.info("Loading AutoAIS model...")
287 |         autoais_model = AutoModelForSeq2SeqLM.from_pretrained(AUTOAIS_MODEL, torch_dtype=torch.bfloat16, max_memory=get_max_memory(), device_map="auto")
288 |         autoais_tokenizer = AutoTokenizer.from_pretrained(AUTOAIS_MODEL, use_fast=False)
289 | 
290 |     logger.info("Computing claims...")
291 |     scores = []
292 |     for item in tqdm(data):
293 |         normalized_output = remove_citations(item['output'])
294 |         entail = 0
295 |         claims = item["claims"]
296 |         for claim in claims:
297 |             entail += _run_nli_autoais(normalized_output, claim)
298 |         scores.append(entail / len(claims))
299 |     return 100 * np.mean(scores)
300 | 
301 | 
302 | def compute_autoais(data,
303 |                     decontext=False,
304 |                     concat=False,
305 |                     qampari=False,
306 |                     at_most_citations=None,):
307 |     """
308 |     Compute AutoAIS score.
309 | 
310 |     Args:
311 |         data: requires field `output` and `docs`
312 |               - docs should be a list of items with fields `title` and `text` (or `phrase` and `sent` for QA-extracted docs)
313 |         citation: check citations and use the corresponding references.
314 |         decontext: decontextualize the output
315 |     """
316 | 
317 |     global autoais_model, autoais_tokenizer
318 |     if autoais_model is None:
319 |         logger.info("Loading AutoAIS model...")
320 |         autoais_model = AutoModelForSeq2SeqLM.from_pretrained(AUTOAIS_MODEL, torch_dtype=torch.bfloat16, max_memory=get_max_memory(), device_map="auto")
321 |         autoais_tokenizer = AutoTokenizer.from_pretrained(AUTOAIS_MODEL, use_fast=False)
322 | 
323 |     logger.info(f"Running AutoAIS...")
324 | 
325 |     def _format_document(doc):
326 |         """Format document for AutoAIS."""
327 | 
328 |         if "sent" in doc:
329 |             # QA-extracted docs
330 |             return "Title: %s\n%s" % (doc['title'], doc['sent'])
331 |         else:
332 |             return "Title: %s\n%s" % (doc['title'], doc['text'])
333 | 
334 |     ais_scores = []
335 |     ais_scores_prec = []
336 | 
337 |     sent_total = 0
338 |     sent_mcite = 0
339 |     sent_mcite_support = 0
340 |     sent_mcite_overcite = 0
341 |     autoais_log = []
342 |     citation_position_count = defaultdict(lambda: 0)
343 |     for item in tqdm(data):
344 |         # Get sentences by using NLTK
345 |         if qampari:
346 |             sents = [item['question'] + " " + x.strip() for x in item['output'].rstrip().rstrip(".").rstrip(",").split(",")]
347 |         else:
348 |             sents = sent_tokenize(item['output'])
349 |             # we also ignore sentences that are < 5 characters long, they are unlikely to be meaningful
350 |             # this resolves the case where the sentencizer takes "1." as a sentence
351 |             sents = [x for x in sents if len(x.strip()) >= 5]
352 |         if len(sents) == 0:
353 |             continue
354 | 
355 |         target_sents = [remove_citations(sent).strip() for sent in sents]
356 | 
357 |         entail = 0
358 |         entail_prec = 0
359 |         total_citations = 0
360 |         for sent_id, sent in enumerate(sents):
361 |             target_sent = target_sents[sent_id] # Citation removed and (if opted for) decontextualized
362 |             joint_entail = -1 # Undecided
363 | 
364 |             # Find references
365 |             ref = [int(r[1:])-1 for r in re.findall(r"\[\d+", sent)] # In text citation id starts from 1
366 |             for r in ref:
367 |                 citation_position_count[r] += 1
368 |             logger.info(f"For `{sent}`, find citations {ref}")
369 |             if len(ref) == 0:
370 |                 # No citations
371 |                 joint_entail = 0
372 |             elif any([ref_id >= len(item['docs']) for ref_id in ref]):
373 |                 # Citations out of range
374 |                 joint_entail = 0
375 |             else:
376 |                 if at_most_citations is not None:
377 |                     ref = ref[:at_most_citations]
378 |                 total_citations += len(ref)
379 |                 joint_passage = '\n'.join([_format_document(item['docs'][psgs_id]) for psgs_id in ref])
380 | 
381 |             # If not directly rejected by citation format error, calculate the recall score
382 |             if joint_entail == -1:
383 |                 joint_entail = _run_nli_autoais(joint_passage, target_sent)
384 |                 autoais_log.append({
385 |                     "question": item['question'],
386 |                     "output": item['output'],
387 |                     "claim": sent,
388 |                     "passage": [joint_passage],
389 |                     "model_type": "NLI",
390 |                     "model_output": joint_entail,
391 |                 })
392 | 
393 |             entail += joint_entail
394 |             if len(ref) > 1:
395 |                 sent_mcite += 1
396 | 
397 |             # calculate the precision score if applicable
398 |             if joint_entail and len(ref) > 1:
399 |                 sent_mcite_support += 1
400 |                 # Precision check: did the model cite any unnecessary documents?
401 |                 for psgs_id in ref:
402 |                     # condition A
403 |                     passage = _format_document(item['docs'][psgs_id])
404 |                     nli_result = _run_nli_autoais(passage, target_sent)
405 | 
406 |                     # condition B
407 |                     if not nli_result:
408 |                         subset_exclude = copy.deepcopy(ref)
409 |                         subset_exclude.remove(psgs_id)
410 |                         passage = '\n'.join([_format_document(item['docs'][pid]) for pid in subset_exclude])
411 |                         nli_result = _run_nli_autoais(passage, target_sent)
412 |                         if nli_result: # psgs_id is not necessary
413 |                             flag = 0
414 |                             sent_mcite_overcite += 1
415 |                         else:
416 |                             entail_prec += 1
417 |                     else:
418 |                         entail_prec += 1
419 |             else:
420 |                 entail_prec += joint_entail
421 | 
422 |         sent_total += len(sents)
423 |         ais_scores.append(entail / len(sents))
424 |         ais_scores_prec.append(entail_prec / total_citations if total_citations > 0 else 0) # len(sents))
425 | 
426 |     if sent_mcite > 0 and sent_mcite_support > 0:
427 |         print("Among all sentences, %.2f%% have multiple citations, among which %.2f%% are supported by the joint set, among which %.2f%% overcite." % (
428 |             100 * sent_mcite / sent_total,
429 |             100 * sent_mcite_support / sent_mcite,
430 |             100 * sent_mcite_overcite / sent_mcite_support
431 |         ))
432 | 
433 |     return {
434 |         "citation_rec": 100 * np.mean(ais_scores) if len(ais_scores) > 0 else 0,
435 |         "citation_prec": 100 * np.mean(ais_scores_prec) if len(ais_scores_prec) > 0 else 0,
436 |         "citation_positions": dict(citation_position_count),
437 |     }
438 | 
439 | 
440 | def compute_qampari_f1(data, cot=False):
441 |     prec = []
442 |     rec = []
443 |     rec_top5 = []
444 |     f1 = []
445 |     f1_top5 = []
446 | 
447 |     num_preds = []
448 |     for item in data:
449 |         if cot:
450 |             if ":" in item['output']:
451 |                 o = ':'.join(item['output'].split(":")[1:]) # try to separate the COT part and the answer list part.
452 |             else:
453 |                 o = ""
454 |         else:
455 |             o = item['output']
456 |         preds = [normalize_answer(x.strip()) for x in o.rstrip().rstrip(".").rstrip(",").split(",")]
457 |         preds = [p for p in preds if len(p) > 0] # delete empty answers
458 |         num_preds.append(len(preds))
459 |         answers = [[normalize_answer(x) for x in ans] for ans in item['answers']]
460 |         flat_answers = [item for sublist in answers for item in sublist]
461 | 
462 |         prec.append(sum([p in flat_answers for p in preds]) / len(preds) if len(preds) > 0 else 0)
463 |         rec.append(sum([any([x in preds for x in a]) for a in answers]) / len(answers))
464 |         rec_top5.append(min(5, sum([any([x in preds for x in a]) for a in answers])) / min(5, len(answers)))
465 |         if (prec[-1] + rec[-1]) == 0:
466 |             f1.append(0)
467 |         else:
468 |             f1.append(2 * prec[-1] * rec[-1] / (prec[-1] + rec[-1]))
469 |         if (prec[-1] + rec_top5[-1]) == 0:
470 |             f1_top5.append(0)
471 |         else:
472 |             f1_top5.append(2 * prec[-1] * rec_top5[-1] / (prec[-1] + rec_top5[-1]))
473 | 
474 |     return {
475 |         "num_preds": np.mean(num_preds),
476 |         "qampari_prec": 100 * np.mean(prec),
477 |         "qampari_rec": 100 * np.mean(rec),
478 |         "qampari_rec_top5": 100 * np.mean(rec_top5),
479 |         "qampari_f1": 100 * np.mean(f1),
480 |         "qampari_f1_top5": 100 * np.mean(f1_top5),
481 |     }
482 | 
483 | def main(args=None):
484 |     parser = argparse.ArgumentParser()
485 |     parser.add_argument("--f", type=str, required=True, help="Output file. Should have field `question`, `output`, (ROUGE) `answer`, \
486 |                         (accuracy) `qa_pairs`, (AIS) `docs`")
487 |     parser.add_argument("--no_rouge", action="store_true", help="Do not evaluate ROUGE score")
488 |     parser.add_argument("--qa", action="store_true", help="Use the QA model")
489 |     parser.add_argument("--mauve", action="store_true", help="Use the mauve score model")
490 |     parser.add_argument("--citations", action="store_true", help="Evaluation with citation")
491 |     parser.add_argument("--at_most_citations", type=int, default=3, help="At most take this many documents (mostly for precision)")
492 |     parser.add_argument("--claims_nli", action="store_true", help="Use claims for ELI5")
493 | 
494 |     # QAMPARI
495 |     parser.add_argument("--cot", action="store_true", help="For QAMPARI, try to find colon and separate the COT and answer listing")
496 | 
497 |     if args is None:
498 |         args = parser.parse_args()
499 |     else:
500 |         args = parser.parse_args(args)
501 | 
502 |     with open(args.f) as f:
503 |         data_with_config = json.load(f)
504 |     data = data_with_config['data']
505 | 
506 |     if "qampari" in args.f:
507 |         args.no_rouge = True
508 |         args.qa = False
509 |         args.mauve = False
510 |         args.decontext = False
511 |         qampari = True
512 |     else:
513 |         qampari = False
514 | 
515 |     # Truncate by newline and remove on the fly search result
516 |     # logger.warning("We remove all the pre/appended space/newlines and we truncate the answer by the first newline.")
517 |     logger.warning("We remove all the pre/appended space/newlines and replace newlines with spaces.")
518 |     logger.warning("We replace any on the fly search result to standard bracket citation format.")
519 |     for i in range(len(data)):
520 |         # data[i]['output'] = data[i]['output'].strip().split("\n")[0]
521 |         data[i]['output'] = re.sub(r"\n+", " ", data[i]['output'])
522 |         data[i]['output'] = data[i]['output'].replace("<|im_end|>", "")
523 | 
524 | 
525 |     # Remove all citations for all non-AutoAIS evaluation
526 |     normalized_data = copy.deepcopy(data)
527 |     for i in range(len(normalized_data)):
528 |         normalized_data[i]['output'] = remove_citations(normalized_data[i]['output'])
529 | 
530 |     result = {}
531 |     result['length'] = compute_len(normalized_data)
532 |     result['str_em'], result['str_hit'] = compute_str_em(normalized_data)
533 |     if qampari:
534 |         result.update(compute_qampari_f1(normalized_data, cot=args.cot))
535 |     if not args.no_rouge:
536 |         result['rougeLsum'] = compute_rouge(normalized_data)
537 |     if args.qa:
538 |         result.update(compute_qa(normalized_data))
539 |     if args.mauve:
540 |         result['mauve'] = compute_mauve(normalized_data)
541 |     if args.citations:
542 |         result.update(compute_autoais(data, qampari=qampari, at_most_citations=args.at_most_citations))
543 |     if args.claims_nli:
544 |         result["claims_nli"] = compute_claims(normalized_data)
545 | 
546 |     print(result)
547 |     with open(args.f + ".score", "w") as f:
548 |         json.dump(result, f, indent=4)
549 | 
550 | 
551 | if __name__ == "__main__":
552 |     main()
553 | 


--------------------------------------------------------------------------------
/longproc_addon/README.md:
--------------------------------------------------------------------------------
 1 | ## LongProc Add-On on HELMET
 2 | We integrated [LongProc](https://github.com/princeton-pli/LongProc) in HELMET to support convenient evaluation.
 3 | 
 4 | **Additional Setup**
 5 | Pull the submodule from LongProc and add `__init__.py` files to make the import work:
 6 | ```bash
 7 | git submodule update --init --recursive
 8 | touch longproc_addon/__init__.py
 9 | touch longproc_addon/longproc/__init__.py
10 | ```
11 | 
12 | To quickly test if everything is working, you can try running the evaluations.
13 | 
14 | **Running Evaluation**
15 | You can now run evaluation just as you would in HELMET. The config files are stored in  `longproc_addon/configs`.
16 | 
17 | For example:
18 | ```bash
19 | python eval.py --config longproc_addon/configs/html_to_tsv.yaml --model_name_or_path {local model path or huggingface model name} --output_dir {output directory, defaults to output/{model_name}}
20 | ```
21 | 


--------------------------------------------------------------------------------
/longproc_addon/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/princeton-nlp/HELMET/87bbd6d8e316dd9ea9c5515b8f20e44bf2d768b1/longproc_addon/__init__.py


--------------------------------------------------------------------------------
/longproc_addon/configs/countdown.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 32000,32000,32000
 2 | generation_max_length: 1024,3072,10240
 3 | datasets: countdown_0.5k,countdown_2k,countdown_8k
 4 | test_files: longproc_addon/longproc/data,longproc_addon/longproc/data,longproc_addon/longproc/data
 5 | demo_files: ",,"
 6 | use_chat_template: true
 7 | max_test_samples: 100
 8 | temperature: 0.0
 9 | shots: 0
10 | stop_new_line: false
11 | model_name_or_path: gpt-4o-mini-2024-07-18
12 | 


--------------------------------------------------------------------------------
/longproc_addon/configs/html_to_tsv.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 128000,128000,128000
 2 | generation_max_length: 1024,3072,10240
 3 | datasets: html_to_tsv_0.5k,html_to_tsv_2k,html_to_tsv_8k
 4 | test_files: longproc_addon/longproc/data,longproc_addon/longproc/data,longproc_addon/longproc/data
 5 | demo_files: ",,"
 6 | use_chat_template: true
 7 | max_test_samples: 100
 8 | temperature: 0.0
 9 | shots: 0
10 | stop_new_line: false
11 | model_name_or_path: gpt-4o-mini-2024-07-18
12 | 


--------------------------------------------------------------------------------
/longproc_addon/configs/path_traversal.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 32000,32000,32000
 2 | generation_max_length: 1024,3072,10240
 3 | datasets: path_traversal_0.5k,path_traversal_2k,path_traversal_8k
 4 | test_files: longproc_addon/longproc/data,longproc_addon/longproc/data,longproc_addon/longproc/data
 5 | demo_files: ",,"
 6 | use_chat_template: true
 7 | max_test_samples: 100
 8 | temperature: 0.0
 9 | shots: 0
10 | stop_new_line: false
11 | model_name_or_path: gpt-4o-mini-2024-07-18
12 | 


--------------------------------------------------------------------------------
/longproc_addon/configs/pseudo_to_code.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 8000,8000
 2 | generation_max_length: 1024,3072
 3 | datasets: pseudo_to_code_0.5k,pseudo_to_code_2k
 4 | test_files: longproc_addon/longproc/data,longproc_addon/longproc/data
 5 | demo_files: ","
 6 | use_chat_template: true
 7 | max_test_samples: 100
 8 | temperature: 0.0
 9 | shots: 0
10 | stop_new_line: false
11 | model_name_or_path: gpt-4o-mini-2024-07-18
12 | 


--------------------------------------------------------------------------------
/longproc_addon/configs/tom_tracking.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 32000,32000,32000
 2 | generation_max_length: 1024,3072,10240
 3 | datasets: tom_tracking_0.5k,tom_tracking_2k,tom_tracking_8k
 4 | test_files: longproc_addon/longproc/data,longproc_addon/longproc/data,longproc_addon/longproc/data
 5 | demo_files: ",,"
 6 | use_chat_template: true
 7 | max_test_samples: 100
 8 | temperature: 0.0
 9 | shots: 0
10 | stop_new_line: false
11 | model_name_or_path: gpt-4o-mini-2024-07-18
12 | 


--------------------------------------------------------------------------------
/longproc_addon/configs/travel_planning.yaml:
--------------------------------------------------------------------------------
 1 | input_max_length: 32000,32000
 2 | generation_max_length: 3072,10240
 3 | datasets: travel_planning_2k,travel_planning_8k
 4 | test_files: longproc_addon/longproc/data,longproc_addon/longproc/data
 5 | demo_files: ","
 6 | use_chat_template: true
 7 | max_test_samples: 100
 8 | temperature: 0.0
 9 | shots: 0
10 | stop_new_line: false
11 | model_name_or_path: gpt-4o-mini-2024-07-18
12 | 


--------------------------------------------------------------------------------
/longproc_addon/longproc_helmet_loader.py:
--------------------------------------------------------------------------------
 1 | from datasets import Dataset as HFDataset
 2 | 
 3 | try:
 4 |     from .longproc.longproc.longproc_data import load_longproc_data
 5 | except ImportError as e:
 6 |     raise ImportError("LongProc cannot be loaded.")
 7 | 
 8 | 
 9 | def load_longproc_data_for_helmet(dataset: str, path="longproc_addon/longproc/data", max_test_samples=None, seed=42):
10 |     # packed data: list of "input_prompt", "reference_output", "item"
11 |     packed_data, eval_func = load_longproc_data(dataset, path)
12 | 
13 |     packed_data = HFDataset.from_list(packed_data)
14 |     if max_test_samples is not None:
15 |         packed_data = packed_data.shuffle(seed=seed).select(range(min(max_test_samples, len(packed_data))))
16 | 
17 |     def helmet_eval_wrapper(output: dict, example: dict):
18 |         predict = output["output"]
19 |         return eval_func(predict, example)
20 | 
21 |     return {
22 |         "data": packed_data,
23 |         "prompt_template": "{input_prompt}",
24 |         "user_template": "{input_prompt}",
25 |         "post_process": helmet_eval_wrapper,
26 |     }
27 | 
28 | 
29 | def _test_load_all():
30 |     def test_loading(dataset):
31 |         data, eval_func = load_longproc_data(dataset, "longproc_addon/longproc/data")
32 |         print(f"Dataset: {dataset}")
33 |         print(f"N samples: {len(data)}")
34 |         print(f"Eval func: {eval_func}")
35 |         print(f"Max input chars: {max([len(d['input_prompt']) for d in data])}")
36 |         print(f"Max output chars: {max([len(d['reference_output']) for d in data])}")
37 | 
38 |     [test_loading(d) for d in ["path_traversal_0.5k", "path_traversal_2k", "path_traversal_8k"]]
39 | 
40 |     [test_loading(d) for d in ["html_to_tsv_0.5k", "html_to_tsv_2k", "html_to_tsv_8k"]]
41 | 
42 |     [test_loading(d) for d in ["pseudo_to_code_0.5k", "pseudo_to_code_2k",]]
43 | 
44 |     [test_loading(d) for d in ["travel_planning_2k", "travel_planning_8k"]]
45 | 
46 |     [test_loading(d) for d in ["tom_tracking_0.5k", "tom_tracking_2k", "tom_tracking_8k"]]
47 | 
48 |     [test_loading(d) for d in ["countdown_0.5k", "countdown_2k", "countdown_8k"]]
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     _test_load_all()
53 | 
54 | 


--------------------------------------------------------------------------------
/prompts/asqa_nocite.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "instruction": "Instruction: Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant). Use an unbiased and journalistic tone.",
  3 |     "demo_sep": "\n\n\n",
  4 |     "demo_prompt": "{instruction}\n\nQuestion: {question}\n\n{context}\n\nAnswer: {answer}",
  5 |     "doc_prompt": "Document [{ID}](Title: {title}): {text}",
  6 |     "demos": [
  7 |         {
  8 |             "question": "Which is the most rainy place on earth?",
  9 |             "answer": "Several places on Earth claim to be the most rainy, such as Lloró, Colombia, which reported an average annual rainfall of 12,717 mm between 1952 and 1989, and López de Micay, Colombia, which reported an annual 12,892 mm between 1960 and 2012. However, the official record is held by Mawsynram, India with an average annual rainfall of 11,872 mm, although nearby town Sohra, India, also known as Cherrapunji, holds the record for most rain in a calendar month for July 1861 and most rain in a year from August 1860 to July 1861.",
 10 |             "docs": [
 11 |                 {
 12 |                     "title": "Cherrapunji",
 13 |                     "text": "Cherrapunji Cherrapunji (; with the native name Sohra being more commonly used, and can also be spelled Cherrapunjee or Cherrapunji) is a subdivisional town in the East Khasi Hills district in the Indian state of Meghalaya. It is the traditional capital of aNongkhlaw \"hima\" (Khasi tribal chieftainship constituting a petty state), both known as Sohra or Churra. Cherrapunji has often been credited as being the wettest place on Earth, but for now nearby Mawsynram currently holds that distinction. Cherrapunji still holds the all-time record for the most rainfall in a calendar month for July 1861 and most rain in a year from August 1860 to July 1861, however: it received in"
 14 |                 },
 15 |                 {
 16 |                     "title": "Cherrapunji",
 17 |                     "text": "Radio relay station known as Akashvani Cherrapunji. It broadcasts on FM frequencies. Cherrapunji Cherrapunji (; with the native name Sohra being more commonly used, and can also be spelled Cherrapunjee or Cherrapunji) is a subdivisional town in the East Khasi Hills district in the Indian state of Meghalaya. It is the traditional capital of aNongkhlaw \"hima\" (Khasi tribal chieftainship constituting a petty state), both known as Sohra or Churra. Cherrapunji has often been credited as being the wettest place on Earth, but for now nearby Mawsynram currently holds that distinction. Cherrapunji still holds the all-time record for the most rainfall"
 18 |                 },
 19 |                 {
 20 |                     "title": "Mawsynram",
 21 |                     "text": "Mawsynram Mawsynram () is a village in the East Khasi Hills district of Meghalaya state in north-eastern India, 65 kilometres from Shillong. Mawsynram receives one of the highest rainfalls in India. It is reportedly the wettest place on Earth, with an average annual rainfall of 11,872 mm, but that claim is disputed by Lloró, Colombia, which reported an average yearly rainfall of 12,717 mm between 1952 and 1989 and López de Micay, also in Colombia, which reported an annual 12,892 mm per year between 1960 and 2012. According to the \"Guinness Book of World Records\", Mawsynram received of rainfall in 1985. Mawsynram is located at 25° 18′"
 22 |                 },
 23 |                 {
 24 |                     "title": "Earth rainfall climatology",
 25 |                     "text": "Pacific Northwest, and the Sierra Nevada range are the wetter portions of the nation, with average rainfall exceeding per year. The drier areas are the Desert Southwest, Great Basin, valleys of northeast Arizona, eastern Utah, central Wyoming, eastern Oregon and Washington and the northeast of the Olympic Peninsula. The Big Bog on the island of Maui receives, on average, every year, making it the wettest location in the US, and all of Oceania. The annual average rainfall maxima across the continent lie across the northwest from northwest Brazil into northern Peru, Colombia, and Ecuador, then along the Atlantic coast of"
 26 |                 },
 27 |                 {
 28 |                     "title": "Going to Extremes",
 29 |                     "text": "in the world. Oymyakon in Siberia, where the average winter temperature is −47 °F (− 44 °C). Arica in Chile, where there had been fourteen consecutive years without rain. Fog is the only local source of water. Mawsynram in India, where average annual rainfall is 14 meters, falling within a four-month period in the monsoon season. The rainfall is approximately equal to that of its neighbor Cherrapunji. Dallol in Ethiopia, known as the 'Hell-hole of creation' where the temperature averages 94 °F (34 °C) over the year. In his second series, Middleton visited places without permanent towns, locations where \"survival\""
 30 |                 }
 31 |             ]
 32 |         },
 33 |         {
 34 |             "question": "When did the us break away from england?",
 35 |             "answer": "The United States took the first step towards gaining independence from Great Britain when it declared independence from Great Britain on July 2, 1776 (although the event is now commemorated on July 4, 1776, the date when the Declaration of Independence was officially adopted by Congress). The Treaty of Paris was later signed on September 3, 1783, formally separating the United States from the British Empire.",
 36 |             "docs": [
 37 |                 {
 38 |                     "title": "United States withdrawal from Saudi Arabia",
 39 |                     "text": "United States withdrawal from Saudi Arabia Beginning during Operation Desert Shield in August 1990, while preparing for the Gulf War, the United States sent a large troop contingent to Saudi Arabia. After the war, remnant troops, primarily U.S. Air Force personnel, augmented by a smaller number of coordinating and training personnel from the U.S. Navy, U.S. Army and U.S. Marine Corps remained in Saudi Arabia under the aegis of Joint Task Force Southwest Asia (JTF-SWA), as part of Operation Southern Watch (OSW). The United Kingdom and France also maintained a small contingent of Royal Air Force and French Air Force"
 40 |                 },
 41 |                 {
 42 |                     "title": "Decolonization of the Americas",
 43 |                     "text": "and France has fully \"integrated\" most of its former colonies as fully constituent \"departments\" of France. The United States of America declared independence from Great Britain on July 2, 1776 (although the event is now commemorated on July 4, the date when the Declaration of Independence was officially adopted by Congress), in so doing becoming the first independent, foreign-recognized nation in the Americas and the first European colonial entity to break from its mother country. Britain formally acknowledged American independence in 1783 after its defeat in the American Revolutionary War. Although initially occupying only the land east of the Mississippi"
 44 |                 },
 45 |                 {
 46 |                     "title": "American Revolution",
 47 |                     "text": "second British army at Yorktown in the fall of 1781, effectively ending the war. The Treaty of Paris was signed September 3, 1783, formally ending the conflict and confirming the new nation's complete separation from the British Empire. The United States took possession of nearly all the territory east of the Mississippi River and south of the Great Lakes, with the British retaining control of Canada and Spain taking Florida. Among the significant results of the revolution was the creation of the United States Constitution, establishing a relatively strong federal national government that included an executive, a national judiciary, and"
 48 |                 },
 49 |                 {
 50 |                     "title": "Decolonization",
 51 |                     "text": "accelerate decolonialization and bring an end to the colonial empires of its Western allies, most importantly during the 1956 Suez Crisis, but American military bases were established around the world and direct and indirect interventions continued in Korea, Indochina, Latin America (\"inter alia\", the 1965 occupation of the Dominican Republic), Africa, and the Middle East to oppose Communist invasions and insurgencies. Since the dissolution of the Soviet Union, the United States has been far less active in the Americas, but invaded Afghanistan and Iraq following the September 11 attacks in 2001, establishing army and air bases in Central Asia. Before"
 52 |                 },
 53 |                 {
 54 |                     "title": "Decolonization",
 55 |                     "text": "the responsibility of the United Kingdom (with a copy of the new constitution annexed), and finally, if approved, issuance of an Order of Council fixing the exact date of independence. After World War I, several former German and Ottoman territories in the Middle East, Africa, and the Pacific were governed by the UK as League of Nations mandates. Some were administered directly by the UK, and others by British dominions – Nauru and the Territory of New Guinea by Australia, South West Africa by the Union of South Africa, and Western Samoa by New Zealand. Egypt became independent in 1922,"
 56 |                 }
 57 |             ]
 58 |         },
 59 |         {
 60 |             "question": "Who set the record for longest field goal?",
 61 |             "answer": "The record for the longest field goal in an NFL game was set by Matt Prater at 64 yards, but the record for the longest field goal at any level was 69 yards, kicked by collegiate kicker Ove Johansson in a 1976 Abilene Christian University football game against East Texas State University.",
 62 |             "docs": [
 63 |                 {
 64 |                     "title": "Field goal",
 65 |                     "text": "toward its own end. The longest field goal kick in NFL history is 64 yards, a record set by Matt Prater on December 8, 2013. The previous record was 63, originally set by Tom Dempsey (1970) and then matched by Jason Elam (1998), Sebastian Janikowski (2011), David Akers (2012), and Graham Gano (2018). High school, college and most professional football leagues offer only a three-point field goal; however, some professional leagues have encouraged more rare kicks through \"four-point field goals\". NFL Europe encouraged long field goals of 50 yards or more by making those worth four points instead of three"
 66 |                 },
 67 |                 {
 68 |                     "title": "Field goal range",
 69 |                     "text": "35 and 40 yard lines (closer in a crosswind) often will go for the more risky fourth down conversion rather than risk either the touchback or the missed field goal. The longest field goal in recorded football history was 69 yards, set by collegiate kicker Ove Johansson, who was born in Sweden, in a 1976 Abilene Christian University football game against East Texas State University (now Texas A&M Commerce) at Shotwell Stadium in Abilene. The longest successful field goal in the NFL was 64 yards and was completed by Matt Prater in 2013. The NCAA record is 67 yards held"
 70 |                 },
 71 |                 {
 72 |                     "title": "Field goal",
 73 |                     "text": "both end zones) is only 66 yards. Scaccia, while playing indoor football, attempted a 64-yard kick that was inches short of success, hitting the crossbar. Longer field goals have been attempted at times; the longest attempt in the NFL, which was well short and was kicked into the wind, was 76 yards, attempted by Sebastian Janikowski of the Oakland Raiders, in a September 28, 2008 game against the San Diego Chargers. NFL Europe rewarded kickers that successfully kicked a field goal of longer than 50 yards with a bonus point, making such field goals worth 4 points instead of 3;"
 74 |                 },
 75 |                 {
 76 |                     "title": "Field goal",
 77 |                     "text": "this accomplishment is not the official record. All of the above kicks were successful with the use of a kicking tee, which was banned by the NCAA after the 1988 season. The longest known drop-kicked field goal in college football was a 62-yard kick from Pat O'Dea, an Australian kicker who played on the Wisconsin Badgers football team. O'Dea's kick took place in a blizzard against Northwestern on November 15, 1898. The longest field goal in U Sports football history is 59 yards, by Niko Difonte of Calgary Dinos, playing against the UBC Thunderbirds on November 11, 2017. The field"
 78 |                 },
 79 |                 {
 80 |                     "title": "Field goal range",
 81 |                     "text": "NFL and have been banned from NCAA since 1989) is 68 yards held by Fabrizio Scaccia, and the high school record 68 yards held by Dirk Borgognone; high school has wider goal posts and treats a field goal attempt that lands short in the field of play the same as a punt, making longer attempts much less risky. The indoor football record, with narrower and higher goal posts, is 63 yards (set by Aaron Mills), which is practically as long of a field goal as is possible in that variant of the sport, since the field in indoor football (including"
 82 |                 }
 83 |             ]
 84 |         },
 85 |         {
 86 |             "question": "Who played galen in planet of the apes?",
 87 |             "answer": "In the 1968 film Planet of the Apes, Galen was played by Wright King. And in the tv series Planet of the Apes, Galen was played by Roddy McDowall.",
 88 |             "docs": [
 89 |                 {
 90 |                     "title": "Planet of the Apes",
 91 |                     "text": "installment. Jacobs died on June 27, 1973, bringing an end to the APJAC Productions era of the \"Planet of the Apes\" franchise. Former Fox executive Stan Hough took over as producer for the television project, titled \"Planet of the Apes\". CBS picked up the series for its 1974 autumn lineup. Ron Harper and James Naughton played Alan Virdon and Peter Burke, two 20th-century American astronauts who pass through a time warp to a future where apes subjugate humans (unlike the original film, the humans can speak). Roddy McDowall returned to the franchise as Galen, a chimpanzee who joins the astronauts."
 92 |                 },
 93 |                 {
 94 |                     "title": "Planet of the Apes (1968 film)",
 95 |                     "text": "chimpanzees: animal psychologist Zira (Kim Hunter) and surgeon Galen (Wright King). While unable to speak as his throat wound is healing, called \"Bright Eyes\" by Zira and placed with one of the captive primitive humans he later names \"Nova\", Taylor observes the enhanced society of talking apes and in a strict caste system: the gorillas being the military police, hunters and workers; the orangutans overseeing the affairs of government, science, and religion; and intellectual chimpanzees being mostly scientists. While their society is a theocracy similar to the beginnings of the human Industrial Era, the apes consider the primitive humans as"
 96 |                 },
 97 |                 {
 98 |                     "title": "Planet of the Apes (1968 film)",
 99 |                     "text": "Planet of the Apes (1968 film) Planet of the Apes is a 1968 American science fiction film directed by Franklin J. Schaffner. It stars Charlton Heston, Roddy McDowall, Kim Hunter, Maurice Evans, James Whitmore, James Daly and Linda Harrison. The screenplay by Michael Wilson and Rod Serling was loosely based on the 1963 French novel \"La Plan\u00e8te des Singes\" by Pierre Boulle. Jerry Goldsmith composed the groundbreaking avant-garde score. It was the first in a series of five films made between 1968 and 1973, all produced by Arthur P. Jacobs and released by 20th Century Fox. The film tells the"
100 |                 },
101 |                 {
102 |                     "title": "Planet of the Apes",
103 |                     "text": "Rupert Wyatt. To portray ape characters realistically, the production avoided practical effects in favor of performance capture acting, partnering with New Zealand visual effects company Weta Digital. Wyatt cast James Franco as Will Rodman, while veteran performance capture actor Andy Serkis signed on to star as Caesar. \"Rise\" debuted on August 5, 2011. Critics reviewed it positively, especially praising the visual effects and Serkis's performance. It was a major box office hit, taking in $482 million globally, more than five times its $93 million budget. Weta's special effects earned the film two Visual Effects Society Awards and an Oscar nomination"
104 |                 },
105 |                 {
106 |                     "title": "Planet of the Apes",
107 |                     "text": "film stars Mark Wahlberg as astronaut Leo Davidson, who accidentally travels through a wormhole to a distant planet where talking apes enslave humans. He leads a human revolt and upends ape civilization by discovering that the apes evolved from the normal earth primates who had accompanied his mission, and arrived years before. Helena Bonham Carter played chimpanzee Ari, while Tim Roth played the human-hating chimpanzee General Thade. The film received mixed reviews; most critics believed it failed to compare to the original. Much of the negative commentary focused on the confusing plot and twist ending, though many reviewers praised the"
108 |                 }
109 |             ]
110 |         }
111 |     ]
112 | }


--------------------------------------------------------------------------------
/prompts/asqa_revised.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "instruction": "Instruction: Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant) and cite them properly. Use an unbiased and journalistic tone. Always cite for any factual claim. When citing a document, surround its ID with square brackets, such as [x] to cite document x. To cite multiple documents, simply concatenate the citation markers; for example, use [x][y][z] to cite the documents with ID x, y, and z. Cite at least one document and at most three documents in each sentence. If multiple documents support the sentence, only cite a minimum sufficient subset of the documents.",
  3 |     "demo_sep": "\n\n\n",
  4 |     "demo_prompt": "{instruction}\n\nQuestion: {question}\n\n{context}\n\nAnswer: {answer}",
  5 |     "doc_prompt": "Document [{ID}](Title: {title}): {text}",
  6 |     "demos": [
  7 |         {
  8 |             "question": "Which is the most rainy place on earth?",
  9 |             "answer": "Several places on Earth claim to be the most rainy, such as Lloró, Colombia, which reported an average annual rainfall of 12,717 mm between 1952 and 1989, and López de Micay, Colombia, which reported an annual 12,892 mm between 1960 and 2012 [3]. However, the official record is held by Mawsynram, India with an average annual rainfall of 11,872 mm [3], although nearby town Sohra, India, also known as Cherrapunji, holds the record for most rain in a calendar month for July 1861 and most rain in a year from August 1860 to July 1861 [1].",
 10 |             "docs": [
 11 |                 {
 12 |                     "title": "Cherrapunji",
 13 |                     "text": "Cherrapunji Cherrapunji (; with the native name Sohra being more commonly used, and can also be spelled Cherrapunjee or Cherrapunji) is a subdivisional town in the East Khasi Hills district in the Indian state of Meghalaya. It is the traditional capital of aNongkhlaw \"hima\" (Khasi tribal chieftainship constituting a petty state), both known as Sohra or Churra. Cherrapunji has often been credited as being the wettest place on Earth, but for now nearby Mawsynram currently holds that distinction. Cherrapunji still holds the all-time record for the most rainfall in a calendar month for July 1861 and most rain in a year from August 1860 to July 1861, however: it received in"
 14 |                 },
 15 |                 {
 16 |                     "title": "Cherrapunji",
 17 |                     "text": "Radio relay station known as Akashvani Cherrapunji. It broadcasts on FM frequencies. Cherrapunji Cherrapunji (; with the native name Sohra being more commonly used, and can also be spelled Cherrapunjee or Cherrapunji) is a subdivisional town in the East Khasi Hills district in the Indian state of Meghalaya. It is the traditional capital of aNongkhlaw \"hima\" (Khasi tribal chieftainship constituting a petty state), both known as Sohra or Churra. Cherrapunji has often been credited as being the wettest place on Earth, but for now nearby Mawsynram currently holds that distinction. Cherrapunji still holds the all-time record for the most rainfall"
 18 |                 },
 19 |                 {
 20 |                     "title": "Mawsynram",
 21 |                     "text": "Mawsynram Mawsynram () is a village in the East Khasi Hills district of Meghalaya state in north-eastern India, 65 kilometres from Shillong. Mawsynram receives one of the highest rainfalls in India. It is reportedly the wettest place on Earth, with an average annual rainfall of 11,872 mm, but that claim is disputed by Lloró, Colombia, which reported an average yearly rainfall of 12,717 mm between 1952 and 1989 and López de Micay, also in Colombia, which reported an annual 12,892 mm per year between 1960 and 2012. According to the \"Guinness Book of World Records\", Mawsynram received of rainfall in 1985. Mawsynram is located at 25° 18′"
 22 |                 },
 23 |                 {
 24 |                     "title": "Earth rainfall climatology",
 25 |                     "text": "Pacific Northwest, and the Sierra Nevada range are the wetter portions of the nation, with average rainfall exceeding per year. The drier areas are the Desert Southwest, Great Basin, valleys of northeast Arizona, eastern Utah, central Wyoming, eastern Oregon and Washington and the northeast of the Olympic Peninsula. The Big Bog on the island of Maui receives, on average, every year, making it the wettest location in the US, and all of Oceania. The annual average rainfall maxima across the continent lie across the northwest from northwest Brazil into northern Peru, Colombia, and Ecuador, then along the Atlantic coast of"
 26 |                 },
 27 |                 {
 28 |                     "title": "Going to Extremes",
 29 |                     "text": "in the world. Oymyakon in Siberia, where the average winter temperature is −47 °F (− 44 °C). Arica in Chile, where there had been fourteen consecutive years without rain. Fog is the only local source of water. Mawsynram in India, where average annual rainfall is 14 meters, falling within a four-month period in the monsoon season. The rainfall is approximately equal to that of its neighbor Cherrapunji. Dallol in Ethiopia, known as the 'Hell-hole of creation' where the temperature averages 94 °F (34 °C) over the year. In his second series, Middleton visited places without permanent towns, locations where \"survival\""
 30 |                 }
 31 |             ]
 32 |         },
 33 |         {
 34 |             "question": "When did the us break away from england?",
 35 |             "answer": "The United States took the first step towards gaining independence from Great Britain when it declared independence from Great Britain on July 2, 1776 (although the event is now commemorated on July 4, 1776, the date when the Declaration of Independence was officially adopted by Congress) [2]. The Treaty of Paris was later signed on September 3, 1783, formally separating the United States from the British Empire [3].",
 36 |             "docs": [
 37 |                 {
 38 |                     "title": "United States withdrawal from Saudi Arabia",
 39 |                     "text": "United States withdrawal from Saudi Arabia Beginning during Operation Desert Shield in August 1990, while preparing for the Gulf War, the United States sent a large troop contingent to Saudi Arabia. After the war, remnant troops, primarily U.S. Air Force personnel, augmented by a smaller number of coordinating and training personnel from the U.S. Navy, U.S. Army and U.S. Marine Corps remained in Saudi Arabia under the aegis of Joint Task Force Southwest Asia (JTF-SWA), as part of Operation Southern Watch (OSW). The United Kingdom and France also maintained a small contingent of Royal Air Force and French Air Force"
 40 |                 },
 41 |                 {
 42 |                     "title": "Decolonization of the Americas",
 43 |                     "text": "and France has fully \"integrated\" most of its former colonies as fully constituent \"departments\" of France. The United States of America declared independence from Great Britain on July 2, 1776 (although the event is now commemorated on July 4, the date when the Declaration of Independence was officially adopted by Congress), in so doing becoming the first independent, foreign-recognized nation in the Americas and the first European colonial entity to break from its mother country. Britain formally acknowledged American independence in 1783 after its defeat in the American Revolutionary War. Although initially occupying only the land east of the Mississippi"
 44 |                 },
 45 |                 {
 46 |                     "title": "American Revolution",
 47 |                     "text": "second British army at Yorktown in the fall of 1781, effectively ending the war. The Treaty of Paris was signed September 3, 1783, formally ending the conflict and confirming the new nation's complete separation from the British Empire. The United States took possession of nearly all the territory east of the Mississippi River and south of the Great Lakes, with the British retaining control of Canada and Spain taking Florida. Among the significant results of the revolution was the creation of the United States Constitution, establishing a relatively strong federal national government that included an executive, a national judiciary, and"
 48 |                 },
 49 |                 {
 50 |                     "title": "Decolonization",
 51 |                     "text": "accelerate decolonialization and bring an end to the colonial empires of its Western allies, most importantly during the 1956 Suez Crisis, but American military bases were established around the world and direct and indirect interventions continued in Korea, Indochina, Latin America (\"inter alia\", the 1965 occupation of the Dominican Republic), Africa, and the Middle East to oppose Communist invasions and insurgencies. Since the dissolution of the Soviet Union, the United States has been far less active in the Americas, but invaded Afghanistan and Iraq following the September 11 attacks in 2001, establishing army and air bases in Central Asia. Before"
 52 |                 },
 53 |                 {
 54 |                     "title": "Decolonization",
 55 |                     "text": "the responsibility of the United Kingdom (with a copy of the new constitution annexed), and finally, if approved, issuance of an Order of Council fixing the exact date of independence. After World War I, several former German and Ottoman territories in the Middle East, Africa, and the Pacific were governed by the UK as League of Nations mandates. Some were administered directly by the UK, and others by British dominions – Nauru and the Territory of New Guinea by Australia, South West Africa by the Union of South Africa, and Western Samoa by New Zealand. Egypt became independent in 1922,"
 56 |                 }
 57 |             ]
 58 |         },
 59 |         {
 60 |             "question": "Who set the record for longest field goal?",
 61 |             "answer": "The record for the longest field goal in an NFL game was set by Matt Prater at 64 yards [1], but the record for the longest field goal at any level was 69 yards, kicked by collegiate kicker Ove Johansson in a 1976 Abilene Christian University football game against East Texas State University [2].",
 62 |             "docs": [
 63 |                 {
 64 |                     "title": "Field goal",
 65 |                     "text": "toward its own end. The longest field goal kick in NFL history is 64 yards, a record set by Matt Prater on December 8, 2013. The previous record was 63, originally set by Tom Dempsey (1970) and then matched by Jason Elam (1998), Sebastian Janikowski (2011), David Akers (2012), and Graham Gano (2018). High school, college and most professional football leagues offer only a three-point field goal; however, some professional leagues have encouraged more rare kicks through \"four-point field goals\". NFL Europe encouraged long field goals of 50 yards or more by making those worth four points instead of three"
 66 |                 },
 67 |                 {
 68 |                     "title": "Field goal range",
 69 |                     "text": "35 and 40 yard lines (closer in a crosswind) often will go for the more risky fourth down conversion rather than risk either the touchback or the missed field goal. The longest field goal in recorded football history was 69 yards, set by collegiate kicker Ove Johansson, who was born in Sweden, in a 1976 Abilene Christian University football game against East Texas State University (now Texas A&M Commerce) at Shotwell Stadium in Abilene. The longest successful field goal in the NFL was 64 yards and was completed by Matt Prater in 2013. The NCAA record is 67 yards held"
 70 |                 },
 71 |                 {
 72 |                     "title": "Field goal",
 73 |                     "text": "both end zones) is only 66 yards. Scaccia, while playing indoor football, attempted a 64-yard kick that was inches short of success, hitting the crossbar. Longer field goals have been attempted at times; the longest attempt in the NFL, which was well short and was kicked into the wind, was 76 yards, attempted by Sebastian Janikowski of the Oakland Raiders, in a September 28, 2008 game against the San Diego Chargers. NFL Europe rewarded kickers that successfully kicked a field goal of longer than 50 yards with a bonus point, making such field goals worth 4 points instead of 3;"
 74 |                 },
 75 |                 {
 76 |                     "title": "Field goal",
 77 |                     "text": "this accomplishment is not the official record. All of the above kicks were successful with the use of a kicking tee, which was banned by the NCAA after the 1988 season. The longest known drop-kicked field goal in college football was a 62-yard kick from Pat O'Dea, an Australian kicker who played on the Wisconsin Badgers football team. O'Dea's kick took place in a blizzard against Northwestern on November 15, 1898. The longest field goal in U Sports football history is 59 yards, by Niko Difonte of Calgary Dinos, playing against the UBC Thunderbirds on November 11, 2017. The field"
 78 |                 },
 79 |                 {
 80 |                     "title": "Field goal range",
 81 |                     "text": "NFL and have been banned from NCAA since 1989) is 68 yards held by Fabrizio Scaccia, and the high school record 68 yards held by Dirk Borgognone; high school has wider goal posts and treats a field goal attempt that lands short in the field of play the same as a punt, making longer attempts much less risky. The indoor football record, with narrower and higher goal posts, is 63 yards (set by Aaron Mills), which is practically as long of a field goal as is possible in that variant of the sport, since the field in indoor football (including"
 82 |                 }
 83 |             ]
 84 |         },
 85 |         {
 86 |             "question": "Who played galen in planet of the apes?",
 87 |             "answer": "In the 1968 film Planet of the Apes, Galen was played by Wright King [2]. And in the tv series Planet of the Apes, Galen was played by Roddy McDowall [1].",
 88 |             "docs": [
 89 |                 {
 90 |                     "title": "Planet of the Apes",
 91 |                     "text": "installment. Jacobs died on June 27, 1973, bringing an end to the APJAC Productions era of the \"Planet of the Apes\" franchise. Former Fox executive Stan Hough took over as producer for the television project, titled \"Planet of the Apes\". CBS picked up the series for its 1974 autumn lineup. Ron Harper and James Naughton played Alan Virdon and Peter Burke, two 20th-century American astronauts who pass through a time warp to a future where apes subjugate humans (unlike the original film, the humans can speak). Roddy McDowall returned to the franchise as Galen, a chimpanzee who joins the astronauts."
 92 |                 },
 93 |                 {
 94 |                     "title": "Planet of the Apes (1968 film)",
 95 |                     "text": "chimpanzees: animal psychologist Zira (Kim Hunter) and surgeon Galen (Wright King). While unable to speak as his throat wound is healing, called \"Bright Eyes\" by Zira and placed with one of the captive primitive humans he later names \"Nova\", Taylor observes the enhanced society of talking apes and in a strict caste system: the gorillas being the military police, hunters and workers; the orangutans overseeing the affairs of government, science, and religion; and intellectual chimpanzees being mostly scientists. While their society is a theocracy similar to the beginnings of the human Industrial Era, the apes consider the primitive humans as"
 96 |                 },
 97 |                 {
 98 |                     "title": "Planet of the Apes (1968 film)",
 99 |                     "text": "Planet of the Apes (1968 film) Planet of the Apes is a 1968 American science fiction film directed by Franklin J. Schaffner. It stars Charlton Heston, Roddy McDowall, Kim Hunter, Maurice Evans, James Whitmore, James Daly and Linda Harrison. The screenplay by Michael Wilson and Rod Serling was loosely based on the 1963 French novel \"La Plan\u00e8te des Singes\" by Pierre Boulle. Jerry Goldsmith composed the groundbreaking avant-garde score. It was the first in a series of five films made between 1968 and 1973, all produced by Arthur P. Jacobs and released by 20th Century Fox. The film tells the"
100 |                 },
101 |                 {
102 |                     "title": "Planet of the Apes",
103 |                     "text": "Rupert Wyatt. To portray ape characters realistically, the production avoided practical effects in favor of performance capture acting, partnering with New Zealand visual effects company Weta Digital. Wyatt cast James Franco as Will Rodman, while veteran performance capture actor Andy Serkis signed on to star as Caesar. \"Rise\" debuted on August 5, 2011. Critics reviewed it positively, especially praising the visual effects and Serkis's performance. It was a major box office hit, taking in $482 million globally, more than five times its $93 million budget. Weta's special effects earned the film two Visual Effects Society Awards and an Oscar nomination"
104 |                 },
105 |                 {
106 |                     "title": "Planet of the Apes",
107 |                     "text": "film stars Mark Wahlberg as astronaut Leo Davidson, who accidentally travels through a wormhole to a distant planet where talking apes enslave humans. He leads a human revolt and upends ape civilization by discovering that the apes evolved from the normal earth primates who had accompanied his mission, and arrived years before. Helena Bonham Carter played chimpanzee Ari, while Tim Roth played the human-hating chimpanzee General Thade. The film received mixed reviews; most critics believed it failed to compare to the original. Much of the negative commentary focused on the confusing plot and twist ending, though many reviewers praised the"
108 |                 }
109 |             ]
110 |         }
111 |     ]
112 | }


--------------------------------------------------------------------------------
/prompts/qampari_nocite.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "instruction": "Instruction: Provide a list of accurate answers for the given question using only the provided search results (some of which might be irrelevant). Separate answers by commas. For questions that have more than 5 answers, write at least 5 answers.",
  3 |     "demo_sep": "\n\n\n",
  4 |     "demo_prompt": "{instruction}\n\nQuestion: {question}\n\n{context}\nAnswer: {answer}",
  5 |     "doc_prompt": "Document [{ID}](Title: {title}): {text}",
  6 |     "demos": [
  7 |         {
  8 |             "question": "Which books were written by Nevil Shute?",
  9 |             "answer": "Marazan, Stephen Morris, Beyond the Black Stump, Lonely Road, The Chequer Board, In the Wet, Trustee from the Toolroom, Round the Bend, No Highway, Ruined City, On the Beach.",
 10 |             "docs": [
 11 |                 {
 12 |                     "title": "Nevil Shute",
 13 |                     "text": "early stages. My congratulations.\" His celebrity as a writer caused the Ministry of Information to send him to the Normandy Landings on 6 June 1944 and later to Burma as a correspondent. He finished the war with the rank of lieutenant commander in the Royal Navy Volunteer Reserves (RNVR). Shute's first novel, \"Stephen Morris\", was written in 1923, but not published until 1961. His first published novel was \"Marazan\", which came out in 1926. After that he averaged one novel every two years through the 1950s, with the exception of a six-year hiatus while he was establishing his own aircraft"
 14 |                 },
 15 |                 {
 16 |                     "title": "Nevil Shute",
 17 |                     "text": "theme is the bridging of social barriers such as class (\"Lonely Road\" and \"Landfall\"), race (\"The Chequer Board\"), or religion (\"Round the Bend\"). The Australian novels are individual hymns to that country, with subtle disparagement of the mores of the United States (\"Beyond the Black Stump\") and overt antipathy towards the post-World War II socialist government of Shute's native Britain (\"The Far Country\" and \"In the Wet\"). Shute's heroes tended to be like himself: middle class solicitors, doctors, accountants, bank managers, engineers, generally university graduates. However (as in \"Trustee from the Toolroom\"), Shute valued the honest artisans and their social"
 18 |                 },
 19 |                 {
 20 |                     "title": "Nevil Shute",
 21 |                     "text": "construction company, Airspeed Ltd. His popularity grew slowly with each novel, but he became much more famous after the publication of \"On the Beach\" in 1957. Shute's novels are written in a simple, highly readable style, with clearly delineated plot lines. Where there is a romantic element, sex is referred to only obliquely. Many of the stories are introduced by a narrator who is not a character in the story. The most common theme in Shute's novels is the dignity of work, spanning all classes, whether an Eastern European bar \"hostess\" (\"Ruined City\") or brilliant boffin (\"No Highway\"). Another recurrent"
 22 |                 },
 23 |                 {
 24 |                     "title": "The Chequer Board",
 25 |                     "text": "the Burmese people\", both of which are central to the book's story. Shute was concerned that sales of the book in the United States would be negatively impacted by the book's open-minded handling of racial issues; as it turned out, sales soared. Shute and his wife traveled the U.S. on Greyhound buses to \"\"get in touch with the man on the street,\"\" finding the experience refreshing. Afterwards he wrote \"\"Sincerity is the first attribute for making money in the business of writing novels.\"\" The Chequer Board The Chequer Board is a novel by Nevil Shute, first published in the United"
 26 |                 },
 27 |                 {
 28 |                     "title": "In the Wet",
 29 |                     "text": "had used the idea of multiple votes for merit in his short story \"The Curious Republic of Gondour\". In the Wet In The Wet is a novel by Nevil Shute that was first published in the United Kingdom in 1953. It contains many of the typical elements of a hearty and adventurous Shute yarn such as flying, the future, mystic states, and ordinary people doing extraordinary things. The story is opened by its initial narrator \u2013 an Anglican priest in the Bush Brotherhood named Roger Hargreaves \u2013 who describes his ordinary circumstances in a large parish of the Australian outback"
 30 |                 }
 31 |             ]
 32 |         },
 33 |         {
 34 |             "question": "Which film has Gong Li as a member of its cast?",
 35 |             "answer": "The Story of Qiu Ju, Farewell My Concubine, Flirting Scholar, The Monkey King 2, Mulan, Saturday Fiction, Coming Home.",
 36 |             "docs": [
 37 |                 {
 38 |                     "title": "Gong Li",
 39 |                     "text": "Gong Li Gong Li (born 31 December 1965) is a Chinese-born Singaporean film actress. She achieved international prominence through her close collaborations with Chinese director Zhang Yimou and won the Volpi Cup for Best Actress at Venice for her performance in his 1992 film \"The Story of Qiu Ju\". She has been credited with helping to bring Chinese cinema to prominence in Europe and the United States. In 2006, she was voted the most beautiful woman in China. Gong has won numerous accolades for her work as an actress; she won the New York Film Critics Circle Award for Best"
 40 |                 },
 41 |                 {
 42 |                     "title": "Gong Li",
 43 |                     "text": "making her realize that she has assisted the dark cynical system. In 1993, she received a New York Film Critics Circle award for her role in \"Farewell My Concubine\" (1993). Directed by Chen Kaige, the film was her first major role with a director other than Zhang Yimou. In the same year, she was awarded with the Berlinale Camera at the 43rd Berlin International Film Festival. \"Premiere\" magazine ranked her performance in \"Farewell My Concubine\" as the 89th greatest performance of all time. She also worked with renowned director Stephen Chow in comedy films \"\" (1991) and \"Flirting Scholar\" (1993)."
 44 |                 },
 45 |                 {
 46 |                     "title": "Gong Li",
 47 |                     "text": "International Film Festival. Later that same year, she reunited with Zhang Yimou for the film \"Coming Home\", which is set during the throes of the Cultural Revolution; this film was their first collaboration since 2006. In 2016, Gong took on her first action role in \"The Monkey King 2\", playing the White Bone Demon. In 2018, Gong was cast in Lou Ye's period drama \"Saturday Fiction\", where she plays an actress who is working undercover gathering intelligence for the Allies. That year, she was also cast in the live-action adaptation of the 1998 Disney animated film \"Mulan\", as an unspecified"
 48 |                 },
 49 |                 {
 50 |                     "title": "Zhang Yimou",
 51 |                     "text": "in Zhang's earlier films. \"Raise the Red Lantern\" was nominated in the Best Foreign Language Film category at the 1992 Academy Awards, becoming the second Chinese film to earn this distinction (after Zhang's \"Ju Dou\"). It eventually lost out to Gabriele Salvatores's \"Mediterraneo\". Zhang's next directorial work, \"The Story of Qiu Ju\", in 1992, once again starring Gong Li in the lead role. The film, which tells the tale of a peasant woman seeking justice for her husband after he was beaten by a village official, was a hit at film festivals and won the Golden Lion award at the"
 52 |                 },
 53 |                 {
 54 |                     "title": "Gong Li",
 55 |                     "text": "Gong Li Gong Li (born 31 December 1965) is a Chinese-born Singaporean film actress. She achieved international prominence through her close collaborations with Chinese director Zhang Yimou and won the Volpi Cup for Best Actress at Venice for her performance in his 1992 film \"The Story of Qiu Ju\". She has been credited with helping to bring Chinese cinema to prominence in Europe and the United States. In 2006, she was voted the most beautiful woman in China. Gong has won numerous accolades for her work as an actress; she won the New York Film Critics Circle Award for Best"
 56 |                 }
 57 |             ]
 58 |         },
 59 |         {
 60 |             "question": "In which years did Patti LaBelle publish music?",
 61 |             "answer": "2006, 1977, 2004, 2005, 2000, 2006.",
 62 |             "docs": [
 63 |                 {
 64 |                     "title": "The Gospel According to Patti LaBelle",
 65 |                     "text": "The Gospel According to Patti LaBelle The Gospel According to Patti LaBelle is the first gospel album released by singer Patti LaBelle, released in November 2006. This project began three years ago when Patti's late musical director and close friend Budd Ellison told a skeptical LaBelle that \"it's now or never, Patti.\" The album is dedicated to his memory as he succumbed to prostate cancer before the album saw a release. The album was released on November 21, 2006 through indie label Umbrella/Bungalow Records, also home to Carl Thomas, Rodney Jerkins, Dean \"DC\" Charles, and other artists. \"The Gospel According"
 66 |                 },
 67 |                 {
 68 |                     "title": "Patti LaBelle (album)",
 69 |                     "text": "scaled the high sixties on the \"Billboard\" R&B chart, it soon became one of her famous show-stoppers while performing the song. LaBelle performed the song at her first solo concert in London, getting a standing ovation, which helped to give LaBelle motivation to continue her career. The album, when released, performed successfully, reaching number 62 on the \"Billboard\" 200 and number 31 on the R&B albums chart, while critics hailed the album. Patti LaBelle (album) Patti LaBelle is the debut solo album by singer Patti LaBelle, released in 1977. The first album LaBelle recorded after sixteen years fronting the band"
 70 |                 },
 71 |                 {
 72 |                     "title": "Patti LaBelle",
 73 |                     "text": "win. In 2000, LaBelle released her final MCA album, \"When a Woman Loves\", before signing with Def Soul Classics to release the 2004 album, \"Timeless Journey\". Following the release of her 2005 covers album, \"Classic Moments\", LaBelle engaged in a rivalry with Antonio \"L.A.\" Reid over the direction of her career, leading to her leaving the label.In the same year, the World Music Awards recognized her years in the music business by awarding her the Legend Award. In 2006, she released her first gospel album, \"The Gospel According to Patti LaBelle\" on the Bungalo label, the album later peaking at"
 74 |                 },
 75 |                 {
 76 |                     "title": "Patti LaBelle",
 77 |                     "text": "Patti LaBelle Patti LaBelle (born Patricia Louise Holt; May 24, 1944) is an American singer, actress, and entrepreneur. LaBelle began her career in the early 1960s as lead singer and front woman of the vocal group, Patti LaBelle and the Bluebelles. Following the group's name change to Labelle in the early 1970s, they released the iconic disco song \"Lady Marmalade\" and the group later became the first African-American vocal group to land the cover of \"Rolling Stone\" magazine. After the group split in 1976, LaBelle began a successful solo career, starting with her critically acclaimed debut album, which included the"
 78 |                 },
 79 |                 {
 80 |                     "title": "The Gospel According to Patti LaBelle",
 81 |                     "text": "Billboard's Top Gospel Albums chart for 17 weeks. \"Where Love Begins,\" a duet with Yolanda Adams was played frequently on R&B and gospel radio stations and debuted at #68 on Billboard's Hot R&B/Hip-Hop tracks. The second single \"Anything\" featuring Kanye West, Mary Mary and Consequence hit #64 on Billboards Hot R&B/Hip-Hop tracks. In 2008, the album was nominated for a Dove Award for Contemporary Gospel Album of the Year at the 39th GMA Dove Awards. The Gospel According to Patti LaBelle The Gospel According to Patti LaBelle is the first gospel album released by singer Patti LaBelle, released in November"
 82 |                 }
 83 |             ]
 84 |         },
 85 |         {
 86 |             "question": "Glenn Ford was a member of cast in which film?",
 87 |             "answer": "So Ends Our Night, Heaven with a Barbed Wire Fence, Happy Birthday to Me, The Greatest Gift, The Gift, The Brotherhood of the Bell.",
 88 |             "docs": [
 89 |                 {
 90 |                     "title": "Glenn Ford",
 91 |                     "text": "name came from his father's hometown of Glenford, Alberta. His first major movie part was in the 1939 film, \"Heaven with a Barbed Wire Fence\". Top Hollywood director John Cromwell was impressed enough with his work to borrow him from Columbia for the independently produced drama, \"So Ends Our Night\" (1941), where Ford delivered a poignant portrayal of a 19-year-old German exile on the run in Nazi-occupied Europe. Working with Academy Award-winning Fredric March and wooing (onscreen) 30-year-old Margaret Sullavan, recently nominated for an Oscar, Ford's shy, ardent young refugee riveted attention even in such stellar company. \"Glenn Ford, a"
 92 |                 },
 93 |                 {
 94 |                     "title": "Glenn Ford",
 95 |                     "text": "were Westerns. He suggested doing a Western series, instead, which resulted in the \"modern-day Western\" series, \"Cade's County\". Ford played southwestern Sheriff Cade for one season (1971\u20131972) in a mix of police mystery and western drama. In \"The Family Holvak\" (1975\u20131976), Ford portrayed a Depression-era preacher in a family drama, reprising the same character he had played in the TV film, \"The Greatest Gift\". In 1978 Ford was host, presenter and narrator of the disaster documentary series 'When Havoc Struck'. In 1981, Ford co-starred with Melissa Sue Anderson in the slasher film \"Happy Birthday to Me\". In 1991, Ford agreed"
 96 |                 },
 97 |                 {
 98 |                     "title": "CBS Thursday Night Movie",
 99 |                     "text": "Night Movie\" opened its fall schedule with the premiere of a low-budget, made-for-TV movie, rather than a proven Hollywood blockbuster guaranteed to lure mass viewership, it became CBS's way of declaring its commitment to product that, although cheaply manufactured, was nevertheless new and topical. In this case, the movie was \"The Brotherhood of the Bell\", and the film's star was Glenn Ford, a movie actor who had never appeared in a television-film. In fact, before shooting on the project even began, Ford had been warned by friends in the industry that he would hate the experience. Instead, the actor reported"
100 |                 },
101 |                 {
102 |                     "title": "The Trouble with Girls (film) ",
103 |                     "text": "with Charlene, but when she refuses to give in, he deceives her and uses the local police force to be sure that she must leave on the train with the rest of the troupe. Cast notes In June 1959 it was announced that Don Mankiewicz would write a screenplay of an unpublished story by Mauri Grashin, Day Keene, and Dwight Babcock. By December 1960, with the project titled \"Chautauqua\", MGM was ready to make the film with Glenn Ford. Rumours circulating in Hollywood at the time stated that Presley would co-star with Ford, Hope Lange, and Arthur O'Connell, but nothing"
104 |                 },
105 |                 {
106 |                     "title": "Trouble in the Glen",
107 |                     "text": "Mel Ferrer. It was Orson Welles' fifth British movie in six months. Filming started 15 December 1953. The film received very poor reviews. Trouble in the Glen Trouble in the Glen is a 1954 British comedy film directed by Herbert Wilcox and starring Margaret Lockwood, Orson Welles, Forrest Tucker and Victor McLaglen. It is loosely based on Maurice Walsh's 1950 novel of the same name. It was filmed in Trucolor for Republic Pictures. After moving from South America to the Scottish Highlands, millionaire Sanin Cejador y Mengues (Welles) reassumes the title of laird of Glen Easan, which he inherited from"
108 |                 }
109 |             ]
110 |         }
111 |     ]
112 | }
113 | 


--------------------------------------------------------------------------------
/prompts/qampari_revised.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "instruction": "Instruction: Provide a list of accurate answers for the given question using only the provided search results (some of which might be irrelevant) and cite them properly. Always cite one and only one document for each answer. When citing a document, surround its ID with square brackets, such as [x] to cite document x. Separate answers by commas. For questions that have more than 5 answers, write at least 5 answers.",
  3 |     "demo_sep": "\n\n\n",
  4 |     "demo_prompt": "{instruction}\n\nQuestion: {question}\n\n{context}\nAnswer: {answer}",
  5 |     "doc_prompt": "Document [{ID}](Title: {title}): {text}",
  6 |     "demos": [
  7 |         {
  8 |             "question": "Which books were written by Nevil Shute?",
  9 |             "answer": "Marazan [1], Stephen Morris [1], Beyond the Black Stump [2], Lonely Road [2], The Chequer Board [2], In the Wet [2], Trustee from the Toolroom [2], Round the Bend [2], No Highway [3], Ruined City [3], On the Beach [3].",
 10 |             "docs": [
 11 |                 {
 12 |                     "title": "Nevil Shute",
 13 |                     "text": "early stages. My congratulations.\" His celebrity as a writer caused the Ministry of Information to send him to the Normandy Landings on 6 June 1944 and later to Burma as a correspondent. He finished the war with the rank of lieutenant commander in the Royal Navy Volunteer Reserves (RNVR). Shute's first novel, \"Stephen Morris\", was written in 1923, but not published until 1961. His first published novel was \"Marazan\", which came out in 1926. After that he averaged one novel every two years through the 1950s, with the exception of a six-year hiatus while he was establishing his own aircraft"
 14 |                 },
 15 |                 {
 16 |                     "title": "Nevil Shute",
 17 |                     "text": "theme is the bridging of social barriers such as class (\"Lonely Road\" and \"Landfall\"), race (\"The Chequer Board\"), or religion (\"Round the Bend\"). The Australian novels are individual hymns to that country, with subtle disparagement of the mores of the United States (\"Beyond the Black Stump\") and overt antipathy towards the post-World War II socialist government of Shute's native Britain (\"The Far Country\" and \"In the Wet\"). Shute's heroes tended to be like himself: middle class solicitors, doctors, accountants, bank managers, engineers, generally university graduates. However (as in \"Trustee from the Toolroom\"), Shute valued the honest artisans and their social"
 18 |                 },
 19 |                 {
 20 |                     "title": "Nevil Shute",
 21 |                     "text": "construction company, Airspeed Ltd. His popularity grew slowly with each novel, but he became much more famous after the publication of \"On the Beach\" in 1957. Shute's novels are written in a simple, highly readable style, with clearly delineated plot lines. Where there is a romantic element, sex is referred to only obliquely. Many of the stories are introduced by a narrator who is not a character in the story. The most common theme in Shute's novels is the dignity of work, spanning all classes, whether an Eastern European bar \"hostess\" (\"Ruined City\") or brilliant boffin (\"No Highway\"). Another recurrent"
 22 |                 },
 23 |                 {
 24 |                     "title": "The Chequer Board",
 25 |                     "text": "the Burmese people\", both of which are central to the book's story. Shute was concerned that sales of the book in the United States would be negatively impacted by the book's open-minded handling of racial issues; as it turned out, sales soared. Shute and his wife traveled the U.S. on Greyhound buses to \"\"get in touch with the man on the street,\"\" finding the experience refreshing. Afterwards he wrote \"\"Sincerity is the first attribute for making money in the business of writing novels.\"\" The Chequer Board The Chequer Board is a novel by Nevil Shute, first published in the United"
 26 |                 },
 27 |                 {
 28 |                     "title": "In the Wet",
 29 |                     "text": "had used the idea of multiple votes for merit in his short story \"The Curious Republic of Gondour\". In the Wet In The Wet is a novel by Nevil Shute that was first published in the United Kingdom in 1953. It contains many of the typical elements of a hearty and adventurous Shute yarn such as flying, the future, mystic states, and ordinary people doing extraordinary things. The story is opened by its initial narrator \u2013 an Anglican priest in the Bush Brotherhood named Roger Hargreaves \u2013 who describes his ordinary circumstances in a large parish of the Australian outback"
 30 |                 }
 31 |             ]
 32 |         },
 33 |         {
 34 |             "question": "Which film has Gong Li as a member of its cast?",
 35 |             "answer": "The Story of Qiu Ju [1], Farewell My Concubine [2], Flirting Scholar [2], The Monkey King 2 [3], Mulan [3], Saturday Fiction [3], Coming Home [3].",
 36 |             "docs": [
 37 |                 {
 38 |                     "title": "Gong Li",
 39 |                     "text": "Gong Li Gong Li (born 31 December 1965) is a Chinese-born Singaporean film actress. She achieved international prominence through her close collaborations with Chinese director Zhang Yimou and won the Volpi Cup for Best Actress at Venice for her performance in his 1992 film \"The Story of Qiu Ju\". She has been credited with helping to bring Chinese cinema to prominence in Europe and the United States. In 2006, she was voted the most beautiful woman in China. Gong has won numerous accolades for her work as an actress; she won the New York Film Critics Circle Award for Best"
 40 |                 },
 41 |                 {
 42 |                     "title": "Gong Li",
 43 |                     "text": "making her realize that she has assisted the dark cynical system. In 1993, she received a New York Film Critics Circle award for her role in \"Farewell My Concubine\" (1993). Directed by Chen Kaige, the film was her first major role with a director other than Zhang Yimou. In the same year, she was awarded with the Berlinale Camera at the 43rd Berlin International Film Festival. \"Premiere\" magazine ranked her performance in \"Farewell My Concubine\" as the 89th greatest performance of all time. She also worked with renowned director Stephen Chow in comedy films \"\" (1991) and \"Flirting Scholar\" (1993)."
 44 |                 },
 45 |                 {
 46 |                     "title": "Gong Li",
 47 |                     "text": "International Film Festival. Later that same year, she reunited with Zhang Yimou for the film \"Coming Home\", which is set during the throes of the Cultural Revolution; this film was their first collaboration since 2006. In 2016, Gong took on her first action role in \"The Monkey King 2\", playing the White Bone Demon. In 2018, Gong was cast in Lou Ye's period drama \"Saturday Fiction\", where she plays an actress who is working undercover gathering intelligence for the Allies. That year, she was also cast in the live-action adaptation of the 1998 Disney animated film \"Mulan\", as an unspecified"
 48 |                 },
 49 |                 {
 50 |                     "title": "Zhang Yimou",
 51 |                     "text": "in Zhang's earlier films. \"Raise the Red Lantern\" was nominated in the Best Foreign Language Film category at the 1992 Academy Awards, becoming the second Chinese film to earn this distinction (after Zhang's \"Ju Dou\"). It eventually lost out to Gabriele Salvatores's \"Mediterraneo\". Zhang's next directorial work, \"The Story of Qiu Ju\", in 1992, once again starring Gong Li in the lead role. The film, which tells the tale of a peasant woman seeking justice for her husband after he was beaten by a village official, was a hit at film festivals and won the Golden Lion award at the"
 52 |                 },
 53 |                 {
 54 |                     "title": "Gong Li",
 55 |                     "text": "Gong Li Gong Li (born 31 December 1965) is a Chinese-born Singaporean film actress. She achieved international prominence through her close collaborations with Chinese director Zhang Yimou and won the Volpi Cup for Best Actress at Venice for her performance in his 1992 film \"The Story of Qiu Ju\". She has been credited with helping to bring Chinese cinema to prominence in Europe and the United States. In 2006, she was voted the most beautiful woman in China. Gong has won numerous accolades for her work as an actress; she won the New York Film Critics Circle Award for Best"
 56 |                 }
 57 |             ]
 58 |         },
 59 |         {
 60 |             "question": "In which years did Patti LaBelle publish music?",
 61 |             "answer": "2006 [1], 1977 [2], 2004 [3], 2005 [3], 2000 [3], 2006 [3].",
 62 |             "docs": [
 63 |                 {
 64 |                     "title": "The Gospel According to Patti LaBelle",
 65 |                     "text": "The Gospel According to Patti LaBelle The Gospel According to Patti LaBelle is the first gospel album released by singer Patti LaBelle, released in November 2006. This project began three years ago when Patti's late musical director and close friend Budd Ellison told a skeptical LaBelle that \"it's now or never, Patti.\" The album is dedicated to his memory as he succumbed to prostate cancer before the album saw a release. The album was released on November 21, 2006 through indie label Umbrella/Bungalow Records, also home to Carl Thomas, Rodney Jerkins, Dean \"DC\" Charles, and other artists. \"The Gospel According"
 66 |                 },
 67 |                 {
 68 |                     "title": "Patti LaBelle (album)",
 69 |                     "text": "scaled the high sixties on the \"Billboard\" R&B chart, it soon became one of her famous show-stoppers while performing the song. LaBelle performed the song at her first solo concert in London, getting a standing ovation, which helped to give LaBelle motivation to continue her career. The album, when released, performed successfully, reaching number 62 on the \"Billboard\" 200 and number 31 on the R&B albums chart, while critics hailed the album. Patti LaBelle (album) Patti LaBelle is the debut solo album by singer Patti LaBelle, released in 1977. The first album LaBelle recorded after sixteen years fronting the band"
 70 |                 },
 71 |                 {
 72 |                     "title": "Patti LaBelle",
 73 |                     "text": "win. In 2000, LaBelle released her final MCA album, \"When a Woman Loves\", before signing with Def Soul Classics to release the 2004 album, \"Timeless Journey\". Following the release of her 2005 covers album, \"Classic Moments\", LaBelle engaged in a rivalry with Antonio \"L.A.\" Reid over the direction of her career, leading to her leaving the label.In the same year, the World Music Awards recognized her years in the music business by awarding her the Legend Award. In 2006, she released her first gospel album, \"The Gospel According to Patti LaBelle\" on the Bungalo label, the album later peaking at"
 74 |                 },
 75 |                 {
 76 |                     "title": "Patti LaBelle",
 77 |                     "text": "Patti LaBelle Patti LaBelle (born Patricia Louise Holt; May 24, 1944) is an American singer, actress, and entrepreneur. LaBelle began her career in the early 1960s as lead singer and front woman of the vocal group, Patti LaBelle and the Bluebelles. Following the group's name change to Labelle in the early 1970s, they released the iconic disco song \"Lady Marmalade\" and the group later became the first African-American vocal group to land the cover of \"Rolling Stone\" magazine. After the group split in 1976, LaBelle began a successful solo career, starting with her critically acclaimed debut album, which included the"
 78 |                 },
 79 |                 {
 80 |                     "title": "The Gospel According to Patti LaBelle",
 81 |                     "text": "Billboard's Top Gospel Albums chart for 17 weeks. \"Where Love Begins,\" a duet with Yolanda Adams was played frequently on R&B and gospel radio stations and debuted at #68 on Billboard's Hot R&B/Hip-Hop tracks. The second single \"Anything\" featuring Kanye West, Mary Mary and Consequence hit #64 on Billboards Hot R&B/Hip-Hop tracks. In 2008, the album was nominated for a Dove Award for Contemporary Gospel Album of the Year at the 39th GMA Dove Awards. The Gospel According to Patti LaBelle The Gospel According to Patti LaBelle is the first gospel album released by singer Patti LaBelle, released in November"
 82 |                 }
 83 |             ]
 84 |         },
 85 |         {
 86 |             "question": "Glenn Ford was a member of cast in which film?",
 87 |             "answer": "So Ends Our Night [1], Heaven with a Barbed Wire Fence [1], Happy Birthday to Me [2], The Greatest Gift [2], The Gift [2], The Brotherhood of the Bell [3].",
 88 |             "docs": [
 89 |                 {
 90 |                     "title": "Glenn Ford",
 91 |                     "text": "name came from his father's hometown of Glenford, Alberta. His first major movie part was in the 1939 film, \"Heaven with a Barbed Wire Fence\". Top Hollywood director John Cromwell was impressed enough with his work to borrow him from Columbia for the independently produced drama, \"So Ends Our Night\" (1941), where Ford delivered a poignant portrayal of a 19-year-old German exile on the run in Nazi-occupied Europe. Working with Academy Award-winning Fredric March and wooing (onscreen) 30-year-old Margaret Sullavan, recently nominated for an Oscar, Ford's shy, ardent young refugee riveted attention even in such stellar company. \"Glenn Ford, a"
 92 |                 },
 93 |                 {
 94 |                     "title": "Glenn Ford",
 95 |                     "text": "were Westerns. He suggested doing a Western series, instead, which resulted in the \"modern-day Western\" series, \"Cade's County\". Ford played southwestern Sheriff Cade for one season (1971\u20131972) in a mix of police mystery and western drama. In \"The Family Holvak\" (1975\u20131976), Ford portrayed a Depression-era preacher in a family drama, reprising the same character he had played in the TV film, \"The Greatest Gift\". In 1978 Ford was host, presenter and narrator of the disaster documentary series 'When Havoc Struck'. In 1981, Ford co-starred with Melissa Sue Anderson in the slasher film \"Happy Birthday to Me\". In 1991, Ford agreed"
 96 |                 },
 97 |                 {
 98 |                     "title": "CBS Thursday Night Movie",
 99 |                     "text": "Night Movie\" opened its fall schedule with the premiere of a low-budget, made-for-TV movie, rather than a proven Hollywood blockbuster guaranteed to lure mass viewership, it became CBS's way of declaring its commitment to product that, although cheaply manufactured, was nevertheless new and topical. In this case, the movie was \"The Brotherhood of the Bell\", and the film's star was Glenn Ford, a movie actor who had never appeared in a television-film. In fact, before shooting on the project even began, Ford had been warned by friends in the industry that he would hate the experience. Instead, the actor reported"
100 |                 },
101 |                 {
102 |                     "title": "The Trouble with Girls (film) ",
103 |                     "text": "with Charlene, but when she refuses to give in, he deceives her and uses the local police force to be sure that she must leave on the train with the rest of the troupe. Cast notes In June 1959 it was announced that Don Mankiewicz would write a screenplay of an unpublished story by Mauri Grashin, Day Keene, and Dwight Babcock. By December 1960, with the project titled \"Chautauqua\", MGM was ready to make the film with Glenn Ford. Rumours circulating in Hollywood at the time stated that Presley would co-star with Ford, Hope Lange, and Arthur O'Connell, but nothing"
104 |                 },
105 |                 {
106 |                     "title": "Trouble in the Glen",
107 |                     "text": "Mel Ferrer. It was Orson Welles' fifth British movie in six months. Filming started 15 December 1953. The film received very poor reviews. Trouble in the Glen Trouble in the Glen is a 1954 British comedy film directed by Herbert Wilcox and starring Margaret Lockwood, Orson Welles, Forrest Tucker and Victor McLaglen. It is loosely based on Maurice Walsh's 1950 novel of the same name. It was filmed in Trucolor for Republic Pictures. After moving from South America to the Scottish Highlands, millionaire Sanin Cejador y Mengues (Welles) reassumes the title of laird of Glen Easan, which he inherited from"
108 |                 }
109 |             ]
110 |         }
111 |     ]
112 | }
113 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | wheel
 2 | ninja
 3 | packaging
 4 | torch
 5 | datasets
 6 | transformers
 7 | accelerate
 8 | sentencepiece
 9 | pytrec_eval
10 | rouge_score
11 | openai
12 | 


--------------------------------------------------------------------------------
/scripts/collect_results.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import numpy as np
  4 | import pandas as pd
  5 | import yaml
  6 | from dataclasses import dataclass, asdict
  7 | from tqdm import tqdm
  8 | 
  9 | dataset_to_metrics = {
 10 |     "json_kv": "substring_exact_match",
 11 |     "nq": "substring_exact_match",
 12 |     "popqa": "substring_exact_match",
 13 |     "triviaqa": "substring_exact_match",
 14 |     "hotpotqa": "substring_exact_match",
 15 |     
 16 |     "narrativeqa": ["gpt-4-score"],
 17 |     "msmarco_rerank_psg": "NDCG@10",
 18 |     
 19 |     "trec_coarse": "exact_match",
 20 |     "trec_fine": "exact_match",
 21 |     "banking77": "exact_match",
 22 |     "clinic150": "exact_match",
 23 |     "nlu": "exact_match",
 24 |     
 25 |     "qmsum": "rougeL_recall",
 26 |     "multi_lexsum": ["gpt-4-f1"],
 27 |     
 28 |     "ruler_niah_s_1": "ruler_recall",
 29 |     "ruler_niah_s_2": "ruler_recall",
 30 |     "ruler_niah_s_3": "ruler_recall",
 31 |     "ruler_niah_mk_1": "ruler_recall",
 32 |     "ruler_niah_mk_2": "ruler_recall",
 33 |     "ruler_niah_mk_3": "ruler_recall",
 34 |     "ruler_niah_mq": "ruler_recall",
 35 |     "ruler_niah_mv": "ruler_recall",
 36 |     "ruler_fwe": "ruler_recall",
 37 |     "ruler_cwe": "ruler_recall",
 38 |     "ruler_vt": "ruler_recall",
 39 |     "ruler_qa_1": "substring_exact_match",
 40 |     "ruler_qa_2": "substring_exact_match",
 41 |     
 42 |     "infbench_qa": ["rougeL_f1"],
 43 |     "infbench_choice": ["exact_match"],
 44 |     "infbench_sum": ["gpt-4-f1"],
 45 |     
 46 |     "alce_asqa": ["str_em", "citation_rec", "citation_prec"],
 47 |     "alce_qampari": ["qampari_rec_top5", "citation_rec", "citation_prec"],
 48 | }
 49 | 
 50 | dataset_to_metrics = {k: [v] if isinstance(v, str) else v for k, v in dataset_to_metrics.items()}
 51 | custom_avgs = {
 52 |     "Recall": ["json_kv substring_exact_match", "ruler_niah_mk_2 ruler_recall", "ruler_niah_mk_3 ruler_recall", "ruler_niah_mv ruler_recall"],
 53 |     "RAG": ['nq substring_exact_match', 'hotpotqa substring_exact_match', 'popqa substring_exact_match', 'triviaqa substring_exact_match',],
 54 |     "ICL": ['trec_coarse exact_match', 'trec_fine exact_match', 'banking77 exact_match', 'clinic150 exact_match', 'nlu exact_match'],
 55 |     "Cite": ['alce_asqa str_em', 'alce_asqa citation_rec', 'alce_asqa citation_prec', 'alce_qampari qampari_rec_top5', 'alce_qampari citation_rec', 'alce_qampari citation_prec', ],
 56 |     "Re-rank": ['msmarco_rerank_psg NDCG@10', ],
 57 |     "LongQA": ['narrativeqa gpt-4-score', 'infbench_qa rougeL_f1', 'infbench_choice exact_match', ],
 58 |     "Summ": ['infbench_sum gpt-4-f1', 'multi_lexsum gpt-4-f1', ],
 59 |     # "RULER": ['ruler_niah_s_1 ruler_recall', 'ruler_niah_s_2 ruler_recall', 'ruler_niah_s_3 ruler_recall', 'ruler_niah_mk_1 ruler_recall', 'ruler_niah_mk_2 ruler_recall', 'ruler_niah_mk_3 ruler_recall', 'ruler_niah_mq ruler_recall', 'ruler_niah_mv ruler_recall', 'ruler_cwe ruler_recall', 'ruler_fwe ruler_recall', 'ruler_vt ruler_recall', 'ruler_qa_1 substring_exact_match', 'ruler_qa_2 substring_exact_match'],
 60 |     "Ours": ['Recall', 'RAG', 'ICL', 'Cite', 'Re-rank', 'LongQA', 'Summ'],
 61 | }
 62 | 
 63 | @dataclass
 64 | class arguments:
 65 |     tag: str = "v1"
 66 |     input_max_length: int = 131072
 67 |     generation_max_length: int = 100
 68 |     generation_min_length: int = 0
 69 |     max_test_samples: int = 100
 70 |     shots: int = 2
 71 |     do_sample: bool = False
 72 |     temperature: float = 0.0
 73 |     top_p: float = 1.0
 74 |     use_chat_template: bool = False
 75 |     seed: int = 42
 76 |     test_name: str = ""
 77 |     dataset: str = "nq"
 78 |     output_dir: str = "output"
 79 |     popularity_threshold: float = 3
 80 |         
 81 |     category: str = "synthetic"
 82 |     
 83 |     def update(self, new):
 84 |         for key, value in new.items():
 85 |             if hasattr(self, key):
 86 |                 setattr(self, key, value)
 87 |                 
 88 |     def get_path(self):
 89 |         tag = self.tag
 90 |         path = os.path.join(self.output_dir, "{args.dataset}_{tag}_{args.test_name}_in{args.input_max_length}_size{args.max_test_samples}_shots{args.shots}_samp{args.do_sample}max{args.generation_max_length}min{args.generation_min_length}t{args.temperature}p{args.top_p}_chat{args.use_chat_template}_{args.seed}.json".format(args=self, tag=tag))
 91 | 
 92 |         if os.path.exists(path.replace(".json", "-gpt4eval_o.json")):
 93 |             return path.replace(".json", "-gpt4eval_o.json")
 94 |         if "alce" in self.dataset:
 95 |             return path.replace(".json", ".json.score")
 96 |         
 97 |         if os.path.exists(path + ".score"):
 98 |             return path + ".score"
 99 |         return path
100 | 
101 |     def get_metric_name(self):
102 |         for d, m in dataset_to_metrics.items():
103 |             if d in self.dataset:
104 |                 return d, m
105 |         return None
106 |     
107 |     def get_averaged_metric(self):
108 |         path = self.get_path()
109 |         print(path)
110 |         if not os.path.exists(path):
111 |             print("path doesn't exist")
112 |             return None
113 |         with open(path) as f:
114 |             results = json.load(f)
115 |         
116 |         _, metric = self.get_metric_name()
117 |         if path.endswith(".score"):
118 |             if any([m not in results for m in metric]):
119 |                 print("metric doesn't exist")
120 |                 return None
121 |             s = {m: results[m] for m in metric}
122 |         else:
123 |             if any([m not in results["averaged_metrics"] for m in metric]):
124 |                 print("metric doesn't exist")
125 |                 return None
126 |             s = {m: results['averaged_metrics'][m] for m in metric}
127 |         
128 |         s = {m : v * (100 if m == "gpt-4-f1" else 1) * (100/3 if m == "gpt-4-score" else 1) for m, v in s.items()}
129 |         print("found scores:", s)
130 |         return s
131 |         
132 |     def get_metric_by_depth(self):
133 |         path = self.get_path()
134 |         path = path.replace(".score", '')
135 |         print(path)
136 |         if not os.path.exists(path):
137 |             return None
138 |         with open(path) as f:
139 |             results = json.load(f)
140 | 
141 |         output = []        
142 |         _, metric = self.get_metric_name()
143 |         metric = metric[0]
144 |         keys = ["depth", "k", metric]
145 |         for d in results["data"]:
146 |             o = {}
147 |             for key in keys:
148 |                 if key == "k" and "ctxs" in d:
149 |                     d["k"] = len(d['ctxs'])
150 |                 if key not in d:
151 |                     print("no", key)
152 |                     return None
153 |                 o[key] = d[key]
154 |             o["metric"] = o.pop(metric)
155 |             output.append(o)
156 |         
157 |         df = pd.DataFrame(output)
158 |         dfs = df.groupby(list(output[0].keys())[:-1]).mean().reset_index()
159 | 
160 |         return dfs.to_dict("records")
161 | 
162 | if __name__ == "__main__":
163 |     # comment out the models you don't want to include, or add the new ones 
164 |     models_configs = [
165 |         {"model": "gpt-4-0125-preview", "use_chat_template": True, "training_length": 128000},
166 |         {"model": "gpt-4o-mini-2024-07-18", "use_chat_template": True, "training_length": 128000},
167 |         {"model": "gpt-4o-2024-05-13", "use_chat_template": True, "training_length": 128000},
168 |         {"model": "gpt-4o-2024-08-06", "use_chat_template": True, "training_length": 128000},
169 |         {"model": "claude-3-5-sonnet-20240620", "use_chat_template": True, "training_length": 200000},
170 |         {"model": "gemini-1.5-flash-001", "use_chat_template": True, "training_length": 1048576},
171 |         {"model": "gemini-1.5-pro-001", "use_chat_template": True, "training_length": 2097152},
172 | 
173 |         # llama 2 based models
174 |         {"model": "Llama-2-7B-32K", "use_chat_template": False, "training_length": 32768},
175 |         {"model": "Llama-2-7B-32K-Instruct", "training_length": 32768},
176 |         {"model": "llama-2-7b-80k", "use_chat_template": False, "training_length": 80000},
177 |         {"model": "Yarn-Llama-2-7b-64k", "use_chat_template": False, "training_length": 65536},
178 |         {"model": "Yarn-Llama-2-7b-128k", "use_chat_template": False, "training_length": 131072},
179 |         
180 |         # llama 3 models
181 |         {"model": "Meta-Llama-3-8B", "use_chat_template": False, "training_length": 8192},
182 |         {"model": "Meta-Llama-3-8B-Instruct", "training_length": 8192},
183 |         {"model": "Meta-Llama-3-8B-Theta16M", "use_chat_template": False, "training_length": 8192},
184 |         {"model": "Meta-Llama-3-8B-Instruct-Theta16M", "training_length": 8192},
185 |         {"model": "Meta-Llama-3-70B-Theta16M", "use_chat_template": False, "training_length": 8192},
186 |         {"model": "Meta-Llama-3-70B-Instruct-Theta16M", "training_length": 8192},
187 |         
188 |         {"model": "Llama-3.1-8B", "use_chat_template": False, "training_length": 131072},
189 |         {"model": "Llama-3.1-8B-Instruct", "training_length": 131072},
190 |         {"model": "Llama-3.1-70B", "use_chat_template": False, "training_length": 131072},
191 |         {"model": "Llama-3.1-70B-Instruct", "training_length": 131072},
192 |         {"model": "Llama-3.3-70B-Instruct", "training_length": 131072},
193 |         
194 |         {"model": "Llama-3.2-1B", "use_chat_template": False, "training_length": 131072},
195 |         {"model": "Llama-3.2-1B-Instruct", "training_length": 131072},
196 |         {"model": "Llama-3.2-3B", "use_chat_template": False, "training_length": 131072},
197 |         {"model": "Llama-3.2-3B-Instruct", "training_length": 131072},
198 |         
199 |         # mistral models
200 |         {"model": "Mistral-7B-v0.1", "use_chat_template": False, "training_length": 8192},
201 |         {"model": "Mistral-7B-Instruct-v0.1", "training_length": 8192},
202 |         {"model": "Mistral-7B-Instruct-v0.2", "training_length": 32768},
203 |         {"model": "Mistral-7B-v0.3", "use_chat_template": False, "training_length": 32768},
204 |         {"model": "Mistral-7B-Instruct-v0.3", "training_length": 32768},
205 |         {"model": "Ministral-8B-Instruct-2410", "training_length": 131072},
206 |         
207 |         {"model": "Mistral-Nemo-Base-2407", "use_chat_template": False, "training_length": 128000},
208 |         {"model": "Mistral-Nemo-Instruct-2407", "training_length": 128000},
209 |         {"model": "MegaBeam-Mistral-7B-512k", "training_length": 524288},
210 |         
211 |         # yi models
212 |         {"model": "Yi-6B-200K", "use_chat_template": False, "training_length": 200000},
213 |         {"model": "Yi-9B-200K", "use_chat_template": False, "training_length": 200000},
214 |         {"model": "Yi-34B-200K", "use_chat_template": False, "training_length": 200000},
215 |         {"model": "Yi-1.5-9B-32K", "use_chat_template": False, "training_length": 32768},
216 |         
217 |         # phi models
218 |         {"model": "Phi-3-mini-128k-instruct", "training_length": 131072},
219 |         {"model": "Phi-3-small-128k-instruct", "training_length": 131072},
220 |         {"model": "Phi-3-medium-128k-instruct", "training_length": 131072},
221 |         {"model": "Phi-3.5-mini-instruct", "training_length": 131072},
222 |         
223 |         # qwen models
224 |         {"model": "Qwen2-7B", "use_chat_template": False, "training_length": 32768},
225 |         {"model": "Qwen2-7B-Instruct", "training_length": 32768},
226 |         {"model": "Qwen2-57B-A14B", "use_chat_template": False, "training_length": 32768},
227 |         {"model": "Qwen2-57B-A14B-Instruct", "training_length": 32768},
228 |         {"model": "Qwen2.5-1.5B", "use_chat_template": False, "training_length": 32768},
229 |         {"model": "Qwen2.5-1.5B-Instruct", "training_length": 32768},
230 |         {"model": "Qwen2.5-3B", "use_chat_template": False, "training_length": 32768},
231 |         {"model": "Qwen2.5-3B-Instruct", "training_length": 32768},
232 |         {"model": "Qwen2.5-7B", "use_chat_template": False, "training_length": 131072},
233 |         {"model": "Qwen2.5-7B-Instruct", "training_length": 131072},
234 |         {"model": "Qwen2.5-72B-Instruct", "training_length": 131072},
235 |         
236 |         # prolong
237 |         {"model": "Llama-3-8B-ProLong-512k-Instruct", "training_length": 524288},
238 |         
239 |         # gemma 2 models
240 |         {"model": "gemma-2-9b", "use_chat_template": False, "training_length": 8192},
241 |         {"model": "gemma-2-9b-it", "training_length": 8192},
242 |         {"model": "gemma-2-9b-it-Theta320K", "training_length": 8192},
243 | 
244 |         {"model": "gemma-2-27b", "use_chat_template": False, "training_length": 8192},
245 |         {"model": "gemma-2-27b-it", "training_length": 8192},
246 |         {"model": "gemma-2-27b-it-Theta320K", "training_length": 8192},
247 |         
248 |         # others
249 |         {"model": "c4ai-command-r-v01", "training_length": 131072},
250 |         {"model": "Jamba-v0.1", "use_chat_template": False, "training_length": 262144},
251 |         {"model": "AI21-Jamba-1.5-Mini", "training_length": 262144},
252 |     ]
253 | 
254 |     
255 |     models_configs = [
256 |             {"model": "Llama-3.1-8B", "use_chat_template": False, "training_length": 131072},
257 |             {"model": "Llama-3.1-8B-Instruct", "training_length": 131072},
258 |             {"model": "DeepSeek-R1-Distill-Llama-8B", "training_length": 131072, "do_sample": True, "temperature": 0.6},
259 |             {"model": "Qwen2-7B", "use_chat_template": False, "training_length": 32768},
260 |             {"model": "Qwen2-7B-Instruct", "training_length": 32768},
261 |             {"model": "DeepSeek-R1-Distill-Qwen-7B", "training_length": 131072, "do_sample": True, "temperature": 0.6},
262 |     ]
263 | 
264 |     # set your configs here, only include the ones that you ran
265 |     config_files = [
266 |         "configs/recall.yaml", "configs/recall_short.yaml", 
267 |         "configs/rag.yaml", "configs/rag_short.yaml", 
268 |         "configs/longqa.yaml", "configs/longqa_short.yaml", 
269 |         "configs/summ.yaml", "configs/summ_short.yaml", 
270 |         "configs/rerank.yaml", "configs/rerank_short.yaml", 
271 |         "configs/icl.yaml", "configs/icl_short.yaml", 
272 |         "configs/cite.yaml", "configs/cite_short.yaml", 
273 |         "configs/ruler.yaml", "configs/ruler_short.yaml", 
274 |     ]
275 | 
276 |     dataset_configs = []
277 |     for file in config_files:
278 |         c = yaml.safe_load(open(file))
279 |         
280 |         if isinstance(c["generation_max_length"], int):
281 |             c["generation_max_length"] = ",".join([str(c["generation_max_length"])] * len(c["datasets"].split(",")))
282 |         for d, t, l, g in zip(c['datasets'].split(','), c['test_files'].split(','), c['input_max_length'].split(','), c['generation_max_length'].split(',')):
283 |             dataset_configs.append({"dataset": d, "test_name": os.path.basename(os.path.splitext(t)[0]), "input_max_length": int(l), "generation_max_length": int(g), "max_test_samples": c['max_test_samples'], 'use_chat_template': c['use_chat_template'], 'shots': c['shots']})
284 |     print(dataset_configs)    
285 | 
286 |     failed_paths = []
287 |     df = []
288 |     for model in tqdm(models_configs):
289 |         args = arguments()
290 |         args.tag = "v1" # SET YOUR TAG HERE
291 |         args.output_dir = f"output/{model['model']}"
292 |     
293 |         for dataset in dataset_configs:
294 |             args.update(dataset)
295 |             args.update(model)
296 | 
297 |             metric = args.get_averaged_metric()
298 |             dsimple, mnames = args.get_metric_name()
299 | 
300 |             if metric is None:
301 |                 failed_paths.append(args.get_path())
302 |                 continue
303 |                 
304 |             for k, m in metric.items():
305 |                 df.append({**asdict(args), **model,
306 |                     "metric name": k, "metric": m, 
307 |                     "dataset_simple": dsimple + " " + k, "test_data": f"{args.dataset}-{args.test_name}-{args.input_max_length}"
308 |                 })
309 | 
310 |     all_df = pd.DataFrame(df)
311 |     lf_df = all_df.pivot_table(index=["input_max_length", "model", ], columns="dataset_simple", values="metric", sort=False)
312 |     lf_df = lf_df.reset_index()
313 | 
314 |     for k, v in custom_avgs.items():
315 |         lf_df[k] = lf_df[v].mean(axis=1)
316 | 
317 |     print(lf_df.to_csv(index=False))
318 | 
319 |     print("Warning, failed to get the following paths, make sure that these are correct or the printed results will not be accurate:", failed_paths)
320 |     # import pdb; pdb.set_trace()


--------------------------------------------------------------------------------
/scripts/download_data.sh:
--------------------------------------------------------------------------------
1 | wget -c https://huggingface.co/datasets/princeton-nlp/HELMET/resolve/main/data.tar.gz
2 | tar -xvzf data.tar.gz
3 | 


--------------------------------------------------------------------------------
/scripts/eval_gpt4_longqa.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | import os
  4 | import sys
  5 | import re
  6 | from tqdm import tqdm
  7 | import glob
  8 | 
  9 | # Get the parent directory path
 10 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
 11 | # Add the parent directory to the Python path
 12 | sys.path.append(parent_dir)
 13 | 
 14 | from model_utils import OpenAIModel
 15 | 
 16 | def parse_output(output, prefix="Answer:"):
 17 |     output = output.replace("\n", " ")
 18 | 
 19 |     def lstrip_string(s, sub):
 20 |         return re.sub(f'^{re.escape(sub)}', '', s, flags=re.IGNORECASE)
 21 |     patterns = [re.compile(f"(?:{prefix})(.*)(?:\n|$)", flags=re.IGNORECASE), re.compile(r"(?:^)(.*)(?:\n|$)")]
 22 |     for pat in patterns:
 23 |         matches = pat.search(output)
 24 |         if matches is not None:
 25 |             return lstrip_string(matches[1].strip(), prefix).strip() # 0 index includes the non-capturing group # lstrip again because for chat models sometimes it will repeat the prefix
 26 |     # if still not found, return None, but should actually never get this case...
 27 |     return None
 28 | 
 29 | 
 30 | # prompts inspired by https://www.databricks.com/blog/LLM-auto-eval-best-practices-RAG
 31 | judge_prompt = """Please act as an impartial judge and evaluate the quality of the provided answer which attempts to answer the provided question based on a provided context.
 32 | Although you are not given the context, you will be given a set of correct answers that achieves full scores on all metrics, and you need to assess the provided answers using the correct answers.
 33 | 
 34 | Below is your grading rubric:
 35 | 
 36 | Fluency:
 37 | - Score 0 (incoherent, repetitive, or incomplete): Incoherent sentences, repetitive sentences (even if not by exact words), incomplete answers, or gibberish. Note that even if the answer is coherent, if it is repetitive or incomplete, it should be given a score of 0.
 38 | - Score 1 (coherent, non-repetitive answer): Coherent, non-repetitive, fluent, grammatically correct answers.
 39 | 
 40 | Correctness:
 41 | - Score 0 (Incorrect): The answer does not agree with the provided correct answers at all.
 42 | - Score 1 (partly correct): Partly agree with one of the provided correct answers (for example, the question asks for a date and a person; the answer gets the date right but the person wrong).
 43 | - Score 2 (correct but not fully relevant): Fully agrees with one of the provided correct answers but mentions other completely irrelevant information. Note that extra details provided in the answer, even if not mentioned in the correct answers, should NOT be seen as irrelevant as long as they are relevant to the question to a reasonable extend.
 44 | - Score 3 (correct and relevant): Fully agrees with one of the provided correct answers and only provides information relevant to the question. Note that if the answer is longer than the correct answer, as long as everything in the answer is relevant to the question, it should still be given score 3. For example, if the correct answer is "the North Pole" and the answer is "They are headed for the North Pole", it should still be given a score of 3.
 45 | 
 46 | Now, read the following question, answer, and correct answers. First think step-by-step and provide your reasoning and assessment on the answer. Then output your score in the following json format: {{"fluency": 0, "correctness": 1}}.
 47 | 
 48 | Question: {question}
 49 | Correct answers: {correct_answers}
 50 | Answer: {parsed_output}
 51 | """
 52 | 
 53 | def parse_json(text):
 54 |     matches = re.findall(r"\{.*?\}", text, re.DOTALL)
 55 |     if len(matches) > 0:
 56 |         try:
 57 |             r = json.loads(matches[-1])
 58 |         except:
 59 |             return None
 60 |         return r
 61 |     return None
 62 | 
 63 | def check_metrics(model, results_file, output_file):
 64 |     with open(results_file, "r") as f:
 65 |         results = json.load(f)
 66 | 
 67 |     sum_score = 0
 68 |     count_score = 0
 69 | 
 70 |     all_inputs = []
 71 |     for d in results["data"]:
 72 |         p = judge_prompt.format(question=d['question'], correct_answers=d['answer'], parsed_output=parse_output(d['output']))
 73 |         all_inputs.append(p)
 74 | 
 75 |     outputs = model.generate_batch(prompt=all_inputs, batch_file=output_file+".batch")
 76 |     for idx, o in enumerate(outputs):
 77 |         d = results["data"][idx]
 78 |         s = None
 79 | 
 80 |         if o is not None:
 81 |             scores = parse_json(o["output"])
 82 |             if scores is not None and "correctness" in scores and "fluency" in scores:
 83 |                 s = scores
 84 |             else:
 85 |                 print("Warning! Couldn't get a score")
 86 |                 print(f"GPT-4 output: {o['output']}")
 87 | 
 88 |             if scores is not None:
 89 |                 sum_score += scores["fluency"] * scores["correctness"]
 90 |                 count_score += 1
 91 | 
 92 |         d["gpt-4-scores"] = s
 93 | 
 94 |         if idx < 10:
 95 |             print("=====================================")
 96 |             print(f"Prompt: {all_inputs[idx]}")
 97 |             print(f"Output: {o['output']}")
 98 |             print(f"Final score: {s}")
 99 | 
100 |     results["averaged_metrics"]["gpt-4-score"] = sum_score / count_score
101 |     with open(output_file, "w") as f:
102 |         json.dump(results, f, indent=4)
103 | 
104 |     return results
105 | 
106 | if __name__ == "__main__":
107 |     model = OpenAIModel("gpt-4o-2024-05-13", temperature=0.1)
108 |     parser = argparse.ArgumentParser()
109 |     parser.add_argument("--num_shards", type=int, default=1)
110 |     parser.add_argument("--shard_idx", type=int, default=0)
111 |     parser.add_argument("--model_to_check", nargs="+", default=[])
112 |     parser.add_argument("--tag", type=str, default="v1")
113 |     args = parser.parse_args()
114 |     num_shards = args.num_shards
115 |     shard_idx = args.shard_idx
116 | 
117 |     if len(args.model_to_check) > 0:
118 |         model_to_check = args.model_to_check
119 |     else:
120 |         # all models
121 |         model_to_check = ['gpt-4-0125-preview','gpt-4o-mini-2024-07-18','gpt-4o-2024-05-13','gpt-4o-2024-08-06','claude-3-5-sonnet-20240620','gemini-1.5-flash-001','gemini-1.5-pro-001','Llama-2-7B-32K','Llama-2-7B-32K-Instruct','llama-2-7b-80k','Yarn-Llama-2-7b-64k','Yarn-Llama-2-7b-128k','Meta-Llama-3-8B','Meta-Llama-3-8B-Instruct','Meta-Llama-3-8B-Theta16M','Meta-Llama-3-8B-Instruct-Theta16M','Meta-Llama-3-70B-Theta16M','Meta-Llama-3-70B-Instruct-Theta16M','Llama-3.1-8B','Llama-3.1-8B-Instruct','Llama-3.1-70B','Llama-3.1-70B-Instruct','Llama-3.3-70B-Instruct','Llama-3.2-1B','Llama-3.2-1B-Instruct','Llama-3.2-3B','Llama-3.2-3B-Instruct','Mistral-7B-v0.1','Mistral-7B-Instruct-v0.1','Mistral-7B-Instruct-v0.2','Mistral-7B-v0.3','Mistral-7B-Instruct-v0.3','Ministral-8B-Instruct-2410','Mistral-Nemo-Base-2407','Mistral-Nemo-Instruct-2407','MegaBeam-Mistral-7B-512k','Yi-6B-200K','Yi-9B-200K','Yi-34B-200K','Yi-1.5-9B-32K','Phi-3-mini-128k-instruct','Phi-3-small-128k-instruct','Phi-3-medium-128k-instruct','Phi-3.5-mini-instruct','Qwen2-7B','Qwen2-7B-Instruct','Qwen2-57B-A14B','Qwen2-57B-A14B-Instruct','Qwen2.5-1.5B','Qwen2.5-1.5B-Instruct','Qwen2.5-3B','Qwen2.5-3B-Instruct','Qwen2.5-7B','Qwen2.5-7B-Instruct','Qwen2.5-7B-Instruct-1M','Qwen2.5-14B-Instruct-1M','Qwen2.5-72B-Instruct','Llama-3-8B-ProLong-512k-Instruct','gemma-2-9b','gemma-2-9b-it','gemma-2-9b-it-Theta320K','gemma-2-27b','gemma-2-27b-it','gemma-2-27b-it-Theta320K','c4ai-command-r-v01','Jamba-v0.1','AI21-Jamba-1.5-Mini', "DeepSeek-R1-Distill-Llama-8B", "DeepSeek-R1-Distill-Qwen-7B"]
122 | 
123 |     all_paths = [glob.glob(f"output/{m}/narrativeqa_*_{args.tag}_*.json") for m in model_to_check]
124 |     all_paths = [item for sublist in all_paths for item in sublist]
125 |     all_paths = [p for p in all_paths if not os.path.exists(p.replace(".json", "-gpt4eval_o.json"))]
126 |     all_paths = [p for p in all_paths if not p.endswith("-gpt4eval_o.json")]
127 |     all_paths = all_paths[shard_idx::num_shards]
128 |     print(f"Found {len(all_paths)} path")
129 | 
130 |     for p in all_paths:
131 |         newp = p.replace(".json", "-gpt4eval_o.json")
132 |         print("evaluating path:", p)
133 |         check_metrics(model, p, newp)
134 | 


--------------------------------------------------------------------------------
/scripts/eval_gpt4_longqa.sh:
--------------------------------------------------------------------------------
1 | shards=10; for i in $(seq 0 $shards); do python scripts/eval_gpt4_longqa.py --num_shards $shards --shard_idx $i & done
2 | 


--------------------------------------------------------------------------------
/scripts/eval_gpt4_summ.sh:
--------------------------------------------------------------------------------
1 | shards=10; for i in $(seq 0 $shards); do python scripts/eval_gpt4_summ.py --num_shards $shards --shard_idx $i & done
2 | 


--------------------------------------------------------------------------------
/scripts/generate_configs.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | 
  3 | # cannot be shared ones: use_chat_template, shots, and stop_new_line
  4 | 
  5 | lengths_mapping = {"4k": 4096, "8k": 8192, "16k": 16384, "32k": 32768, "64k": 65536, "128k": 131072}
  6 | master_mapping = {
  7 |     # ruler tasks, shots: 0, use_chat_template: False, and stop_new_line: False
  8 |     "ruler_niah_s_1": { # NIAH Repeat
  9 |         k: {
 10 |             "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/niah_single_1/validation_{v}.jsonl"
 11 |         } for k, v in {"4k": 4096, "8k": 8192, "16k": 16384, "32k": 32768, "64k": 65536, "128k": 131072}.items()
 12 |     },
 13 |     "ruler_niah_s_2": { # NIAH 
 14 |         k: {
 15 |             "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/niah_single_2/validation_{v}.jsonl"
 16 |         } for k, v in {"4k": 4096, "8k": 8192, "16k": 16384, "32k": 32768, "64k": 65536, "128k": 131072}.items()
 17 |     },
 18 |     "ruler_niah_s_3": { # NIAH UUID
 19 |         k: {
 20 |             "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/niah_single_3/validation_{v}.jsonl"
 21 |         } for k, v in {"4k": 4096, "8k": 8192, "16k": 16384, "32k": 32768, "64k": 65536, "128k": 131072}.items()
 22 |     },
 23 |     "ruler_niah_mk_1": { # NIAH MK Essay
 24 |         k: {
 25 |             "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/niah_multikey_1/validation_{v}.jsonl"
 26 |         } for k, v in {"4k": 4096, "8k": 8192, "16k": 16384, "32k": 32768, "64k": 65536, "128k": 131072}.items()
 27 |     },
 28 |     "ruler_niah_mk_2": { # NIAH MK Needle
 29 |         k: {
 30 |             "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/niah_multikey_2/validation_{v}.jsonl"
 31 |         } for k, v in  lengths_mapping.items()
 32 |     },
 33 |     "ruler_niah_mk_3": { # NIAH MK UUID
 34 |         k: {
 35 |             "input_length": v, "generation_max_length": 100, "test_files": f"data/ruler/niah_multikey_3/validation_{v}.jsonl"
 36 |         } for k, v in lengths_mapping.items()
 37 |     },
 38 |     "ruler_niah_mq": { # NIAH MQ
 39 |         k: {
 40 |             "input_length": v, "generation_max_length": 100, "test_files": f"data/ruler/niah_multiquery/validation_{v}.jsonl"
 41 |         } for k, v in lengths_mapping.items()
 42 |     },
 43 |     "ruler_niah_mv": { # NIAH MV
 44 |         k: {
 45 |             "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/niah_multivalue/validation_{v}.jsonl"
 46 |         } for k, v in lengths_mapping.items()
 47 |     },
 48 |     "ruler_cwe": { # RULER CWE
 49 |         k: {
 50 |             "input_length": v, "generation_max_length": 100, "test_files": f"data/ruler/cwe/validation_{v}.jsonl"
 51 |         } for k, v in lengths_mapping.items()
 52 |     },
 53 |     "ruler_fwe": { # RULER FWE
 54 |         k: {
 55 |             "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/fwe/validation_{v}.jsonl"
 56 |         } for k, v in lengths_mapping.items()
 57 |     },
 58 |     "ruler_vt": { # RULER VT
 59 |         k: {
 60 |             "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/vt/validation_{v}.jsonl"
 61 |         } for k, v in lengths_mapping.items()
 62 |     },
 63 |     "ruler_qa_1": { # SQuAD
 64 |         k: {
 65 |             "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/qa_1/validation_{v}.jsonl"
 66 |         } for k, v in lengths_mapping.items()
 67 |     },
 68 |     "ruler_qa_2": { # HotpotQA
 69 |         k: {
 70 |             "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/qa_2/validation_{v}.jsonl"
 71 |         } for k, v in lengths_mapping.items()
 72 |     },
 73 | 
 74 |     "json_kv": {
 75 |         k: {
 76 |             "input_length": v, "generation_max_length": 100, "test_files": f"data/json_kv/test_k" + ["50", "105", "220", "440", "900", "1800"][i] + "_dep6.jsonl", "demo_files": ""
 77 |         } for i, (k, v) in enumerate(lengths_mapping.items())
 78 |     },
 79 | 
 80 |     # generation with citations -- alce
 81 |     "alce_asqa": { # ASQA
 82 |         k: {
 83 |             "input_length": v, "generation_max_length": 300, "test_files": f"data/alce/asqa_eval_gtr_top2000.json", "demo_files": f"prompts/asqa_revised.json", "name_postfix": ["_8", "_30", "_75", "_165", "_345", "_700"][i]
 84 |         } for i, (k, v) in enumerate(lengths_mapping.items())
 85 |     },
 86 |     "alce_qampari": { # QAMPARI
 87 |         k: {
 88 |             "input_length": v, "generation_max_length": 300, "test_files": f"data/alce/qampari_eval_gtr_top2000.json", "demo_files": f"prompts/qampari_revised.json", "name_postfix": ["_8", "_30", "_75", "_165", "_345", "_700"][i]
 89 |         } for i, (k, v) in enumerate(lengths_mapping.items())
 90 |     },
 91 | 
 92 |     "alce_asqa_nocite": { # ASQA
 93 |         k: {
 94 |             "input_length": v, "generation_max_length": 300, "test_files": f"data/alce/asqa_eval_gtr_top2000.json", "demo_files": f"prompts/asqa_nocite.json", "name_postfix": ["_8", "_30", "_75", "_165", "_345", "_700"][i]
 95 |         } for i, (k, v) in enumerate(lengths_mapping.items())
 96 |     },
 97 |     "alce_qampari_nocite": { # QAMPARI
 98 |         k: {
 99 |             "input_length": v, "generation_max_length": 300, "test_files": f"data/alce/qampari_eval_gtr_top2000.json", "demo_files": f"prompts/qampari_nocite.json", "name_postfix": ["_8", "_30", "_75", "_165", "_345", "_700"][i]
100 |         } for i, (k, v) in enumerate(lengths_mapping.items())
101 |     },
102 | 
103 |     # RAG tasks, using KILT's datasets and retrieval corpus
104 |     "kilt_nq": {
105 |         k: {
106 |             "input_length": v, "generation_max_length": 20, 
107 |             "test_files": "data/kilt/nq-dev-multikilt_1000_k" + ["20", "50", "105", "220", "440", "1000"][i] + "_dep6.jsonl", 
108 |             "demo_files": "data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl"
109 |         } for i, (k, v) in enumerate(lengths_mapping.items())
110 |     }, 
111 |     "kilt_triviaqa": {
112 |         k: {
113 |             "input_length": v, "generation_max_length": 20,
114 |             "test_files": "data/kilt/triviaqa-dev-multikilt_1000_k" + ["20", "50", "105", "220", "440", "1000"][i] + "_dep6.jsonl",
115 |             "demo_files": "data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl"
116 |         } for i, (k, v) in enumerate(lengths_mapping.items())
117 |     },
118 |     "kilt_hotpotqa": {
119 |         k: {
120 |             "input_length": v, "generation_max_length": 20,
121 |             "test_files": "data/kilt/hotpotqa-dev-multikilt_1000_k" + ["20", "50", "105", "220", "440", "1000"][i] + "_dep3.jsonl",
122 |             "demo_files": "data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl"
123 |         } for i, (k, v) in enumerate(lengths_mapping.items())
124 |     },
125 |     "kilt_popqa": {
126 |         k: {
127 |             "input_length": v, "generation_max_length": 20, "name_postfix": "_3",
128 |             "test_files": "data/kilt/popqa_test_1000_k" + ["20", "50", "105", "220", "440", "1000"][i] + "_dep6.jsonl", 
129 |             "demo_files": "data/kilt/popqa_test_1000_k3_dep6.jsonl"
130 |         } for i, (k, v) in enumerate(lengths_mapping.items())
131 |     },
132 | 
133 |     # for longqa, we truncate by the length - 200 - the generation length
134 |     "narrativeqa": {
135 |         k: {
136 |             "input_length": v, "generation_max_length": 100, "test_files": "", "demo_files": "", "name_postfix": f"_{v - 200 - 100}"
137 |         } for k, v in lengths_mapping.items()
138 |     },
139 |     "infbench_qa_eng": {
140 |         k: {
141 |             "input_length": v, "generation_max_length": 10, "test_files": "", "demo_files": "", "name_postfix": f"_{v - 200 - 10}"
142 |         } for k, v in lengths_mapping.items()
143 |     },
144 |     "infbench_choice_eng": {
145 |         k: {
146 |             "input_length": v, "generation_max_length": 10, "test_files": "", "demo_files": "", "name_postfix": f"_{v - 200 - 10}"
147 |         } for k, v in lengths_mapping.items()
148 |     },
149 | 
150 |     "infbench_sum_eng": {
151 |         k: {
152 |             "input_length": v, "generation_max_length": 1200, "test_files": "", "demo_files": "", "name_postfix": f"_{v - 200 - 1200}"
153 |         } for k, v in lengths_mapping.items()
154 |     },
155 |     # for multi lexsum, we truncate by the length - 300 (prompt and buffer) - 400 (generation)
156 |     "multi_lexsum": {
157 |         k: {
158 |             "input_length": v, "generation_max_length": 400, "test_files": "", "demo_files": "", "name_postfix": f"_{v - 300 - 400}"
159 |         } for k, v in lengths_mapping.items()
160 |     },
161 | 
162 |     "msmarco_rerank_psg": {
163 |         k: {
164 |             "input_length": v, "generation_max_length": 200, 
165 |             "test_files": "data/msmarco/test_reranking_data_k" + ["14", "50", "130", "285", "600", "1000"][i] + "_dep3.jsonl",
166 |             "demo_files": "data/msmarco/test_reranking_data_k10_dep3.jsonl"
167 |         } for i, (k, v) in enumerate(lengths_mapping.items())
168 |     },
169 | 
170 |     "icl_trec_coarse": {
171 |         k: {
172 |             "input_length": v, "generation_max_length": 20,
173 |             "test_files": "", "demo_files": "", "name_postfix": "_" + ["200", "400", "800", "1600", "3300", "6600"][i] + "shot_balance"
174 |         } for i, (k, v) in enumerate(lengths_mapping.items())
175 |     },
176 |     "icl_trec_fine": {
177 |         k: {
178 |             "input_length": v, "generation_max_length": 20,
179 |             "test_files": "", "demo_files": "", "name_postfix": "_" + ["200", "400", "800", "1600", "3200", "6400"][i] + "shot_balance"
180 |         } for i, (k, v) in enumerate(lengths_mapping.items())
181 |     },
182 |     "icl_banking77": {
183 |         k: {
184 |             "input_length": v, "generation_max_length": 20,
185 |             "test_files": "", "demo_files": "", "name_postfix": "_" + ["180", "360", "720", "1450", "2900", "5900"][i] + "shot_balance"
186 |         } for i, (k, v) in enumerate(lengths_mapping.items())
187 |     },
188 |     "icl_clinic150": {
189 |         k: {
190 |             "input_length": v, "generation_max_length": 20,
191 |             "test_files": "", "demo_files": "", "name_postfix": "_" + ["220", "440", "880", "1750", "3525", "7050"][i] + "shot_balance"
192 |         } for i, (k, v) in enumerate(lengths_mapping.items())
193 |     },
194 |     "icl_nlu": {
195 |         k: {
196 |             "input_length": v, "generation_max_length": 20,
197 |             "test_files": "", "demo_files": "", "name_postfix": "_" + ["250", "510", "1020", "2040", "4080", "8296"][i] + "shot_balance"
198 |         } for i, (k, v) in enumerate(lengths_mapping.items())
199 |     },
200 | }
201 | 
202 | def process_configs(config_name, datasets, input_lengths, **kwargs):
203 |     configs = []
204 |     for i, d in enumerate(datasets):
205 |         con = master_mapping[d]
206 |         print(d)
207 |         for l in input_lengths:
208 |             c = con[l]
209 |             print(c)
210 |             configs.append({
211 |                 "input_max_length": c['input_length'],
212 |                 "datasets": d + c.get("name_postfix", ""),
213 |                 "generation_max_length": c['generation_max_length'],
214 |                 "test_files": c.get("test_files", ""),
215 |                 "demo_files": c.get("demo_files", ""),
216 |             })
217 |         out_config = {k: ",".join([str(c[k]) for c in configs]) for k in configs[0]}
218 |         # llama 3 by default but you can change it to anything else
219 |         out_config.update({
220 |             **kwargs,
221 |             "model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct",
222 |         })
223 |     with open(config_name, "w") as f:
224 |         yaml.dump(out_config, f, sort_keys=False)
225 | 
226 | def helmet_configs(input_lengths = ["128k"], fname_postfix = ""):
227 |     synthetic = ["ruler_niah_mk_2", "ruler_niah_mk_3", "ruler_niah_mv", "json_kv"]
228 |     # ruler actually doesn't support demos so it defaults to 0, json kv uses 2
229 |     process_configs(
230 |         f"configs/recall{fname_postfix}.yaml", synthetic, input_lengths, 
231 |         use_chat_template=False, max_test_samples=100, shots=2, stop_new_line=False
232 |     ) 
233 | 
234 |     rag = ['kilt_nq', 'kilt_triviaqa', 'kilt_hotpotqa', 'kilt_popqa']
235 |     process_configs(
236 |         f"configs/rag{fname_postfix}.yaml", rag, input_lengths,
237 |         use_chat_template=False, max_test_samples=100, shots=2, stop_new_line=True # could be false but set to true so it runs faster
238 |     )
239 | 
240 |     longqa = ['narrativeqa', 'infbench_qa_eng', 'infbench_choice_eng']
241 |     process_configs(
242 |         f"configs/longqa{fname_postfix}.yaml", longqa, input_lengths,
243 |         use_chat_template=True, max_test_samples=100, shots=2, stop_new_line=False
244 |     )
245 | 
246 |     summ = ['infbench_sum_eng', 'multi_lexsum']
247 |     process_configs(
248 |         f"configs/summ{fname_postfix}.yaml", summ, input_lengths,
249 |         use_chat_template=True, max_test_samples=100, shots=2, stop_new_line=False
250 |     )
251 | 
252 |     icl = ['icl_trec_coarse', 'icl_trec_fine', 'icl_banking77', 'icl_clinic150', 'icl_nlu']
253 |     process_configs(
254 |         f"configs/icl{fname_postfix}.yaml", icl, input_lengths,
255 |         use_chat_template=False, max_test_samples=500, shots=0, stop_new_line=True
256 |     )
257 | 
258 |     rerank = ["msmarco_rerank_psg"]
259 |     process_configs(
260 |         f"configs/rerank{fname_postfix}.yaml", rerank, input_lengths,
261 |         use_chat_template=False, max_test_samples=100, shots=2, stop_new_line=True
262 |     )
263 | 
264 |     cite = ["alce_asqa", "alce_qampari"]
265 |     process_configs(
266 |         f"configs/cite{fname_postfix}.yaml", cite, input_lengths,
267 |         use_chat_template=True, max_test_samples=100, shots=2, stop_new_line=False
268 |     )
269 |     
270 |     nocite = ["alce_asqa_nocite"]
271 |     process_configs(
272 |         f"configs/alce_nocite{fname_postfix}.yaml", nocite, input_lengths,
273 |         use_chat_template=True, max_test_samples=100, shots=0, stop_new_line=False, generation_max_length=600,
274 |     )
275 | 
276 |     ruler = ["ruler_niah_s_1", "ruler_niah_s_2", "ruler_niah_s_3", "ruler_niah_mk_1", "ruler_niah_mk_2", "ruler_niah_mk_3", "ruler_niah_mq", "ruler_niah_mv", "ruler_cwe", "ruler_fwe", "ruler_vt", "ruler_qa_1", "ruler_qa_2"]
277 |     process_configs(
278 |         f"configs/ruler{fname_postfix}.yaml", ruler, input_lengths,
279 |         use_chat_template=False, max_test_samples=100, shots=0, stop_new_line=False
280 |     )
281 | 
282 | def separate_configs(input_lengths = ["128k"], fname_postfix = ""):
283 |     # separate rag and icl configs into individual files
284 |     for name in ['kilt_nq', 'kilt_triviaqa', 'kilt_hotpotqa', 'kilt_popqa']:
285 |         process_configs(
286 |             f"configs/rag/{name}{fname_postfix}.yaml", [name], input_lengths,
287 |             use_chat_template=False, max_test_samples=100, shots=2, stop_new_line=True
288 |         )
289 | 
290 |     for name in ['icl_trec_coarse', 'icl_trec_fine', 'icl_banking77', 'icl_clinic150', 'icl_nlu']:
291 |         process_configs(
292 |             f"configs/icl/{name}{fname_postfix}.yaml", [name], input_lengths,
293 |             use_chat_template=False, max_test_samples=500, shots=0, stop_new_line=True
294 |         )
295 |     
296 | 
297 | def niah_configs():
298 |     input_lengths = [8192, 16384, 32768, 65536, 131072]
299 |     dataset=["ruler_niah_s_2"]
300 |     gen_lengths = [50]
301 |     for i, l in enumerate(input_lengths):
302 |         config = {
303 |             "input_max_length": l,
304 |             "datasets": dataset[0],
305 |             "generation_max_length": gen_lengths[0],
306 |             "test_files": f'data/ruler/{dataset[0].replace("ruler_", "").replace("_s_", "_single_")}/validation_{l}.jsonl',
307 |             "demo_files": "",
308 |         }
309 |     with open(f"configs/niah.yaml", "w") as f:
310 |         yaml.dump(config, f, sort_keys=False)
311 |     
312 | 
313 | if __name__ == "__main__":
314 |     helmet_configs()
315 |     helmet_configs(input_lengths=["8k", "16k", "32k", "64k"], fname_postfix="_short")
316 |     niah_configs()
317 |     separate_configs()
318 |     separate_configs(input_lengths=["8k", "16k", "32k", "64k"], fname_postfix="_short")


--------------------------------------------------------------------------------
/scripts/run_api.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash -l
 2 | 
 3 | ##############################
 4 | #       Job blueprint        #
 5 | ##############################
 6 | 
 7 | # Give your job a name, so you can recognize it in the queue overview
 8 | #SBATCH --job-name=api ## CHANGE JOBNAME HERE
 9 | #SBATCH --array=0
10 | 
11 | # Remove one # to uncommment
12 | #SBATCH --output=./joblog/%x-%A_%a.out                          ## Stdout
13 | #SBATCH --error=./joblog/%x-%A_%a.err                           ## Stderr
14 | 
15 | # Define, how many nodes you need. Here, we ask for 1 node.
16 | #SBATCH -N 1                                        ##nodes
17 | #SBATCH -n 1                                        ##tasks
18 | #SBATCH --cpus-per-task=8
19 | #SBATCH --mem=32G
20 | #SBATCH --time=0-3:00:00
21 | #SBATCH --gres=gpu:0 --ntasks-per-node=1 -N 1
22 | # Turn on mail notification. There are many possible self-explaining values:
23 | # NONE, BEGIN, END, FAIL, ALL (including all aforementioned)
24 | # For more values, check "man sbatch"
25 | #SBATCH --mail-type=ALL
26 | # Remember to set your email address here instead of nobody
27 | #SBATCH --mail-user=nobody
28 | 
29 | echo "Date              = $(date)"
30 | echo "Hostname          = $(hostname -s)"
31 | echo "Working Directory = $(pwd)"
32 | echo ""
33 | echo "Number of Nodes Allocated      = $SLURM_JOB_NUM_NODES"
34 | echo "Number of Tasks Allocated      = $SLURM_NTASKS"
35 | echo "Number of Cores/Task Allocated = $SLURM_CPUS_PER_TASK"
36 | echo "Array Job ID                   = $SLURM_ARRAY_JOB_ID"
37 | echo "Array Task ID                  = $SLURM_ARRAY_TASK_ID"
38 | echo "Cache                          = $TRANSFORMERS_CACHE"
39 | 
40 | source env/bin/activate
41 | 
42 | export OMP_NUM_THREADS=8
43 | IDX=$SLURM_ARRAY_TASK_ID
44 | if [[ -z $SLURM_ARRAY_TASK_ID ]]; then
45 |     IDX=0
46 | fi
47 | 
48 | 
49 | TAG=v1
50 | 
51 | CONFIGS=(recall.yaml rag.yaml longqa.yaml summ.yaml icl.yaml rerank.yaml cite.yaml)
52 | #CONFIGS=(${CONFIGS[7]}) # you may want to run only one config
53 | SEED=42
54 | 
55 | # azure vs. non-azure makes no difference, just use whichever you prefer
56 | OD=(
57 |     azure/gpt-4-0125-preview # 0
58 |     azure/gpt-4o-2024-05-13 # 1
59 |     gpt-4o-2024-08-06 # 2
60 |     azure/gpt-4o-mini-2024-07-18  # 3
61 |     claude-3-5-sonnet-20240620 # 4
62 |     gemini-1.5-flash-001 # 5
63 |     gemini-1.5-pro-001 # 6
64 | )
65 | MODEL_NAME="${OD[$IDX]}"
66 | OUTPUT_DIR="output/$(basename $MODEL_NAME)"
67 | 
68 | # for the API models we always use use_chat_template=True
69 | OPTIONS="--use_chat_template True --stop_newline False"
70 | 
71 | echo "Evaluation output dir         = $OUTPUT_DIR"
72 | echo "Tag                           = $TAG"
73 | echo "Model name                    = $MODEL_NAME"
74 | echo "Options                       = $OPTIONS"
75 | 
76 | for CONFIG in "${CONFIGS[@]}"; do
77 |     echo "Config file: $CONFIG"
78 | 
79 |     python eval.py \
80 |         --config configs/$CONFIG \
81 |         --seed $SEED \
82 |         --output_dir $OUTPUT_DIR \
83 |         --tag $TAG \
84 |         --model_name_or_path $MODEL_NAME \
85 |         $OPTIONS
86 | done
87 | 
88 | echo "finished with $?"
89 | 
90 | wait;
91 | 


--------------------------------------------------------------------------------
/scripts/run_eval.sh:
--------------------------------------------------------------------------------
1 | for task in "recall" "rag" "longqa" "summ" "icl" "rerank" "cite"; do
2 |     python eval.py --config configs/${task}.yaml
3 | done
4 | 
5 | this will run the 8k to 64k versions
6 | for task in "recall" "rag" "longqa" "summ" "icl" "rerank" "cite"; do
7 |     python eval.py --config configs/${task}_short.yaml
8 | done


--------------------------------------------------------------------------------
/scripts/run_eval_hf_endpoint.sh:
--------------------------------------------------------------------------------
1 | 
2 | LLM_ENDPOINT="https://${hf_inference_point_url}/v1" # fill in your endpoint url
3 | API_KEY=$HF_TOKEN
4 | 
5 | python eval.py --config configs/recall_demo.yaml --endpoint_url $LLM_ENDPOINT --api_key $API_KEY


--------------------------------------------------------------------------------
/scripts/run_eval_slurm.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash -l
  2 | 
  3 | ##############################
  4 | #       Job blueprint        #
  5 | ##############################
  6 | 
  7 | # Give your job a name, so you can recognize it in the queue overview
  8 | #SBATCH --job-name=helmet ## CHANGE JOBNAME HERE
  9 | #SBATCH --array=0-35
 10 | 
 11 | # Remove one # to uncommment
 12 | #SBATCH --output=./joblog/%x-%A_%a.out                          ## Stdout
 13 | #SBATCH --error=./joblog/%x-%A_%a.err                           ## Stderr
 14 | 
 15 | # Define, how many nodes you need. Here, we ask for 1 node.
 16 | #SBATCH -N 1                                        ##nodes
 17 | #SBATCH -n 1                                        ##tasks
 18 | #SBATCH --cpus-per-task=8
 19 | #SBATCH --mem=100G
 20 | #SBATCH --time=0-24:00:00
 21 | #SBATCH --gres=gpu:1 --ntasks-per-node=1 -N 1
 22 | #SBATCH --constraint=gpu80
 23 | # Turn on mail notification. There are many possible self-explaining values:
 24 | # NONE, BEGIN, END, FAIL, ALL (including all aforementioned)
 25 | # For more values, check "man sbatch"
 26 | #SBATCH --mail-type=ALL
 27 | # Remember to set your email address here instead of nobody
 28 | #SBATCH --mail-user=nobody
 29 | 
 30 | echo "Date              = $(date)"
 31 | echo "Hostname          = $(hostname -s)"
 32 | echo "Working Directory = $(pwd)"
 33 | echo ""
 34 | echo "Number of Nodes Allocated      = $SLURM_JOB_NUM_NODES"
 35 | echo "Number of Tasks Allocated      = $SLURM_NTASKS"
 36 | echo "Number of Cores/Task Allocated = $SLURM_CPUS_PER_TASK"
 37 | echo "Array Job ID                   = $SLURM_ARRAY_JOB_ID"
 38 | echo "Array Task ID                  = $SLURM_ARRAY_TASK_ID"
 39 | echo "Cache                          = $TRANSFORMERS_CACHE"
 40 | 
 41 | source env/bin/activate
 42 | 
 43 | IDX=$SLURM_ARRAY_TASK_ID
 44 | NGPU=$SLURM_GPUS_ON_NODE
 45 | if [[ -z $SLURM_ARRAY_TASK_ID ]]; then
 46 |     IDX=31
 47 |     NGPU=1
 48 | fi
 49 | export OMP_NUM_THREADS=8
 50 | 
 51 | # change the tag to distinguish different runs
 52 | TAG=v1
 53 | 
 54 | CONFIGS=(recall.yaml rag.yaml longqa.yaml summ.yaml icl.yaml rerank.yaml cite.yaml)
 55 | SEED=42
 56 | 
 57 | OPTIONS=""
 58 | 
 59 | M_IDX=$IDX
 60 | 
 61 | # Array for models larger than 13B (12 models)
 62 | L_MODELS=(
 63 |   "Meta-Llama-3-70B-Theta8M"
 64 |   "Meta-Llama-3-70B-Instruct-Theta8M"
 65 |   "Meta-Llama-3.1-70B"
 66 |   "Meta-Llama-3.1-70B-Instruct"
 67 |   "Yi-34B-200K"
 68 |   "Qwen2-57B-A14B"
 69 |   "Qwen2-57B-A14B-Instruct"
 70 |   "c4ai-command-r-v01"
 71 |   "Jamba-v0.1"
 72 |   "AI21-Jamba-1.5-Mini"
 73 |   "gemma-2-27b"
 74 |   "gemma-2-27b-it"
 75 | )
 76 | 
 77 | # Array for models 13B and smaller (36 models)
 78 | S_MODELS=(
 79 |   "LLaMA-2-7B-32K"
 80 |   "Llama-2-7B-32K-Instruct"
 81 |   "llama-2-7b-80k-basefixed"
 82 |   "Yarn-Llama-2-7b-64k"
 83 |   "Yarn-Llama-2-7b-128k"
 84 |   "Meta-Llama-3-8B"
 85 |   "Meta-Llama-3-8B-Instruct"
 86 |   "Meta-Llama-3-8B-Theta8M"
 87 |   "Meta-Llama-3-8B-Instruct-Theta8M"
 88 |   "Meta-Llama-3.1-8B"
 89 |   "Meta-Llama-3.1-8B-Instruct"
 90 |   "Mistral-7B-v0.1"
 91 |   "Mistral-7B-Instruct-v0.1"
 92 |   "Mistral-7B-Instruct-v0.2"
 93 |   "Mistral-7B-v0.3"
 94 |   "Mistral-7B-Instruct-v0.3"
 95 |   "Yi-6B-200K"
 96 |   "Yi-9B-200K"
 97 |   "Yi-1.5-9B-32K"
 98 |   "Phi-3-mini-128k-instruct"
 99 |   "Phi-3-small-128k-instruct"
100 |   "Phi-3.5-mini-instruct"
101 |   "Qwen2-7B"
102 |   "Qwen2-7B-Instruct"
103 |   "gemma-2-9b"
104 |   "gemma-2-9b-it"
105 |   "prolong-64k-instruct"
106 |   "prolong-512k-instruct-20b-theta128m"
107 |   "Mistral-Nemo-Base-2407"
108 |   "Mistral-Nemo-Instruct-2407"
109 |   "Phi-3-medium-128k-instruct"
110 |   "MegaBeam-Mistral-7B-512k" #31
111 |   "Llama-3.2-1B" # 32
112 |   "Llama-3.2-1B-Instruct" # 33
113 |   "Llama-3.2-3B" # 34
114 |   "Llama-3.2-3B-Instruct" # 35
115 | )
116 | MNAME="${S_MODELS[$M_IDX]}"
117 | 
118 | OUTPUT_DIR="output/$MNAME"
119 | MODEL_NAME="/path/to/your/model/$MNAME" # CHANGE PATH HERE or you can change the array to load from HF
120 | 
121 | shopt -s nocasematch
122 | chat_models=".*(chat|instruct|it$|nous|command|Jamba-1.5|MegaBeam).*"
123 | echo $MNAME
124 | if ! [[ $MNAME =~ $chat_models ]]; then
125 |     # for the base models we always use use_chat_template=False
126 |     OPTIONS="$OPTIONS --use_chat_template False"
127 | fi
128 | 
129 | 
130 | echo "Evaluation output dir         = $OUTPUT_DIR"
131 | echo "Tag                           = $TAG"
132 | echo "Model name                    = $MODEL_NAME"
133 | echo "Options                       = $OPTIONS"
134 | 
135 | 
136 | for CONFIG in "${CONFIGS[@]}"; do
137 |     echo "Config file: $CONFIG"
138 | 
139 |     python eval.py \
140 |         --config configs/$CONFIG \
141 |         --seed $SEED \
142 |         --output_dir $OUTPUT_DIR \
143 |         --tag $TAG \
144 |         --model_name_or_path $MODEL_NAME \
145 |         $OPTIONS
146 | done
147 | 
148 | echo "finished with $?"
149 | 
150 | wait;
151 | 
152 | #echo "done, check $OUTPUT_DIR for outputs"
153 | 
154 | #exit 0
155 | 
156 | 


--------------------------------------------------------------------------------
/scripts/run_eval_tgi.sh:
--------------------------------------------------------------------------------
1 | export host_ip=$(hostname -I | awk '{print $1}')
2 | export LLM_ENDPOINT_PORT=8085 # change this to the port you want to use
3 | export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}/v1"
4 | 
5 | python eval.py --config configs/recall_demo.yaml --endpoint_url $LLM_ENDPOINT
6 | 


--------------------------------------------------------------------------------
/scripts/run_eval_vllm_gaudi.sh:
--------------------------------------------------------------------------------
1 | export host_ip=$(hostname -I | awk '{print $1}')
2 | export LLM_ENDPOINT_PORT=8010
3 | export DATA_PATH="~/.cache/huggingface"
4 | export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}/v1"
5 | export HF_HOME=$DATA_PATH
6 | 
7 | for task in "recall" "rag"; do
8 |     python eval.py --config configs/${task}_vllm.yaml --endpoint_url $LLM_ENDPOINT --overwrite --no_cuda
9 | done


--------------------------------------------------------------------------------
/scripts/run_short_slurm.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash -l
  2 | 
  3 | ##############################
  4 | #       Job blueprint        #
  5 | ##############################
  6 | 
  7 | # Give your job a name, so you can recognize it in the queue overview
  8 | #SBATCH --job-name=helmet_short ## CHANGE JOBNAME HERE
  9 | #SBATCH --array=0
 10 | 
 11 | # Remove one # to uncommment
 12 | #SBATCH --output=./joblog/%x-%A_%a.out                          ## Stdout
 13 | #SBATCH --error=./joblog/%x-%A_%a.err                           ## Stderr
 14 | 
 15 | # Define, how many nodes you need. Here, we ask for 1 node.
 16 | #SBATCH -N 1                                        ##nodes
 17 | #SBATCH -n 1                                        ##tasks
 18 | #SBATCH --cpus-per-task=8
 19 | #SBATCH --mem=150G
 20 | #SBATCH --time=0-4:00:00
 21 | #SBATCH --gres=gpu:1 --ntasks-per-node=1 -N 1
 22 | #SBATCH --constraint=gpu80
 23 | # Turn on mail notification. There are many possible self-explaining values:
 24 | # NONE, BEGIN, END, FAIL, ALL (including all aforementioned)
 25 | # For more values, check "man sbatch"
 26 | #SBATCH --mail-type=ALL
 27 | # Remember to set your email address here instead of nobody
 28 | #SBATCH --mail-user=nobody
 29 | 
 30 | echo "Date              = $(date)"
 31 | echo "Hostname          = $(hostname -s)"
 32 | echo "Working Directory = $(pwd)"
 33 | echo ""
 34 | echo "Number of Nodes Allocated      = $SLURM_JOB_NUM_NODES"
 35 | echo "Number of Tasks Allocated      = $SLURM_NTASKS"
 36 | echo "Number of Cores/Task Allocated = $SLURM_CPUS_PER_TASK"
 37 | echo "Array Job ID                   = $SLURM_ARRAY_JOB_ID"
 38 | echo "Array Task ID                  = $SLURM_ARRAY_TASK_ID"
 39 | echo "Cache                          = $TRANSFORMERS_CACHE"
 40 | 
 41 | source env/bin/activate
 42 | 
 43 | IDX=$SLURM_ARRAY_TASK_ID
 44 | NGPU=$SLURM_GPUS_ON_NODE
 45 | if [[ -z $SLURM_ARRAY_TASK_ID ]]; then
 46 |     IDX=0
 47 |     NGPU=1
 48 | fi
 49 | PORT=$(shuf -i 30000-65000 -n 1)
 50 | echo "Port                          = $PORT"
 51 | 
 52 | export OMP_NUM_THREADS=8
 53 | 
 54 | TAG=v1
 55 | 
 56 | CONFIGS=(recall_short.yaml rag_short.yaml longqa_short.yaml summ_short.yaml icl_short.yaml rerank_short.yaml cite_short.yaml)
 57 | #CONFIGS=(${CONFIGS[8]})
 58 | SEED=42
 59 | 
 60 | M_IDX=$IDX
 61 | 
 62 | # Array for models larger than 13B (12 models)
 63 | L_MODELS=(
 64 |   "Meta-Llama-3-70B-Theta8M" #0
 65 |   "Meta-Llama-3-70B-Instruct-Theta8M" #1
 66 |   "Meta-Llama-3.1-70B" #2
 67 |   "Meta-Llama-3.1-70B-Instruct" #3
 68 |   "Yi-34B-200K" #4
 69 |   "Qwen2-57B-A14B" #5
 70 |   "Qwen2-57B-A14B-Instruct" #6
 71 |   "c4ai-command-r-v01" #7
 72 |   "Jamba-v0.1" #8
 73 |   "AI21-Jamba-1.5-Mini" #9
 74 |   "gemma-2-27b" #10
 75 |   "gemma-2-27b-it" #11
 76 | )
 77 | 
 78 | # Array for models 13B and smaller (36 models)
 79 | S_MODELS=(
 80 |   "LLaMA-2-7B-32K" # 0
 81 |   "Llama-2-7B-32K-Instruct" # 1
 82 |   "llama-2-7b-80k-basefixed" # 2
 83 |   "Yarn-Llama-2-7b-64k" # 3
 84 |   "Yarn-Llama-2-7b-128k" # 4
 85 |   "Meta-Llama-3-8B" # 5
 86 |   "Meta-Llama-3-8B-Instruct" # 6
 87 |   "Meta-Llama-3-8B-Theta8M" # 7
 88 |   "Meta-Llama-3-8B-Instruct-Theta8M" # 8
 89 |   "Meta-Llama-3.1-8B" # 9
 90 |   "Meta-Llama-3.1-8B-Instruct" # 10
 91 |   "Mistral-7B-v0.1" # 11
 92 |   "Mistral-7B-Instruct-v0.1" # 12
 93 |   "Mistral-7B-Instruct-v0.2" # 13
 94 |   "Mistral-7B-v0.3" # 14
 95 |   "Mistral-7B-Instruct-v0.3" # 15
 96 |   "Yi-6B-200K" # 16
 97 |   "Yi-9B-200K" # 17
 98 |   "Yi-1.5-9B-32K" # 18
 99 |   "Phi-3-mini-128k-instruct" # 19
100 |   "Phi-3-small-128k-instruct" # 20
101 |   "Phi-3.5-mini-instruct" # 21
102 |   "Qwen2-7B" # 22
103 |   "Qwen2-7B-Instruct" # 23
104 |   "gemma-2-9b" # 24
105 |   "gemma-2-9b-it" # 25
106 |   "prolong-64k-instruct" # 26
107 |   "prolong-512k-instruct-20b-theta128m" # 27
108 |   "Mistral-Nemo-Base-2407" # 28
109 |   "Mistral-Nemo-Instruct-2407" # 29
110 |   "Phi-3-medium-128k-instruct" # 30
111 |   "MegaBeam-Mistral-7B-512k" #31
112 |   "Llama-3.2-1B" # 32
113 |   "Llama-3.2-1B-Instruct" # 33
114 |   "Llama-3.2-3B" # 34
115 |   "Llama-3.2-3B-Instruct" # 35
116 | )
117 | MNAME="${S_MODELS[$M_IDX]}"
118 | 
119 | OUTPUT_DIR="output/$MNAME"
120 | MODEL_NAME="/path/to/your/model/$MNAME" # CHANGE PATH HERE or you can change the array to load from HF
121 | 
122 | shopt -s nocasematch
123 | chat_models=".*(chat|instruct|it$|nous|command|Jamba-1.5|MegaBeam).*"
124 | echo $MNAME
125 | if ! [[ $MNAME =~ $chat_models ]]; then
126 |     OPTIONS="$OPTIONS --use_chat_template False"
127 | fi
128 | 
129 | echo "Evaluation output dir         = $OUTPUT_DIR"
130 | echo "Tag                           = $TAG"
131 | echo "Model name                    = $MODEL_NAME"
132 | echo "Options                       = $OPTIONS"
133 | 
134 | for CONFIG in "${CONFIGS[@]}"; do
135 |     echo "Config file: $CONFIG"
136 | 
137 |     python eval.py \
138 |         --config configs/$CONFIG \
139 |         --seed $SEED \
140 |         --output_dir $OUTPUT_DIR \
141 |         --tag $TAG \
142 |         --model_name_or_path $MODEL_NAME \
143 |         $OPTIONS
144 | done
145 | 
146 | echo "finished with $?"
147 | 
148 | wait;
149 | 


--------------------------------------------------------------------------------
/scripts/vllm-gaudi/build_image.sh:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | export TAG="helmet"
 5 | echo "Building the vllm-gaudi docker images"
 6 | git clone https://github.com/HabanaAI/vllm-fork.git
 7 | cd ./vllm-fork
 8 | git checkout v0.6.6.post1+Gaudi-1.20.0 #habana_main
 9 | 
10 | docker build --no-cache -f Dockerfile.hpu -t ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest} --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy
11 | if [ $? -ne 0 ]; then
12 |     echo "vllm-gaudi failed"
13 |     exit 1
14 | else
15 |     echo "vllm-gaudi successful"
16 | fi
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/scripts/vllm-gaudi/compose.yaml:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2024 Intel Corporation
 2 | # SPDX-License-Identifier: Apache-2.0
 3 | 
 4 | services:
 5 |   vllm-gaudi-server:
 6 |     image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest}
 7 |     container_name: vllm-gaudi-server
 8 |     ports:
 9 |       - ${LLM_ENDPOINT_PORT:-8008}:80
10 |     volumes:
11 |       - "${DATA_PATH:-./data}:/data"
12 |     environment:
13 |       no_proxy: ${no_proxy}
14 |       http_proxy: ${http_proxy}
15 |       https_proxy: ${https_proxy}
16 |       HF_TOKEN: ${HF_TOKEN}
17 |       HF_HOME: "/data"
18 |       HABANA_VISIBLE_DEVICES: all
19 |       OMPI_MCA_btl_vader_single_copy_mechanism: none
20 |       PT_HPU_ENABLE_LAZY_COLLECTIVES: true
21 |       LLM_MODEL_ID: ${LLM_MODEL_ID}
22 |       VLLM_TORCH_PROFILER_DIR: "/mnt"
23 |       host_ip: ${host_ip}
24 |       LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT}
25 |       VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-true}
26 |       VLLM_ALLOW_LONG_MAX_MODEL_LEN: 1
27 |       MAX_MODEL_LEN: ${MAX_MODEL_LEN:-131072}
28 |       MAX_SEQ_LEN_TO_CAPTURE: ${MAX_MODEL_LEN:-131072}
29 |       NUM_CARDS: ${NUM_CARDS:-1}
30 |     runtime: habana
31 |     cap_add:
32 |       - SYS_NICE
33 |     ipc: host
34 |     healthcheck:
35 |       test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"]
36 |       interval: 10s
37 |       timeout: 10s
38 |       retries: 150
39 |     command: --model $LLM_MODEL_ID --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq-len-to-capture ${MAX_MODEL_LEN} --max-model-len ${MAX_MODEL_LEN}
40 | 


--------------------------------------------------------------------------------
/scripts/vllm-gaudi/launch_container.sh:
--------------------------------------------------------------------------------
 1 | export host_ip=$(hostname -I | awk '{print $1}')
 2 | export LLM_ENDPOINT_PORT=8010
 3 | export HF_TOKEN=${HF_TOKEN}
 4 | export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}/v1"
 5 | export DATA_PATH="~/.cache/huggingface"
 6 | export MAX_MODEL_LEN=131072
 7 | 
 8 | # single node 
 9 | # export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct"
10 | # export NUM_CARDS=1
11 | 
12 | # multiple nodes 
13 | export LLM_MODEL_ID="meta-llama/Llama-3.3-70B-Instruct"
14 | export NUM_CARDS=8
15 | 
16 | docker compose up


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Adopted from https://github.com/princeton-nlp/DensePhrases/blob/main/densephrases/utils/eval_utils.py
  3 | """
  4 | 
  5 | import os 
  6 | import string
  7 | import re
  8 | import unicodedata
  9 | from collections import Counter
 10 | 
 11 | from rouge_score import rouge_scorer
 12 | 
 13 | import torch
 14 | import pytrec_eval
 15 | 
 16 | # import tensor_parallel as tp
 17 | 
 18 | import logging
 19 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
 20 |                     datefmt='%m/%d/%Y %H:%M:%S')
 21 | logger = logging.getLogger(__name__)
 22 | logger.setLevel(logging.INFO)
 23 | 
 24 | 
 25 | def normalize_answer(s):
 26 | 
 27 |     def remove_articles(text):
 28 |         return re.sub(r'\b(a|an|the)\b', ' ', text)
 29 | 
 30 |     def white_space_fix(text):
 31 |         return ' '.join(text.split())
 32 | 
 33 |     def remove_punc(text):
 34 |         exclude = set(string.punctuation)
 35 |         return ''.join(ch for ch in text if ch not in exclude)
 36 | 
 37 |     def lower(text):
 38 |         return text.lower()
 39 | 
 40 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
 41 | 
 42 | 
 43 | def remove_citations(sent):
 44 |     return re.sub(r"\[\d+", "", re.sub(r" \[\d+", "", sent)).replace(" |", "").replace("]", "")
 45 | 
 46 | 
 47 | def f1_score(prediction, ground_truth):
 48 |     normalized_prediction = normalize_answer(prediction)
 49 |     normalized_ground_truth = normalize_answer(ground_truth)
 50 | 
 51 |     ZERO_METRIC = (0, 0, 0)
 52 | 
 53 |     if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
 54 |         return ZERO_METRIC
 55 |     if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth:
 56 |         return ZERO_METRIC
 57 | 
 58 |     prediction_tokens = normalized_prediction.split()
 59 |     ground_truth_tokens = normalized_ground_truth.split()
 60 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
 61 |     num_same = sum(common.values())
 62 |     if num_same == 0:
 63 |         return ZERO_METRIC
 64 |     precision = 1.0 * num_same / len(prediction_tokens)
 65 |     recall = 1.0 * num_same / len(ground_truth_tokens)
 66 |     f1 = (2 * precision * recall) / (precision + recall)
 67 |     return f1, precision, recall
 68 | 
 69 | 
 70 | def drqa_normalize(text):
 71 |     """Resolve different type of unicode encodings."""
 72 |     return unicodedata.normalize('NFD', text)
 73 | 
 74 | 
 75 | def drqa_exact_match_score(prediction, ground_truth):
 76 |     """Check if the prediction is a (soft) exact match with the ground truth."""
 77 |     return normalize_answer(prediction) == normalize_answer(ground_truth)
 78 | 
 79 | 
 80 | def substring_exact_match_score(prediciton, ground_truth):
 81 |     """Check if the ground truth is a (soft) exact match substring of the prediction."""
 82 |     return normalize_answer(ground_truth) in normalize_answer(prediciton) 
 83 | 
 84 | 
 85 | def drqa_metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
 86 |     """Given a prediction and multiple valid answers, return the score of
 87 |     the best prediction-answer_n pair given a metric function.
 88 |     """
 89 |     # ground truth could be a string or a list of strings or a list of list of strings
 90 |     if isinstance(ground_truths, str):
 91 |         ground_truths = [ground_truths]
 92 |     elif isinstance(ground_truths[0], list):
 93 |         ground_truths = [ground_truth for ground_truths_list in ground_truths for ground_truth in ground_truths_list]
 94 | 
 95 |     scores_for_ground_truths = []
 96 |     for ground_truth in ground_truths:
 97 |         score = metric_fn(prediction, ground_truth)
 98 |         scores_for_ground_truths.append(score)
 99 |     return max(scores_for_ground_truths)
100 | 
101 | 
102 | def get_max_memory():
103 |     """Get the maximum memory available for the current GPU for loading models."""
104 |     free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3)
105 |     max_memory = f'{free_in_GB-6}GB'
106 |     n_gpus = torch.cuda.device_count()
107 |     max_memory = {i: max_memory for i in range(n_gpus)}
108 |     return max_memory
109 | 
110 | 
111 | def get_top_tokens(logits, tokenizer, top_k=10):
112 |     """Get the top tokens and their probabilities from the logits."""
113 |     top_tokens = []
114 |     for logit in logits:
115 |         a, b = torch.topk(torch.softmax(logit, dim=-1), top_k, dim=-1)
116 |         l = [(y, f"{x*100:.02f}") for x, y in zip(a[0], tokenizer.convert_ids_to_tokens(b[0]))]
117 |         top_tokens.append(l)
118 |     return top_tokens
119 | 
120 | 
121 | def parse_output(output, prefix="Answer:"):
122 |     def lstrip_string(s, sub):
123 |         return re.sub(f'^{re.escape(sub)}', '', s, flags=re.IGNORECASE)
124 |     patterns = [re.compile(f"(?:{prefix})(.*)(?:\n|$)", flags=re.IGNORECASE), re.compile(r"(?:^)(.*)(?:\n|$)")]
125 |     for pat in patterns:
126 |         matches = pat.search(output)
127 |         if matches is not None:
128 |             return lstrip_string(matches[1].strip(), prefix).strip() # 0 index includes the non-capturing group # lstrip again because for chat models sometimes it will repeat the prefix
129 |     # if still not found, return None, but should actually never get this case...
130 |     return None
131 | 
132 | 
133 | def parse_rankings(output):
134 |     # when parsing the rankings, we want to do some preprocessing first
135 |     # 1. remove the square brackets and ID: 
136 |     output = re.sub(r"[\[\]:]", "", output)
137 |     output = output.lower().replace("id", "")
138 | 
139 |     # 2. parse the integer surrounded by >, since all IDs are integers
140 |     pattern = r'(\d+)(?:\s*>\s*(\d+))*'
141 |     match = re.finditer(pattern, output)
142 |     # and take the longest match
143 |     longest = ""
144 |     for m in match:
145 |         if len(m.group(0)) > len(longest):
146 |             longest = m.group(0)
147 | 
148 |     if len(longest) > 0:
149 |         number_string = longest
150 |         # import to output a list of strings instead of ints, since the IDs are saved as strings (even though they are supposed to be integers)
151 |         rankings = [num.strip() for num in number_string.split('>') if num.strip().isdigit()]
152 |     else:
153 |         # if we can't find any numbers, then we just return the whole string (unlikely to get any matches)
154 |         rankings = [output]
155 | 
156 |     results = {}
157 |     for i, rank in enumerate(rankings):
158 |         if rank not in results:
159 |             results[rank] = len(rankings) - i 
160 |     
161 |     return results
162 | 
163 | 
164 | def calculate_metrics(prediction, answers):
165 |     r_scorer = rouge_scorer.RougeScorer(['rougeL', 'rougeLsum'], use_stemmer=True)
166 |     em = drqa_metric_max_over_ground_truths(drqa_exact_match_score, prediction, answers)
167 |     f1 = drqa_metric_max_over_ground_truths(lambda x, y: f1_score(x, y)[0], prediction, answers)
168 |     sub_em = drqa_metric_max_over_ground_truths(substring_exact_match_score, prediction, answers)
169 | 
170 |     if isinstance(answers, str):
171 |         answers = [answers]
172 |     elif isinstance(answers[0], list):
173 |         answers = [ground_truth for ground_truths_list in answers for ground_truth in ground_truths_list]
174 | 
175 |     rouges = [r_scorer.score(target=a, prediction=prediction) for a in answers]
176 |     rouge = {}
177 |     for k in r_scorer.rouge_types:
178 |         rouge[k + "_f1"] = max([r[k].fmeasure for r in rouges])
179 |         rouge[k + "_recall"] = max([r[k].recall for r in rouges])
180 | 
181 |     return {
182 |         "exact_match": em,
183 |         "f1": f1,
184 |         "substring_exact_match": sub_em,
185 |         **rouge,
186 |     }
187 | 
188 | 
189 | def calculate_retrieval_metrics(results, qrels, k_values=[1, 5, 10, 25, 50, 100], verbose=False):
190 |     # https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/retrieval/evaluation.py#L66
191 |     # follow evaluation from BEIR, which is just using the trec eval
192 |     ndcg = {}
193 |     _map = {}
194 |     recall = {}
195 |     precision = {}
196 |     mrr = {"MRR": 0}
197 |     
198 |     for k in k_values:
199 |         ndcg[f"NDCG@{k}"] = 0.0
200 |         _map[f"MAP@{k}"] = 0.0
201 |         recall[f"Recall@{k}"] = 0.0
202 |         precision[f"P@{k}"] = 0.0
203 |     
204 |     map_string = "map_cut." + ",".join([str(k) for k in k_values])
205 |     ndcg_string = "ndcg_cut." + ",".join([str(k) for k in k_values])
206 |     recall_string = "recall." + ",".join([str(k) for k in k_values])
207 |     precision_string = "P." + ",".join([str(k) for k in k_values])
208 | 
209 |     # https://github.com/cvangysel/pytrec_eval/blob/master/examples/simple_cut.py
210 |     # qrels = {qid: {'pid': [0/1] (relevance label)}}
211 |     # results = {qid: {'pid': float (retriever score)}}
212 |     evaluator = pytrec_eval.RelevanceEvaluator(qrels, {map_string, ndcg_string, recall_string, precision_string, "recip_rank"})
213 |     scores = evaluator.evaluate(results)
214 | 
215 |     for query_id in scores.keys():
216 |         for k in k_values:
217 |             ndcg[f"NDCG@{k}"] += scores[query_id]["ndcg_cut_" + str(k)]
218 |             _map[f"MAP@{k}"] += scores[query_id]["map_cut_" + str(k)]
219 |             recall[f"Recall@{k}"] += scores[query_id]["recall_" + str(k)]
220 |             precision[f"P@{k}"] += scores[query_id]["P_"+ str(k)]
221 |         mrr["MRR"] += scores[query_id]["recip_rank"]
222 |     
223 |     for k in k_values:
224 |         ndcg[f"NDCG@{k}"] = round(ndcg[f"NDCG@{k}"]/len(scores), 5)
225 |         _map[f"MAP@{k}"] = round(_map[f"MAP@{k}"]/len(scores), 5)
226 |         recall[f"Recall@{k}"] = round(recall[f"Recall@{k}"]/len(scores), 5)
227 |         precision[f"P@{k}"] = round(precision[f"P@{k}"]/len(scores), 5)
228 |     mrr["MRR"] = round(mrr["MRR"]/len(scores), 5)
229 |     
230 |     if verbose:
231 |         for eval in [ndcg, _map, recall, precision, mrr]:
232 |             logger.info("\n")
233 |             for k in eval.keys():
234 |                 logger.info("{}: {:.4f}".format(k, eval[k]))
235 | 
236 |     output = {**ndcg, **_map, **recall, **precision, **mrr}
237 |     return output 
238 | 


--------------------------------------------------------------------------------