├── .gitignore ├── .gitmodules ├── CHANGELOG.md ├── LICENSE ├── README.md ├── arguments.py ├── assets ├── benchmark_overview.png ├── logo.jpeg └── task_correlation.png ├── claude.tokenizer.json ├── configs ├── alce_nocite.yaml ├── alce_nocite_short.yaml ├── cite.yaml ├── cite_short.yaml ├── icl.yaml ├── icl_short.yaml ├── longqa.yaml ├── longqa_short.yaml ├── niah.yaml ├── niah_long.yaml ├── rag.yaml ├── rag_short.yaml ├── rag_vllm.yaml ├── recall.yaml ├── recall_demo.yaml ├── recall_short.yaml ├── recall_vllm.yaml ├── rerank.yaml ├── rerank_short.yaml ├── ruler.yaml ├── ruler_short.yaml ├── summ.yaml └── summ_short.yaml ├── data.py ├── eval.py ├── eval_alce.py ├── longproc_addon ├── README.md ├── __init__.py ├── configs │ ├── countdown.yaml │ ├── html_to_tsv.yaml │ ├── path_traversal.yaml │ ├── pseudo_to_code.yaml │ ├── tom_tracking.yaml │ └── travel_planning.yaml └── longproc_helmet_loader.py ├── model_utils.py ├── prompts ├── asqa_nocite.json ├── asqa_revised.json ├── qampari_nocite.json └── qampari_revised.json ├── requirements.txt ├── scripts ├── collect_results.py ├── download_data.sh ├── eval_gpt4_longqa.py ├── eval_gpt4_longqa.sh ├── eval_gpt4_summ.py ├── eval_gpt4_summ.sh ├── generate_configs.py ├── run_api.sh ├── run_eval.sh ├── run_eval_hf_endpoint.sh ├── run_eval_slurm.sh ├── run_eval_tgi.sh ├── run_eval_vllm_gaudi.sh ├── run_short_slurm.sh └── vllm-gaudi │ ├── build_image.sh │ ├── compose.yaml │ └── launch_container.sh ├── utils.py └── visualization.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | .*.swo 3 | output/ 4 | env/ 5 | *_env/ 6 | joblog/ 7 | 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | share/python-wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | MANIFEST 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | *.py,cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | cover/ 60 | 61 | # Translations 62 | *.mo 63 | *.pot 64 | 65 | # Django stuff: 66 | *.log 67 | local_settings.py 68 | db.sqlite3 69 | db.sqlite3-journal 70 | 71 | # Flask stuff: 72 | instance/ 73 | .webassets-cache 74 | 75 | # Scrapy stuff: 76 | .scrapy 77 | 78 | # Sphinx documentation 79 | docs/_build/ 80 | 81 | # PyBuilder 82 | .pybuilder/ 83 | target/ 84 | 85 | # Jupyter Notebook 86 | .ipynb_checkpoints 87 | 88 | # IPython 89 | profile_default/ 90 | ipython_config.py 91 | 92 | # pyenv 93 | # For a library or package, you might want to ignore these files since the code is 94 | # intended to run in multiple environments; otherwise, check them in: 95 | # .python-version 96 | 97 | # pipenv 98 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 99 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 100 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 101 | # install all needed dependencies. 102 | #Pipfile.lock 103 | 104 | # poetry 105 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 106 | # This is especially recommended for binary packages to ensure reproducibility, and is more 107 | # commonly ignored for libraries. 108 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 109 | #poetry.lock 110 | 111 | # pdm 112 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 113 | #pdm.lock 114 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 115 | # in version control. 116 | # https://pdm.fming.dev/#use-with-ide 117 | .pdm.toml 118 | 119 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 120 | __pypackages__/ 121 | 122 | # Celery stuff 123 | celerybeat-schedule 124 | celerybeat.pid 125 | 126 | # SageMath parsed files 127 | *.sage.py 128 | 129 | # Environments 130 | .env 131 | .venv 132 | env/ 133 | venv/ 134 | ENV/ 135 | env.bak/ 136 | venv.bak/ 137 | 138 | # Spyder project settings 139 | .spyderproject 140 | .spyproject 141 | 142 | # Rope project settings 143 | .ropeproject 144 | 145 | # mkdocs documentation 146 | /site 147 | 148 | # mypy 149 | .mypy_cache/ 150 | .dmypy.json 151 | dmypy.json 152 | 153 | # Pyre type checker 154 | .pyre/ 155 | 156 | # pytype static type analyzer 157 | .pytype/ 158 | 159 | # Cython debug symbols 160 | cython_debug/ 161 | 162 | # PyCharm 163 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 164 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 165 | # and can be added to the global gitignore or merged into this file. For a more nuclear 166 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 167 | #.idea/ 168 | 169 | data 170 | output 171 | configs/_* 172 | slurm 173 | 174 | *.ipynb 175 | gty* -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "longproc_addon/longproc"] 2 | path = longproc_addon/longproc 3 | url = https://github.com/princeton-pli/LongProc.git 4 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes will be documented in this file. 3 | 4 | ## 2025-02-25 5 | 6 | In this version, we make some significant improvements to reduce the cost of running the experiments. 7 | 8 | - Add support for batch evaluation 9 | - For OpenAI and Anthropic, we use their batch API to reduce the cost of API calls by 50% ([OpenAI documentation](https://cookbook.openai.com/examples/batch_processing), [Anthropic documentation](https://docs.anthropic.com/en/docs/build-with-claude/message-batches)). The model-based evaluation script have also been updated to reduce cost. 10 | - For other API providers, we use a simple multi-threading approach to parallelize the API calls 11 | - For open-source models, we use batching from the VLLM library for more speed-up. 12 | - Changes to the datasets pre-processing — the paper will be updated in a future version. 13 | - ICL datasets now evaluate 500 samples instead of 100, use a different set of demonstrations for each test instance, and we balance the number of test labels—this is to make the evaluation more consistent and robust. 14 | - RAG, Re-ranking, and Citation use `hashlib` for consistent hashing 15 | - Visualization jupyter notebook for plotting results. 16 | - Support for SGLang, which can be faster for certain supported models. 17 | - Support for reasoning models, such as DeepSeek's R1 models, where we parse out the reasoning steps from the model's output. 18 | - Other minor changes, such as adding documentation. 19 | 20 | ## 2024-10-04 21 | 22 | Thanks to @8188zq and @chtmp223 for pointing out some issues in the current repo: some results are not fully reproducible due to random seeding problems. 23 | This affects the results for ICL, Re-reranking, and the RAG tasks, where the demo sample for each question may differ. 24 | We have updated the code to fix this issue, and will update the results on the paper and the spreadsheet soon to reflect the changes. 25 | Specifically, we make sure the seeding is consistent across runs and independent of system settings. 26 | 27 | Other minor changes: 28 | - Clean up `data.py` and remove unused code. 29 | - Update argument descriptions. 30 | - Log exceptions -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Princeton Natural Language Processing 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /arguments.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import yaml 3 | import ast 4 | import os 5 | 6 | import logging 7 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 8 | datefmt='%m/%d/%Y %H:%M:%S') 9 | logger = logging.getLogger(__name__) 10 | logger.setLevel(logging.INFO) 11 | 12 | 13 | def parse_arguments(): 14 | parser = argparse.ArgumentParser(description="evaluation on downstream tasks") 15 | parser.add_argument("--config", type=str, default=None, help="path to config file") 16 | parser.add_argument("--tag", type=str, default="eval", help="tag to add to the output file") 17 | 18 | # model setting 19 | parser.add_argument("--model_name_or_path", type=str, default=None) 20 | parser.add_argument("--use_vllm", action="store_true", help="whether to use vllm engine") 21 | parser.add_argument("--use_sglang", action="store_true", help="whether to use sglang engine") 22 | parser.add_argument("--use_vllm_serving", action="store_true", help="whether to use vllm serving engine") 23 | parser.add_argument("--use_tgi_serving", action="store_true", help="whether to use tgi serving engine") 24 | parser.add_argument("--endpoint_url", type=str,default="http://localhost:8080/v1/", help="endpoint url for tgi or vllm serving engine") 25 | parser.add_argument("--api_key", type=str, default="EMPTY", help="api key for model endpoint") 26 | 27 | # data settings 28 | parser.add_argument("--datasets", type=str, default=None, help="comma separated list of dataset names") 29 | parser.add_argument("--demo_files", type=str, default=None, help="comma separated list of demo files") 30 | parser.add_argument("--test_files", type=str, default=None, help="comma separated list of test files") 31 | parser.add_argument("--output_dir", type=str, default=None, help="path to save the predictions") 32 | parser.add_argument("--overwrite", action="store_true", help="whether to the saved file") 33 | parser.add_argument("--max_test_samples", type=int, default=None) 34 | parser.add_argument("--num_workers", type=int, default=4, help="number of workers for data loading") 35 | 36 | # dataset specific settings 37 | parser.add_argument("--popularity_threshold", type=int, default=3, help="popularity threshold for popqa, in log scale") 38 | 39 | # evaluation settings 40 | parser.add_argument("--shots", type=int, default=2, help="total number of ICL demos") 41 | parser.add_argument("--input_max_length", type=str, default='8192', help="the maximum number of tokens of the input, we truncate the end of the context; can be separated by comma to match the specified datasets") 42 | 43 | # generation settings 44 | parser.add_argument("--do_sample", type=ast.literal_eval, choices=[True, False], default=False, help="whether to use sampling (false is greedy), overwrites temperature") 45 | parser.add_argument("--generation_max_length", type=str, default='10', help="max number of tokens to generate, can be separated by comma to match the specified datasets") 46 | parser.add_argument("--generation_min_length", type=int, default=0, help="min number of tokens to generate") 47 | parser.add_argument("--temperature", type=float, default=0.0, help="generation temperature") 48 | parser.add_argument("--top_p", type=float, default=1.0, help="top-p parameter for nucleus sampling") 49 | parser.add_argument("--stop_newline", type=ast.literal_eval, choices=[True, False], default=False, help="whether to stop generation at newline") 50 | parser.add_argument("--system_message", type=str, default=None, help="system message to add to the beginning of context") 51 | 52 | # model specific settings 53 | parser.add_argument("--seed", type=int, default=42, help="random seed") 54 | parser.add_argument("--no_cuda", action="store_true", help="disable cuda") 55 | parser.add_argument("--no_bf16", action="store_true", help="disable bf16 and use fp32") 56 | parser.add_argument("--no_torch_compile", action="store_true", help="disable torchcompile") 57 | parser.add_argument("--use_chat_template", type=ast.literal_eval, choices=[True, False], default=False, help="whether to use chat template") 58 | parser.add_argument("--rope_theta", type=int, default=None, help="override rope theta") 59 | parser.add_argument("--thinking", action="store_true", help="for reasoning models (e.g., Deepseek-r1), when this is set, we allow the model to generate an additional 32k tokens and exclude all texts between * from the output for evaluation") 60 | 61 | # misc 62 | parser.add_argument("--debug", action="store_true", help="for debugging") 63 | parser.add_argument("--count_tokens", action="store_true", help="instead of running generation, just count the number of tokens (only for HF models not API)") 64 | 65 | args = parser.parse_args() 66 | config = yaml.safe_load(open(args.config)) if args.config is not None else {} 67 | parser.set_defaults(**config) 68 | args = parser.parse_args() 69 | 70 | if args.output_dir is None: 71 | args.output_dir = f"output/{os.path.basename(args.model_name_or_path)}" 72 | 73 | if args.rope_theta is not None: 74 | args.output_dir = args.output_dir + f"-override-rope{args.rope_theta}" 75 | 76 | if not args.do_sample and args.temperature != 0.0: 77 | args.temperature = 0.0 78 | logger.info("overwriting temperature to 0.0 since do_sample is False") 79 | 80 | return args 81 | -------------------------------------------------------------------------------- /assets/benchmark_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/princeton-nlp/HELMET/87bbd6d8e316dd9ea9c5515b8f20e44bf2d768b1/assets/benchmark_overview.png -------------------------------------------------------------------------------- /assets/logo.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/princeton-nlp/HELMET/87bbd6d8e316dd9ea9c5515b8f20e44bf2d768b1/assets/logo.jpeg -------------------------------------------------------------------------------- /assets/task_correlation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/princeton-nlp/HELMET/87bbd6d8e316dd9ea9c5515b8f20e44bf2d768b1/assets/task_correlation.png -------------------------------------------------------------------------------- /configs/alce_nocite.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: '131072' 2 | datasets: alce_asqa_nocite_700 3 | generation_max_length: 600 4 | test_files: data/alce/asqa_eval_gtr_top2000.json 5 | demo_files: prompts/asqa_nocite.json 6 | use_chat_template: true 7 | max_test_samples: 100 8 | shots: 0 9 | stop_new_line: false 10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct 11 | -------------------------------------------------------------------------------- /configs/alce_nocite_short.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 8192,16384,32768,65536 2 | datasets: alce_asqa_nocite_30,alce_asqa_nocite_75,alce_asqa_nocite_165,alce_asqa_nocite_345 3 | generation_max_length: 600 4 | test_files: data/alce/asqa_eval_gtr_top2000.json,data/alce/asqa_eval_gtr_top2000.json,data/alce/asqa_eval_gtr_top2000.json,data/alce/asqa_eval_gtr_top2000.json 5 | demo_files: prompts/asqa_nocite.json,prompts/asqa_nocite.json,prompts/asqa_nocite.json,prompts/asqa_nocite.json 6 | use_chat_template: true 7 | max_test_samples: 100 8 | shots: 0 9 | stop_new_line: false 10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct 11 | -------------------------------------------------------------------------------- /configs/cite.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 131072,131072 2 | datasets: alce_asqa_700,alce_qampari_700 3 | generation_max_length: 300,300 4 | test_files: data/alce/asqa_eval_gtr_top2000.json,data/alce/qampari_eval_gtr_top2000.json 5 | demo_files: prompts/asqa_revised.json,prompts/qampari_revised.json 6 | use_chat_template: true 7 | max_test_samples: 100 8 | shots: 2 9 | stop_new_line: false 10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct 11 | -------------------------------------------------------------------------------- /configs/cite_short.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536 2 | datasets: alce_asqa_30,alce_asqa_75,alce_asqa_165,alce_asqa_345,alce_qampari_30,alce_qampari_75,alce_qampari_165,alce_qampari_345 3 | generation_max_length: 300,300,300,300,300,300,300,300 4 | test_files: data/alce/asqa_eval_gtr_top2000.json,data/alce/asqa_eval_gtr_top2000.json,data/alce/asqa_eval_gtr_top2000.json,data/alce/asqa_eval_gtr_top2000.json,data/alce/qampari_eval_gtr_top2000.json,data/alce/qampari_eval_gtr_top2000.json,data/alce/qampari_eval_gtr_top2000.json,data/alce/qampari_eval_gtr_top2000.json 5 | demo_files: prompts/asqa_revised.json,prompts/asqa_revised.json,prompts/asqa_revised.json,prompts/asqa_revised.json,prompts/qampari_revised.json,prompts/qampari_revised.json,prompts/qampari_revised.json,prompts/qampari_revised.json 6 | use_chat_template: true 7 | max_test_samples: 100 8 | shots: 2 9 | stop_new_line: false 10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct 11 | -------------------------------------------------------------------------------- /configs/icl.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 131072,131072,131072,131072,131072 2 | datasets: icl_trec_coarse_6600shot_balance,icl_trec_fine_6400shot_balance,icl_banking77_5900shot_balance,icl_clinic150_7050shot_balance,icl_nlu_8296shot_balance 3 | generation_max_length: 20,20,20,20,20 4 | test_files: ',,,,' 5 | demo_files: ',,,,' 6 | use_chat_template: false 7 | max_test_samples: 500 8 | shots: 0 9 | stop_new_line: true 10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct 11 | -------------------------------------------------------------------------------- /configs/icl_short.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536 2 | datasets: icl_trec_coarse_400shot_balance,icl_trec_coarse_800shot_balance,icl_trec_coarse_1600shot_balance,icl_trec_coarse_3300shot_balance,icl_trec_fine_400shot_balance,icl_trec_fine_800shot_balance,icl_trec_fine_1600shot_balance,icl_trec_fine_3200shot_balance,icl_banking77_360shot_balance,icl_banking77_720shot_balance,icl_banking77_1450shot_balance,icl_banking77_2900shot_balance,icl_clinic150_440shot_balance,icl_clinic150_880shot_balance,icl_clinic150_1750shot_balance,icl_clinic150_3525shot_balance,icl_nlu_510shot_balance,icl_nlu_1020shot_balance,icl_nlu_2040shot_balance,icl_nlu_4080shot_balance 3 | generation_max_length: 20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20 4 | test_files: ',,,,,,,,,,,,,,,,,,,' 5 | demo_files: ',,,,,,,,,,,,,,,,,,,' 6 | use_chat_template: false 7 | max_test_samples: 500 8 | shots: 0 9 | stop_new_line: true 10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct 11 | -------------------------------------------------------------------------------- /configs/longqa.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 131072,131072,131072 2 | datasets: narrativeqa_130772,infbench_qa_eng_130862,infbench_choice_eng_130862 3 | generation_max_length: 100,10,10 4 | test_files: ',,' 5 | demo_files: ',,' 6 | use_chat_template: true 7 | max_test_samples: 100 8 | shots: 2 9 | stop_new_line: false 10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct 11 | -------------------------------------------------------------------------------- /configs/longqa_short.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536 2 | datasets: narrativeqa_7892,narrativeqa_16084,narrativeqa_32468,narrativeqa_65236,infbench_qa_eng_7982,infbench_qa_eng_16174,infbench_qa_eng_32558,infbench_qa_eng_65326,infbench_choice_eng_7982,infbench_choice_eng_16174,infbench_choice_eng_32558,infbench_choice_eng_65326 3 | generation_max_length: 100,100,100,100,10,10,10,10,10,10,10,10 4 | test_files: ',,,,,,,,,,,' 5 | demo_files: ',,,,,,,,,,,' 6 | use_chat_template: true 7 | max_test_samples: 100 8 | shots: 2 9 | stop_new_line: false 10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct 11 | -------------------------------------------------------------------------------- /configs/niah.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 131072 2 | datasets: ruler_niah_s_2 3 | generation_max_length: 50 4 | test_files: data/ruler/niah_single_2/validation_131072.jsonl 5 | demo_files: '' 6 | -------------------------------------------------------------------------------- /configs/niah_long.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072,65536,131072 2 | datasets: ruler_niah_s_1,ruler_niah_s_1,ruler_niah_s_2,ruler_niah_s_2,ruler_niah_s_3,ruler_niah_s_3,ruler_niah_mk_1,ruler_niah_mk_1,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mq,ruler_niah_mq,ruler_niah_mv,ruler_niah_mv,ruler_cwe,ruler_cwe,ruler_fwe,ruler_fwe,ruler_vt,ruler_vt,ruler_qa_1,ruler_qa_1,ruler_qa_2,ruler_qa_2 3 | generation_max_length: 50,50,50,50,50,50,50,50,50,50,100,100,100,100,50,50,100,100,50,50,50,50,50,50,50,50 4 | test_files: data/ruler/niah_single_1/validation_65536.jsonl,data/ruler/niah_single_1/validation_131072.jsonl,data/ruler/niah_single_2/validation_65536.jsonl,data/ruler/niah_single_2/validation_131072.jsonl,data/ruler/niah_single_3/validation_65536.jsonl,data/ruler/niah_single_3/validation_131072.jsonl,data/ruler/niah_multikey_1/validation_65536.jsonl,data/ruler/niah_multikey_1/validation_131072.jsonl,data/ruler/niah_multikey_2/validation_65536.jsonl,data/ruler/niah_multikey_2/validation_131072.jsonl,data/ruler/niah_multikey_3/validation_65536.jsonl,data/ruler/niah_multikey_3/validation_131072.jsonl,data/ruler/niah_multiquery/validation_65536.jsonl,data/ruler/niah_multiquery/validation_131072.jsonl,data/ruler/niah_multivalue/validation_65536.jsonl,data/ruler/niah_multivalue/validation_131072.jsonl,data/ruler/cwe/validation_65536.jsonl,data/ruler/cwe/validation_131072.jsonl,data/ruler/fwe/validation_65536.jsonl,data/ruler/fwe/validation_131072.jsonl,data/ruler/vt/validation_65536.jsonl,data/ruler/vt/validation_131072.jsonl,data/ruler/qa_1/validation_65536.jsonl,data/ruler/qa_1/validation_131072.jsonl,data/ruler/qa_2/validation_65536.jsonl,data/ruler/qa_2/validation_131072.jsonl 5 | demo_files: ',,,,,,,,,,,,,,,,,,,,,,,,,' 6 | use_chat_template: false 7 | max_test_samples: 100 8 | shots: 0 9 | stop_new_line: false 10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct 11 | -------------------------------------------------------------------------------- /configs/rag.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 131072,131072,131072,131072 2 | datasets: kilt_nq,kilt_triviaqa,kilt_hotpotqa,kilt_popqa_3 3 | generation_max_length: 20,20,20,20 4 | test_files: data/kilt/nq-dev-multikilt_1000_k1000_dep6.jsonl,data/kilt/triviaqa-dev-multikilt_1000_k1000_dep6.jsonl,data/kilt/hotpotqa-dev-multikilt_1000_k1000_dep3.jsonl,data/kilt/popqa_test_1000_k1000_dep6.jsonl 5 | demo_files: data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl,data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl,data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl,data/kilt/popqa_test_1000_k3_dep6.jsonl 6 | use_chat_template: false 7 | max_test_samples: 100 8 | shots: 2 9 | stop_new_line: true 10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct 11 | -------------------------------------------------------------------------------- /configs/rag_short.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536 2 | datasets: kilt_nq,kilt_nq,kilt_nq,kilt_nq,kilt_triviaqa,kilt_triviaqa,kilt_triviaqa,kilt_triviaqa,kilt_hotpotqa,kilt_hotpotqa,kilt_hotpotqa,kilt_hotpotqa,kilt_popqa_3,kilt_popqa_3,kilt_popqa_3,kilt_popqa_3 3 | generation_max_length: 20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20 4 | test_files: data/kilt/nq-dev-multikilt_1000_k50_dep6.jsonl,data/kilt/nq-dev-multikilt_1000_k105_dep6.jsonl,data/kilt/nq-dev-multikilt_1000_k220_dep6.jsonl,data/kilt/nq-dev-multikilt_1000_k440_dep6.jsonl,data/kilt/triviaqa-dev-multikilt_1000_k50_dep6.jsonl,data/kilt/triviaqa-dev-multikilt_1000_k105_dep6.jsonl,data/kilt/triviaqa-dev-multikilt_1000_k220_dep6.jsonl,data/kilt/triviaqa-dev-multikilt_1000_k440_dep6.jsonl,data/kilt/hotpotqa-dev-multikilt_1000_k50_dep3.jsonl,data/kilt/hotpotqa-dev-multikilt_1000_k105_dep3.jsonl,data/kilt/hotpotqa-dev-multikilt_1000_k220_dep3.jsonl,data/kilt/hotpotqa-dev-multikilt_1000_k440_dep3.jsonl,data/kilt/popqa_test_1000_k50_dep6.jsonl,data/kilt/popqa_test_1000_k105_dep6.jsonl,data/kilt/popqa_test_1000_k220_dep6.jsonl,data/kilt/popqa_test_1000_k440_dep6.jsonl 5 | demo_files: data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl,data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl,data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl,data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl,data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl,data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl,data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl,data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl,data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl,data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl,data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl,data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl,data/kilt/popqa_test_1000_k3_dep6.jsonl,data/kilt/popqa_test_1000_k3_dep6.jsonl,data/kilt/popqa_test_1000_k3_dep6.jsonl,data/kilt/popqa_test_1000_k3_dep6.jsonl 6 | use_chat_template: false 7 | max_test_samples: 100 8 | shots: 2 9 | stop_new_line: true 10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct 11 | -------------------------------------------------------------------------------- /configs/rag_vllm.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 131072,131072,131072,131072 2 | datasets: kilt_nq,kilt_triviaqa,kilt_hotpotqa,kilt_popqa_3 3 | generation_max_length: 20,20,20,20 4 | test_files: data/kilt/nq-dev-multikilt_1000_k1000_dep6.jsonl,data/kilt/triviaqa-dev-multikilt_1000_k1000_dep6.jsonl,data/kilt/hotpotqa-dev-multikilt_1000_k1000_dep3.jsonl,data/kilt/popqa_test_1000_k1000_dep6.jsonl 5 | demo_files: data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl,data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl,data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl,data/kilt/popqa_test_1000_k3_dep6.jsonl 6 | use_chat_template: true 7 | max_test_samples: 100 8 | shots: 2 9 | stop_new_line: true 10 | model_name_or_path: meta-llama/Llama-3.3-70B-Instruct 11 | output_dir: output/vllm-gaudi/Llama-3.3-70B-Instruct 12 | use_vllm_serving: true 13 | -------------------------------------------------------------------------------- /configs/recall.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 131072,131072,131072,131072 2 | datasets: ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mv,json_kv 3 | generation_max_length: 50,100,50,100 4 | test_files: data/ruler/niah_multikey_2/validation_131072.jsonl,data/ruler/niah_multikey_3/validation_131072.jsonl,data/ruler/niah_multivalue/validation_131072.jsonl,data/json_kv/test_k1800_dep6.jsonl 5 | demo_files: ',,,' 6 | use_chat_template: false 7 | max_test_samples: 100 8 | shots: 2 9 | stop_new_line: false 10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct 11 | -------------------------------------------------------------------------------- /configs/recall_demo.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 8192 2 | datasets: ruler_niah_mk_2 3 | generation_max_length: 50 4 | test_files: data/ruler/niah_multikey_2/validation_8192.jsonl 5 | demo_files: '' 6 | use_chat_template: true 7 | max_test_samples: 5 8 | shots: 2 9 | top_p: 0.95 # need to be >0 and <1 10 | stop_new_line: false 11 | model_name_or_path: tgi:meta-llama/Llama-3.2-1B-Instruct 12 | output_dir: output/tgi/meta-llama/Llama-3.2-1B-Instruct 13 | use_tgi_serving: true -------------------------------------------------------------------------------- /configs/recall_short.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536 2 | datasets: ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mv,ruler_niah_mv,ruler_niah_mv,ruler_niah_mv,json_kv,json_kv,json_kv,json_kv 3 | generation_max_length: 50,50,50,50,100,100,100,100,50,50,50,50,100,100,100,100 4 | test_files: data/ruler/niah_multikey_2/validation_8192.jsonl,data/ruler/niah_multikey_2/validation_16384.jsonl,data/ruler/niah_multikey_2/validation_32768.jsonl,data/ruler/niah_multikey_2/validation_65536.jsonl,data/ruler/niah_multikey_3/validation_8192.jsonl,data/ruler/niah_multikey_3/validation_16384.jsonl,data/ruler/niah_multikey_3/validation_32768.jsonl,data/ruler/niah_multikey_3/validation_65536.jsonl,data/ruler/niah_multivalue/validation_8192.jsonl,data/ruler/niah_multivalue/validation_16384.jsonl,data/ruler/niah_multivalue/validation_32768.jsonl,data/ruler/niah_multivalue/validation_65536.jsonl,data/json_kv/test_k105_dep6.jsonl,data/json_kv/test_k220_dep6.jsonl,data/json_kv/test_k440_dep6.jsonl,data/json_kv/test_k900_dep6.jsonl 5 | demo_files: ',,,,,,,,,,,,,,,' 6 | use_chat_template: false 7 | max_test_samples: 100 8 | shots: 2 9 | stop_new_line: false 10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct 11 | -------------------------------------------------------------------------------- /configs/recall_vllm.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 131072,131072,131072,131072 2 | datasets: ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mv,json_kv 3 | generation_max_length: 50,100,50,100 4 | test_files: data/ruler/niah_multikey_2/validation_131072.jsonl,data/ruler/niah_multikey_3/validation_131072.jsonl,data/ruler/niah_multivalue/validation_131072.jsonl,data/json_kv/test_k1800_dep6.jsonl 5 | demo_files: ',,,' 6 | use_chat_template: true 7 | max_test_samples: 100 8 | shots: 2 9 | stop_new_line: false 10 | model_name_or_path: meta-llama/Llama-3.3-70B-Instruct 11 | output_dir: output/vllm-gaudi/Llama-3.3-70B-Instruct 12 | use_vllm_serving: true 13 | -------------------------------------------------------------------------------- /configs/rerank.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: '131072' 2 | datasets: msmarco_rerank_psg 3 | generation_max_length: '200' 4 | test_files: data/msmarco/test_reranking_data_k1000_dep3.jsonl 5 | demo_files: data/msmarco/test_reranking_data_k10_dep3.jsonl 6 | use_chat_template: false 7 | max_test_samples: 100 8 | shots: 2 9 | stop_new_line: true 10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct 11 | -------------------------------------------------------------------------------- /configs/rerank_short.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 8192,16384,32768,65536 2 | datasets: msmarco_rerank_psg,msmarco_rerank_psg,msmarco_rerank_psg,msmarco_rerank_psg 3 | generation_max_length: 200,200,200,200 4 | test_files: data/msmarco/test_reranking_data_k50_dep3.jsonl,data/msmarco/test_reranking_data_k130_dep3.jsonl,data/msmarco/test_reranking_data_k285_dep3.jsonl,data/msmarco/test_reranking_data_k600_dep3.jsonl 5 | demo_files: data/msmarco/test_reranking_data_k10_dep3.jsonl,data/msmarco/test_reranking_data_k10_dep3.jsonl,data/msmarco/test_reranking_data_k10_dep3.jsonl,data/msmarco/test_reranking_data_k10_dep3.jsonl 6 | use_chat_template: false 7 | max_test_samples: 100 8 | shots: 2 9 | stop_new_line: true 10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct 11 | -------------------------------------------------------------------------------- /configs/ruler.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 131072,131072,131072,131072,131072,131072,131072,131072,131072,131072,131072,131072,131072 2 | datasets: ruler_niah_s_1,ruler_niah_s_2,ruler_niah_s_3,ruler_niah_mk_1,ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mq,ruler_niah_mv,ruler_cwe,ruler_fwe,ruler_vt,ruler_qa_1,ruler_qa_2 3 | generation_max_length: 50,50,50,50,50,100,100,50,100,50,50,50,50 4 | test_files: data/ruler/niah_single_1/validation_131072.jsonl,data/ruler/niah_single_2/validation_131072.jsonl,data/ruler/niah_single_3/validation_131072.jsonl,data/ruler/niah_multikey_1/validation_131072.jsonl,data/ruler/niah_multikey_2/validation_131072.jsonl,data/ruler/niah_multikey_3/validation_131072.jsonl,data/ruler/niah_multiquery/validation_131072.jsonl,data/ruler/niah_multivalue/validation_131072.jsonl,data/ruler/cwe/validation_131072.jsonl,data/ruler/fwe/validation_131072.jsonl,data/ruler/vt/validation_131072.jsonl,data/ruler/qa_1/validation_131072.jsonl,data/ruler/qa_2/validation_131072.jsonl 5 | demo_files: ',,,,,,,,,,,,' 6 | use_chat_template: false 7 | max_test_samples: 100 8 | shots: 0 9 | stop_new_line: false 10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct 11 | -------------------------------------------------------------------------------- /configs/ruler_short.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536,8192,16384,32768,65536 2 | datasets: ruler_niah_s_1,ruler_niah_s_1,ruler_niah_s_1,ruler_niah_s_1,ruler_niah_s_2,ruler_niah_s_2,ruler_niah_s_2,ruler_niah_s_2,ruler_niah_s_3,ruler_niah_s_3,ruler_niah_s_3,ruler_niah_s_3,ruler_niah_mk_1,ruler_niah_mk_1,ruler_niah_mk_1,ruler_niah_mk_1,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_2,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mk_3,ruler_niah_mq,ruler_niah_mq,ruler_niah_mq,ruler_niah_mq,ruler_niah_mv,ruler_niah_mv,ruler_niah_mv,ruler_niah_mv,ruler_cwe,ruler_cwe,ruler_cwe,ruler_cwe,ruler_fwe,ruler_fwe,ruler_fwe,ruler_fwe,ruler_vt,ruler_vt,ruler_vt,ruler_vt,ruler_qa_1,ruler_qa_1,ruler_qa_1,ruler_qa_1,ruler_qa_2,ruler_qa_2,ruler_qa_2,ruler_qa_2 3 | generation_max_length: 50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,100,100,100,100,100,100,100,100,50,50,50,50,100,100,100,100,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50,50 4 | test_files: data/ruler/niah_single_1/validation_8192.jsonl,data/ruler/niah_single_1/validation_16384.jsonl,data/ruler/niah_single_1/validation_32768.jsonl,data/ruler/niah_single_1/validation_65536.jsonl,data/ruler/niah_single_2/validation_8192.jsonl,data/ruler/niah_single_2/validation_16384.jsonl,data/ruler/niah_single_2/validation_32768.jsonl,data/ruler/niah_single_2/validation_65536.jsonl,data/ruler/niah_single_3/validation_8192.jsonl,data/ruler/niah_single_3/validation_16384.jsonl,data/ruler/niah_single_3/validation_32768.jsonl,data/ruler/niah_single_3/validation_65536.jsonl,data/ruler/niah_multikey_1/validation_8192.jsonl,data/ruler/niah_multikey_1/validation_16384.jsonl,data/ruler/niah_multikey_1/validation_32768.jsonl,data/ruler/niah_multikey_1/validation_65536.jsonl,data/ruler/niah_multikey_2/validation_8192.jsonl,data/ruler/niah_multikey_2/validation_16384.jsonl,data/ruler/niah_multikey_2/validation_32768.jsonl,data/ruler/niah_multikey_2/validation_65536.jsonl,data/ruler/niah_multikey_3/validation_8192.jsonl,data/ruler/niah_multikey_3/validation_16384.jsonl,data/ruler/niah_multikey_3/validation_32768.jsonl,data/ruler/niah_multikey_3/validation_65536.jsonl,data/ruler/niah_multiquery/validation_8192.jsonl,data/ruler/niah_multiquery/validation_16384.jsonl,data/ruler/niah_multiquery/validation_32768.jsonl,data/ruler/niah_multiquery/validation_65536.jsonl,data/ruler/niah_multivalue/validation_8192.jsonl,data/ruler/niah_multivalue/validation_16384.jsonl,data/ruler/niah_multivalue/validation_32768.jsonl,data/ruler/niah_multivalue/validation_65536.jsonl,data/ruler/cwe/validation_8192.jsonl,data/ruler/cwe/validation_16384.jsonl,data/ruler/cwe/validation_32768.jsonl,data/ruler/cwe/validation_65536.jsonl,data/ruler/fwe/validation_8192.jsonl,data/ruler/fwe/validation_16384.jsonl,data/ruler/fwe/validation_32768.jsonl,data/ruler/fwe/validation_65536.jsonl,data/ruler/vt/validation_8192.jsonl,data/ruler/vt/validation_16384.jsonl,data/ruler/vt/validation_32768.jsonl,data/ruler/vt/validation_65536.jsonl,data/ruler/qa_1/validation_8192.jsonl,data/ruler/qa_1/validation_16384.jsonl,data/ruler/qa_1/validation_32768.jsonl,data/ruler/qa_1/validation_65536.jsonl,data/ruler/qa_2/validation_8192.jsonl,data/ruler/qa_2/validation_16384.jsonl,data/ruler/qa_2/validation_32768.jsonl,data/ruler/qa_2/validation_65536.jsonl 5 | demo_files: ',,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,' 6 | use_chat_template: false 7 | max_test_samples: 100 8 | shots: 0 9 | stop_new_line: false 10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct 11 | -------------------------------------------------------------------------------- /configs/summ.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 131072,131072 2 | datasets: infbench_sum_eng_129672,multi_lexsum_130372 3 | generation_max_length: 1200,400 4 | test_files: ',' 5 | demo_files: ',' 6 | use_chat_template: true 7 | max_test_samples: 100 8 | shots: 2 9 | stop_new_line: false 10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct 11 | -------------------------------------------------------------------------------- /configs/summ_short.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 8192,16384,32768,65536,8192,16384,32768,65536 2 | datasets: infbench_sum_eng_6792,infbench_sum_eng_14984,infbench_sum_eng_31368,infbench_sum_eng_64136,multi_lexsum_7492,multi_lexsum_15684,multi_lexsum_32068,multi_lexsum_64836 3 | generation_max_length: 1200,1200,1200,1200,400,400,400,400 4 | test_files: ',,,,,,,' 5 | demo_files: ',,,,,,,' 6 | use_chat_template: true 7 | max_test_samples: 100 8 | shots: 2 9 | stop_new_line: false 10 | model_name_or_path: meta-llama/Llama-3.1-8B-Instruct 11 | -------------------------------------------------------------------------------- /eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from collections import defaultdict 4 | import re 5 | import random 6 | import json 7 | import time 8 | 9 | from tqdm import tqdm 10 | import numpy as np 11 | import torch 12 | from torch.utils.data import DataLoader 13 | 14 | from arguments import parse_arguments 15 | from model_utils import load_LLM, OpenAIModel, AnthropicModel, TgiVllmModel 16 | 17 | from data import ( 18 | load_data, 19 | TestItemDataset, 20 | ) 21 | 22 | import logging 23 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 24 | datefmt='%m/%d/%Y %H:%M:%S') 25 | logger = logging.getLogger(__name__) 26 | logger.setLevel(logging.INFO) 27 | 28 | 29 | def run_test(args, model, dataset, test_file, demo_file): 30 | logger.info(f"running test on {dataset} with test {test_file} and demo {demo_file}") 31 | # dataset specific changes tag 32 | tag = args.tag 33 | if dataset == "popqa": 34 | tag += f"_pop{args.popularity_threshold}" 35 | 36 | test_name = os.path.splitext(os.path.basename(test_file))[0] 37 | output_path = os.path.join(args.output_dir, f"{dataset}_{tag}_{test_name}_in{args.input_max_length}_size{args.max_test_samples}_shots{args.shots}_samp{args.do_sample}max{args.generation_max_length}min{args.generation_min_length}t{args.temperature}p{args.top_p}_chat{args.use_chat_template}_{args.seed}.json") 38 | if os.path.exists(output_path) and not args.overwrite and not args.debug: 39 | logger.info(f"{output_path} already exists, skipping...") 40 | return output_path 41 | 42 | random.seed(args.seed) 43 | data = load_data(args, dataset, test_file, demo_file) 44 | logger.info(f"loaded {len(data['data'])} samples from {dataset}") 45 | 46 | dataloader = DataLoader( 47 | TestItemDataset(data, model, model.tokenizer), 48 | batch_size=1, 49 | shuffle=False, 50 | collate_fn=lambda x: x, 51 | num_workers=args.num_workers if not args.debug else 0, 52 | ) 53 | 54 | # we first prepare all inputs and then run the evaluation in batch 55 | # the dataloader is a bit of an overkill here, but it makes it easier to switch back to iterative instead of batch eval 56 | metrics = defaultdict(list) 57 | all_inputs = [] 58 | all_input_texts = [] 59 | for idx, inputs in enumerate(tqdm(dataloader, desc="Preparing inputs")): 60 | inputs, input_text = inputs[0] 61 | if args.count_tokens: 62 | # count_tokens is only available for models that tokenizes the input 63 | metrics['input_len'].append(inputs.input_ids.shape[1]) 64 | continue 65 | all_inputs.append(inputs) 66 | all_input_texts.append(input_text) 67 | 68 | # HY: for the thinking mode, we add additional 32k tokens to allow models to generate thinking process 69 | if args.thinking: 70 | args.generation_max_length += 32768 71 | args.input_max_length += 32768 72 | model.max_length = args.input_max_length 73 | model.generation_max_length = args.generation_max_length 74 | args.stop_newline = False 75 | logger.info(f"thinking mode, adding 32k tokens to generation and input max length, also disabling stop_newline") 76 | 77 | logger.info("Running generation...") 78 | start_time = time.time() 79 | # generate all outputs 80 | if (isinstance(model, OpenAIModel) or isinstance(model, AnthropicModel)) and (not isinstance(model, TgiVllmModel)): 81 | # using the batch API makes it cheaper and faster 82 | logger.info(f"Using the OpenAI/Anthropic batch API by default, if you want to use the iterative API, please change the code") 83 | all_outputs = model.generate_batch(all_inputs, batch_file=output_path+".batch") 84 | else: 85 | all_outputs = model.generate_batch(all_inputs) 86 | end_time = time.time() 87 | 88 | # then we do all the postprocessing + evaluation 89 | results = [] 90 | for idx, output in enumerate(all_outputs): 91 | test_item = data["data"][idx] 92 | input_text = all_input_texts[idx] 93 | 94 | if output is None: 95 | logger.info(f"skipping example {idx+1} because the model returned None") 96 | continue 97 | 98 | # If we do not use the chat template, then we are doing completion, and for the sake of parsing, we want to prepend the system prompt to the input. 99 | # For example, since we are autocompleting "Answer:"" in the input, then we should prepend the system prompt to the output as well. 100 | # This requires some coordination from the dataset preprocessing 101 | if not args.use_chat_template: 102 | prepend_text = data["system_template"].format(**test_item) 103 | output["output"] = prepend_text + output["output"] 104 | 105 | if args.thinking: 106 | matches = re.search(r"(.*)(.*)", output['output'], flags=re.DOTALL) 107 | if matches: 108 | output["output"] = matches.group(2).strip() 109 | output["thoughts"] = matches.group(1).strip() 110 | 111 | mets, others = data['post_process'](output, test_item) 112 | output.update({**others, **mets}) 113 | for k, v in mets.items(): 114 | metrics[k].append(v) 115 | 116 | metrics["input_len"].append(output["input_len"]) 117 | metrics["output_len"].append(output["output_len"]) 118 | result = {**test_item, **output} 119 | result.pop("context", None) 120 | result.pop("input_ids", None) 121 | if input_text is None: 122 | input_text = result['input_text'] 123 | results.append(result) 124 | 125 | # print out some examples, we also limit how much we print out since it can get really long 126 | if idx < 5 or args.debug: 127 | logger.info(f"Example {idx+1}: ") 128 | logger.info(f"Decoder inputs:\n{input_text}\n") 129 | 130 | logger.info(f"Input length: {output['input_len']}") 131 | # currently we hardcode somethings to print out, but you may change these to print out other things 132 | logger.info(f"Question: {test_item['question'] if 'question' in test_item else ''}") 133 | logger.info(f"Answer: {test_item['answer'] if 'answer' in test_item else ''}") 134 | logger.info(f"Output: {output['output']}") 135 | logger.info(f"Parsed output: {output['parsed_output']}") 136 | logger.info(f"Metrics: {mets}") 137 | 138 | if args.debug: 139 | import pdb; pdb.set_trace() 140 | 141 | if not args.no_cuda: 142 | mem_usage = sum([torch.cuda.max_memory_allocated(i) for i in range(torch.cuda.device_count())]) 143 | logger.info(f"Memory usage: {mem_usage/1000**3:.02f} GB") 144 | logger.info(f"Total time: {end_time - start_time:.02f} s") 145 | logger.info(f"Throughput: {len(results) / (end_time - start_time):.02f} samples/s") 146 | 147 | if args.count_tokens: 148 | logger.info(f"----{dataset}----\nAverage input length: {np.mean(metrics['input_len']):.02f}, std input length: {np.std(metrics['input_len']):.02f}, max input length: {max(metrics['input_len'])}, min input length: {min(metrics['input_len'])}\n----returning----") 149 | return output_path 150 | 151 | if len(results) == 0: 152 | logger.error("No results to evaluate, something went wrong, returning...") 153 | return output_path 154 | 155 | averaged_metrics = {k: np.mean(v)*(100 if "_len" not in k else 1) for k, v in metrics.items()} 156 | 157 | logger.info("Averaged metrics:") 158 | for k, v in averaged_metrics.items(): 159 | logger.info(f"{k}: {v:.02f}") 160 | 161 | output = { 162 | "args": args.__dict__, 163 | "data": results, 164 | "metrics": metrics, 165 | "averaged_metrics": averaged_metrics, 166 | "throughput": len(results) / (end_time - start_time), 167 | } 168 | if not args.no_cuda: 169 | output["memory_usage"] = mem_usage 170 | 171 | if args.output_dir is not None: 172 | with open(output_path, "w") as f: 173 | json.dump(output, f, indent=4) 174 | # this makes it easier to parse results, but alce uses a different evaluation script 175 | if not "alce" in dataset: 176 | with open(output_path + ".score", "w") as f: 177 | json.dump(output["averaged_metrics"], f, indent=4) 178 | logger.info(f"done, results are written to {output_path}") 179 | 180 | return output_path 181 | 182 | 183 | def main(): 184 | args = parse_arguments() 185 | 186 | logger.info(f"Arguments: {args}") 187 | assert args.model_name_or_path is not None 188 | os.makedirs(args.output_dir, exist_ok=True) 189 | 190 | datasets = args.datasets.split(",") 191 | test_files = args.test_files.split(",") 192 | demo_files = args.demo_files.split(",") 193 | max_lengths = ([int(args.input_max_length)] * len(datasets)) if isinstance(args.input_max_length, int) or len(args.input_max_length.split(",")) == 1 else [int(l) for l in args.input_max_length.split(",")] 194 | gen_lengths = ([int(args.generation_max_length)] * len(datasets)) if isinstance(args.generation_max_length, int) or len(args.generation_max_length.split(",")) == 1 else [int(l) for l in args.generation_max_length.split(",")] 195 | assert len(test_files) == len(demo_files) 196 | 197 | args.input_max_length = max(max_lengths) 198 | model = load_LLM(args) 199 | 200 | for dataset, test_file, demo_file, max_length, gen_length in zip(datasets, test_files, demo_files, max_lengths, gen_lengths): 201 | args.datasets = dataset 202 | args.test_files = test_file 203 | args.demo_files = demo_file 204 | args.input_max_length = max_length 205 | args.generation_max_length = gen_length 206 | model.max_length = max_length 207 | model.generation_max_length = gen_length 208 | 209 | try: 210 | output_path = run_test(args, model, dataset, test_file, demo_file) 211 | 212 | if "alce" in dataset and not args.count_tokens and (not os.path.exists(output_path+".score") or args.overwrite): 213 | import eval_alce 214 | logger.info("running eval_alce.py...") 215 | cli_args = ["--f", output_path] 216 | if not "nocite" in dataset: 217 | cli_args.append("--citations") 218 | # HY: If you want to run the full ALCE evaluation, you should uncomment the following lines 219 | # In HELMET, we don't use the MAUVE scores. 220 | # if "asqa" in dataset: 221 | # cli_args.append("--mauve") 222 | # elif "eli5" in dataset: 223 | # cli_args += ["mauve", "--claims_nli"] 224 | eval_alce.main(cli_args) 225 | 226 | except Exception as e: 227 | # in case we run into some kind of error 228 | logger.exception(e) 229 | logger.error(f"Error in {dataset}, continuing...") 230 | if args.debug: 231 | raise e 232 | 233 | if __name__ == "__main__": 234 | main() 235 | 236 | -------------------------------------------------------------------------------- /eval_alce.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import collections 3 | import json 4 | import re 5 | import string 6 | import torch 7 | import copy 8 | 9 | from nltk import sent_tokenize 10 | import numpy as np 11 | from rouge_score import rouge_scorer, scoring 12 | from tqdm import tqdm 13 | import sys 14 | import logging 15 | from collections import defaultdict 16 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 17 | datefmt='%m/%d/%Y %H:%M:%S') 18 | logger = logging.getLogger(__name__) 19 | logger.setLevel(logging.INFO) 20 | 21 | from transformers import ( 22 | AutoModelForSeq2SeqLM, 23 | AutoTokenizer, 24 | pipeline 25 | ) 26 | 27 | from utils import normalize_answer, get_max_memory, remove_citations 28 | 29 | QA_MODEL="gaotianyu1350/roberta-large-squad" 30 | AUTOAIS_MODEL="google/t5_xxl_true_nli_mixture" 31 | 32 | global autoais_model, autoais_tokenizer 33 | autoais_model, autoais_tokenizer = None, None 34 | 35 | 36 | def compute_f1(a_gold, a_pred): 37 | """Compute F1 score between two strings.""" 38 | 39 | def _get_tokens(s): 40 | if not s: 41 | return [] 42 | return normalize_answer(s).split() 43 | 44 | gold_toks = _get_tokens(a_gold) 45 | pred_toks = _get_tokens(a_pred) 46 | 47 | common = collections.Counter(gold_toks) & collections.Counter(pred_toks) 48 | num_same = sum(common.values()) 49 | 50 | if len(gold_toks) == 0 or len(pred_toks) == 0: 51 | # If either is no-answer, then F1 is 1 if they agree, 0 otherwise 52 | return int(gold_toks == pred_toks) 53 | 54 | if num_same == 0: 55 | return 0 56 | 57 | precision = 1.0 * num_same / len(pred_toks) 58 | recall = 1.0 * num_same / len(gold_toks) 59 | f1 = (2 * precision * recall) / (precision + recall) 60 | 61 | return f1 62 | 63 | 64 | def compute_exact(a_gold, a_pred): 65 | """Check whether two strings are equal up to normalization.""" 66 | 67 | return int(normalize_answer(a_gold) == normalize_answer(a_pred)) 68 | 69 | 70 | def exact_presence(short_answers, context): 71 | """Verify if any of the answers is present in the given context. 72 | Args: 73 | short_answers: list of short answers to look for in the context 74 | context: a paragraph to search for short answers 75 | Returns: 76 | true if any of the short answers is present in the context 77 | """ 78 | 79 | n_short_answers = [normalize_answer(sa) for sa in short_answers] 80 | n_context = normalize_answer(context) 81 | 82 | for ans in n_short_answers: 83 | if ans in n_context: 84 | return True 85 | 86 | return False 87 | 88 | 89 | def compute_rouge(data): 90 | """Main function for rouge scoring. 91 | If two references are provided, 92 | the best score is chosen for each instance. 93 | Args: 94 | data: requires field `output` and `answer` (or `annotations` for ASQA) 95 | metrics: list of evaluation metrics 96 | Returns: 97 | dictionary representation of rouge scores 98 | """ 99 | def _rouge_calculation(hypotheses, 100 | references1, 101 | references2=[], 102 | metrics=['rougeLsum']): 103 | 104 | if references2 == []: 105 | references2 = references1 106 | 107 | scorer = rouge_scorer.RougeScorer(metrics, use_stemmer=True) 108 | aggregator = scoring.BootstrapAggregator() 109 | 110 | for i in range(len(hypotheses)): 111 | scores1 = scorer.score(references1[i], hypotheses[i]) 112 | scores2 = scorer.score(references2[i], hypotheses[i]) 113 | if scores1['rougeLsum'].fmeasure > scores2['rougeLsum'].fmeasure: 114 | aggregator.add_scores(scores1) 115 | else: 116 | aggregator.add_scores(scores2) 117 | 118 | scores = {m: [] for m in metrics} 119 | 120 | for m in metrics: 121 | fmeasure = aggregator.aggregate()[m].mid.fmeasure 122 | scores[m].append(fmeasure) 123 | 124 | for m in scores: 125 | scores[m] = 100 * sum(scores[m]) / len(scores[m]) 126 | 127 | return scores 128 | 129 | hypotheses = {} 130 | references1 = {} 131 | references2 = {} 132 | 133 | for idx, item in enumerate(data): 134 | hypotheses[idx] = item["output"] 135 | if "annotations" in item and item['annotations'] is not None: # For ASQA 136 | references1[idx] = item["annotations"][0]["long_answer"] 137 | references2[idx] = item["annotations"][1]["long_answer"] 138 | else: 139 | references1[idx] = item["answer"] 140 | references2[idx] = item["answer"] 141 | 142 | h, r1, r2 = [], [], [] 143 | 144 | for key in references1: 145 | h.append(hypotheses[key]) 146 | r1.append(references1[key]) 147 | 148 | if references2 is not None: 149 | r2.append(references2[key]) 150 | 151 | h = ['\n'.join(sent_tokenize(text.lower())) for text in h] 152 | r1 = ['\n'.join(sent_tokenize(text.lower())) for text in r1] 153 | r2 = ['\n'.join(sent_tokenize(text.lower())) for text in r2] 154 | scores = _rouge_calculation(h, r1, r2) 155 | 156 | return scores['rougeLsum'] 157 | 158 | 159 | def compute_str_em(data): 160 | """Compute STR-EM metric (only for ASQA) 161 | Args: 162 | data: requires field `qa_pairs/short_answers` and `output` 163 | Returns: 164 | STR-EM and STR-EM-HIT () 165 | """ 166 | 167 | if 'qa_pairs' not in data[0] or data[0]['qa_pairs'] is None: 168 | return 0, 0 169 | 170 | acc = [] 171 | hit = [] 172 | 173 | for item in data: 174 | loc_acc = [] 175 | for qa_pair in item['qa_pairs']: 176 | loc_acc.append(exact_presence(qa_pair['short_answers'], item["output"])) 177 | acc.append(np.mean(loc_acc)) 178 | hit.append( int(np.mean(loc_acc) == 1) ) 179 | 180 | return 100 * np.mean(acc), 100 * np.mean(hit) 181 | 182 | 183 | def compute_len(data): 184 | """Compute average length of predictions.""" 185 | 186 | res, cntr = 0, 0 187 | for item in data: 188 | res += len(item["output"].split()) 189 | cntr += 1 190 | return res / cntr 191 | 192 | 193 | def compute_qa(data): 194 | """Compute QA-based accuracy. 195 | Args: 196 | data: requires filed `qa_pairs/short_answers` and `output` 197 | Returns: 198 | QA metrics (QA-EM, QA-F1, QA-Hit) 199 | """ 200 | 201 | if 'qa_pairs' not in data[0] or data[0]['qa_pairs'] is None: 202 | logger.warn("Warning: no QA pairs found in data") 203 | return { 204 | 'QA-EM': 0, 205 | 'QA-F1': 0, 206 | 'QA-Hit': 0, 207 | } 208 | 209 | # Load model 210 | logger.info("Loading the RoBERTa-large SQuAD model for QA-based accuracy...") 211 | qa_pipeline = pipeline("question-answering", model=QA_MODEL, device=0) 212 | logger.info("Done") 213 | 214 | # Get prediction 215 | logger.info("Computing the QA-based accuracy...") 216 | em, f1, bins = [], [], [] 217 | for item in tqdm(data): 218 | question = [qa_pair['question'] for qa_pair in item['qa_pairs']] 219 | context = item['output'] if len(item['output']) > 0 else " " 220 | results = qa_pipeline(question=question, context=context, handle_impossible_answer=True) 221 | loc_counter, loc_em, loc_f1 = 0, 0, 0 222 | 223 | for idx, res in enumerate(results): 224 | answers = item["qa_pairs"][idx]["short_answers"] 225 | prediction = res["answer"] 226 | 227 | loc_em += max([compute_exact(a, prediction) for a in answers]) 228 | loc_f1 += max([compute_f1(a, prediction) for a in answers]) 229 | loc_counter += 1 230 | 231 | em.append(loc_em / loc_counter) 232 | f1.append(loc_f1 / loc_counter) 233 | bins.append(loc_em == loc_counter) 234 | 235 | return { 236 | 'QA-EM': 100 * np.mean(em), 237 | 'QA-F1': 100 * np.mean(f1), 238 | 'QA-Hit': 100 * np.mean(bins) 239 | } 240 | 241 | 242 | def compute_mauve(data): 243 | """Compute Mauve score.""" 244 | 245 | logger.info("Computing MAUVE...") 246 | human_data = [] 247 | model_data = [] 248 | for item in data: 249 | # Remove ending punctuations 250 | # Remove any new lines 251 | # Truncate by 100 words 252 | human_data.append(' '.join((item['question'] + " " + item['answer'].strip()).split()[:100]).rstrip(string.punctuation)) 253 | model_data.append(' '.join((item['question'] + " " + item['output'].strip()).split()[:100]).rstrip(string.punctuation)) 254 | 255 | import mauve 256 | out = mauve.compute_mauve( 257 | p_text=human_data, 258 | q_text=model_data, 259 | device_id=0, 260 | max_text_length=512, 261 | verbose=True, 262 | batch_size=8, 263 | featurize_model_name="gpt2-large" 264 | ) 265 | return out.mauve * 100 266 | 267 | 268 | def _run_nli_autoais(passage, claim): 269 | """ 270 | Run inference for assessing AIS between a premise and hypothesis. 271 | Adapted from https://github.com/google-research-datasets/Attributed-QA/blob/main/evaluation.py 272 | """ 273 | global autoais_model, autoais_tokenizer 274 | input_text = "premise: {} hypothesis: {}".format(passage, claim) 275 | input_ids = autoais_tokenizer(input_text, return_tensors="pt").input_ids.to(autoais_model.device) 276 | with torch.inference_mode(): 277 | outputs = autoais_model.generate(input_ids, max_new_tokens=10) 278 | result = autoais_tokenizer.decode(outputs[0], skip_special_tokens=True) 279 | inference = 1 if result == "1" else 0 280 | return inference 281 | 282 | 283 | def compute_claims(data): 284 | global autoais_model, autoais_tokenizer 285 | if autoais_model is None: 286 | logger.info("Loading AutoAIS model...") 287 | autoais_model = AutoModelForSeq2SeqLM.from_pretrained(AUTOAIS_MODEL, torch_dtype=torch.bfloat16, max_memory=get_max_memory(), device_map="auto") 288 | autoais_tokenizer = AutoTokenizer.from_pretrained(AUTOAIS_MODEL, use_fast=False) 289 | 290 | logger.info("Computing claims...") 291 | scores = [] 292 | for item in tqdm(data): 293 | normalized_output = remove_citations(item['output']) 294 | entail = 0 295 | claims = item["claims"] 296 | for claim in claims: 297 | entail += _run_nli_autoais(normalized_output, claim) 298 | scores.append(entail / len(claims)) 299 | return 100 * np.mean(scores) 300 | 301 | 302 | def compute_autoais(data, 303 | decontext=False, 304 | concat=False, 305 | qampari=False, 306 | at_most_citations=None,): 307 | """ 308 | Compute AutoAIS score. 309 | 310 | Args: 311 | data: requires field `output` and `docs` 312 | - docs should be a list of items with fields `title` and `text` (or `phrase` and `sent` for QA-extracted docs) 313 | citation: check citations and use the corresponding references. 314 | decontext: decontextualize the output 315 | """ 316 | 317 | global autoais_model, autoais_tokenizer 318 | if autoais_model is None: 319 | logger.info("Loading AutoAIS model...") 320 | autoais_model = AutoModelForSeq2SeqLM.from_pretrained(AUTOAIS_MODEL, torch_dtype=torch.bfloat16, max_memory=get_max_memory(), device_map="auto") 321 | autoais_tokenizer = AutoTokenizer.from_pretrained(AUTOAIS_MODEL, use_fast=False) 322 | 323 | logger.info(f"Running AutoAIS...") 324 | 325 | def _format_document(doc): 326 | """Format document for AutoAIS.""" 327 | 328 | if "sent" in doc: 329 | # QA-extracted docs 330 | return "Title: %s\n%s" % (doc['title'], doc['sent']) 331 | else: 332 | return "Title: %s\n%s" % (doc['title'], doc['text']) 333 | 334 | ais_scores = [] 335 | ais_scores_prec = [] 336 | 337 | sent_total = 0 338 | sent_mcite = 0 339 | sent_mcite_support = 0 340 | sent_mcite_overcite = 0 341 | autoais_log = [] 342 | citation_position_count = defaultdict(lambda: 0) 343 | for item in tqdm(data): 344 | # Get sentences by using NLTK 345 | if qampari: 346 | sents = [item['question'] + " " + x.strip() for x in item['output'].rstrip().rstrip(".").rstrip(",").split(",")] 347 | else: 348 | sents = sent_tokenize(item['output']) 349 | # we also ignore sentences that are < 5 characters long, they are unlikely to be meaningful 350 | # this resolves the case where the sentencizer takes "1." as a sentence 351 | sents = [x for x in sents if len(x.strip()) >= 5] 352 | if len(sents) == 0: 353 | continue 354 | 355 | target_sents = [remove_citations(sent).strip() for sent in sents] 356 | 357 | entail = 0 358 | entail_prec = 0 359 | total_citations = 0 360 | for sent_id, sent in enumerate(sents): 361 | target_sent = target_sents[sent_id] # Citation removed and (if opted for) decontextualized 362 | joint_entail = -1 # Undecided 363 | 364 | # Find references 365 | ref = [int(r[1:])-1 for r in re.findall(r"\[\d+", sent)] # In text citation id starts from 1 366 | for r in ref: 367 | citation_position_count[r] += 1 368 | logger.info(f"For `{sent}`, find citations {ref}") 369 | if len(ref) == 0: 370 | # No citations 371 | joint_entail = 0 372 | elif any([ref_id >= len(item['docs']) for ref_id in ref]): 373 | # Citations out of range 374 | joint_entail = 0 375 | else: 376 | if at_most_citations is not None: 377 | ref = ref[:at_most_citations] 378 | total_citations += len(ref) 379 | joint_passage = '\n'.join([_format_document(item['docs'][psgs_id]) for psgs_id in ref]) 380 | 381 | # If not directly rejected by citation format error, calculate the recall score 382 | if joint_entail == -1: 383 | joint_entail = _run_nli_autoais(joint_passage, target_sent) 384 | autoais_log.append({ 385 | "question": item['question'], 386 | "output": item['output'], 387 | "claim": sent, 388 | "passage": [joint_passage], 389 | "model_type": "NLI", 390 | "model_output": joint_entail, 391 | }) 392 | 393 | entail += joint_entail 394 | if len(ref) > 1: 395 | sent_mcite += 1 396 | 397 | # calculate the precision score if applicable 398 | if joint_entail and len(ref) > 1: 399 | sent_mcite_support += 1 400 | # Precision check: did the model cite any unnecessary documents? 401 | for psgs_id in ref: 402 | # condition A 403 | passage = _format_document(item['docs'][psgs_id]) 404 | nli_result = _run_nli_autoais(passage, target_sent) 405 | 406 | # condition B 407 | if not nli_result: 408 | subset_exclude = copy.deepcopy(ref) 409 | subset_exclude.remove(psgs_id) 410 | passage = '\n'.join([_format_document(item['docs'][pid]) for pid in subset_exclude]) 411 | nli_result = _run_nli_autoais(passage, target_sent) 412 | if nli_result: # psgs_id is not necessary 413 | flag = 0 414 | sent_mcite_overcite += 1 415 | else: 416 | entail_prec += 1 417 | else: 418 | entail_prec += 1 419 | else: 420 | entail_prec += joint_entail 421 | 422 | sent_total += len(sents) 423 | ais_scores.append(entail / len(sents)) 424 | ais_scores_prec.append(entail_prec / total_citations if total_citations > 0 else 0) # len(sents)) 425 | 426 | if sent_mcite > 0 and sent_mcite_support > 0: 427 | print("Among all sentences, %.2f%% have multiple citations, among which %.2f%% are supported by the joint set, among which %.2f%% overcite." % ( 428 | 100 * sent_mcite / sent_total, 429 | 100 * sent_mcite_support / sent_mcite, 430 | 100 * sent_mcite_overcite / sent_mcite_support 431 | )) 432 | 433 | return { 434 | "citation_rec": 100 * np.mean(ais_scores) if len(ais_scores) > 0 else 0, 435 | "citation_prec": 100 * np.mean(ais_scores_prec) if len(ais_scores_prec) > 0 else 0, 436 | "citation_positions": dict(citation_position_count), 437 | } 438 | 439 | 440 | def compute_qampari_f1(data, cot=False): 441 | prec = [] 442 | rec = [] 443 | rec_top5 = [] 444 | f1 = [] 445 | f1_top5 = [] 446 | 447 | num_preds = [] 448 | for item in data: 449 | if cot: 450 | if ":" in item['output']: 451 | o = ':'.join(item['output'].split(":")[1:]) # try to separate the COT part and the answer list part. 452 | else: 453 | o = "" 454 | else: 455 | o = item['output'] 456 | preds = [normalize_answer(x.strip()) for x in o.rstrip().rstrip(".").rstrip(",").split(",")] 457 | preds = [p for p in preds if len(p) > 0] # delete empty answers 458 | num_preds.append(len(preds)) 459 | answers = [[normalize_answer(x) for x in ans] for ans in item['answers']] 460 | flat_answers = [item for sublist in answers for item in sublist] 461 | 462 | prec.append(sum([p in flat_answers for p in preds]) / len(preds) if len(preds) > 0 else 0) 463 | rec.append(sum([any([x in preds for x in a]) for a in answers]) / len(answers)) 464 | rec_top5.append(min(5, sum([any([x in preds for x in a]) for a in answers])) / min(5, len(answers))) 465 | if (prec[-1] + rec[-1]) == 0: 466 | f1.append(0) 467 | else: 468 | f1.append(2 * prec[-1] * rec[-1] / (prec[-1] + rec[-1])) 469 | if (prec[-1] + rec_top5[-1]) == 0: 470 | f1_top5.append(0) 471 | else: 472 | f1_top5.append(2 * prec[-1] * rec_top5[-1] / (prec[-1] + rec_top5[-1])) 473 | 474 | return { 475 | "num_preds": np.mean(num_preds), 476 | "qampari_prec": 100 * np.mean(prec), 477 | "qampari_rec": 100 * np.mean(rec), 478 | "qampari_rec_top5": 100 * np.mean(rec_top5), 479 | "qampari_f1": 100 * np.mean(f1), 480 | "qampari_f1_top5": 100 * np.mean(f1_top5), 481 | } 482 | 483 | def main(args=None): 484 | parser = argparse.ArgumentParser() 485 | parser.add_argument("--f", type=str, required=True, help="Output file. Should have field `question`, `output`, (ROUGE) `answer`, \ 486 | (accuracy) `qa_pairs`, (AIS) `docs`") 487 | parser.add_argument("--no_rouge", action="store_true", help="Do not evaluate ROUGE score") 488 | parser.add_argument("--qa", action="store_true", help="Use the QA model") 489 | parser.add_argument("--mauve", action="store_true", help="Use the mauve score model") 490 | parser.add_argument("--citations", action="store_true", help="Evaluation with citation") 491 | parser.add_argument("--at_most_citations", type=int, default=3, help="At most take this many documents (mostly for precision)") 492 | parser.add_argument("--claims_nli", action="store_true", help="Use claims for ELI5") 493 | 494 | # QAMPARI 495 | parser.add_argument("--cot", action="store_true", help="For QAMPARI, try to find colon and separate the COT and answer listing") 496 | 497 | if args is None: 498 | args = parser.parse_args() 499 | else: 500 | args = parser.parse_args(args) 501 | 502 | with open(args.f) as f: 503 | data_with_config = json.load(f) 504 | data = data_with_config['data'] 505 | 506 | if "qampari" in args.f: 507 | args.no_rouge = True 508 | args.qa = False 509 | args.mauve = False 510 | args.decontext = False 511 | qampari = True 512 | else: 513 | qampari = False 514 | 515 | # Truncate by newline and remove on the fly search result 516 | # logger.warning("We remove all the pre/appended space/newlines and we truncate the answer by the first newline.") 517 | logger.warning("We remove all the pre/appended space/newlines and replace newlines with spaces.") 518 | logger.warning("We replace any on the fly search result to standard bracket citation format.") 519 | for i in range(len(data)): 520 | # data[i]['output'] = data[i]['output'].strip().split("\n")[0] 521 | data[i]['output'] = re.sub(r"\n+", " ", data[i]['output']) 522 | data[i]['output'] = data[i]['output'].replace("<|im_end|>", "") 523 | 524 | 525 | # Remove all citations for all non-AutoAIS evaluation 526 | normalized_data = copy.deepcopy(data) 527 | for i in range(len(normalized_data)): 528 | normalized_data[i]['output'] = remove_citations(normalized_data[i]['output']) 529 | 530 | result = {} 531 | result['length'] = compute_len(normalized_data) 532 | result['str_em'], result['str_hit'] = compute_str_em(normalized_data) 533 | if qampari: 534 | result.update(compute_qampari_f1(normalized_data, cot=args.cot)) 535 | if not args.no_rouge: 536 | result['rougeLsum'] = compute_rouge(normalized_data) 537 | if args.qa: 538 | result.update(compute_qa(normalized_data)) 539 | if args.mauve: 540 | result['mauve'] = compute_mauve(normalized_data) 541 | if args.citations: 542 | result.update(compute_autoais(data, qampari=qampari, at_most_citations=args.at_most_citations)) 543 | if args.claims_nli: 544 | result["claims_nli"] = compute_claims(normalized_data) 545 | 546 | print(result) 547 | with open(args.f + ".score", "w") as f: 548 | json.dump(result, f, indent=4) 549 | 550 | 551 | if __name__ == "__main__": 552 | main() 553 | -------------------------------------------------------------------------------- /longproc_addon/README.md: -------------------------------------------------------------------------------- 1 | ## LongProc Add-On on HELMET 2 | We integrated [LongProc](https://github.com/princeton-pli/LongProc) in HELMET to support convenient evaluation. 3 | 4 | **Additional Setup** 5 | Pull the submodule from LongProc and add `__init__.py` files to make the import work: 6 | ```bash 7 | git submodule update --init --recursive 8 | touch longproc_addon/__init__.py 9 | touch longproc_addon/longproc/__init__.py 10 | ``` 11 | 12 | To quickly test if everything is working, you can try running the evaluations. 13 | 14 | **Running Evaluation** 15 | You can now run evaluation just as you would in HELMET. The config files are stored in `longproc_addon/configs`. 16 | 17 | For example: 18 | ```bash 19 | python eval.py --config longproc_addon/configs/html_to_tsv.yaml --model_name_or_path {local model path or huggingface model name} --output_dir {output directory, defaults to output/{model_name}} 20 | ``` 21 | -------------------------------------------------------------------------------- /longproc_addon/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/princeton-nlp/HELMET/87bbd6d8e316dd9ea9c5515b8f20e44bf2d768b1/longproc_addon/__init__.py -------------------------------------------------------------------------------- /longproc_addon/configs/countdown.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 32000,32000,32000 2 | generation_max_length: 1024,3072,10240 3 | datasets: countdown_0.5k,countdown_2k,countdown_8k 4 | test_files: longproc_addon/longproc/data,longproc_addon/longproc/data,longproc_addon/longproc/data 5 | demo_files: ",," 6 | use_chat_template: true 7 | max_test_samples: 100 8 | temperature: 0.0 9 | shots: 0 10 | stop_new_line: false 11 | model_name_or_path: gpt-4o-mini-2024-07-18 12 | -------------------------------------------------------------------------------- /longproc_addon/configs/html_to_tsv.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 128000,128000,128000 2 | generation_max_length: 1024,3072,10240 3 | datasets: html_to_tsv_0.5k,html_to_tsv_2k,html_to_tsv_8k 4 | test_files: longproc_addon/longproc/data,longproc_addon/longproc/data,longproc_addon/longproc/data 5 | demo_files: ",," 6 | use_chat_template: true 7 | max_test_samples: 100 8 | temperature: 0.0 9 | shots: 0 10 | stop_new_line: false 11 | model_name_or_path: gpt-4o-mini-2024-07-18 12 | -------------------------------------------------------------------------------- /longproc_addon/configs/path_traversal.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 32000,32000,32000 2 | generation_max_length: 1024,3072,10240 3 | datasets: path_traversal_0.5k,path_traversal_2k,path_traversal_8k 4 | test_files: longproc_addon/longproc/data,longproc_addon/longproc/data,longproc_addon/longproc/data 5 | demo_files: ",," 6 | use_chat_template: true 7 | max_test_samples: 100 8 | temperature: 0.0 9 | shots: 0 10 | stop_new_line: false 11 | model_name_or_path: gpt-4o-mini-2024-07-18 12 | -------------------------------------------------------------------------------- /longproc_addon/configs/pseudo_to_code.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 8000,8000 2 | generation_max_length: 1024,3072 3 | datasets: pseudo_to_code_0.5k,pseudo_to_code_2k 4 | test_files: longproc_addon/longproc/data,longproc_addon/longproc/data 5 | demo_files: "," 6 | use_chat_template: true 7 | max_test_samples: 100 8 | temperature: 0.0 9 | shots: 0 10 | stop_new_line: false 11 | model_name_or_path: gpt-4o-mini-2024-07-18 12 | -------------------------------------------------------------------------------- /longproc_addon/configs/tom_tracking.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 32000,32000,32000 2 | generation_max_length: 1024,3072,10240 3 | datasets: tom_tracking_0.5k,tom_tracking_2k,tom_tracking_8k 4 | test_files: longproc_addon/longproc/data,longproc_addon/longproc/data,longproc_addon/longproc/data 5 | demo_files: ",," 6 | use_chat_template: true 7 | max_test_samples: 100 8 | temperature: 0.0 9 | shots: 0 10 | stop_new_line: false 11 | model_name_or_path: gpt-4o-mini-2024-07-18 12 | -------------------------------------------------------------------------------- /longproc_addon/configs/travel_planning.yaml: -------------------------------------------------------------------------------- 1 | input_max_length: 32000,32000 2 | generation_max_length: 3072,10240 3 | datasets: travel_planning_2k,travel_planning_8k 4 | test_files: longproc_addon/longproc/data,longproc_addon/longproc/data 5 | demo_files: "," 6 | use_chat_template: true 7 | max_test_samples: 100 8 | temperature: 0.0 9 | shots: 0 10 | stop_new_line: false 11 | model_name_or_path: gpt-4o-mini-2024-07-18 12 | -------------------------------------------------------------------------------- /longproc_addon/longproc_helmet_loader.py: -------------------------------------------------------------------------------- 1 | from datasets import Dataset as HFDataset 2 | 3 | try: 4 | from .longproc.longproc.longproc_data import load_longproc_data 5 | except ImportError as e: 6 | raise ImportError("LongProc cannot be loaded.") 7 | 8 | 9 | def load_longproc_data_for_helmet(dataset: str, path="longproc_addon/longproc/data", max_test_samples=None, seed=42): 10 | # packed data: list of "input_prompt", "reference_output", "item" 11 | packed_data, eval_func = load_longproc_data(dataset, path) 12 | 13 | packed_data = HFDataset.from_list(packed_data) 14 | if max_test_samples is not None: 15 | packed_data = packed_data.shuffle(seed=seed).select(range(min(max_test_samples, len(packed_data)))) 16 | 17 | def helmet_eval_wrapper(output: dict, example: dict): 18 | predict = output["output"] 19 | return eval_func(predict, example) 20 | 21 | return { 22 | "data": packed_data, 23 | "prompt_template": "{input_prompt}", 24 | "user_template": "{input_prompt}", 25 | "post_process": helmet_eval_wrapper, 26 | } 27 | 28 | 29 | def _test_load_all(): 30 | def test_loading(dataset): 31 | data, eval_func = load_longproc_data(dataset, "longproc_addon/longproc/data") 32 | print(f"Dataset: {dataset}") 33 | print(f"N samples: {len(data)}") 34 | print(f"Eval func: {eval_func}") 35 | print(f"Max input chars: {max([len(d['input_prompt']) for d in data])}") 36 | print(f"Max output chars: {max([len(d['reference_output']) for d in data])}") 37 | 38 | [test_loading(d) for d in ["path_traversal_0.5k", "path_traversal_2k", "path_traversal_8k"]] 39 | 40 | [test_loading(d) for d in ["html_to_tsv_0.5k", "html_to_tsv_2k", "html_to_tsv_8k"]] 41 | 42 | [test_loading(d) for d in ["pseudo_to_code_0.5k", "pseudo_to_code_2k",]] 43 | 44 | [test_loading(d) for d in ["travel_planning_2k", "travel_planning_8k"]] 45 | 46 | [test_loading(d) for d in ["tom_tracking_0.5k", "tom_tracking_2k", "tom_tracking_8k"]] 47 | 48 | [test_loading(d) for d in ["countdown_0.5k", "countdown_2k", "countdown_8k"]] 49 | 50 | 51 | if __name__ == "__main__": 52 | _test_load_all() 53 | 54 | -------------------------------------------------------------------------------- /prompts/asqa_nocite.json: -------------------------------------------------------------------------------- 1 | { 2 | "instruction": "Instruction: Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant). Use an unbiased and journalistic tone.", 3 | "demo_sep": "\n\n\n", 4 | "demo_prompt": "{instruction}\n\nQuestion: {question}\n\n{context}\n\nAnswer: {answer}", 5 | "doc_prompt": "Document [{ID}](Title: {title}): {text}", 6 | "demos": [ 7 | { 8 | "question": "Which is the most rainy place on earth?", 9 | "answer": "Several places on Earth claim to be the most rainy, such as Lloró, Colombia, which reported an average annual rainfall of 12,717 mm between 1952 and 1989, and López de Micay, Colombia, which reported an annual 12,892 mm between 1960 and 2012. However, the official record is held by Mawsynram, India with an average annual rainfall of 11,872 mm, although nearby town Sohra, India, also known as Cherrapunji, holds the record for most rain in a calendar month for July 1861 and most rain in a year from August 1860 to July 1861.", 10 | "docs": [ 11 | { 12 | "title": "Cherrapunji", 13 | "text": "Cherrapunji Cherrapunji (; with the native name Sohra being more commonly used, and can also be spelled Cherrapunjee or Cherrapunji) is a subdivisional town in the East Khasi Hills district in the Indian state of Meghalaya. It is the traditional capital of aNongkhlaw \"hima\" (Khasi tribal chieftainship constituting a petty state), both known as Sohra or Churra. Cherrapunji has often been credited as being the wettest place on Earth, but for now nearby Mawsynram currently holds that distinction. Cherrapunji still holds the all-time record for the most rainfall in a calendar month for July 1861 and most rain in a year from August 1860 to July 1861, however: it received in" 14 | }, 15 | { 16 | "title": "Cherrapunji", 17 | "text": "Radio relay station known as Akashvani Cherrapunji. It broadcasts on FM frequencies. Cherrapunji Cherrapunji (; with the native name Sohra being more commonly used, and can also be spelled Cherrapunjee or Cherrapunji) is a subdivisional town in the East Khasi Hills district in the Indian state of Meghalaya. It is the traditional capital of aNongkhlaw \"hima\" (Khasi tribal chieftainship constituting a petty state), both known as Sohra or Churra. Cherrapunji has often been credited as being the wettest place on Earth, but for now nearby Mawsynram currently holds that distinction. Cherrapunji still holds the all-time record for the most rainfall" 18 | }, 19 | { 20 | "title": "Mawsynram", 21 | "text": "Mawsynram Mawsynram () is a village in the East Khasi Hills district of Meghalaya state in north-eastern India, 65 kilometres from Shillong. Mawsynram receives one of the highest rainfalls in India. It is reportedly the wettest place on Earth, with an average annual rainfall of 11,872 mm, but that claim is disputed by Lloró, Colombia, which reported an average yearly rainfall of 12,717 mm between 1952 and 1989 and López de Micay, also in Colombia, which reported an annual 12,892 mm per year between 1960 and 2012. According to the \"Guinness Book of World Records\", Mawsynram received of rainfall in 1985. Mawsynram is located at 25° 18′" 22 | }, 23 | { 24 | "title": "Earth rainfall climatology", 25 | "text": "Pacific Northwest, and the Sierra Nevada range are the wetter portions of the nation, with average rainfall exceeding per year. The drier areas are the Desert Southwest, Great Basin, valleys of northeast Arizona, eastern Utah, central Wyoming, eastern Oregon and Washington and the northeast of the Olympic Peninsula. The Big Bog on the island of Maui receives, on average, every year, making it the wettest location in the US, and all of Oceania. The annual average rainfall maxima across the continent lie across the northwest from northwest Brazil into northern Peru, Colombia, and Ecuador, then along the Atlantic coast of" 26 | }, 27 | { 28 | "title": "Going to Extremes", 29 | "text": "in the world. Oymyakon in Siberia, where the average winter temperature is −47 °F (− 44 °C). Arica in Chile, where there had been fourteen consecutive years without rain. Fog is the only local source of water. Mawsynram in India, where average annual rainfall is 14 meters, falling within a four-month period in the monsoon season. The rainfall is approximately equal to that of its neighbor Cherrapunji. Dallol in Ethiopia, known as the 'Hell-hole of creation' where the temperature averages 94 °F (34 °C) over the year. In his second series, Middleton visited places without permanent towns, locations where \"survival\"" 30 | } 31 | ] 32 | }, 33 | { 34 | "question": "When did the us break away from england?", 35 | "answer": "The United States took the first step towards gaining independence from Great Britain when it declared independence from Great Britain on July 2, 1776 (although the event is now commemorated on July 4, 1776, the date when the Declaration of Independence was officially adopted by Congress). The Treaty of Paris was later signed on September 3, 1783, formally separating the United States from the British Empire.", 36 | "docs": [ 37 | { 38 | "title": "United States withdrawal from Saudi Arabia", 39 | "text": "United States withdrawal from Saudi Arabia Beginning during Operation Desert Shield in August 1990, while preparing for the Gulf War, the United States sent a large troop contingent to Saudi Arabia. After the war, remnant troops, primarily U.S. Air Force personnel, augmented by a smaller number of coordinating and training personnel from the U.S. Navy, U.S. Army and U.S. Marine Corps remained in Saudi Arabia under the aegis of Joint Task Force Southwest Asia (JTF-SWA), as part of Operation Southern Watch (OSW). The United Kingdom and France also maintained a small contingent of Royal Air Force and French Air Force" 40 | }, 41 | { 42 | "title": "Decolonization of the Americas", 43 | "text": "and France has fully \"integrated\" most of its former colonies as fully constituent \"departments\" of France. The United States of America declared independence from Great Britain on July 2, 1776 (although the event is now commemorated on July 4, the date when the Declaration of Independence was officially adopted by Congress), in so doing becoming the first independent, foreign-recognized nation in the Americas and the first European colonial entity to break from its mother country. Britain formally acknowledged American independence in 1783 after its defeat in the American Revolutionary War. Although initially occupying only the land east of the Mississippi" 44 | }, 45 | { 46 | "title": "American Revolution", 47 | "text": "second British army at Yorktown in the fall of 1781, effectively ending the war. The Treaty of Paris was signed September 3, 1783, formally ending the conflict and confirming the new nation's complete separation from the British Empire. The United States took possession of nearly all the territory east of the Mississippi River and south of the Great Lakes, with the British retaining control of Canada and Spain taking Florida. Among the significant results of the revolution was the creation of the United States Constitution, establishing a relatively strong federal national government that included an executive, a national judiciary, and" 48 | }, 49 | { 50 | "title": "Decolonization", 51 | "text": "accelerate decolonialization and bring an end to the colonial empires of its Western allies, most importantly during the 1956 Suez Crisis, but American military bases were established around the world and direct and indirect interventions continued in Korea, Indochina, Latin America (\"inter alia\", the 1965 occupation of the Dominican Republic), Africa, and the Middle East to oppose Communist invasions and insurgencies. Since the dissolution of the Soviet Union, the United States has been far less active in the Americas, but invaded Afghanistan and Iraq following the September 11 attacks in 2001, establishing army and air bases in Central Asia. Before" 52 | }, 53 | { 54 | "title": "Decolonization", 55 | "text": "the responsibility of the United Kingdom (with a copy of the new constitution annexed), and finally, if approved, issuance of an Order of Council fixing the exact date of independence. After World War I, several former German and Ottoman territories in the Middle East, Africa, and the Pacific were governed by the UK as League of Nations mandates. Some were administered directly by the UK, and others by British dominions – Nauru and the Territory of New Guinea by Australia, South West Africa by the Union of South Africa, and Western Samoa by New Zealand. Egypt became independent in 1922," 56 | } 57 | ] 58 | }, 59 | { 60 | "question": "Who set the record for longest field goal?", 61 | "answer": "The record for the longest field goal in an NFL game was set by Matt Prater at 64 yards, but the record for the longest field goal at any level was 69 yards, kicked by collegiate kicker Ove Johansson in a 1976 Abilene Christian University football game against East Texas State University.", 62 | "docs": [ 63 | { 64 | "title": "Field goal", 65 | "text": "toward its own end. The longest field goal kick in NFL history is 64 yards, a record set by Matt Prater on December 8, 2013. The previous record was 63, originally set by Tom Dempsey (1970) and then matched by Jason Elam (1998), Sebastian Janikowski (2011), David Akers (2012), and Graham Gano (2018). High school, college and most professional football leagues offer only a three-point field goal; however, some professional leagues have encouraged more rare kicks through \"four-point field goals\". NFL Europe encouraged long field goals of 50 yards or more by making those worth four points instead of three" 66 | }, 67 | { 68 | "title": "Field goal range", 69 | "text": "35 and 40 yard lines (closer in a crosswind) often will go for the more risky fourth down conversion rather than risk either the touchback or the missed field goal. The longest field goal in recorded football history was 69 yards, set by collegiate kicker Ove Johansson, who was born in Sweden, in a 1976 Abilene Christian University football game against East Texas State University (now Texas A&M Commerce) at Shotwell Stadium in Abilene. The longest successful field goal in the NFL was 64 yards and was completed by Matt Prater in 2013. The NCAA record is 67 yards held" 70 | }, 71 | { 72 | "title": "Field goal", 73 | "text": "both end zones) is only 66 yards. Scaccia, while playing indoor football, attempted a 64-yard kick that was inches short of success, hitting the crossbar. Longer field goals have been attempted at times; the longest attempt in the NFL, which was well short and was kicked into the wind, was 76 yards, attempted by Sebastian Janikowski of the Oakland Raiders, in a September 28, 2008 game against the San Diego Chargers. NFL Europe rewarded kickers that successfully kicked a field goal of longer than 50 yards with a bonus point, making such field goals worth 4 points instead of 3;" 74 | }, 75 | { 76 | "title": "Field goal", 77 | "text": "this accomplishment is not the official record. All of the above kicks were successful with the use of a kicking tee, which was banned by the NCAA after the 1988 season. The longest known drop-kicked field goal in college football was a 62-yard kick from Pat O'Dea, an Australian kicker who played on the Wisconsin Badgers football team. O'Dea's kick took place in a blizzard against Northwestern on November 15, 1898. The longest field goal in U Sports football history is 59 yards, by Niko Difonte of Calgary Dinos, playing against the UBC Thunderbirds on November 11, 2017. The field" 78 | }, 79 | { 80 | "title": "Field goal range", 81 | "text": "NFL and have been banned from NCAA since 1989) is 68 yards held by Fabrizio Scaccia, and the high school record 68 yards held by Dirk Borgognone; high school has wider goal posts and treats a field goal attempt that lands short in the field of play the same as a punt, making longer attempts much less risky. The indoor football record, with narrower and higher goal posts, is 63 yards (set by Aaron Mills), which is practically as long of a field goal as is possible in that variant of the sport, since the field in indoor football (including" 82 | } 83 | ] 84 | }, 85 | { 86 | "question": "Who played galen in planet of the apes?", 87 | "answer": "In the 1968 film Planet of the Apes, Galen was played by Wright King. And in the tv series Planet of the Apes, Galen was played by Roddy McDowall.", 88 | "docs": [ 89 | { 90 | "title": "Planet of the Apes", 91 | "text": "installment. Jacobs died on June 27, 1973, bringing an end to the APJAC Productions era of the \"Planet of the Apes\" franchise. Former Fox executive Stan Hough took over as producer for the television project, titled \"Planet of the Apes\". CBS picked up the series for its 1974 autumn lineup. Ron Harper and James Naughton played Alan Virdon and Peter Burke, two 20th-century American astronauts who pass through a time warp to a future where apes subjugate humans (unlike the original film, the humans can speak). Roddy McDowall returned to the franchise as Galen, a chimpanzee who joins the astronauts." 92 | }, 93 | { 94 | "title": "Planet of the Apes (1968 film)", 95 | "text": "chimpanzees: animal psychologist Zira (Kim Hunter) and surgeon Galen (Wright King). While unable to speak as his throat wound is healing, called \"Bright Eyes\" by Zira and placed with one of the captive primitive humans he later names \"Nova\", Taylor observes the enhanced society of talking apes and in a strict caste system: the gorillas being the military police, hunters and workers; the orangutans overseeing the affairs of government, science, and religion; and intellectual chimpanzees being mostly scientists. While their society is a theocracy similar to the beginnings of the human Industrial Era, the apes consider the primitive humans as" 96 | }, 97 | { 98 | "title": "Planet of the Apes (1968 film)", 99 | "text": "Planet of the Apes (1968 film) Planet of the Apes is a 1968 American science fiction film directed by Franklin J. Schaffner. It stars Charlton Heston, Roddy McDowall, Kim Hunter, Maurice Evans, James Whitmore, James Daly and Linda Harrison. The screenplay by Michael Wilson and Rod Serling was loosely based on the 1963 French novel \"La Plan\u00e8te des Singes\" by Pierre Boulle. Jerry Goldsmith composed the groundbreaking avant-garde score. It was the first in a series of five films made between 1968 and 1973, all produced by Arthur P. Jacobs and released by 20th Century Fox. The film tells the" 100 | }, 101 | { 102 | "title": "Planet of the Apes", 103 | "text": "Rupert Wyatt. To portray ape characters realistically, the production avoided practical effects in favor of performance capture acting, partnering with New Zealand visual effects company Weta Digital. Wyatt cast James Franco as Will Rodman, while veteran performance capture actor Andy Serkis signed on to star as Caesar. \"Rise\" debuted on August 5, 2011. Critics reviewed it positively, especially praising the visual effects and Serkis's performance. It was a major box office hit, taking in $482 million globally, more than five times its $93 million budget. Weta's special effects earned the film two Visual Effects Society Awards and an Oscar nomination" 104 | }, 105 | { 106 | "title": "Planet of the Apes", 107 | "text": "film stars Mark Wahlberg as astronaut Leo Davidson, who accidentally travels through a wormhole to a distant planet where talking apes enslave humans. He leads a human revolt and upends ape civilization by discovering that the apes evolved from the normal earth primates who had accompanied his mission, and arrived years before. Helena Bonham Carter played chimpanzee Ari, while Tim Roth played the human-hating chimpanzee General Thade. The film received mixed reviews; most critics believed it failed to compare to the original. Much of the negative commentary focused on the confusing plot and twist ending, though many reviewers praised the" 108 | } 109 | ] 110 | } 111 | ] 112 | } -------------------------------------------------------------------------------- /prompts/asqa_revised.json: -------------------------------------------------------------------------------- 1 | { 2 | "instruction": "Instruction: Write an accurate, engaging, and concise answer for the given question using only the provided search results (some of which might be irrelevant) and cite them properly. Use an unbiased and journalistic tone. Always cite for any factual claim. When citing a document, surround its ID with square brackets, such as [x] to cite document x. To cite multiple documents, simply concatenate the citation markers; for example, use [x][y][z] to cite the documents with ID x, y, and z. Cite at least one document and at most three documents in each sentence. If multiple documents support the sentence, only cite a minimum sufficient subset of the documents.", 3 | "demo_sep": "\n\n\n", 4 | "demo_prompt": "{instruction}\n\nQuestion: {question}\n\n{context}\n\nAnswer: {answer}", 5 | "doc_prompt": "Document [{ID}](Title: {title}): {text}", 6 | "demos": [ 7 | { 8 | "question": "Which is the most rainy place on earth?", 9 | "answer": "Several places on Earth claim to be the most rainy, such as Lloró, Colombia, which reported an average annual rainfall of 12,717 mm between 1952 and 1989, and López de Micay, Colombia, which reported an annual 12,892 mm between 1960 and 2012 [3]. However, the official record is held by Mawsynram, India with an average annual rainfall of 11,872 mm [3], although nearby town Sohra, India, also known as Cherrapunji, holds the record for most rain in a calendar month for July 1861 and most rain in a year from August 1860 to July 1861 [1].", 10 | "docs": [ 11 | { 12 | "title": "Cherrapunji", 13 | "text": "Cherrapunji Cherrapunji (; with the native name Sohra being more commonly used, and can also be spelled Cherrapunjee or Cherrapunji) is a subdivisional town in the East Khasi Hills district in the Indian state of Meghalaya. It is the traditional capital of aNongkhlaw \"hima\" (Khasi tribal chieftainship constituting a petty state), both known as Sohra or Churra. Cherrapunji has often been credited as being the wettest place on Earth, but for now nearby Mawsynram currently holds that distinction. Cherrapunji still holds the all-time record for the most rainfall in a calendar month for July 1861 and most rain in a year from August 1860 to July 1861, however: it received in" 14 | }, 15 | { 16 | "title": "Cherrapunji", 17 | "text": "Radio relay station known as Akashvani Cherrapunji. It broadcasts on FM frequencies. Cherrapunji Cherrapunji (; with the native name Sohra being more commonly used, and can also be spelled Cherrapunjee or Cherrapunji) is a subdivisional town in the East Khasi Hills district in the Indian state of Meghalaya. It is the traditional capital of aNongkhlaw \"hima\" (Khasi tribal chieftainship constituting a petty state), both known as Sohra or Churra. Cherrapunji has often been credited as being the wettest place on Earth, but for now nearby Mawsynram currently holds that distinction. Cherrapunji still holds the all-time record for the most rainfall" 18 | }, 19 | { 20 | "title": "Mawsynram", 21 | "text": "Mawsynram Mawsynram () is a village in the East Khasi Hills district of Meghalaya state in north-eastern India, 65 kilometres from Shillong. Mawsynram receives one of the highest rainfalls in India. It is reportedly the wettest place on Earth, with an average annual rainfall of 11,872 mm, but that claim is disputed by Lloró, Colombia, which reported an average yearly rainfall of 12,717 mm between 1952 and 1989 and López de Micay, also in Colombia, which reported an annual 12,892 mm per year between 1960 and 2012. According to the \"Guinness Book of World Records\", Mawsynram received of rainfall in 1985. Mawsynram is located at 25° 18′" 22 | }, 23 | { 24 | "title": "Earth rainfall climatology", 25 | "text": "Pacific Northwest, and the Sierra Nevada range are the wetter portions of the nation, with average rainfall exceeding per year. The drier areas are the Desert Southwest, Great Basin, valleys of northeast Arizona, eastern Utah, central Wyoming, eastern Oregon and Washington and the northeast of the Olympic Peninsula. The Big Bog on the island of Maui receives, on average, every year, making it the wettest location in the US, and all of Oceania. The annual average rainfall maxima across the continent lie across the northwest from northwest Brazil into northern Peru, Colombia, and Ecuador, then along the Atlantic coast of" 26 | }, 27 | { 28 | "title": "Going to Extremes", 29 | "text": "in the world. Oymyakon in Siberia, where the average winter temperature is −47 °F (− 44 °C). Arica in Chile, where there had been fourteen consecutive years without rain. Fog is the only local source of water. Mawsynram in India, where average annual rainfall is 14 meters, falling within a four-month period in the monsoon season. The rainfall is approximately equal to that of its neighbor Cherrapunji. Dallol in Ethiopia, known as the 'Hell-hole of creation' where the temperature averages 94 °F (34 °C) over the year. In his second series, Middleton visited places without permanent towns, locations where \"survival\"" 30 | } 31 | ] 32 | }, 33 | { 34 | "question": "When did the us break away from england?", 35 | "answer": "The United States took the first step towards gaining independence from Great Britain when it declared independence from Great Britain on July 2, 1776 (although the event is now commemorated on July 4, 1776, the date when the Declaration of Independence was officially adopted by Congress) [2]. The Treaty of Paris was later signed on September 3, 1783, formally separating the United States from the British Empire [3].", 36 | "docs": [ 37 | { 38 | "title": "United States withdrawal from Saudi Arabia", 39 | "text": "United States withdrawal from Saudi Arabia Beginning during Operation Desert Shield in August 1990, while preparing for the Gulf War, the United States sent a large troop contingent to Saudi Arabia. After the war, remnant troops, primarily U.S. Air Force personnel, augmented by a smaller number of coordinating and training personnel from the U.S. Navy, U.S. Army and U.S. Marine Corps remained in Saudi Arabia under the aegis of Joint Task Force Southwest Asia (JTF-SWA), as part of Operation Southern Watch (OSW). The United Kingdom and France also maintained a small contingent of Royal Air Force and French Air Force" 40 | }, 41 | { 42 | "title": "Decolonization of the Americas", 43 | "text": "and France has fully \"integrated\" most of its former colonies as fully constituent \"departments\" of France. The United States of America declared independence from Great Britain on July 2, 1776 (although the event is now commemorated on July 4, the date when the Declaration of Independence was officially adopted by Congress), in so doing becoming the first independent, foreign-recognized nation in the Americas and the first European colonial entity to break from its mother country. Britain formally acknowledged American independence in 1783 after its defeat in the American Revolutionary War. Although initially occupying only the land east of the Mississippi" 44 | }, 45 | { 46 | "title": "American Revolution", 47 | "text": "second British army at Yorktown in the fall of 1781, effectively ending the war. The Treaty of Paris was signed September 3, 1783, formally ending the conflict and confirming the new nation's complete separation from the British Empire. The United States took possession of nearly all the territory east of the Mississippi River and south of the Great Lakes, with the British retaining control of Canada and Spain taking Florida. Among the significant results of the revolution was the creation of the United States Constitution, establishing a relatively strong federal national government that included an executive, a national judiciary, and" 48 | }, 49 | { 50 | "title": "Decolonization", 51 | "text": "accelerate decolonialization and bring an end to the colonial empires of its Western allies, most importantly during the 1956 Suez Crisis, but American military bases were established around the world and direct and indirect interventions continued in Korea, Indochina, Latin America (\"inter alia\", the 1965 occupation of the Dominican Republic), Africa, and the Middle East to oppose Communist invasions and insurgencies. Since the dissolution of the Soviet Union, the United States has been far less active in the Americas, but invaded Afghanistan and Iraq following the September 11 attacks in 2001, establishing army and air bases in Central Asia. Before" 52 | }, 53 | { 54 | "title": "Decolonization", 55 | "text": "the responsibility of the United Kingdom (with a copy of the new constitution annexed), and finally, if approved, issuance of an Order of Council fixing the exact date of independence. After World War I, several former German and Ottoman territories in the Middle East, Africa, and the Pacific were governed by the UK as League of Nations mandates. Some were administered directly by the UK, and others by British dominions – Nauru and the Territory of New Guinea by Australia, South West Africa by the Union of South Africa, and Western Samoa by New Zealand. Egypt became independent in 1922," 56 | } 57 | ] 58 | }, 59 | { 60 | "question": "Who set the record for longest field goal?", 61 | "answer": "The record for the longest field goal in an NFL game was set by Matt Prater at 64 yards [1], but the record for the longest field goal at any level was 69 yards, kicked by collegiate kicker Ove Johansson in a 1976 Abilene Christian University football game against East Texas State University [2].", 62 | "docs": [ 63 | { 64 | "title": "Field goal", 65 | "text": "toward its own end. The longest field goal kick in NFL history is 64 yards, a record set by Matt Prater on December 8, 2013. The previous record was 63, originally set by Tom Dempsey (1970) and then matched by Jason Elam (1998), Sebastian Janikowski (2011), David Akers (2012), and Graham Gano (2018). High school, college and most professional football leagues offer only a three-point field goal; however, some professional leagues have encouraged more rare kicks through \"four-point field goals\". NFL Europe encouraged long field goals of 50 yards or more by making those worth four points instead of three" 66 | }, 67 | { 68 | "title": "Field goal range", 69 | "text": "35 and 40 yard lines (closer in a crosswind) often will go for the more risky fourth down conversion rather than risk either the touchback or the missed field goal. The longest field goal in recorded football history was 69 yards, set by collegiate kicker Ove Johansson, who was born in Sweden, in a 1976 Abilene Christian University football game against East Texas State University (now Texas A&M Commerce) at Shotwell Stadium in Abilene. The longest successful field goal in the NFL was 64 yards and was completed by Matt Prater in 2013. The NCAA record is 67 yards held" 70 | }, 71 | { 72 | "title": "Field goal", 73 | "text": "both end zones) is only 66 yards. Scaccia, while playing indoor football, attempted a 64-yard kick that was inches short of success, hitting the crossbar. Longer field goals have been attempted at times; the longest attempt in the NFL, which was well short and was kicked into the wind, was 76 yards, attempted by Sebastian Janikowski of the Oakland Raiders, in a September 28, 2008 game against the San Diego Chargers. NFL Europe rewarded kickers that successfully kicked a field goal of longer than 50 yards with a bonus point, making such field goals worth 4 points instead of 3;" 74 | }, 75 | { 76 | "title": "Field goal", 77 | "text": "this accomplishment is not the official record. All of the above kicks were successful with the use of a kicking tee, which was banned by the NCAA after the 1988 season. The longest known drop-kicked field goal in college football was a 62-yard kick from Pat O'Dea, an Australian kicker who played on the Wisconsin Badgers football team. O'Dea's kick took place in a blizzard against Northwestern on November 15, 1898. The longest field goal in U Sports football history is 59 yards, by Niko Difonte of Calgary Dinos, playing against the UBC Thunderbirds on November 11, 2017. The field" 78 | }, 79 | { 80 | "title": "Field goal range", 81 | "text": "NFL and have been banned from NCAA since 1989) is 68 yards held by Fabrizio Scaccia, and the high school record 68 yards held by Dirk Borgognone; high school has wider goal posts and treats a field goal attempt that lands short in the field of play the same as a punt, making longer attempts much less risky. The indoor football record, with narrower and higher goal posts, is 63 yards (set by Aaron Mills), which is practically as long of a field goal as is possible in that variant of the sport, since the field in indoor football (including" 82 | } 83 | ] 84 | }, 85 | { 86 | "question": "Who played galen in planet of the apes?", 87 | "answer": "In the 1968 film Planet of the Apes, Galen was played by Wright King [2]. And in the tv series Planet of the Apes, Galen was played by Roddy McDowall [1].", 88 | "docs": [ 89 | { 90 | "title": "Planet of the Apes", 91 | "text": "installment. Jacobs died on June 27, 1973, bringing an end to the APJAC Productions era of the \"Planet of the Apes\" franchise. Former Fox executive Stan Hough took over as producer for the television project, titled \"Planet of the Apes\". CBS picked up the series for its 1974 autumn lineup. Ron Harper and James Naughton played Alan Virdon and Peter Burke, two 20th-century American astronauts who pass through a time warp to a future where apes subjugate humans (unlike the original film, the humans can speak). Roddy McDowall returned to the franchise as Galen, a chimpanzee who joins the astronauts." 92 | }, 93 | { 94 | "title": "Planet of the Apes (1968 film)", 95 | "text": "chimpanzees: animal psychologist Zira (Kim Hunter) and surgeon Galen (Wright King). While unable to speak as his throat wound is healing, called \"Bright Eyes\" by Zira and placed with one of the captive primitive humans he later names \"Nova\", Taylor observes the enhanced society of talking apes and in a strict caste system: the gorillas being the military police, hunters and workers; the orangutans overseeing the affairs of government, science, and religion; and intellectual chimpanzees being mostly scientists. While their society is a theocracy similar to the beginnings of the human Industrial Era, the apes consider the primitive humans as" 96 | }, 97 | { 98 | "title": "Planet of the Apes (1968 film)", 99 | "text": "Planet of the Apes (1968 film) Planet of the Apes is a 1968 American science fiction film directed by Franklin J. Schaffner. It stars Charlton Heston, Roddy McDowall, Kim Hunter, Maurice Evans, James Whitmore, James Daly and Linda Harrison. The screenplay by Michael Wilson and Rod Serling was loosely based on the 1963 French novel \"La Plan\u00e8te des Singes\" by Pierre Boulle. Jerry Goldsmith composed the groundbreaking avant-garde score. It was the first in a series of five films made between 1968 and 1973, all produced by Arthur P. Jacobs and released by 20th Century Fox. The film tells the" 100 | }, 101 | { 102 | "title": "Planet of the Apes", 103 | "text": "Rupert Wyatt. To portray ape characters realistically, the production avoided practical effects in favor of performance capture acting, partnering with New Zealand visual effects company Weta Digital. Wyatt cast James Franco as Will Rodman, while veteran performance capture actor Andy Serkis signed on to star as Caesar. \"Rise\" debuted on August 5, 2011. Critics reviewed it positively, especially praising the visual effects and Serkis's performance. It was a major box office hit, taking in $482 million globally, more than five times its $93 million budget. Weta's special effects earned the film two Visual Effects Society Awards and an Oscar nomination" 104 | }, 105 | { 106 | "title": "Planet of the Apes", 107 | "text": "film stars Mark Wahlberg as astronaut Leo Davidson, who accidentally travels through a wormhole to a distant planet where talking apes enslave humans. He leads a human revolt and upends ape civilization by discovering that the apes evolved from the normal earth primates who had accompanied his mission, and arrived years before. Helena Bonham Carter played chimpanzee Ari, while Tim Roth played the human-hating chimpanzee General Thade. The film received mixed reviews; most critics believed it failed to compare to the original. Much of the negative commentary focused on the confusing plot and twist ending, though many reviewers praised the" 108 | } 109 | ] 110 | } 111 | ] 112 | } -------------------------------------------------------------------------------- /prompts/qampari_nocite.json: -------------------------------------------------------------------------------- 1 | { 2 | "instruction": "Instruction: Provide a list of accurate answers for the given question using only the provided search results (some of which might be irrelevant). Separate answers by commas. For questions that have more than 5 answers, write at least 5 answers.", 3 | "demo_sep": "\n\n\n", 4 | "demo_prompt": "{instruction}\n\nQuestion: {question}\n\n{context}\nAnswer: {answer}", 5 | "doc_prompt": "Document [{ID}](Title: {title}): {text}", 6 | "demos": [ 7 | { 8 | "question": "Which books were written by Nevil Shute?", 9 | "answer": "Marazan, Stephen Morris, Beyond the Black Stump, Lonely Road, The Chequer Board, In the Wet, Trustee from the Toolroom, Round the Bend, No Highway, Ruined City, On the Beach.", 10 | "docs": [ 11 | { 12 | "title": "Nevil Shute", 13 | "text": "early stages. My congratulations.\" His celebrity as a writer caused the Ministry of Information to send him to the Normandy Landings on 6 June 1944 and later to Burma as a correspondent. He finished the war with the rank of lieutenant commander in the Royal Navy Volunteer Reserves (RNVR). Shute's first novel, \"Stephen Morris\", was written in 1923, but not published until 1961. His first published novel was \"Marazan\", which came out in 1926. After that he averaged one novel every two years through the 1950s, with the exception of a six-year hiatus while he was establishing his own aircraft" 14 | }, 15 | { 16 | "title": "Nevil Shute", 17 | "text": "theme is the bridging of social barriers such as class (\"Lonely Road\" and \"Landfall\"), race (\"The Chequer Board\"), or religion (\"Round the Bend\"). The Australian novels are individual hymns to that country, with subtle disparagement of the mores of the United States (\"Beyond the Black Stump\") and overt antipathy towards the post-World War II socialist government of Shute's native Britain (\"The Far Country\" and \"In the Wet\"). Shute's heroes tended to be like himself: middle class solicitors, doctors, accountants, bank managers, engineers, generally university graduates. However (as in \"Trustee from the Toolroom\"), Shute valued the honest artisans and their social" 18 | }, 19 | { 20 | "title": "Nevil Shute", 21 | "text": "construction company, Airspeed Ltd. His popularity grew slowly with each novel, but he became much more famous after the publication of \"On the Beach\" in 1957. Shute's novels are written in a simple, highly readable style, with clearly delineated plot lines. Where there is a romantic element, sex is referred to only obliquely. Many of the stories are introduced by a narrator who is not a character in the story. The most common theme in Shute's novels is the dignity of work, spanning all classes, whether an Eastern European bar \"hostess\" (\"Ruined City\") or brilliant boffin (\"No Highway\"). Another recurrent" 22 | }, 23 | { 24 | "title": "The Chequer Board", 25 | "text": "the Burmese people\", both of which are central to the book's story. Shute was concerned that sales of the book in the United States would be negatively impacted by the book's open-minded handling of racial issues; as it turned out, sales soared. Shute and his wife traveled the U.S. on Greyhound buses to \"\"get in touch with the man on the street,\"\" finding the experience refreshing. Afterwards he wrote \"\"Sincerity is the first attribute for making money in the business of writing novels.\"\" The Chequer Board The Chequer Board is a novel by Nevil Shute, first published in the United" 26 | }, 27 | { 28 | "title": "In the Wet", 29 | "text": "had used the idea of multiple votes for merit in his short story \"The Curious Republic of Gondour\". In the Wet In The Wet is a novel by Nevil Shute that was first published in the United Kingdom in 1953. It contains many of the typical elements of a hearty and adventurous Shute yarn such as flying, the future, mystic states, and ordinary people doing extraordinary things. The story is opened by its initial narrator \u2013 an Anglican priest in the Bush Brotherhood named Roger Hargreaves \u2013 who describes his ordinary circumstances in a large parish of the Australian outback" 30 | } 31 | ] 32 | }, 33 | { 34 | "question": "Which film has Gong Li as a member of its cast?", 35 | "answer": "The Story of Qiu Ju, Farewell My Concubine, Flirting Scholar, The Monkey King 2, Mulan, Saturday Fiction, Coming Home.", 36 | "docs": [ 37 | { 38 | "title": "Gong Li", 39 | "text": "Gong Li Gong Li (born 31 December 1965) is a Chinese-born Singaporean film actress. She achieved international prominence through her close collaborations with Chinese director Zhang Yimou and won the Volpi Cup for Best Actress at Venice for her performance in his 1992 film \"The Story of Qiu Ju\". She has been credited with helping to bring Chinese cinema to prominence in Europe and the United States. In 2006, she was voted the most beautiful woman in China. Gong has won numerous accolades for her work as an actress; she won the New York Film Critics Circle Award for Best" 40 | }, 41 | { 42 | "title": "Gong Li", 43 | "text": "making her realize that she has assisted the dark cynical system. In 1993, she received a New York Film Critics Circle award for her role in \"Farewell My Concubine\" (1993). Directed by Chen Kaige, the film was her first major role with a director other than Zhang Yimou. In the same year, she was awarded with the Berlinale Camera at the 43rd Berlin International Film Festival. \"Premiere\" magazine ranked her performance in \"Farewell My Concubine\" as the 89th greatest performance of all time. She also worked with renowned director Stephen Chow in comedy films \"\" (1991) and \"Flirting Scholar\" (1993)." 44 | }, 45 | { 46 | "title": "Gong Li", 47 | "text": "International Film Festival. Later that same year, she reunited with Zhang Yimou for the film \"Coming Home\", which is set during the throes of the Cultural Revolution; this film was their first collaboration since 2006. In 2016, Gong took on her first action role in \"The Monkey King 2\", playing the White Bone Demon. In 2018, Gong was cast in Lou Ye's period drama \"Saturday Fiction\", where she plays an actress who is working undercover gathering intelligence for the Allies. That year, she was also cast in the live-action adaptation of the 1998 Disney animated film \"Mulan\", as an unspecified" 48 | }, 49 | { 50 | "title": "Zhang Yimou", 51 | "text": "in Zhang's earlier films. \"Raise the Red Lantern\" was nominated in the Best Foreign Language Film category at the 1992 Academy Awards, becoming the second Chinese film to earn this distinction (after Zhang's \"Ju Dou\"). It eventually lost out to Gabriele Salvatores's \"Mediterraneo\". Zhang's next directorial work, \"The Story of Qiu Ju\", in 1992, once again starring Gong Li in the lead role. The film, which tells the tale of a peasant woman seeking justice for her husband after he was beaten by a village official, was a hit at film festivals and won the Golden Lion award at the" 52 | }, 53 | { 54 | "title": "Gong Li", 55 | "text": "Gong Li Gong Li (born 31 December 1965) is a Chinese-born Singaporean film actress. She achieved international prominence through her close collaborations with Chinese director Zhang Yimou and won the Volpi Cup for Best Actress at Venice for her performance in his 1992 film \"The Story of Qiu Ju\". She has been credited with helping to bring Chinese cinema to prominence in Europe and the United States. In 2006, she was voted the most beautiful woman in China. Gong has won numerous accolades for her work as an actress; she won the New York Film Critics Circle Award for Best" 56 | } 57 | ] 58 | }, 59 | { 60 | "question": "In which years did Patti LaBelle publish music?", 61 | "answer": "2006, 1977, 2004, 2005, 2000, 2006.", 62 | "docs": [ 63 | { 64 | "title": "The Gospel According to Patti LaBelle", 65 | "text": "The Gospel According to Patti LaBelle The Gospel According to Patti LaBelle is the first gospel album released by singer Patti LaBelle, released in November 2006. This project began three years ago when Patti's late musical director and close friend Budd Ellison told a skeptical LaBelle that \"it's now or never, Patti.\" The album is dedicated to his memory as he succumbed to prostate cancer before the album saw a release. The album was released on November 21, 2006 through indie label Umbrella/Bungalow Records, also home to Carl Thomas, Rodney Jerkins, Dean \"DC\" Charles, and other artists. \"The Gospel According" 66 | }, 67 | { 68 | "title": "Patti LaBelle (album)", 69 | "text": "scaled the high sixties on the \"Billboard\" R&B chart, it soon became one of her famous show-stoppers while performing the song. LaBelle performed the song at her first solo concert in London, getting a standing ovation, which helped to give LaBelle motivation to continue her career. The album, when released, performed successfully, reaching number 62 on the \"Billboard\" 200 and number 31 on the R&B albums chart, while critics hailed the album. Patti LaBelle (album) Patti LaBelle is the debut solo album by singer Patti LaBelle, released in 1977. The first album LaBelle recorded after sixteen years fronting the band" 70 | }, 71 | { 72 | "title": "Patti LaBelle", 73 | "text": "win. In 2000, LaBelle released her final MCA album, \"When a Woman Loves\", before signing with Def Soul Classics to release the 2004 album, \"Timeless Journey\". Following the release of her 2005 covers album, \"Classic Moments\", LaBelle engaged in a rivalry with Antonio \"L.A.\" Reid over the direction of her career, leading to her leaving the label.In the same year, the World Music Awards recognized her years in the music business by awarding her the Legend Award. In 2006, she released her first gospel album, \"The Gospel According to Patti LaBelle\" on the Bungalo label, the album later peaking at" 74 | }, 75 | { 76 | "title": "Patti LaBelle", 77 | "text": "Patti LaBelle Patti LaBelle (born Patricia Louise Holt; May 24, 1944) is an American singer, actress, and entrepreneur. LaBelle began her career in the early 1960s as lead singer and front woman of the vocal group, Patti LaBelle and the Bluebelles. Following the group's name change to Labelle in the early 1970s, they released the iconic disco song \"Lady Marmalade\" and the group later became the first African-American vocal group to land the cover of \"Rolling Stone\" magazine. After the group split in 1976, LaBelle began a successful solo career, starting with her critically acclaimed debut album, which included the" 78 | }, 79 | { 80 | "title": "The Gospel According to Patti LaBelle", 81 | "text": "Billboard's Top Gospel Albums chart for 17 weeks. \"Where Love Begins,\" a duet with Yolanda Adams was played frequently on R&B and gospel radio stations and debuted at #68 on Billboard's Hot R&B/Hip-Hop tracks. The second single \"Anything\" featuring Kanye West, Mary Mary and Consequence hit #64 on Billboards Hot R&B/Hip-Hop tracks. In 2008, the album was nominated for a Dove Award for Contemporary Gospel Album of the Year at the 39th GMA Dove Awards. The Gospel According to Patti LaBelle The Gospel According to Patti LaBelle is the first gospel album released by singer Patti LaBelle, released in November" 82 | } 83 | ] 84 | }, 85 | { 86 | "question": "Glenn Ford was a member of cast in which film?", 87 | "answer": "So Ends Our Night, Heaven with a Barbed Wire Fence, Happy Birthday to Me, The Greatest Gift, The Gift, The Brotherhood of the Bell.", 88 | "docs": [ 89 | { 90 | "title": "Glenn Ford", 91 | "text": "name came from his father's hometown of Glenford, Alberta. His first major movie part was in the 1939 film, \"Heaven with a Barbed Wire Fence\". Top Hollywood director John Cromwell was impressed enough with his work to borrow him from Columbia for the independently produced drama, \"So Ends Our Night\" (1941), where Ford delivered a poignant portrayal of a 19-year-old German exile on the run in Nazi-occupied Europe. Working with Academy Award-winning Fredric March and wooing (onscreen) 30-year-old Margaret Sullavan, recently nominated for an Oscar, Ford's shy, ardent young refugee riveted attention even in such stellar company. \"Glenn Ford, a" 92 | }, 93 | { 94 | "title": "Glenn Ford", 95 | "text": "were Westerns. He suggested doing a Western series, instead, which resulted in the \"modern-day Western\" series, \"Cade's County\". Ford played southwestern Sheriff Cade for one season (1971\u20131972) in a mix of police mystery and western drama. In \"The Family Holvak\" (1975\u20131976), Ford portrayed a Depression-era preacher in a family drama, reprising the same character he had played in the TV film, \"The Greatest Gift\". In 1978 Ford was host, presenter and narrator of the disaster documentary series 'When Havoc Struck'. In 1981, Ford co-starred with Melissa Sue Anderson in the slasher film \"Happy Birthday to Me\". In 1991, Ford agreed" 96 | }, 97 | { 98 | "title": "CBS Thursday Night Movie", 99 | "text": "Night Movie\" opened its fall schedule with the premiere of a low-budget, made-for-TV movie, rather than a proven Hollywood blockbuster guaranteed to lure mass viewership, it became CBS's way of declaring its commitment to product that, although cheaply manufactured, was nevertheless new and topical. In this case, the movie was \"The Brotherhood of the Bell\", and the film's star was Glenn Ford, a movie actor who had never appeared in a television-film. In fact, before shooting on the project even began, Ford had been warned by friends in the industry that he would hate the experience. Instead, the actor reported" 100 | }, 101 | { 102 | "title": "The Trouble with Girls (film) ", 103 | "text": "with Charlene, but when she refuses to give in, he deceives her and uses the local police force to be sure that she must leave on the train with the rest of the troupe. Cast notes In June 1959 it was announced that Don Mankiewicz would write a screenplay of an unpublished story by Mauri Grashin, Day Keene, and Dwight Babcock. By December 1960, with the project titled \"Chautauqua\", MGM was ready to make the film with Glenn Ford. Rumours circulating in Hollywood at the time stated that Presley would co-star with Ford, Hope Lange, and Arthur O'Connell, but nothing" 104 | }, 105 | { 106 | "title": "Trouble in the Glen", 107 | "text": "Mel Ferrer. It was Orson Welles' fifth British movie in six months. Filming started 15 December 1953. The film received very poor reviews. Trouble in the Glen Trouble in the Glen is a 1954 British comedy film directed by Herbert Wilcox and starring Margaret Lockwood, Orson Welles, Forrest Tucker and Victor McLaglen. It is loosely based on Maurice Walsh's 1950 novel of the same name. It was filmed in Trucolor for Republic Pictures. After moving from South America to the Scottish Highlands, millionaire Sanin Cejador y Mengues (Welles) reassumes the title of laird of Glen Easan, which he inherited from" 108 | } 109 | ] 110 | } 111 | ] 112 | } 113 | -------------------------------------------------------------------------------- /prompts/qampari_revised.json: -------------------------------------------------------------------------------- 1 | { 2 | "instruction": "Instruction: Provide a list of accurate answers for the given question using only the provided search results (some of which might be irrelevant) and cite them properly. Always cite one and only one document for each answer. When citing a document, surround its ID with square brackets, such as [x] to cite document x. Separate answers by commas. For questions that have more than 5 answers, write at least 5 answers.", 3 | "demo_sep": "\n\n\n", 4 | "demo_prompt": "{instruction}\n\nQuestion: {question}\n\n{context}\nAnswer: {answer}", 5 | "doc_prompt": "Document [{ID}](Title: {title}): {text}", 6 | "demos": [ 7 | { 8 | "question": "Which books were written by Nevil Shute?", 9 | "answer": "Marazan [1], Stephen Morris [1], Beyond the Black Stump [2], Lonely Road [2], The Chequer Board [2], In the Wet [2], Trustee from the Toolroom [2], Round the Bend [2], No Highway [3], Ruined City [3], On the Beach [3].", 10 | "docs": [ 11 | { 12 | "title": "Nevil Shute", 13 | "text": "early stages. My congratulations.\" His celebrity as a writer caused the Ministry of Information to send him to the Normandy Landings on 6 June 1944 and later to Burma as a correspondent. He finished the war with the rank of lieutenant commander in the Royal Navy Volunteer Reserves (RNVR). Shute's first novel, \"Stephen Morris\", was written in 1923, but not published until 1961. His first published novel was \"Marazan\", which came out in 1926. After that he averaged one novel every two years through the 1950s, with the exception of a six-year hiatus while he was establishing his own aircraft" 14 | }, 15 | { 16 | "title": "Nevil Shute", 17 | "text": "theme is the bridging of social barriers such as class (\"Lonely Road\" and \"Landfall\"), race (\"The Chequer Board\"), or religion (\"Round the Bend\"). The Australian novels are individual hymns to that country, with subtle disparagement of the mores of the United States (\"Beyond the Black Stump\") and overt antipathy towards the post-World War II socialist government of Shute's native Britain (\"The Far Country\" and \"In the Wet\"). Shute's heroes tended to be like himself: middle class solicitors, doctors, accountants, bank managers, engineers, generally university graduates. However (as in \"Trustee from the Toolroom\"), Shute valued the honest artisans and their social" 18 | }, 19 | { 20 | "title": "Nevil Shute", 21 | "text": "construction company, Airspeed Ltd. His popularity grew slowly with each novel, but he became much more famous after the publication of \"On the Beach\" in 1957. Shute's novels are written in a simple, highly readable style, with clearly delineated plot lines. Where there is a romantic element, sex is referred to only obliquely. Many of the stories are introduced by a narrator who is not a character in the story. The most common theme in Shute's novels is the dignity of work, spanning all classes, whether an Eastern European bar \"hostess\" (\"Ruined City\") or brilliant boffin (\"No Highway\"). Another recurrent" 22 | }, 23 | { 24 | "title": "The Chequer Board", 25 | "text": "the Burmese people\", both of which are central to the book's story. Shute was concerned that sales of the book in the United States would be negatively impacted by the book's open-minded handling of racial issues; as it turned out, sales soared. Shute and his wife traveled the U.S. on Greyhound buses to \"\"get in touch with the man on the street,\"\" finding the experience refreshing. Afterwards he wrote \"\"Sincerity is the first attribute for making money in the business of writing novels.\"\" The Chequer Board The Chequer Board is a novel by Nevil Shute, first published in the United" 26 | }, 27 | { 28 | "title": "In the Wet", 29 | "text": "had used the idea of multiple votes for merit in his short story \"The Curious Republic of Gondour\". In the Wet In The Wet is a novel by Nevil Shute that was first published in the United Kingdom in 1953. It contains many of the typical elements of a hearty and adventurous Shute yarn such as flying, the future, mystic states, and ordinary people doing extraordinary things. The story is opened by its initial narrator \u2013 an Anglican priest in the Bush Brotherhood named Roger Hargreaves \u2013 who describes his ordinary circumstances in a large parish of the Australian outback" 30 | } 31 | ] 32 | }, 33 | { 34 | "question": "Which film has Gong Li as a member of its cast?", 35 | "answer": "The Story of Qiu Ju [1], Farewell My Concubine [2], Flirting Scholar [2], The Monkey King 2 [3], Mulan [3], Saturday Fiction [3], Coming Home [3].", 36 | "docs": [ 37 | { 38 | "title": "Gong Li", 39 | "text": "Gong Li Gong Li (born 31 December 1965) is a Chinese-born Singaporean film actress. She achieved international prominence through her close collaborations with Chinese director Zhang Yimou and won the Volpi Cup for Best Actress at Venice for her performance in his 1992 film \"The Story of Qiu Ju\". She has been credited with helping to bring Chinese cinema to prominence in Europe and the United States. In 2006, she was voted the most beautiful woman in China. Gong has won numerous accolades for her work as an actress; she won the New York Film Critics Circle Award for Best" 40 | }, 41 | { 42 | "title": "Gong Li", 43 | "text": "making her realize that she has assisted the dark cynical system. In 1993, she received a New York Film Critics Circle award for her role in \"Farewell My Concubine\" (1993). Directed by Chen Kaige, the film was her first major role with a director other than Zhang Yimou. In the same year, she was awarded with the Berlinale Camera at the 43rd Berlin International Film Festival. \"Premiere\" magazine ranked her performance in \"Farewell My Concubine\" as the 89th greatest performance of all time. She also worked with renowned director Stephen Chow in comedy films \"\" (1991) and \"Flirting Scholar\" (1993)." 44 | }, 45 | { 46 | "title": "Gong Li", 47 | "text": "International Film Festival. Later that same year, she reunited with Zhang Yimou for the film \"Coming Home\", which is set during the throes of the Cultural Revolution; this film was their first collaboration since 2006. In 2016, Gong took on her first action role in \"The Monkey King 2\", playing the White Bone Demon. In 2018, Gong was cast in Lou Ye's period drama \"Saturday Fiction\", where she plays an actress who is working undercover gathering intelligence for the Allies. That year, she was also cast in the live-action adaptation of the 1998 Disney animated film \"Mulan\", as an unspecified" 48 | }, 49 | { 50 | "title": "Zhang Yimou", 51 | "text": "in Zhang's earlier films. \"Raise the Red Lantern\" was nominated in the Best Foreign Language Film category at the 1992 Academy Awards, becoming the second Chinese film to earn this distinction (after Zhang's \"Ju Dou\"). It eventually lost out to Gabriele Salvatores's \"Mediterraneo\". Zhang's next directorial work, \"The Story of Qiu Ju\", in 1992, once again starring Gong Li in the lead role. The film, which tells the tale of a peasant woman seeking justice for her husband after he was beaten by a village official, was a hit at film festivals and won the Golden Lion award at the" 52 | }, 53 | { 54 | "title": "Gong Li", 55 | "text": "Gong Li Gong Li (born 31 December 1965) is a Chinese-born Singaporean film actress. She achieved international prominence through her close collaborations with Chinese director Zhang Yimou and won the Volpi Cup for Best Actress at Venice for her performance in his 1992 film \"The Story of Qiu Ju\". She has been credited with helping to bring Chinese cinema to prominence in Europe and the United States. In 2006, she was voted the most beautiful woman in China. Gong has won numerous accolades for her work as an actress; she won the New York Film Critics Circle Award for Best" 56 | } 57 | ] 58 | }, 59 | { 60 | "question": "In which years did Patti LaBelle publish music?", 61 | "answer": "2006 [1], 1977 [2], 2004 [3], 2005 [3], 2000 [3], 2006 [3].", 62 | "docs": [ 63 | { 64 | "title": "The Gospel According to Patti LaBelle", 65 | "text": "The Gospel According to Patti LaBelle The Gospel According to Patti LaBelle is the first gospel album released by singer Patti LaBelle, released in November 2006. This project began three years ago when Patti's late musical director and close friend Budd Ellison told a skeptical LaBelle that \"it's now or never, Patti.\" The album is dedicated to his memory as he succumbed to prostate cancer before the album saw a release. The album was released on November 21, 2006 through indie label Umbrella/Bungalow Records, also home to Carl Thomas, Rodney Jerkins, Dean \"DC\" Charles, and other artists. \"The Gospel According" 66 | }, 67 | { 68 | "title": "Patti LaBelle (album)", 69 | "text": "scaled the high sixties on the \"Billboard\" R&B chart, it soon became one of her famous show-stoppers while performing the song. LaBelle performed the song at her first solo concert in London, getting a standing ovation, which helped to give LaBelle motivation to continue her career. The album, when released, performed successfully, reaching number 62 on the \"Billboard\" 200 and number 31 on the R&B albums chart, while critics hailed the album. Patti LaBelle (album) Patti LaBelle is the debut solo album by singer Patti LaBelle, released in 1977. The first album LaBelle recorded after sixteen years fronting the band" 70 | }, 71 | { 72 | "title": "Patti LaBelle", 73 | "text": "win. In 2000, LaBelle released her final MCA album, \"When a Woman Loves\", before signing with Def Soul Classics to release the 2004 album, \"Timeless Journey\". Following the release of her 2005 covers album, \"Classic Moments\", LaBelle engaged in a rivalry with Antonio \"L.A.\" Reid over the direction of her career, leading to her leaving the label.In the same year, the World Music Awards recognized her years in the music business by awarding her the Legend Award. In 2006, she released her first gospel album, \"The Gospel According to Patti LaBelle\" on the Bungalo label, the album later peaking at" 74 | }, 75 | { 76 | "title": "Patti LaBelle", 77 | "text": "Patti LaBelle Patti LaBelle (born Patricia Louise Holt; May 24, 1944) is an American singer, actress, and entrepreneur. LaBelle began her career in the early 1960s as lead singer and front woman of the vocal group, Patti LaBelle and the Bluebelles. Following the group's name change to Labelle in the early 1970s, they released the iconic disco song \"Lady Marmalade\" and the group later became the first African-American vocal group to land the cover of \"Rolling Stone\" magazine. After the group split in 1976, LaBelle began a successful solo career, starting with her critically acclaimed debut album, which included the" 78 | }, 79 | { 80 | "title": "The Gospel According to Patti LaBelle", 81 | "text": "Billboard's Top Gospel Albums chart for 17 weeks. \"Where Love Begins,\" a duet with Yolanda Adams was played frequently on R&B and gospel radio stations and debuted at #68 on Billboard's Hot R&B/Hip-Hop tracks. The second single \"Anything\" featuring Kanye West, Mary Mary and Consequence hit #64 on Billboards Hot R&B/Hip-Hop tracks. In 2008, the album was nominated for a Dove Award for Contemporary Gospel Album of the Year at the 39th GMA Dove Awards. The Gospel According to Patti LaBelle The Gospel According to Patti LaBelle is the first gospel album released by singer Patti LaBelle, released in November" 82 | } 83 | ] 84 | }, 85 | { 86 | "question": "Glenn Ford was a member of cast in which film?", 87 | "answer": "So Ends Our Night [1], Heaven with a Barbed Wire Fence [1], Happy Birthday to Me [2], The Greatest Gift [2], The Gift [2], The Brotherhood of the Bell [3].", 88 | "docs": [ 89 | { 90 | "title": "Glenn Ford", 91 | "text": "name came from his father's hometown of Glenford, Alberta. His first major movie part was in the 1939 film, \"Heaven with a Barbed Wire Fence\". Top Hollywood director John Cromwell was impressed enough with his work to borrow him from Columbia for the independently produced drama, \"So Ends Our Night\" (1941), where Ford delivered a poignant portrayal of a 19-year-old German exile on the run in Nazi-occupied Europe. Working with Academy Award-winning Fredric March and wooing (onscreen) 30-year-old Margaret Sullavan, recently nominated for an Oscar, Ford's shy, ardent young refugee riveted attention even in such stellar company. \"Glenn Ford, a" 92 | }, 93 | { 94 | "title": "Glenn Ford", 95 | "text": "were Westerns. He suggested doing a Western series, instead, which resulted in the \"modern-day Western\" series, \"Cade's County\". Ford played southwestern Sheriff Cade for one season (1971\u20131972) in a mix of police mystery and western drama. In \"The Family Holvak\" (1975\u20131976), Ford portrayed a Depression-era preacher in a family drama, reprising the same character he had played in the TV film, \"The Greatest Gift\". In 1978 Ford was host, presenter and narrator of the disaster documentary series 'When Havoc Struck'. In 1981, Ford co-starred with Melissa Sue Anderson in the slasher film \"Happy Birthday to Me\". In 1991, Ford agreed" 96 | }, 97 | { 98 | "title": "CBS Thursday Night Movie", 99 | "text": "Night Movie\" opened its fall schedule with the premiere of a low-budget, made-for-TV movie, rather than a proven Hollywood blockbuster guaranteed to lure mass viewership, it became CBS's way of declaring its commitment to product that, although cheaply manufactured, was nevertheless new and topical. In this case, the movie was \"The Brotherhood of the Bell\", and the film's star was Glenn Ford, a movie actor who had never appeared in a television-film. In fact, before shooting on the project even began, Ford had been warned by friends in the industry that he would hate the experience. Instead, the actor reported" 100 | }, 101 | { 102 | "title": "The Trouble with Girls (film) ", 103 | "text": "with Charlene, but when she refuses to give in, he deceives her and uses the local police force to be sure that she must leave on the train with the rest of the troupe. Cast notes In June 1959 it was announced that Don Mankiewicz would write a screenplay of an unpublished story by Mauri Grashin, Day Keene, and Dwight Babcock. By December 1960, with the project titled \"Chautauqua\", MGM was ready to make the film with Glenn Ford. Rumours circulating in Hollywood at the time stated that Presley would co-star with Ford, Hope Lange, and Arthur O'Connell, but nothing" 104 | }, 105 | { 106 | "title": "Trouble in the Glen", 107 | "text": "Mel Ferrer. It was Orson Welles' fifth British movie in six months. Filming started 15 December 1953. The film received very poor reviews. Trouble in the Glen Trouble in the Glen is a 1954 British comedy film directed by Herbert Wilcox and starring Margaret Lockwood, Orson Welles, Forrest Tucker and Victor McLaglen. It is loosely based on Maurice Walsh's 1950 novel of the same name. It was filmed in Trucolor for Republic Pictures. After moving from South America to the Scottish Highlands, millionaire Sanin Cejador y Mengues (Welles) reassumes the title of laird of Glen Easan, which he inherited from" 108 | } 109 | ] 110 | } 111 | ] 112 | } 113 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | wheel 2 | ninja 3 | packaging 4 | torch 5 | datasets 6 | transformers 7 | accelerate 8 | sentencepiece 9 | pytrec_eval 10 | rouge_score 11 | openai 12 | -------------------------------------------------------------------------------- /scripts/collect_results.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | import pandas as pd 5 | import yaml 6 | from dataclasses import dataclass, asdict 7 | from tqdm import tqdm 8 | 9 | dataset_to_metrics = { 10 | "json_kv": "substring_exact_match", 11 | "nq": "substring_exact_match", 12 | "popqa": "substring_exact_match", 13 | "triviaqa": "substring_exact_match", 14 | "hotpotqa": "substring_exact_match", 15 | 16 | "narrativeqa": ["gpt-4-score"], 17 | "msmarco_rerank_psg": "NDCG@10", 18 | 19 | "trec_coarse": "exact_match", 20 | "trec_fine": "exact_match", 21 | "banking77": "exact_match", 22 | "clinic150": "exact_match", 23 | "nlu": "exact_match", 24 | 25 | "qmsum": "rougeL_recall", 26 | "multi_lexsum": ["gpt-4-f1"], 27 | 28 | "ruler_niah_s_1": "ruler_recall", 29 | "ruler_niah_s_2": "ruler_recall", 30 | "ruler_niah_s_3": "ruler_recall", 31 | "ruler_niah_mk_1": "ruler_recall", 32 | "ruler_niah_mk_2": "ruler_recall", 33 | "ruler_niah_mk_3": "ruler_recall", 34 | "ruler_niah_mq": "ruler_recall", 35 | "ruler_niah_mv": "ruler_recall", 36 | "ruler_fwe": "ruler_recall", 37 | "ruler_cwe": "ruler_recall", 38 | "ruler_vt": "ruler_recall", 39 | "ruler_qa_1": "substring_exact_match", 40 | "ruler_qa_2": "substring_exact_match", 41 | 42 | "infbench_qa": ["rougeL_f1"], 43 | "infbench_choice": ["exact_match"], 44 | "infbench_sum": ["gpt-4-f1"], 45 | 46 | "alce_asqa": ["str_em", "citation_rec", "citation_prec"], 47 | "alce_qampari": ["qampari_rec_top5", "citation_rec", "citation_prec"], 48 | } 49 | 50 | dataset_to_metrics = {k: [v] if isinstance(v, str) else v for k, v in dataset_to_metrics.items()} 51 | custom_avgs = { 52 | "Recall": ["json_kv substring_exact_match", "ruler_niah_mk_2 ruler_recall", "ruler_niah_mk_3 ruler_recall", "ruler_niah_mv ruler_recall"], 53 | "RAG": ['nq substring_exact_match', 'hotpotqa substring_exact_match', 'popqa substring_exact_match', 'triviaqa substring_exact_match',], 54 | "ICL": ['trec_coarse exact_match', 'trec_fine exact_match', 'banking77 exact_match', 'clinic150 exact_match', 'nlu exact_match'], 55 | "Cite": ['alce_asqa str_em', 'alce_asqa citation_rec', 'alce_asqa citation_prec', 'alce_qampari qampari_rec_top5', 'alce_qampari citation_rec', 'alce_qampari citation_prec', ], 56 | "Re-rank": ['msmarco_rerank_psg NDCG@10', ], 57 | "LongQA": ['narrativeqa gpt-4-score', 'infbench_qa rougeL_f1', 'infbench_choice exact_match', ], 58 | "Summ": ['infbench_sum gpt-4-f1', 'multi_lexsum gpt-4-f1', ], 59 | # "RULER": ['ruler_niah_s_1 ruler_recall', 'ruler_niah_s_2 ruler_recall', 'ruler_niah_s_3 ruler_recall', 'ruler_niah_mk_1 ruler_recall', 'ruler_niah_mk_2 ruler_recall', 'ruler_niah_mk_3 ruler_recall', 'ruler_niah_mq ruler_recall', 'ruler_niah_mv ruler_recall', 'ruler_cwe ruler_recall', 'ruler_fwe ruler_recall', 'ruler_vt ruler_recall', 'ruler_qa_1 substring_exact_match', 'ruler_qa_2 substring_exact_match'], 60 | "Ours": ['Recall', 'RAG', 'ICL', 'Cite', 'Re-rank', 'LongQA', 'Summ'], 61 | } 62 | 63 | @dataclass 64 | class arguments: 65 | tag: str = "v1" 66 | input_max_length: int = 131072 67 | generation_max_length: int = 100 68 | generation_min_length: int = 0 69 | max_test_samples: int = 100 70 | shots: int = 2 71 | do_sample: bool = False 72 | temperature: float = 0.0 73 | top_p: float = 1.0 74 | use_chat_template: bool = False 75 | seed: int = 42 76 | test_name: str = "" 77 | dataset: str = "nq" 78 | output_dir: str = "output" 79 | popularity_threshold: float = 3 80 | 81 | category: str = "synthetic" 82 | 83 | def update(self, new): 84 | for key, value in new.items(): 85 | if hasattr(self, key): 86 | setattr(self, key, value) 87 | 88 | def get_path(self): 89 | tag = self.tag 90 | path = os.path.join(self.output_dir, "{args.dataset}_{tag}_{args.test_name}_in{args.input_max_length}_size{args.max_test_samples}_shots{args.shots}_samp{args.do_sample}max{args.generation_max_length}min{args.generation_min_length}t{args.temperature}p{args.top_p}_chat{args.use_chat_template}_{args.seed}.json".format(args=self, tag=tag)) 91 | 92 | if os.path.exists(path.replace(".json", "-gpt4eval_o.json")): 93 | return path.replace(".json", "-gpt4eval_o.json") 94 | if "alce" in self.dataset: 95 | return path.replace(".json", ".json.score") 96 | 97 | if os.path.exists(path + ".score"): 98 | return path + ".score" 99 | return path 100 | 101 | def get_metric_name(self): 102 | for d, m in dataset_to_metrics.items(): 103 | if d in self.dataset: 104 | return d, m 105 | return None 106 | 107 | def get_averaged_metric(self): 108 | path = self.get_path() 109 | print(path) 110 | if not os.path.exists(path): 111 | print("path doesn't exist") 112 | return None 113 | with open(path) as f: 114 | results = json.load(f) 115 | 116 | _, metric = self.get_metric_name() 117 | if path.endswith(".score"): 118 | if any([m not in results for m in metric]): 119 | print("metric doesn't exist") 120 | return None 121 | s = {m: results[m] for m in metric} 122 | else: 123 | if any([m not in results["averaged_metrics"] for m in metric]): 124 | print("metric doesn't exist") 125 | return None 126 | s = {m: results['averaged_metrics'][m] for m in metric} 127 | 128 | s = {m : v * (100 if m == "gpt-4-f1" else 1) * (100/3 if m == "gpt-4-score" else 1) for m, v in s.items()} 129 | print("found scores:", s) 130 | return s 131 | 132 | def get_metric_by_depth(self): 133 | path = self.get_path() 134 | path = path.replace(".score", '') 135 | print(path) 136 | if not os.path.exists(path): 137 | return None 138 | with open(path) as f: 139 | results = json.load(f) 140 | 141 | output = [] 142 | _, metric = self.get_metric_name() 143 | metric = metric[0] 144 | keys = ["depth", "k", metric] 145 | for d in results["data"]: 146 | o = {} 147 | for key in keys: 148 | if key == "k" and "ctxs" in d: 149 | d["k"] = len(d['ctxs']) 150 | if key not in d: 151 | print("no", key) 152 | return None 153 | o[key] = d[key] 154 | o["metric"] = o.pop(metric) 155 | output.append(o) 156 | 157 | df = pd.DataFrame(output) 158 | dfs = df.groupby(list(output[0].keys())[:-1]).mean().reset_index() 159 | 160 | return dfs.to_dict("records") 161 | 162 | if __name__ == "__main__": 163 | # comment out the models you don't want to include, or add the new ones 164 | models_configs = [ 165 | {"model": "gpt-4-0125-preview", "use_chat_template": True, "training_length": 128000}, 166 | {"model": "gpt-4o-mini-2024-07-18", "use_chat_template": True, "training_length": 128000}, 167 | {"model": "gpt-4o-2024-05-13", "use_chat_template": True, "training_length": 128000}, 168 | {"model": "gpt-4o-2024-08-06", "use_chat_template": True, "training_length": 128000}, 169 | {"model": "claude-3-5-sonnet-20240620", "use_chat_template": True, "training_length": 200000}, 170 | {"model": "gemini-1.5-flash-001", "use_chat_template": True, "training_length": 1048576}, 171 | {"model": "gemini-1.5-pro-001", "use_chat_template": True, "training_length": 2097152}, 172 | 173 | # llama 2 based models 174 | {"model": "Llama-2-7B-32K", "use_chat_template": False, "training_length": 32768}, 175 | {"model": "Llama-2-7B-32K-Instruct", "training_length": 32768}, 176 | {"model": "llama-2-7b-80k", "use_chat_template": False, "training_length": 80000}, 177 | {"model": "Yarn-Llama-2-7b-64k", "use_chat_template": False, "training_length": 65536}, 178 | {"model": "Yarn-Llama-2-7b-128k", "use_chat_template": False, "training_length": 131072}, 179 | 180 | # llama 3 models 181 | {"model": "Meta-Llama-3-8B", "use_chat_template": False, "training_length": 8192}, 182 | {"model": "Meta-Llama-3-8B-Instruct", "training_length": 8192}, 183 | {"model": "Meta-Llama-3-8B-Theta16M", "use_chat_template": False, "training_length": 8192}, 184 | {"model": "Meta-Llama-3-8B-Instruct-Theta16M", "training_length": 8192}, 185 | {"model": "Meta-Llama-3-70B-Theta16M", "use_chat_template": False, "training_length": 8192}, 186 | {"model": "Meta-Llama-3-70B-Instruct-Theta16M", "training_length": 8192}, 187 | 188 | {"model": "Llama-3.1-8B", "use_chat_template": False, "training_length": 131072}, 189 | {"model": "Llama-3.1-8B-Instruct", "training_length": 131072}, 190 | {"model": "Llama-3.1-70B", "use_chat_template": False, "training_length": 131072}, 191 | {"model": "Llama-3.1-70B-Instruct", "training_length": 131072}, 192 | {"model": "Llama-3.3-70B-Instruct", "training_length": 131072}, 193 | 194 | {"model": "Llama-3.2-1B", "use_chat_template": False, "training_length": 131072}, 195 | {"model": "Llama-3.2-1B-Instruct", "training_length": 131072}, 196 | {"model": "Llama-3.2-3B", "use_chat_template": False, "training_length": 131072}, 197 | {"model": "Llama-3.2-3B-Instruct", "training_length": 131072}, 198 | 199 | # mistral models 200 | {"model": "Mistral-7B-v0.1", "use_chat_template": False, "training_length": 8192}, 201 | {"model": "Mistral-7B-Instruct-v0.1", "training_length": 8192}, 202 | {"model": "Mistral-7B-Instruct-v0.2", "training_length": 32768}, 203 | {"model": "Mistral-7B-v0.3", "use_chat_template": False, "training_length": 32768}, 204 | {"model": "Mistral-7B-Instruct-v0.3", "training_length": 32768}, 205 | {"model": "Ministral-8B-Instruct-2410", "training_length": 131072}, 206 | 207 | {"model": "Mistral-Nemo-Base-2407", "use_chat_template": False, "training_length": 128000}, 208 | {"model": "Mistral-Nemo-Instruct-2407", "training_length": 128000}, 209 | {"model": "MegaBeam-Mistral-7B-512k", "training_length": 524288}, 210 | 211 | # yi models 212 | {"model": "Yi-6B-200K", "use_chat_template": False, "training_length": 200000}, 213 | {"model": "Yi-9B-200K", "use_chat_template": False, "training_length": 200000}, 214 | {"model": "Yi-34B-200K", "use_chat_template": False, "training_length": 200000}, 215 | {"model": "Yi-1.5-9B-32K", "use_chat_template": False, "training_length": 32768}, 216 | 217 | # phi models 218 | {"model": "Phi-3-mini-128k-instruct", "training_length": 131072}, 219 | {"model": "Phi-3-small-128k-instruct", "training_length": 131072}, 220 | {"model": "Phi-3-medium-128k-instruct", "training_length": 131072}, 221 | {"model": "Phi-3.5-mini-instruct", "training_length": 131072}, 222 | 223 | # qwen models 224 | {"model": "Qwen2-7B", "use_chat_template": False, "training_length": 32768}, 225 | {"model": "Qwen2-7B-Instruct", "training_length": 32768}, 226 | {"model": "Qwen2-57B-A14B", "use_chat_template": False, "training_length": 32768}, 227 | {"model": "Qwen2-57B-A14B-Instruct", "training_length": 32768}, 228 | {"model": "Qwen2.5-1.5B", "use_chat_template": False, "training_length": 32768}, 229 | {"model": "Qwen2.5-1.5B-Instruct", "training_length": 32768}, 230 | {"model": "Qwen2.5-3B", "use_chat_template": False, "training_length": 32768}, 231 | {"model": "Qwen2.5-3B-Instruct", "training_length": 32768}, 232 | {"model": "Qwen2.5-7B", "use_chat_template": False, "training_length": 131072}, 233 | {"model": "Qwen2.5-7B-Instruct", "training_length": 131072}, 234 | {"model": "Qwen2.5-72B-Instruct", "training_length": 131072}, 235 | 236 | # prolong 237 | {"model": "Llama-3-8B-ProLong-512k-Instruct", "training_length": 524288}, 238 | 239 | # gemma 2 models 240 | {"model": "gemma-2-9b", "use_chat_template": False, "training_length": 8192}, 241 | {"model": "gemma-2-9b-it", "training_length": 8192}, 242 | {"model": "gemma-2-9b-it-Theta320K", "training_length": 8192}, 243 | 244 | {"model": "gemma-2-27b", "use_chat_template": False, "training_length": 8192}, 245 | {"model": "gemma-2-27b-it", "training_length": 8192}, 246 | {"model": "gemma-2-27b-it-Theta320K", "training_length": 8192}, 247 | 248 | # others 249 | {"model": "c4ai-command-r-v01", "training_length": 131072}, 250 | {"model": "Jamba-v0.1", "use_chat_template": False, "training_length": 262144}, 251 | {"model": "AI21-Jamba-1.5-Mini", "training_length": 262144}, 252 | ] 253 | 254 | 255 | models_configs = [ 256 | {"model": "Llama-3.1-8B", "use_chat_template": False, "training_length": 131072}, 257 | {"model": "Llama-3.1-8B-Instruct", "training_length": 131072}, 258 | {"model": "DeepSeek-R1-Distill-Llama-8B", "training_length": 131072, "do_sample": True, "temperature": 0.6}, 259 | {"model": "Qwen2-7B", "use_chat_template": False, "training_length": 32768}, 260 | {"model": "Qwen2-7B-Instruct", "training_length": 32768}, 261 | {"model": "DeepSeek-R1-Distill-Qwen-7B", "training_length": 131072, "do_sample": True, "temperature": 0.6}, 262 | ] 263 | 264 | # set your configs here, only include the ones that you ran 265 | config_files = [ 266 | "configs/recall.yaml", "configs/recall_short.yaml", 267 | "configs/rag.yaml", "configs/rag_short.yaml", 268 | "configs/longqa.yaml", "configs/longqa_short.yaml", 269 | "configs/summ.yaml", "configs/summ_short.yaml", 270 | "configs/rerank.yaml", "configs/rerank_short.yaml", 271 | "configs/icl.yaml", "configs/icl_short.yaml", 272 | "configs/cite.yaml", "configs/cite_short.yaml", 273 | "configs/ruler.yaml", "configs/ruler_short.yaml", 274 | ] 275 | 276 | dataset_configs = [] 277 | for file in config_files: 278 | c = yaml.safe_load(open(file)) 279 | 280 | if isinstance(c["generation_max_length"], int): 281 | c["generation_max_length"] = ",".join([str(c["generation_max_length"])] * len(c["datasets"].split(","))) 282 | for d, t, l, g in zip(c['datasets'].split(','), c['test_files'].split(','), c['input_max_length'].split(','), c['generation_max_length'].split(',')): 283 | dataset_configs.append({"dataset": d, "test_name": os.path.basename(os.path.splitext(t)[0]), "input_max_length": int(l), "generation_max_length": int(g), "max_test_samples": c['max_test_samples'], 'use_chat_template': c['use_chat_template'], 'shots': c['shots']}) 284 | print(dataset_configs) 285 | 286 | failed_paths = [] 287 | df = [] 288 | for model in tqdm(models_configs): 289 | args = arguments() 290 | args.tag = "v1" # SET YOUR TAG HERE 291 | args.output_dir = f"output/{model['model']}" 292 | 293 | for dataset in dataset_configs: 294 | args.update(dataset) 295 | args.update(model) 296 | 297 | metric = args.get_averaged_metric() 298 | dsimple, mnames = args.get_metric_name() 299 | 300 | if metric is None: 301 | failed_paths.append(args.get_path()) 302 | continue 303 | 304 | for k, m in metric.items(): 305 | df.append({**asdict(args), **model, 306 | "metric name": k, "metric": m, 307 | "dataset_simple": dsimple + " " + k, "test_data": f"{args.dataset}-{args.test_name}-{args.input_max_length}" 308 | }) 309 | 310 | all_df = pd.DataFrame(df) 311 | lf_df = all_df.pivot_table(index=["input_max_length", "model", ], columns="dataset_simple", values="metric", sort=False) 312 | lf_df = lf_df.reset_index() 313 | 314 | for k, v in custom_avgs.items(): 315 | lf_df[k] = lf_df[v].mean(axis=1) 316 | 317 | print(lf_df.to_csv(index=False)) 318 | 319 | print("Warning, failed to get the following paths, make sure that these are correct or the printed results will not be accurate:", failed_paths) 320 | # import pdb; pdb.set_trace() -------------------------------------------------------------------------------- /scripts/download_data.sh: -------------------------------------------------------------------------------- 1 | wget -c https://huggingface.co/datasets/princeton-nlp/HELMET/resolve/main/data.tar.gz 2 | tar -xvzf data.tar.gz 3 | -------------------------------------------------------------------------------- /scripts/eval_gpt4_longqa.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | import sys 5 | import re 6 | from tqdm import tqdm 7 | import glob 8 | 9 | # Get the parent directory path 10 | parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) 11 | # Add the parent directory to the Python path 12 | sys.path.append(parent_dir) 13 | 14 | from model_utils import OpenAIModel 15 | 16 | def parse_output(output, prefix="Answer:"): 17 | output = output.replace("\n", " ") 18 | 19 | def lstrip_string(s, sub): 20 | return re.sub(f'^{re.escape(sub)}', '', s, flags=re.IGNORECASE) 21 | patterns = [re.compile(f"(?:{prefix})(.*)(?:\n|$)", flags=re.IGNORECASE), re.compile(r"(?:^)(.*)(?:\n|$)")] 22 | for pat in patterns: 23 | matches = pat.search(output) 24 | if matches is not None: 25 | return lstrip_string(matches[1].strip(), prefix).strip() # 0 index includes the non-capturing group # lstrip again because for chat models sometimes it will repeat the prefix 26 | # if still not found, return None, but should actually never get this case... 27 | return None 28 | 29 | 30 | # prompts inspired by https://www.databricks.com/blog/LLM-auto-eval-best-practices-RAG 31 | judge_prompt = """Please act as an impartial judge and evaluate the quality of the provided answer which attempts to answer the provided question based on a provided context. 32 | Although you are not given the context, you will be given a set of correct answers that achieves full scores on all metrics, and you need to assess the provided answers using the correct answers. 33 | 34 | Below is your grading rubric: 35 | 36 | Fluency: 37 | - Score 0 (incoherent, repetitive, or incomplete): Incoherent sentences, repetitive sentences (even if not by exact words), incomplete answers, or gibberish. Note that even if the answer is coherent, if it is repetitive or incomplete, it should be given a score of 0. 38 | - Score 1 (coherent, non-repetitive answer): Coherent, non-repetitive, fluent, grammatically correct answers. 39 | 40 | Correctness: 41 | - Score 0 (Incorrect): The answer does not agree with the provided correct answers at all. 42 | - Score 1 (partly correct): Partly agree with one of the provided correct answers (for example, the question asks for a date and a person; the answer gets the date right but the person wrong). 43 | - Score 2 (correct but not fully relevant): Fully agrees with one of the provided correct answers but mentions other completely irrelevant information. Note that extra details provided in the answer, even if not mentioned in the correct answers, should NOT be seen as irrelevant as long as they are relevant to the question to a reasonable extend. 44 | - Score 3 (correct and relevant): Fully agrees with one of the provided correct answers and only provides information relevant to the question. Note that if the answer is longer than the correct answer, as long as everything in the answer is relevant to the question, it should still be given score 3. For example, if the correct answer is "the North Pole" and the answer is "They are headed for the North Pole", it should still be given a score of 3. 45 | 46 | Now, read the following question, answer, and correct answers. First think step-by-step and provide your reasoning and assessment on the answer. Then output your score in the following json format: {{"fluency": 0, "correctness": 1}}. 47 | 48 | Question: {question} 49 | Correct answers: {correct_answers} 50 | Answer: {parsed_output} 51 | """ 52 | 53 | def parse_json(text): 54 | matches = re.findall(r"\{.*?\}", text, re.DOTALL) 55 | if len(matches) > 0: 56 | try: 57 | r = json.loads(matches[-1]) 58 | except: 59 | return None 60 | return r 61 | return None 62 | 63 | def check_metrics(model, results_file, output_file): 64 | with open(results_file, "r") as f: 65 | results = json.load(f) 66 | 67 | sum_score = 0 68 | count_score = 0 69 | 70 | all_inputs = [] 71 | for d in results["data"]: 72 | p = judge_prompt.format(question=d['question'], correct_answers=d['answer'], parsed_output=parse_output(d['output'])) 73 | all_inputs.append(p) 74 | 75 | outputs = model.generate_batch(prompt=all_inputs, batch_file=output_file+".batch") 76 | for idx, o in enumerate(outputs): 77 | d = results["data"][idx] 78 | s = None 79 | 80 | if o is not None: 81 | scores = parse_json(o["output"]) 82 | if scores is not None and "correctness" in scores and "fluency" in scores: 83 | s = scores 84 | else: 85 | print("Warning! Couldn't get a score") 86 | print(f"GPT-4 output: {o['output']}") 87 | 88 | if scores is not None: 89 | sum_score += scores["fluency"] * scores["correctness"] 90 | count_score += 1 91 | 92 | d["gpt-4-scores"] = s 93 | 94 | if idx < 10: 95 | print("=====================================") 96 | print(f"Prompt: {all_inputs[idx]}") 97 | print(f"Output: {o['output']}") 98 | print(f"Final score: {s}") 99 | 100 | results["averaged_metrics"]["gpt-4-score"] = sum_score / count_score 101 | with open(output_file, "w") as f: 102 | json.dump(results, f, indent=4) 103 | 104 | return results 105 | 106 | if __name__ == "__main__": 107 | model = OpenAIModel("gpt-4o-2024-05-13", temperature=0.1) 108 | parser = argparse.ArgumentParser() 109 | parser.add_argument("--num_shards", type=int, default=1) 110 | parser.add_argument("--shard_idx", type=int, default=0) 111 | parser.add_argument("--model_to_check", nargs="+", default=[]) 112 | parser.add_argument("--tag", type=str, default="v1") 113 | args = parser.parse_args() 114 | num_shards = args.num_shards 115 | shard_idx = args.shard_idx 116 | 117 | if len(args.model_to_check) > 0: 118 | model_to_check = args.model_to_check 119 | else: 120 | # all models 121 | model_to_check = ['gpt-4-0125-preview','gpt-4o-mini-2024-07-18','gpt-4o-2024-05-13','gpt-4o-2024-08-06','claude-3-5-sonnet-20240620','gemini-1.5-flash-001','gemini-1.5-pro-001','Llama-2-7B-32K','Llama-2-7B-32K-Instruct','llama-2-7b-80k','Yarn-Llama-2-7b-64k','Yarn-Llama-2-7b-128k','Meta-Llama-3-8B','Meta-Llama-3-8B-Instruct','Meta-Llama-3-8B-Theta16M','Meta-Llama-3-8B-Instruct-Theta16M','Meta-Llama-3-70B-Theta16M','Meta-Llama-3-70B-Instruct-Theta16M','Llama-3.1-8B','Llama-3.1-8B-Instruct','Llama-3.1-70B','Llama-3.1-70B-Instruct','Llama-3.3-70B-Instruct','Llama-3.2-1B','Llama-3.2-1B-Instruct','Llama-3.2-3B','Llama-3.2-3B-Instruct','Mistral-7B-v0.1','Mistral-7B-Instruct-v0.1','Mistral-7B-Instruct-v0.2','Mistral-7B-v0.3','Mistral-7B-Instruct-v0.3','Ministral-8B-Instruct-2410','Mistral-Nemo-Base-2407','Mistral-Nemo-Instruct-2407','MegaBeam-Mistral-7B-512k','Yi-6B-200K','Yi-9B-200K','Yi-34B-200K','Yi-1.5-9B-32K','Phi-3-mini-128k-instruct','Phi-3-small-128k-instruct','Phi-3-medium-128k-instruct','Phi-3.5-mini-instruct','Qwen2-7B','Qwen2-7B-Instruct','Qwen2-57B-A14B','Qwen2-57B-A14B-Instruct','Qwen2.5-1.5B','Qwen2.5-1.5B-Instruct','Qwen2.5-3B','Qwen2.5-3B-Instruct','Qwen2.5-7B','Qwen2.5-7B-Instruct','Qwen2.5-7B-Instruct-1M','Qwen2.5-14B-Instruct-1M','Qwen2.5-72B-Instruct','Llama-3-8B-ProLong-512k-Instruct','gemma-2-9b','gemma-2-9b-it','gemma-2-9b-it-Theta320K','gemma-2-27b','gemma-2-27b-it','gemma-2-27b-it-Theta320K','c4ai-command-r-v01','Jamba-v0.1','AI21-Jamba-1.5-Mini', "DeepSeek-R1-Distill-Llama-8B", "DeepSeek-R1-Distill-Qwen-7B"] 122 | 123 | all_paths = [glob.glob(f"output/{m}/narrativeqa_*_{args.tag}_*.json") for m in model_to_check] 124 | all_paths = [item for sublist in all_paths for item in sublist] 125 | all_paths = [p for p in all_paths if not os.path.exists(p.replace(".json", "-gpt4eval_o.json"))] 126 | all_paths = [p for p in all_paths if not p.endswith("-gpt4eval_o.json")] 127 | all_paths = all_paths[shard_idx::num_shards] 128 | print(f"Found {len(all_paths)} path") 129 | 130 | for p in all_paths: 131 | newp = p.replace(".json", "-gpt4eval_o.json") 132 | print("evaluating path:", p) 133 | check_metrics(model, p, newp) 134 | -------------------------------------------------------------------------------- /scripts/eval_gpt4_longqa.sh: -------------------------------------------------------------------------------- 1 | shards=10; for i in $(seq 0 $shards); do python scripts/eval_gpt4_longqa.py --num_shards $shards --shard_idx $i & done 2 | -------------------------------------------------------------------------------- /scripts/eval_gpt4_summ.sh: -------------------------------------------------------------------------------- 1 | shards=10; for i in $(seq 0 $shards); do python scripts/eval_gpt4_summ.py --num_shards $shards --shard_idx $i & done 2 | -------------------------------------------------------------------------------- /scripts/generate_configs.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | 3 | # cannot be shared ones: use_chat_template, shots, and stop_new_line 4 | 5 | lengths_mapping = {"4k": 4096, "8k": 8192, "16k": 16384, "32k": 32768, "64k": 65536, "128k": 131072} 6 | master_mapping = { 7 | # ruler tasks, shots: 0, use_chat_template: False, and stop_new_line: False 8 | "ruler_niah_s_1": { # NIAH Repeat 9 | k: { 10 | "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/niah_single_1/validation_{v}.jsonl" 11 | } for k, v in {"4k": 4096, "8k": 8192, "16k": 16384, "32k": 32768, "64k": 65536, "128k": 131072}.items() 12 | }, 13 | "ruler_niah_s_2": { # NIAH 14 | k: { 15 | "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/niah_single_2/validation_{v}.jsonl" 16 | } for k, v in {"4k": 4096, "8k": 8192, "16k": 16384, "32k": 32768, "64k": 65536, "128k": 131072}.items() 17 | }, 18 | "ruler_niah_s_3": { # NIAH UUID 19 | k: { 20 | "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/niah_single_3/validation_{v}.jsonl" 21 | } for k, v in {"4k": 4096, "8k": 8192, "16k": 16384, "32k": 32768, "64k": 65536, "128k": 131072}.items() 22 | }, 23 | "ruler_niah_mk_1": { # NIAH MK Essay 24 | k: { 25 | "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/niah_multikey_1/validation_{v}.jsonl" 26 | } for k, v in {"4k": 4096, "8k": 8192, "16k": 16384, "32k": 32768, "64k": 65536, "128k": 131072}.items() 27 | }, 28 | "ruler_niah_mk_2": { # NIAH MK Needle 29 | k: { 30 | "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/niah_multikey_2/validation_{v}.jsonl" 31 | } for k, v in lengths_mapping.items() 32 | }, 33 | "ruler_niah_mk_3": { # NIAH MK UUID 34 | k: { 35 | "input_length": v, "generation_max_length": 100, "test_files": f"data/ruler/niah_multikey_3/validation_{v}.jsonl" 36 | } for k, v in lengths_mapping.items() 37 | }, 38 | "ruler_niah_mq": { # NIAH MQ 39 | k: { 40 | "input_length": v, "generation_max_length": 100, "test_files": f"data/ruler/niah_multiquery/validation_{v}.jsonl" 41 | } for k, v in lengths_mapping.items() 42 | }, 43 | "ruler_niah_mv": { # NIAH MV 44 | k: { 45 | "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/niah_multivalue/validation_{v}.jsonl" 46 | } for k, v in lengths_mapping.items() 47 | }, 48 | "ruler_cwe": { # RULER CWE 49 | k: { 50 | "input_length": v, "generation_max_length": 100, "test_files": f"data/ruler/cwe/validation_{v}.jsonl" 51 | } for k, v in lengths_mapping.items() 52 | }, 53 | "ruler_fwe": { # RULER FWE 54 | k: { 55 | "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/fwe/validation_{v}.jsonl" 56 | } for k, v in lengths_mapping.items() 57 | }, 58 | "ruler_vt": { # RULER VT 59 | k: { 60 | "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/vt/validation_{v}.jsonl" 61 | } for k, v in lengths_mapping.items() 62 | }, 63 | "ruler_qa_1": { # SQuAD 64 | k: { 65 | "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/qa_1/validation_{v}.jsonl" 66 | } for k, v in lengths_mapping.items() 67 | }, 68 | "ruler_qa_2": { # HotpotQA 69 | k: { 70 | "input_length": v, "generation_max_length": 50, "test_files": f"data/ruler/qa_2/validation_{v}.jsonl" 71 | } for k, v in lengths_mapping.items() 72 | }, 73 | 74 | "json_kv": { 75 | k: { 76 | "input_length": v, "generation_max_length": 100, "test_files": f"data/json_kv/test_k" + ["50", "105", "220", "440", "900", "1800"][i] + "_dep6.jsonl", "demo_files": "" 77 | } for i, (k, v) in enumerate(lengths_mapping.items()) 78 | }, 79 | 80 | # generation with citations -- alce 81 | "alce_asqa": { # ASQA 82 | k: { 83 | "input_length": v, "generation_max_length": 300, "test_files": f"data/alce/asqa_eval_gtr_top2000.json", "demo_files": f"prompts/asqa_revised.json", "name_postfix": ["_8", "_30", "_75", "_165", "_345", "_700"][i] 84 | } for i, (k, v) in enumerate(lengths_mapping.items()) 85 | }, 86 | "alce_qampari": { # QAMPARI 87 | k: { 88 | "input_length": v, "generation_max_length": 300, "test_files": f"data/alce/qampari_eval_gtr_top2000.json", "demo_files": f"prompts/qampari_revised.json", "name_postfix": ["_8", "_30", "_75", "_165", "_345", "_700"][i] 89 | } for i, (k, v) in enumerate(lengths_mapping.items()) 90 | }, 91 | 92 | "alce_asqa_nocite": { # ASQA 93 | k: { 94 | "input_length": v, "generation_max_length": 300, "test_files": f"data/alce/asqa_eval_gtr_top2000.json", "demo_files": f"prompts/asqa_nocite.json", "name_postfix": ["_8", "_30", "_75", "_165", "_345", "_700"][i] 95 | } for i, (k, v) in enumerate(lengths_mapping.items()) 96 | }, 97 | "alce_qampari_nocite": { # QAMPARI 98 | k: { 99 | "input_length": v, "generation_max_length": 300, "test_files": f"data/alce/qampari_eval_gtr_top2000.json", "demo_files": f"prompts/qampari_nocite.json", "name_postfix": ["_8", "_30", "_75", "_165", "_345", "_700"][i] 100 | } for i, (k, v) in enumerate(lengths_mapping.items()) 101 | }, 102 | 103 | # RAG tasks, using KILT's datasets and retrieval corpus 104 | "kilt_nq": { 105 | k: { 106 | "input_length": v, "generation_max_length": 20, 107 | "test_files": "data/kilt/nq-dev-multikilt_1000_k" + ["20", "50", "105", "220", "440", "1000"][i] + "_dep6.jsonl", 108 | "demo_files": "data/kilt/nq-train-multikilt_1000_k3_dep6.jsonl" 109 | } for i, (k, v) in enumerate(lengths_mapping.items()) 110 | }, 111 | "kilt_triviaqa": { 112 | k: { 113 | "input_length": v, "generation_max_length": 20, 114 | "test_files": "data/kilt/triviaqa-dev-multikilt_1000_k" + ["20", "50", "105", "220", "440", "1000"][i] + "_dep6.jsonl", 115 | "demo_files": "data/kilt/triviaqa-train-multikilt_1000_k3_dep6.jsonl" 116 | } for i, (k, v) in enumerate(lengths_mapping.items()) 117 | }, 118 | "kilt_hotpotqa": { 119 | k: { 120 | "input_length": v, "generation_max_length": 20, 121 | "test_files": "data/kilt/hotpotqa-dev-multikilt_1000_k" + ["20", "50", "105", "220", "440", "1000"][i] + "_dep3.jsonl", 122 | "demo_files": "data/kilt/hotpotqa-train-multikilt_1000_k3_dep3.jsonl" 123 | } for i, (k, v) in enumerate(lengths_mapping.items()) 124 | }, 125 | "kilt_popqa": { 126 | k: { 127 | "input_length": v, "generation_max_length": 20, "name_postfix": "_3", 128 | "test_files": "data/kilt/popqa_test_1000_k" + ["20", "50", "105", "220", "440", "1000"][i] + "_dep6.jsonl", 129 | "demo_files": "data/kilt/popqa_test_1000_k3_dep6.jsonl" 130 | } for i, (k, v) in enumerate(lengths_mapping.items()) 131 | }, 132 | 133 | # for longqa, we truncate by the length - 200 - the generation length 134 | "narrativeqa": { 135 | k: { 136 | "input_length": v, "generation_max_length": 100, "test_files": "", "demo_files": "", "name_postfix": f"_{v - 200 - 100}" 137 | } for k, v in lengths_mapping.items() 138 | }, 139 | "infbench_qa_eng": { 140 | k: { 141 | "input_length": v, "generation_max_length": 10, "test_files": "", "demo_files": "", "name_postfix": f"_{v - 200 - 10}" 142 | } for k, v in lengths_mapping.items() 143 | }, 144 | "infbench_choice_eng": { 145 | k: { 146 | "input_length": v, "generation_max_length": 10, "test_files": "", "demo_files": "", "name_postfix": f"_{v - 200 - 10}" 147 | } for k, v in lengths_mapping.items() 148 | }, 149 | 150 | "infbench_sum_eng": { 151 | k: { 152 | "input_length": v, "generation_max_length": 1200, "test_files": "", "demo_files": "", "name_postfix": f"_{v - 200 - 1200}" 153 | } for k, v in lengths_mapping.items() 154 | }, 155 | # for multi lexsum, we truncate by the length - 300 (prompt and buffer) - 400 (generation) 156 | "multi_lexsum": { 157 | k: { 158 | "input_length": v, "generation_max_length": 400, "test_files": "", "demo_files": "", "name_postfix": f"_{v - 300 - 400}" 159 | } for k, v in lengths_mapping.items() 160 | }, 161 | 162 | "msmarco_rerank_psg": { 163 | k: { 164 | "input_length": v, "generation_max_length": 200, 165 | "test_files": "data/msmarco/test_reranking_data_k" + ["14", "50", "130", "285", "600", "1000"][i] + "_dep3.jsonl", 166 | "demo_files": "data/msmarco/test_reranking_data_k10_dep3.jsonl" 167 | } for i, (k, v) in enumerate(lengths_mapping.items()) 168 | }, 169 | 170 | "icl_trec_coarse": { 171 | k: { 172 | "input_length": v, "generation_max_length": 20, 173 | "test_files": "", "demo_files": "", "name_postfix": "_" + ["200", "400", "800", "1600", "3300", "6600"][i] + "shot_balance" 174 | } for i, (k, v) in enumerate(lengths_mapping.items()) 175 | }, 176 | "icl_trec_fine": { 177 | k: { 178 | "input_length": v, "generation_max_length": 20, 179 | "test_files": "", "demo_files": "", "name_postfix": "_" + ["200", "400", "800", "1600", "3200", "6400"][i] + "shot_balance" 180 | } for i, (k, v) in enumerate(lengths_mapping.items()) 181 | }, 182 | "icl_banking77": { 183 | k: { 184 | "input_length": v, "generation_max_length": 20, 185 | "test_files": "", "demo_files": "", "name_postfix": "_" + ["180", "360", "720", "1450", "2900", "5900"][i] + "shot_balance" 186 | } for i, (k, v) in enumerate(lengths_mapping.items()) 187 | }, 188 | "icl_clinic150": { 189 | k: { 190 | "input_length": v, "generation_max_length": 20, 191 | "test_files": "", "demo_files": "", "name_postfix": "_" + ["220", "440", "880", "1750", "3525", "7050"][i] + "shot_balance" 192 | } for i, (k, v) in enumerate(lengths_mapping.items()) 193 | }, 194 | "icl_nlu": { 195 | k: { 196 | "input_length": v, "generation_max_length": 20, 197 | "test_files": "", "demo_files": "", "name_postfix": "_" + ["250", "510", "1020", "2040", "4080", "8296"][i] + "shot_balance" 198 | } for i, (k, v) in enumerate(lengths_mapping.items()) 199 | }, 200 | } 201 | 202 | def process_configs(config_name, datasets, input_lengths, **kwargs): 203 | configs = [] 204 | for i, d in enumerate(datasets): 205 | con = master_mapping[d] 206 | print(d) 207 | for l in input_lengths: 208 | c = con[l] 209 | print(c) 210 | configs.append({ 211 | "input_max_length": c['input_length'], 212 | "datasets": d + c.get("name_postfix", ""), 213 | "generation_max_length": c['generation_max_length'], 214 | "test_files": c.get("test_files", ""), 215 | "demo_files": c.get("demo_files", ""), 216 | }) 217 | out_config = {k: ",".join([str(c[k]) for c in configs]) for k in configs[0]} 218 | # llama 3 by default but you can change it to anything else 219 | out_config.update({ 220 | **kwargs, 221 | "model_name_or_path": "meta-llama/Llama-3.1-8B-Instruct", 222 | }) 223 | with open(config_name, "w") as f: 224 | yaml.dump(out_config, f, sort_keys=False) 225 | 226 | def helmet_configs(input_lengths = ["128k"], fname_postfix = ""): 227 | synthetic = ["ruler_niah_mk_2", "ruler_niah_mk_3", "ruler_niah_mv", "json_kv"] 228 | # ruler actually doesn't support demos so it defaults to 0, json kv uses 2 229 | process_configs( 230 | f"configs/recall{fname_postfix}.yaml", synthetic, input_lengths, 231 | use_chat_template=False, max_test_samples=100, shots=2, stop_new_line=False 232 | ) 233 | 234 | rag = ['kilt_nq', 'kilt_triviaqa', 'kilt_hotpotqa', 'kilt_popqa'] 235 | process_configs( 236 | f"configs/rag{fname_postfix}.yaml", rag, input_lengths, 237 | use_chat_template=False, max_test_samples=100, shots=2, stop_new_line=True # could be false but set to true so it runs faster 238 | ) 239 | 240 | longqa = ['narrativeqa', 'infbench_qa_eng', 'infbench_choice_eng'] 241 | process_configs( 242 | f"configs/longqa{fname_postfix}.yaml", longqa, input_lengths, 243 | use_chat_template=True, max_test_samples=100, shots=2, stop_new_line=False 244 | ) 245 | 246 | summ = ['infbench_sum_eng', 'multi_lexsum'] 247 | process_configs( 248 | f"configs/summ{fname_postfix}.yaml", summ, input_lengths, 249 | use_chat_template=True, max_test_samples=100, shots=2, stop_new_line=False 250 | ) 251 | 252 | icl = ['icl_trec_coarse', 'icl_trec_fine', 'icl_banking77', 'icl_clinic150', 'icl_nlu'] 253 | process_configs( 254 | f"configs/icl{fname_postfix}.yaml", icl, input_lengths, 255 | use_chat_template=False, max_test_samples=500, shots=0, stop_new_line=True 256 | ) 257 | 258 | rerank = ["msmarco_rerank_psg"] 259 | process_configs( 260 | f"configs/rerank{fname_postfix}.yaml", rerank, input_lengths, 261 | use_chat_template=False, max_test_samples=100, shots=2, stop_new_line=True 262 | ) 263 | 264 | cite = ["alce_asqa", "alce_qampari"] 265 | process_configs( 266 | f"configs/cite{fname_postfix}.yaml", cite, input_lengths, 267 | use_chat_template=True, max_test_samples=100, shots=2, stop_new_line=False 268 | ) 269 | 270 | nocite = ["alce_asqa_nocite"] 271 | process_configs( 272 | f"configs/alce_nocite{fname_postfix}.yaml", nocite, input_lengths, 273 | use_chat_template=True, max_test_samples=100, shots=0, stop_new_line=False, generation_max_length=600, 274 | ) 275 | 276 | ruler = ["ruler_niah_s_1", "ruler_niah_s_2", "ruler_niah_s_3", "ruler_niah_mk_1", "ruler_niah_mk_2", "ruler_niah_mk_3", "ruler_niah_mq", "ruler_niah_mv", "ruler_cwe", "ruler_fwe", "ruler_vt", "ruler_qa_1", "ruler_qa_2"] 277 | process_configs( 278 | f"configs/ruler{fname_postfix}.yaml", ruler, input_lengths, 279 | use_chat_template=False, max_test_samples=100, shots=0, stop_new_line=False 280 | ) 281 | 282 | def separate_configs(input_lengths = ["128k"], fname_postfix = ""): 283 | # separate rag and icl configs into individual files 284 | for name in ['kilt_nq', 'kilt_triviaqa', 'kilt_hotpotqa', 'kilt_popqa']: 285 | process_configs( 286 | f"configs/rag/{name}{fname_postfix}.yaml", [name], input_lengths, 287 | use_chat_template=False, max_test_samples=100, shots=2, stop_new_line=True 288 | ) 289 | 290 | for name in ['icl_trec_coarse', 'icl_trec_fine', 'icl_banking77', 'icl_clinic150', 'icl_nlu']: 291 | process_configs( 292 | f"configs/icl/{name}{fname_postfix}.yaml", [name], input_lengths, 293 | use_chat_template=False, max_test_samples=500, shots=0, stop_new_line=True 294 | ) 295 | 296 | 297 | def niah_configs(): 298 | input_lengths = [8192, 16384, 32768, 65536, 131072] 299 | dataset=["ruler_niah_s_2"] 300 | gen_lengths = [50] 301 | for i, l in enumerate(input_lengths): 302 | config = { 303 | "input_max_length": l, 304 | "datasets": dataset[0], 305 | "generation_max_length": gen_lengths[0], 306 | "test_files": f'data/ruler/{dataset[0].replace("ruler_", "").replace("_s_", "_single_")}/validation_{l}.jsonl', 307 | "demo_files": "", 308 | } 309 | with open(f"configs/niah.yaml", "w") as f: 310 | yaml.dump(config, f, sort_keys=False) 311 | 312 | 313 | if __name__ == "__main__": 314 | helmet_configs() 315 | helmet_configs(input_lengths=["8k", "16k", "32k", "64k"], fname_postfix="_short") 316 | niah_configs() 317 | separate_configs() 318 | separate_configs(input_lengths=["8k", "16k", "32k", "64k"], fname_postfix="_short") -------------------------------------------------------------------------------- /scripts/run_api.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | ############################## 4 | # Job blueprint # 5 | ############################## 6 | 7 | # Give your job a name, so you can recognize it in the queue overview 8 | #SBATCH --job-name=api ## CHANGE JOBNAME HERE 9 | #SBATCH --array=0 10 | 11 | # Remove one # to uncommment 12 | #SBATCH --output=./joblog/%x-%A_%a.out ## Stdout 13 | #SBATCH --error=./joblog/%x-%A_%a.err ## Stderr 14 | 15 | # Define, how many nodes you need. Here, we ask for 1 node. 16 | #SBATCH -N 1 ##nodes 17 | #SBATCH -n 1 ##tasks 18 | #SBATCH --cpus-per-task=8 19 | #SBATCH --mem=32G 20 | #SBATCH --time=0-3:00:00 21 | #SBATCH --gres=gpu:0 --ntasks-per-node=1 -N 1 22 | # Turn on mail notification. There are many possible self-explaining values: 23 | # NONE, BEGIN, END, FAIL, ALL (including all aforementioned) 24 | # For more values, check "man sbatch" 25 | #SBATCH --mail-type=ALL 26 | # Remember to set your email address here instead of nobody 27 | #SBATCH --mail-user=nobody 28 | 29 | echo "Date = $(date)" 30 | echo "Hostname = $(hostname -s)" 31 | echo "Working Directory = $(pwd)" 32 | echo "" 33 | echo "Number of Nodes Allocated = $SLURM_JOB_NUM_NODES" 34 | echo "Number of Tasks Allocated = $SLURM_NTASKS" 35 | echo "Number of Cores/Task Allocated = $SLURM_CPUS_PER_TASK" 36 | echo "Array Job ID = $SLURM_ARRAY_JOB_ID" 37 | echo "Array Task ID = $SLURM_ARRAY_TASK_ID" 38 | echo "Cache = $TRANSFORMERS_CACHE" 39 | 40 | source env/bin/activate 41 | 42 | export OMP_NUM_THREADS=8 43 | IDX=$SLURM_ARRAY_TASK_ID 44 | if [[ -z $SLURM_ARRAY_TASK_ID ]]; then 45 | IDX=0 46 | fi 47 | 48 | 49 | TAG=v1 50 | 51 | CONFIGS=(recall.yaml rag.yaml longqa.yaml summ.yaml icl.yaml rerank.yaml cite.yaml) 52 | #CONFIGS=(${CONFIGS[7]}) # you may want to run only one config 53 | SEED=42 54 | 55 | # azure vs. non-azure makes no difference, just use whichever you prefer 56 | OD=( 57 | azure/gpt-4-0125-preview # 0 58 | azure/gpt-4o-2024-05-13 # 1 59 | gpt-4o-2024-08-06 # 2 60 | azure/gpt-4o-mini-2024-07-18 # 3 61 | claude-3-5-sonnet-20240620 # 4 62 | gemini-1.5-flash-001 # 5 63 | gemini-1.5-pro-001 # 6 64 | ) 65 | MODEL_NAME="${OD[$IDX]}" 66 | OUTPUT_DIR="output/$(basename $MODEL_NAME)" 67 | 68 | # for the API models we always use use_chat_template=True 69 | OPTIONS="--use_chat_template True --stop_newline False" 70 | 71 | echo "Evaluation output dir = $OUTPUT_DIR" 72 | echo "Tag = $TAG" 73 | echo "Model name = $MODEL_NAME" 74 | echo "Options = $OPTIONS" 75 | 76 | for CONFIG in "${CONFIGS[@]}"; do 77 | echo "Config file: $CONFIG" 78 | 79 | python eval.py \ 80 | --config configs/$CONFIG \ 81 | --seed $SEED \ 82 | --output_dir $OUTPUT_DIR \ 83 | --tag $TAG \ 84 | --model_name_or_path $MODEL_NAME \ 85 | $OPTIONS 86 | done 87 | 88 | echo "finished with $?" 89 | 90 | wait; 91 | -------------------------------------------------------------------------------- /scripts/run_eval.sh: -------------------------------------------------------------------------------- 1 | for task in "recall" "rag" "longqa" "summ" "icl" "rerank" "cite"; do 2 | python eval.py --config configs/${task}.yaml 3 | done 4 | 5 | this will run the 8k to 64k versions 6 | for task in "recall" "rag" "longqa" "summ" "icl" "rerank" "cite"; do 7 | python eval.py --config configs/${task}_short.yaml 8 | done -------------------------------------------------------------------------------- /scripts/run_eval_hf_endpoint.sh: -------------------------------------------------------------------------------- 1 | 2 | LLM_ENDPOINT="https://${hf_inference_point_url}/v1" # fill in your endpoint url 3 | API_KEY=$HF_TOKEN 4 | 5 | python eval.py --config configs/recall_demo.yaml --endpoint_url $LLM_ENDPOINT --api_key $API_KEY -------------------------------------------------------------------------------- /scripts/run_eval_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | ############################## 4 | # Job blueprint # 5 | ############################## 6 | 7 | # Give your job a name, so you can recognize it in the queue overview 8 | #SBATCH --job-name=helmet ## CHANGE JOBNAME HERE 9 | #SBATCH --array=0-35 10 | 11 | # Remove one # to uncommment 12 | #SBATCH --output=./joblog/%x-%A_%a.out ## Stdout 13 | #SBATCH --error=./joblog/%x-%A_%a.err ## Stderr 14 | 15 | # Define, how many nodes you need. Here, we ask for 1 node. 16 | #SBATCH -N 1 ##nodes 17 | #SBATCH -n 1 ##tasks 18 | #SBATCH --cpus-per-task=8 19 | #SBATCH --mem=100G 20 | #SBATCH --time=0-24:00:00 21 | #SBATCH --gres=gpu:1 --ntasks-per-node=1 -N 1 22 | #SBATCH --constraint=gpu80 23 | # Turn on mail notification. There are many possible self-explaining values: 24 | # NONE, BEGIN, END, FAIL, ALL (including all aforementioned) 25 | # For more values, check "man sbatch" 26 | #SBATCH --mail-type=ALL 27 | # Remember to set your email address here instead of nobody 28 | #SBATCH --mail-user=nobody 29 | 30 | echo "Date = $(date)" 31 | echo "Hostname = $(hostname -s)" 32 | echo "Working Directory = $(pwd)" 33 | echo "" 34 | echo "Number of Nodes Allocated = $SLURM_JOB_NUM_NODES" 35 | echo "Number of Tasks Allocated = $SLURM_NTASKS" 36 | echo "Number of Cores/Task Allocated = $SLURM_CPUS_PER_TASK" 37 | echo "Array Job ID = $SLURM_ARRAY_JOB_ID" 38 | echo "Array Task ID = $SLURM_ARRAY_TASK_ID" 39 | echo "Cache = $TRANSFORMERS_CACHE" 40 | 41 | source env/bin/activate 42 | 43 | IDX=$SLURM_ARRAY_TASK_ID 44 | NGPU=$SLURM_GPUS_ON_NODE 45 | if [[ -z $SLURM_ARRAY_TASK_ID ]]; then 46 | IDX=31 47 | NGPU=1 48 | fi 49 | export OMP_NUM_THREADS=8 50 | 51 | # change the tag to distinguish different runs 52 | TAG=v1 53 | 54 | CONFIGS=(recall.yaml rag.yaml longqa.yaml summ.yaml icl.yaml rerank.yaml cite.yaml) 55 | SEED=42 56 | 57 | OPTIONS="" 58 | 59 | M_IDX=$IDX 60 | 61 | # Array for models larger than 13B (12 models) 62 | L_MODELS=( 63 | "Meta-Llama-3-70B-Theta8M" 64 | "Meta-Llama-3-70B-Instruct-Theta8M" 65 | "Meta-Llama-3.1-70B" 66 | "Meta-Llama-3.1-70B-Instruct" 67 | "Yi-34B-200K" 68 | "Qwen2-57B-A14B" 69 | "Qwen2-57B-A14B-Instruct" 70 | "c4ai-command-r-v01" 71 | "Jamba-v0.1" 72 | "AI21-Jamba-1.5-Mini" 73 | "gemma-2-27b" 74 | "gemma-2-27b-it" 75 | ) 76 | 77 | # Array for models 13B and smaller (36 models) 78 | S_MODELS=( 79 | "LLaMA-2-7B-32K" 80 | "Llama-2-7B-32K-Instruct" 81 | "llama-2-7b-80k-basefixed" 82 | "Yarn-Llama-2-7b-64k" 83 | "Yarn-Llama-2-7b-128k" 84 | "Meta-Llama-3-8B" 85 | "Meta-Llama-3-8B-Instruct" 86 | "Meta-Llama-3-8B-Theta8M" 87 | "Meta-Llama-3-8B-Instruct-Theta8M" 88 | "Meta-Llama-3.1-8B" 89 | "Meta-Llama-3.1-8B-Instruct" 90 | "Mistral-7B-v0.1" 91 | "Mistral-7B-Instruct-v0.1" 92 | "Mistral-7B-Instruct-v0.2" 93 | "Mistral-7B-v0.3" 94 | "Mistral-7B-Instruct-v0.3" 95 | "Yi-6B-200K" 96 | "Yi-9B-200K" 97 | "Yi-1.5-9B-32K" 98 | "Phi-3-mini-128k-instruct" 99 | "Phi-3-small-128k-instruct" 100 | "Phi-3.5-mini-instruct" 101 | "Qwen2-7B" 102 | "Qwen2-7B-Instruct" 103 | "gemma-2-9b" 104 | "gemma-2-9b-it" 105 | "prolong-64k-instruct" 106 | "prolong-512k-instruct-20b-theta128m" 107 | "Mistral-Nemo-Base-2407" 108 | "Mistral-Nemo-Instruct-2407" 109 | "Phi-3-medium-128k-instruct" 110 | "MegaBeam-Mistral-7B-512k" #31 111 | "Llama-3.2-1B" # 32 112 | "Llama-3.2-1B-Instruct" # 33 113 | "Llama-3.2-3B" # 34 114 | "Llama-3.2-3B-Instruct" # 35 115 | ) 116 | MNAME="${S_MODELS[$M_IDX]}" 117 | 118 | OUTPUT_DIR="output/$MNAME" 119 | MODEL_NAME="/path/to/your/model/$MNAME" # CHANGE PATH HERE or you can change the array to load from HF 120 | 121 | shopt -s nocasematch 122 | chat_models=".*(chat|instruct|it$|nous|command|Jamba-1.5|MegaBeam).*" 123 | echo $MNAME 124 | if ! [[ $MNAME =~ $chat_models ]]; then 125 | # for the base models we always use use_chat_template=False 126 | OPTIONS="$OPTIONS --use_chat_template False" 127 | fi 128 | 129 | 130 | echo "Evaluation output dir = $OUTPUT_DIR" 131 | echo "Tag = $TAG" 132 | echo "Model name = $MODEL_NAME" 133 | echo "Options = $OPTIONS" 134 | 135 | 136 | for CONFIG in "${CONFIGS[@]}"; do 137 | echo "Config file: $CONFIG" 138 | 139 | python eval.py \ 140 | --config configs/$CONFIG \ 141 | --seed $SEED \ 142 | --output_dir $OUTPUT_DIR \ 143 | --tag $TAG \ 144 | --model_name_or_path $MODEL_NAME \ 145 | $OPTIONS 146 | done 147 | 148 | echo "finished with $?" 149 | 150 | wait; 151 | 152 | #echo "done, check $OUTPUT_DIR for outputs" 153 | 154 | #exit 0 155 | 156 | -------------------------------------------------------------------------------- /scripts/run_eval_tgi.sh: -------------------------------------------------------------------------------- 1 | export host_ip=$(hostname -I | awk '{print $1}') 2 | export LLM_ENDPOINT_PORT=8085 # change this to the port you want to use 3 | export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}/v1" 4 | 5 | python eval.py --config configs/recall_demo.yaml --endpoint_url $LLM_ENDPOINT 6 | -------------------------------------------------------------------------------- /scripts/run_eval_vllm_gaudi.sh: -------------------------------------------------------------------------------- 1 | export host_ip=$(hostname -I | awk '{print $1}') 2 | export LLM_ENDPOINT_PORT=8010 3 | export DATA_PATH="~/.cache/huggingface" 4 | export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}/v1" 5 | export HF_HOME=$DATA_PATH 6 | 7 | for task in "recall" "rag"; do 8 | python eval.py --config configs/${task}_vllm.yaml --endpoint_url $LLM_ENDPOINT --overwrite --no_cuda 9 | done -------------------------------------------------------------------------------- /scripts/run_short_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash -l 2 | 3 | ############################## 4 | # Job blueprint # 5 | ############################## 6 | 7 | # Give your job a name, so you can recognize it in the queue overview 8 | #SBATCH --job-name=helmet_short ## CHANGE JOBNAME HERE 9 | #SBATCH --array=0 10 | 11 | # Remove one # to uncommment 12 | #SBATCH --output=./joblog/%x-%A_%a.out ## Stdout 13 | #SBATCH --error=./joblog/%x-%A_%a.err ## Stderr 14 | 15 | # Define, how many nodes you need. Here, we ask for 1 node. 16 | #SBATCH -N 1 ##nodes 17 | #SBATCH -n 1 ##tasks 18 | #SBATCH --cpus-per-task=8 19 | #SBATCH --mem=150G 20 | #SBATCH --time=0-4:00:00 21 | #SBATCH --gres=gpu:1 --ntasks-per-node=1 -N 1 22 | #SBATCH --constraint=gpu80 23 | # Turn on mail notification. There are many possible self-explaining values: 24 | # NONE, BEGIN, END, FAIL, ALL (including all aforementioned) 25 | # For more values, check "man sbatch" 26 | #SBATCH --mail-type=ALL 27 | # Remember to set your email address here instead of nobody 28 | #SBATCH --mail-user=nobody 29 | 30 | echo "Date = $(date)" 31 | echo "Hostname = $(hostname -s)" 32 | echo "Working Directory = $(pwd)" 33 | echo "" 34 | echo "Number of Nodes Allocated = $SLURM_JOB_NUM_NODES" 35 | echo "Number of Tasks Allocated = $SLURM_NTASKS" 36 | echo "Number of Cores/Task Allocated = $SLURM_CPUS_PER_TASK" 37 | echo "Array Job ID = $SLURM_ARRAY_JOB_ID" 38 | echo "Array Task ID = $SLURM_ARRAY_TASK_ID" 39 | echo "Cache = $TRANSFORMERS_CACHE" 40 | 41 | source env/bin/activate 42 | 43 | IDX=$SLURM_ARRAY_TASK_ID 44 | NGPU=$SLURM_GPUS_ON_NODE 45 | if [[ -z $SLURM_ARRAY_TASK_ID ]]; then 46 | IDX=0 47 | NGPU=1 48 | fi 49 | PORT=$(shuf -i 30000-65000 -n 1) 50 | echo "Port = $PORT" 51 | 52 | export OMP_NUM_THREADS=8 53 | 54 | TAG=v1 55 | 56 | CONFIGS=(recall_short.yaml rag_short.yaml longqa_short.yaml summ_short.yaml icl_short.yaml rerank_short.yaml cite_short.yaml) 57 | #CONFIGS=(${CONFIGS[8]}) 58 | SEED=42 59 | 60 | M_IDX=$IDX 61 | 62 | # Array for models larger than 13B (12 models) 63 | L_MODELS=( 64 | "Meta-Llama-3-70B-Theta8M" #0 65 | "Meta-Llama-3-70B-Instruct-Theta8M" #1 66 | "Meta-Llama-3.1-70B" #2 67 | "Meta-Llama-3.1-70B-Instruct" #3 68 | "Yi-34B-200K" #4 69 | "Qwen2-57B-A14B" #5 70 | "Qwen2-57B-A14B-Instruct" #6 71 | "c4ai-command-r-v01" #7 72 | "Jamba-v0.1" #8 73 | "AI21-Jamba-1.5-Mini" #9 74 | "gemma-2-27b" #10 75 | "gemma-2-27b-it" #11 76 | ) 77 | 78 | # Array for models 13B and smaller (36 models) 79 | S_MODELS=( 80 | "LLaMA-2-7B-32K" # 0 81 | "Llama-2-7B-32K-Instruct" # 1 82 | "llama-2-7b-80k-basefixed" # 2 83 | "Yarn-Llama-2-7b-64k" # 3 84 | "Yarn-Llama-2-7b-128k" # 4 85 | "Meta-Llama-3-8B" # 5 86 | "Meta-Llama-3-8B-Instruct" # 6 87 | "Meta-Llama-3-8B-Theta8M" # 7 88 | "Meta-Llama-3-8B-Instruct-Theta8M" # 8 89 | "Meta-Llama-3.1-8B" # 9 90 | "Meta-Llama-3.1-8B-Instruct" # 10 91 | "Mistral-7B-v0.1" # 11 92 | "Mistral-7B-Instruct-v0.1" # 12 93 | "Mistral-7B-Instruct-v0.2" # 13 94 | "Mistral-7B-v0.3" # 14 95 | "Mistral-7B-Instruct-v0.3" # 15 96 | "Yi-6B-200K" # 16 97 | "Yi-9B-200K" # 17 98 | "Yi-1.5-9B-32K" # 18 99 | "Phi-3-mini-128k-instruct" # 19 100 | "Phi-3-small-128k-instruct" # 20 101 | "Phi-3.5-mini-instruct" # 21 102 | "Qwen2-7B" # 22 103 | "Qwen2-7B-Instruct" # 23 104 | "gemma-2-9b" # 24 105 | "gemma-2-9b-it" # 25 106 | "prolong-64k-instruct" # 26 107 | "prolong-512k-instruct-20b-theta128m" # 27 108 | "Mistral-Nemo-Base-2407" # 28 109 | "Mistral-Nemo-Instruct-2407" # 29 110 | "Phi-3-medium-128k-instruct" # 30 111 | "MegaBeam-Mistral-7B-512k" #31 112 | "Llama-3.2-1B" # 32 113 | "Llama-3.2-1B-Instruct" # 33 114 | "Llama-3.2-3B" # 34 115 | "Llama-3.2-3B-Instruct" # 35 116 | ) 117 | MNAME="${S_MODELS[$M_IDX]}" 118 | 119 | OUTPUT_DIR="output/$MNAME" 120 | MODEL_NAME="/path/to/your/model/$MNAME" # CHANGE PATH HERE or you can change the array to load from HF 121 | 122 | shopt -s nocasematch 123 | chat_models=".*(chat|instruct|it$|nous|command|Jamba-1.5|MegaBeam).*" 124 | echo $MNAME 125 | if ! [[ $MNAME =~ $chat_models ]]; then 126 | OPTIONS="$OPTIONS --use_chat_template False" 127 | fi 128 | 129 | echo "Evaluation output dir = $OUTPUT_DIR" 130 | echo "Tag = $TAG" 131 | echo "Model name = $MODEL_NAME" 132 | echo "Options = $OPTIONS" 133 | 134 | for CONFIG in "${CONFIGS[@]}"; do 135 | echo "Config file: $CONFIG" 136 | 137 | python eval.py \ 138 | --config configs/$CONFIG \ 139 | --seed $SEED \ 140 | --output_dir $OUTPUT_DIR \ 141 | --tag $TAG \ 142 | --model_name_or_path $MODEL_NAME \ 143 | $OPTIONS 144 | done 145 | 146 | echo "finished with $?" 147 | 148 | wait; 149 | -------------------------------------------------------------------------------- /scripts/vllm-gaudi/build_image.sh: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Intel Corporation 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | export TAG="helmet" 5 | echo "Building the vllm-gaudi docker images" 6 | git clone https://github.com/HabanaAI/vllm-fork.git 7 | cd ./vllm-fork 8 | git checkout v0.6.6.post1+Gaudi-1.20.0 #habana_main 9 | 10 | docker build --no-cache -f Dockerfile.hpu -t ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest} --shm-size=128g . --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy 11 | if [ $? -ne 0 ]; then 12 | echo "vllm-gaudi failed" 13 | exit 1 14 | else 15 | echo "vllm-gaudi successful" 16 | fi 17 | 18 | 19 | -------------------------------------------------------------------------------- /scripts/vllm-gaudi/compose.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2024 Intel Corporation 2 | # SPDX-License-Identifier: Apache-2.0 3 | 4 | services: 5 | vllm-gaudi-server: 6 | image: ${REGISTRY:-opea}/vllm-gaudi:${TAG:-latest} 7 | container_name: vllm-gaudi-server 8 | ports: 9 | - ${LLM_ENDPOINT_PORT:-8008}:80 10 | volumes: 11 | - "${DATA_PATH:-./data}:/data" 12 | environment: 13 | no_proxy: ${no_proxy} 14 | http_proxy: ${http_proxy} 15 | https_proxy: ${https_proxy} 16 | HF_TOKEN: ${HF_TOKEN} 17 | HF_HOME: "/data" 18 | HABANA_VISIBLE_DEVICES: all 19 | OMPI_MCA_btl_vader_single_copy_mechanism: none 20 | PT_HPU_ENABLE_LAZY_COLLECTIVES: true 21 | LLM_MODEL_ID: ${LLM_MODEL_ID} 22 | VLLM_TORCH_PROFILER_DIR: "/mnt" 23 | host_ip: ${host_ip} 24 | LLM_ENDPOINT_PORT: ${LLM_ENDPOINT_PORT} 25 | VLLM_SKIP_WARMUP: ${VLLM_SKIP_WARMUP:-true} 26 | VLLM_ALLOW_LONG_MAX_MODEL_LEN: 1 27 | MAX_MODEL_LEN: ${MAX_MODEL_LEN:-131072} 28 | MAX_SEQ_LEN_TO_CAPTURE: ${MAX_MODEL_LEN:-131072} 29 | NUM_CARDS: ${NUM_CARDS:-1} 30 | runtime: habana 31 | cap_add: 32 | - SYS_NICE 33 | ipc: host 34 | healthcheck: 35 | test: ["CMD-SHELL", "curl -f http://${host_ip}:${LLM_ENDPOINT_PORT}/health || exit 1"] 36 | interval: 10s 37 | timeout: 10s 38 | retries: 150 39 | command: --model $LLM_MODEL_ID --tensor-parallel-size ${NUM_CARDS} --host 0.0.0.0 --port 80 --block-size 128 --max-num-seqs 256 --max-seq-len-to-capture ${MAX_MODEL_LEN} --max-model-len ${MAX_MODEL_LEN} 40 | -------------------------------------------------------------------------------- /scripts/vllm-gaudi/launch_container.sh: -------------------------------------------------------------------------------- 1 | export host_ip=$(hostname -I | awk '{print $1}') 2 | export LLM_ENDPOINT_PORT=8010 3 | export HF_TOKEN=${HF_TOKEN} 4 | export LLM_ENDPOINT="http://${host_ip}:${LLM_ENDPOINT_PORT}/v1" 5 | export DATA_PATH="~/.cache/huggingface" 6 | export MAX_MODEL_LEN=131072 7 | 8 | # single node 9 | # export LLM_MODEL_ID="meta-llama/Meta-Llama-3-8B-Instruct" 10 | # export NUM_CARDS=1 11 | 12 | # multiple nodes 13 | export LLM_MODEL_ID="meta-llama/Llama-3.3-70B-Instruct" 14 | export NUM_CARDS=8 15 | 16 | docker compose up -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Adopted from https://github.com/princeton-nlp/DensePhrases/blob/main/densephrases/utils/eval_utils.py 3 | """ 4 | 5 | import os 6 | import string 7 | import re 8 | import unicodedata 9 | from collections import Counter 10 | 11 | from rouge_score import rouge_scorer 12 | 13 | import torch 14 | import pytrec_eval 15 | 16 | # import tensor_parallel as tp 17 | 18 | import logging 19 | logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 20 | datefmt='%m/%d/%Y %H:%M:%S') 21 | logger = logging.getLogger(__name__) 22 | logger.setLevel(logging.INFO) 23 | 24 | 25 | def normalize_answer(s): 26 | 27 | def remove_articles(text): 28 | return re.sub(r'\b(a|an|the)\b', ' ', text) 29 | 30 | def white_space_fix(text): 31 | return ' '.join(text.split()) 32 | 33 | def remove_punc(text): 34 | exclude = set(string.punctuation) 35 | return ''.join(ch for ch in text if ch not in exclude) 36 | 37 | def lower(text): 38 | return text.lower() 39 | 40 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 41 | 42 | 43 | def remove_citations(sent): 44 | return re.sub(r"\[\d+", "", re.sub(r" \[\d+", "", sent)).replace(" |", "").replace("]", "") 45 | 46 | 47 | def f1_score(prediction, ground_truth): 48 | normalized_prediction = normalize_answer(prediction) 49 | normalized_ground_truth = normalize_answer(ground_truth) 50 | 51 | ZERO_METRIC = (0, 0, 0) 52 | 53 | if normalized_prediction in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth: 54 | return ZERO_METRIC 55 | if normalized_ground_truth in ['yes', 'no', 'noanswer'] and normalized_prediction != normalized_ground_truth: 56 | return ZERO_METRIC 57 | 58 | prediction_tokens = normalized_prediction.split() 59 | ground_truth_tokens = normalized_ground_truth.split() 60 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 61 | num_same = sum(common.values()) 62 | if num_same == 0: 63 | return ZERO_METRIC 64 | precision = 1.0 * num_same / len(prediction_tokens) 65 | recall = 1.0 * num_same / len(ground_truth_tokens) 66 | f1 = (2 * precision * recall) / (precision + recall) 67 | return f1, precision, recall 68 | 69 | 70 | def drqa_normalize(text): 71 | """Resolve different type of unicode encodings.""" 72 | return unicodedata.normalize('NFD', text) 73 | 74 | 75 | def drqa_exact_match_score(prediction, ground_truth): 76 | """Check if the prediction is a (soft) exact match with the ground truth.""" 77 | return normalize_answer(prediction) == normalize_answer(ground_truth) 78 | 79 | 80 | def substring_exact_match_score(prediciton, ground_truth): 81 | """Check if the ground truth is a (soft) exact match substring of the prediction.""" 82 | return normalize_answer(ground_truth) in normalize_answer(prediciton) 83 | 84 | 85 | def drqa_metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 86 | """Given a prediction and multiple valid answers, return the score of 87 | the best prediction-answer_n pair given a metric function. 88 | """ 89 | # ground truth could be a string or a list of strings or a list of list of strings 90 | if isinstance(ground_truths, str): 91 | ground_truths = [ground_truths] 92 | elif isinstance(ground_truths[0], list): 93 | ground_truths = [ground_truth for ground_truths_list in ground_truths for ground_truth in ground_truths_list] 94 | 95 | scores_for_ground_truths = [] 96 | for ground_truth in ground_truths: 97 | score = metric_fn(prediction, ground_truth) 98 | scores_for_ground_truths.append(score) 99 | return max(scores_for_ground_truths) 100 | 101 | 102 | def get_max_memory(): 103 | """Get the maximum memory available for the current GPU for loading models.""" 104 | free_in_GB = int(torch.cuda.mem_get_info()[0]/1024**3) 105 | max_memory = f'{free_in_GB-6}GB' 106 | n_gpus = torch.cuda.device_count() 107 | max_memory = {i: max_memory for i in range(n_gpus)} 108 | return max_memory 109 | 110 | 111 | def get_top_tokens(logits, tokenizer, top_k=10): 112 | """Get the top tokens and their probabilities from the logits.""" 113 | top_tokens = [] 114 | for logit in logits: 115 | a, b = torch.topk(torch.softmax(logit, dim=-1), top_k, dim=-1) 116 | l = [(y, f"{x*100:.02f}") for x, y in zip(a[0], tokenizer.convert_ids_to_tokens(b[0]))] 117 | top_tokens.append(l) 118 | return top_tokens 119 | 120 | 121 | def parse_output(output, prefix="Answer:"): 122 | def lstrip_string(s, sub): 123 | return re.sub(f'^{re.escape(sub)}', '', s, flags=re.IGNORECASE) 124 | patterns = [re.compile(f"(?:{prefix})(.*)(?:\n|$)", flags=re.IGNORECASE), re.compile(r"(?:^)(.*)(?:\n|$)")] 125 | for pat in patterns: 126 | matches = pat.search(output) 127 | if matches is not None: 128 | return lstrip_string(matches[1].strip(), prefix).strip() # 0 index includes the non-capturing group # lstrip again because for chat models sometimes it will repeat the prefix 129 | # if still not found, return None, but should actually never get this case... 130 | return None 131 | 132 | 133 | def parse_rankings(output): 134 | # when parsing the rankings, we want to do some preprocessing first 135 | # 1. remove the square brackets and ID: 136 | output = re.sub(r"[\[\]:]", "", output) 137 | output = output.lower().replace("id", "") 138 | 139 | # 2. parse the integer surrounded by >, since all IDs are integers 140 | pattern = r'(\d+)(?:\s*>\s*(\d+))*' 141 | match = re.finditer(pattern, output) 142 | # and take the longest match 143 | longest = "" 144 | for m in match: 145 | if len(m.group(0)) > len(longest): 146 | longest = m.group(0) 147 | 148 | if len(longest) > 0: 149 | number_string = longest 150 | # import to output a list of strings instead of ints, since the IDs are saved as strings (even though they are supposed to be integers) 151 | rankings = [num.strip() for num in number_string.split('>') if num.strip().isdigit()] 152 | else: 153 | # if we can't find any numbers, then we just return the whole string (unlikely to get any matches) 154 | rankings = [output] 155 | 156 | results = {} 157 | for i, rank in enumerate(rankings): 158 | if rank not in results: 159 | results[rank] = len(rankings) - i 160 | 161 | return results 162 | 163 | 164 | def calculate_metrics(prediction, answers): 165 | r_scorer = rouge_scorer.RougeScorer(['rougeL', 'rougeLsum'], use_stemmer=True) 166 | em = drqa_metric_max_over_ground_truths(drqa_exact_match_score, prediction, answers) 167 | f1 = drqa_metric_max_over_ground_truths(lambda x, y: f1_score(x, y)[0], prediction, answers) 168 | sub_em = drqa_metric_max_over_ground_truths(substring_exact_match_score, prediction, answers) 169 | 170 | if isinstance(answers, str): 171 | answers = [answers] 172 | elif isinstance(answers[0], list): 173 | answers = [ground_truth for ground_truths_list in answers for ground_truth in ground_truths_list] 174 | 175 | rouges = [r_scorer.score(target=a, prediction=prediction) for a in answers] 176 | rouge = {} 177 | for k in r_scorer.rouge_types: 178 | rouge[k + "_f1"] = max([r[k].fmeasure for r in rouges]) 179 | rouge[k + "_recall"] = max([r[k].recall for r in rouges]) 180 | 181 | return { 182 | "exact_match": em, 183 | "f1": f1, 184 | "substring_exact_match": sub_em, 185 | **rouge, 186 | } 187 | 188 | 189 | def calculate_retrieval_metrics(results, qrels, k_values=[1, 5, 10, 25, 50, 100], verbose=False): 190 | # https://github.com/beir-cellar/beir/blob/f062f038c4bfd19a8ca942a9910b1e0d218759d4/beir/retrieval/evaluation.py#L66 191 | # follow evaluation from BEIR, which is just using the trec eval 192 | ndcg = {} 193 | _map = {} 194 | recall = {} 195 | precision = {} 196 | mrr = {"MRR": 0} 197 | 198 | for k in k_values: 199 | ndcg[f"NDCG@{k}"] = 0.0 200 | _map[f"MAP@{k}"] = 0.0 201 | recall[f"Recall@{k}"] = 0.0 202 | precision[f"P@{k}"] = 0.0 203 | 204 | map_string = "map_cut." + ",".join([str(k) for k in k_values]) 205 | ndcg_string = "ndcg_cut." + ",".join([str(k) for k in k_values]) 206 | recall_string = "recall." + ",".join([str(k) for k in k_values]) 207 | precision_string = "P." + ",".join([str(k) for k in k_values]) 208 | 209 | # https://github.com/cvangysel/pytrec_eval/blob/master/examples/simple_cut.py 210 | # qrels = {qid: {'pid': [0/1] (relevance label)}} 211 | # results = {qid: {'pid': float (retriever score)}} 212 | evaluator = pytrec_eval.RelevanceEvaluator(qrels, {map_string, ndcg_string, recall_string, precision_string, "recip_rank"}) 213 | scores = evaluator.evaluate(results) 214 | 215 | for query_id in scores.keys(): 216 | for k in k_values: 217 | ndcg[f"NDCG@{k}"] += scores[query_id]["ndcg_cut_" + str(k)] 218 | _map[f"MAP@{k}"] += scores[query_id]["map_cut_" + str(k)] 219 | recall[f"Recall@{k}"] += scores[query_id]["recall_" + str(k)] 220 | precision[f"P@{k}"] += scores[query_id]["P_"+ str(k)] 221 | mrr["MRR"] += scores[query_id]["recip_rank"] 222 | 223 | for k in k_values: 224 | ndcg[f"NDCG@{k}"] = round(ndcg[f"NDCG@{k}"]/len(scores), 5) 225 | _map[f"MAP@{k}"] = round(_map[f"MAP@{k}"]/len(scores), 5) 226 | recall[f"Recall@{k}"] = round(recall[f"Recall@{k}"]/len(scores), 5) 227 | precision[f"P@{k}"] = round(precision[f"P@{k}"]/len(scores), 5) 228 | mrr["MRR"] = round(mrr["MRR"]/len(scores), 5) 229 | 230 | if verbose: 231 | for eval in [ndcg, _map, recall, precision, mrr]: 232 | logger.info("\n") 233 | for k in eval.keys(): 234 | logger.info("{}: {:.4f}".format(k, eval[k])) 235 | 236 | output = {**ndcg, **_map, **recall, **precision, **mrr} 237 | return output 238 | --------------------------------------------------------------------------------