├── .gitattributes ├── .gitignore ├── .gitmodules ├── LICENSE ├── README.md ├── github_overview.png ├── requirements.txt ├── setup.py ├── tigerscore ├── __init__.py ├── candidates_generation │ ├── _generate_candidates.sh │ ├── downmodel.py │ ├── engine.py │ ├── eval_candidates.py │ ├── eval_candidates.sh │ ├── finetune_base_model.py │ ├── finetune_base_model.sh │ ├── generate_candidates.py │ ├── generate_candidates.sh │ ├── generate_candidates_by_gpt.py │ ├── generate_candidates_by_gpt.sh │ ├── generate_candidates_series.sh │ ├── generate_ref_by_gpt4.py │ └── model_utils.py ├── common │ ├── InstructScore.py │ ├── README.md │ ├── __init__.py │ ├── bart_score.py │ ├── cor_eval.py │ ├── datasets_config.py │ ├── download.sh │ ├── evaluation.py │ ├── flan_score.py │ ├── prism.py │ ├── requirements.txt │ └── utils.py ├── download_dataset │ ├── bartscore_data_process.py │ ├── datasets_scripts │ │ └── fetaqa.sh │ ├── download_bartscore_data.sh │ ├── download_general_datasets.py │ ├── download_general_datasets.sh │ ├── preprocess_utils_totto.py │ └── utils.py ├── eval_scripts │ ├── bs_analysis.py │ ├── bs_utils.py │ ├── check_data.py │ ├── check_data.sh │ ├── check_responses.py │ ├── check_responses.sh │ ├── eval_baseline.py │ ├── eval_baseline.sh │ ├── generate_distill_data.py │ ├── generate_distill_data.sh │ ├── generate_inst_synthetic_data.py │ ├── generate_inst_synthetic_data.sh │ ├── generate_synthesis_distill_data.py │ ├── generate_synthesis_distill_data.sh │ ├── get_systhesis_ref_data.sh │ ├── lfqa_gpt_rate.py │ ├── lfqa_gpt_rate.sh │ ├── mathqa_rate.py │ ├── test_ref_diff.py │ ├── test_xgptscore.py │ ├── test_xgptscore.sh │ └── utils.py ├── finetune │ ├── ds_llama_config.json │ ├── finetune_llama.sh │ ├── finetune_mistral.sh │ ├── format_data_v2.py │ ├── format_data_v2.sh │ ├── format_distill_data.py │ ├── format_distill_data.sh │ ├── format_synthesis_distill_data.py │ ├── format_synthesis_distill_data.sh │ ├── ft_llama_lora.sh │ ├── test_llama.py │ ├── test_llama.sh │ ├── test_llama_vllm.py │ ├── test_llama_vllm.sh │ ├── test_llama_vllm_distance.py │ ├── test_llama_vllm_vanilla.py │ ├── train.py │ ├── trainer.py │ └── utils.py ├── get_error_types │ ├── error_types │ │ └── error_types.json │ └── get_error_types.py ├── scorer │ ├── __init__.py │ └── tigerscore.py └── xgptscore │ ├── README.md │ ├── constants.py │ ├── example.json │ ├── example_result.json │ ├── mode_configs │ ├── align_score.json │ ├── default.json │ ├── kb_txt.json │ └── wmt_mqm.json │ ├── openai_utils.py │ ├── openai_utils_azure.py │ ├── openai_utils_curl.py │ ├── openai_utils_openAI.py │ ├── process.py │ ├── process_utils.py │ ├── templates.py │ └── xgptscore.py └── tigerscore_example_usage.ipynb /.gitattributes: -------------------------------------------------------------------------------- 1 | data/evaluation/instruct/mixinstruct/test_data_prepared.json filter=lfs diff=lfs merge=lfs -text 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | tigerscore/xgptscore/cache/ 162 | jobs/ 163 | /tigerscore/common/models 164 | 165 | /TigerScore.zip 166 | /hf_space 167 | !/hf_space/TIGERScore 168 | /hf_evaluate 169 | /raw_datasets 170 | 171 | /data/real-world/summarization/summeval/cnndm 172 | /data/real-world/summarization/summeval/M* 173 | /data/**/train_data.json 174 | /data/**/**/train_data.json 175 | /data/**/**/**/train_data.json 176 | /data/clean_real_world_data 177 | /data/clean_real_world 178 | /test.ipynb 179 | /data/synthesis/synthesis 180 | /data/*.json 181 | /data/*.jsonl 182 | /data/*.ipynb 183 | /tigerscore/xgptscore/cache 184 | /test.sh 185 | /tigerscore/eval_scripts/check_data_private.sh 186 | /tigerscore/finetune/wandb/ 187 | /data/additional 188 | /tigerscore/eval_scripts/eval_inst_baseline.sh 189 | /data/evaluation/translation 190 | /data/evaluation 191 | /data/data_dist 192 | /data/evaluation/pair_cmp 193 | /test* 194 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "hf_space/TIGERScore"] 2 | path = hf_space/TIGERScore 3 | url = https://huggingface.co/spaces/TIGER-Lab/TIGERScore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 TIGER Lab 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /github_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/TIGERScore/5133420066ee65a875d5b64cfd20ab35cfce0112/github_overview.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | datasets 3 | torch 4 | accelerate 5 | wget 6 | pycocoevalcap 7 | spacy 8 | evaluate 9 | prettytable 10 | gdcm 11 | pydicom 12 | bitsandbytes 13 | openai 14 | nltk 15 | scipy 16 | json5 17 | peft 18 | fire 19 | gradio 20 | sentencepiece 21 | tiktoken 22 | dacite 23 | wandb 24 | bs4 25 | py7zr 26 | gdown 27 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | description = """ 4 | TIGERScore, a Trained metric that follows Instruction Guidance to perform Explainable, and Reference-free evaluation over a wide spectrum of text generation tasks. 5 | Different from other automatic evaluation methods that only provide arcane scores, TIGERScore is guided by the natural language instruction to provide error analysis to pinpoint the mistakes in the generated text. 6 | """ 7 | 8 | setup( 9 | name='tigerscore', 10 | version='0.0.1', 11 | description=description, 12 | author='Dongfu Jiang', 13 | author_email='dongfu.jiang@uwaterloo.ca', 14 | packages=find_packages(), 15 | url='https://tiger-ai-lab.github.io/TIGERScore/', 16 | install_requires=[ 17 | 'torch', 18 | 'transformers', 19 | 'datasets', 20 | 'accelerate', 21 | 'gradio', 22 | 'tiktoken', 23 | 'llama-cpp-python', 24 | 'protobuf', 25 | 'sentencepiece', 26 | 'accelerate' 27 | ], 28 | ) 29 | -------------------------------------------------------------------------------- /tigerscore/__init__.py: -------------------------------------------------------------------------------- 1 | from tigerscore.scorer.tigerscore import TIGERScorer -------------------------------------------------------------------------------- /tigerscore/candidates_generation/_generate_candidates.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --time=30:00:00 3 | #SBATCH --job-name=generate_candidates 4 | #SBATCH --output ../../jobs/%j.out 5 | #SBATCH --hint=memory_bound 6 | #SBATCH --mem=60G 7 | #SBATCH --gres=gpu:a6000:2 8 | #SBATCH --qos=normal 9 | #SBATCH -n 1 10 | 11 | nvidia-smi 12 | # candidates will be saved in ../../data/${dataset}/candidates/${decoding_method}/${model}.json 13 | dataset=$1 14 | set=$2 15 | model_type=$3 16 | model=$4 17 | output_max_length=$5 18 | no_instruction=$6 19 | input_max_length=$7 20 | decoding_method=$8 21 | image2text=$9 22 | start_idx=${10} 23 | end_idx=${11} 24 | data_dir="../../data" 25 | dtype="float16" 26 | num_candidates=5 27 | num_beams=$num_candidates 28 | num_beam_groups=$num_candidates 29 | overwrite=False 30 | inference_bs=1 31 | 32 | 33 | if [ -z "$start_idx" ] && [ -z "$end_idx" ]; then 34 | echo "start_idx and end_idx are not provided, set to None" 35 | else 36 | echo "start_idx: $start_idx" 37 | echo "end_idx: $end_idx" 38 | fi 39 | if [ -z "$output_max_length" ]; then 40 | output_max_length=300 41 | echo "output_max_length is not provided, set to $output_max_length" 42 | else 43 | echo "output_max_length: $output_max_length" 44 | fi 45 | 46 | if [ -z "$input_max_length" ]; then 47 | input_max_length=300 48 | echo "input_max_length is not provided, set to $input_max_length" 49 | else 50 | echo "input_max_length: $input_max_length" 51 | fi 52 | 53 | if [ -z "$image2text" ]; then 54 | image2text=False 55 | echo "image2text is not provided, set to $image2text" 56 | else 57 | echo "image2text: $image2text" 58 | fi 59 | if [ -z "$no_instruction" ]; then 60 | no_instruction=False 61 | echo "no_instruction is not provided, set to $no_instruction" 62 | else 63 | echo "no_instruction: $no_instruction" 64 | fi 65 | if [ -z "$decoding_method" ]; then 66 | decoding_method="top_p_sampling" 67 | echo "decoding_method is not provided, set to $decoding_method" 68 | else 69 | echo "decoding_method: $decoding_method" 70 | fi 71 | python ./generate_candidates.py \ 72 | --model_type $model_type \ 73 | --model $model \ 74 | --data_dir $data_dir \ 75 | --dataset $dataset \ 76 | --set $set \ 77 | --num_return_sequences $num_candidates \ 78 | --decoding_method $decoding_method \ 79 | --inference_bs $inference_bs \ 80 | --prompt_max_length $input_max_length \ 81 | --output_max_length $output_max_length \ 82 | --dtype $dtype \ 83 | --num_beams $num_beams \ 84 | --num_beam_groups $num_beam_groups \ 85 | --no_repeat_ngram_size 3 \ 86 | --start_idx "$start_idx" \ 87 | --end_idx "$end_idx" \ 88 | --overwrite $overwrite \ 89 | --image2text "$image2text" \ 90 | --no_instruction "$no_instruction" \ -------------------------------------------------------------------------------- /tigerscore/candidates_generation/downmodel.py: -------------------------------------------------------------------------------- 1 | # The task in slurm connot support long time download,so just download in shell. 2 | from model_utils import build_model, build_tokenizer 3 | import os 4 | from pathlib import Path 5 | import fire 6 | 7 | 8 | def main( models: str = None, model_type: str = None, cache_dir: str = None): 9 | models = models 10 | model_type = model_type 11 | cache_dir = ( 12 | cache_dir or Path(os.path.abspath(__file__)).parent.parent.parent / "hf_models" 13 | ) 14 | for model in models.split(","): 15 | tokenizer = build_tokenizer( 16 | model, 17 | cache_dir=cache_dir, 18 | resume_download=True, 19 | trust_remote_code=True, 20 | ) 21 | model = build_model( 22 | model_type, 23 | model, 24 | cache_dir=cache_dir, 25 | resume_download=True, 26 | trust_remote_code=True, 27 | ) 28 | 29 | 30 | if __name__ == "__main__": 31 | fire.Fire(main) -------------------------------------------------------------------------------- /tigerscore/candidates_generation/engine.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is taken from This file is modified based on: 3 | https://github.com/Ravoxsg/SummaReranker-ACL-22-/blob/main/src/candidate_generation/engine.py 4 | We thank the authors for sharing their code. 5 | """ 6 | import gc 7 | import torch 8 | import torch.nn.functional as F 9 | from typing import Dict 10 | 11 | 12 | def beam_search_step(inputs: Dict, tokenizer, base_model, args, **kwargs): 13 | """beam search step 14 | 15 | Args: 16 | inputs (dict): settings for beam search 17 | tokenizer (transformers tokenizer): Tokenizer 18 | base_model (transformers model): Model 19 | args (dict): settings for beam search 20 | 21 | Returns: 22 | dict: generated candidates and their logprobs in batch 23 | """ 24 | kwargs['return_dict_in_generate'] = True 25 | kwargs['output_scores'] = True 26 | # 1 - beam search 27 | if args.decoding_method == "beam_search": 28 | outputs = base_model.generate( 29 | **inputs, 30 | num_beams=args.num_beams, 31 | num_return_sequences=args.num_return_sequences, 32 | max_new_tokens=args.output_max_length, 33 | repetition_penalty=args.repetition_penalty, 34 | length_penalty=args.length_penalty, 35 | no_repeat_ngram_size=args.no_repeat_ngram_size, 36 | use_cache=True, 37 | early_stopping=True, 38 | temperature=args.temperature, 39 | **kwargs 40 | ) 41 | # 2 - diverse beam search 42 | if args.decoding_method == "diverse_beam_search": 43 | outputs = base_model.generate( 44 | **inputs, 45 | num_beams=args.num_beams, 46 | num_beam_groups=args.num_beam_groups, 47 | num_return_sequences=args.num_return_sequences, 48 | max_new_tokens=args.output_max_length, 49 | diversity_penalty=args.diversity_penalty, 50 | repetition_penalty=args.repetition_penalty, 51 | length_penalty=args.length_penalty, 52 | no_repeat_ngram_size=args.no_repeat_ngram_size, 53 | use_cache=True, 54 | early_stopping=True, 55 | temperature=args.temperature, 56 | **kwargs 57 | ) 58 | # 3 - top-p sampling 59 | if args.decoding_method == "top_p_sampling": 60 | outputs = base_model.generate( 61 | **inputs, 62 | num_beams=1, 63 | do_sample=True, 64 | top_p=args.top_p, 65 | num_return_sequences=args.num_return_sequences, 66 | max_new_tokens=args.output_max_length, 67 | repetition_penalty=args.repetition_penalty, 68 | length_penalty=args.length_penalty, 69 | no_repeat_ngram_size=args.no_repeat_ngram_size, 70 | use_cache=True, 71 | early_stopping=True, 72 | temperature=args.temperature, 73 | **kwargs 74 | ) 75 | # 4 - top-k sampling 76 | if args.decoding_method == "top_k_sampling": 77 | outputs = base_model.generate( 78 | **inputs, 79 | num_beams=1, 80 | do_sample=True, 81 | top_k=args.top_k, 82 | num_return_sequences=args.num_return_sequences, 83 | max_new_tokens=args.output_max_length, 84 | repetition_penalty=args.repetition_penalty, 85 | length_penalty=args.length_penalty, 86 | no_repeat_ngram_size=args.no_repeat_ngram_size, 87 | use_cache=True, 88 | early_stopping=True, 89 | temperature=args.temperature, 90 | **kwargs 91 | ) 92 | # for top-p and top-k sampling, some scores will be masked as -inf. These scores are not processed by softmax and logrithm. 93 | masked_logits = torch.stack(outputs.scores, dim=0) 94 | masked_logits = F.log_softmax(masked_logits, dim=1) 95 | summary_ids = outputs.sequences 96 | logprobs = [] 97 | # Different process for decoder-only models and encoder-decoder models 98 | if "input_ids" in inputs and \ 99 | summary_ids.shape[1] == inputs['input_ids'].shape[1] + masked_logits.shape[0]: 100 | # for decoder-only models 101 | # remove input_ids 102 | summary_ids = summary_ids[:, inputs['input_ids'].shape[1]:] 103 | for i in range(summary_ids.shape[0]): 104 | logprobs.append([]) 105 | for j in range(summary_ids.shape[1]): # token_idx 106 | if summary_ids[i][j] == tokenizer.eos_token_id: 107 | break 108 | logprobs[i].append( 109 | masked_logits[j, i, summary_ids[i][j]].item()) 110 | else: 111 | # for encoder-decoder models 112 | for i in range(summary_ids.shape[0]): 113 | logprobs.append([]) 114 | # shift of decoder because of the additional bos_token 115 | for j in range(summary_ids.shape[1] - 1): # token_idx 116 | if summary_ids[i][j + 1] == tokenizer.eos_token_id: 117 | break 118 | logprobs[i].append( 119 | masked_logits[j, i, summary_ids[i][j + 1]].item()) 120 | 121 | logprobs = [sum(_probs) for _probs in logprobs] 122 | generated = tokenizer.batch_decode( 123 | summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True) 124 | del summary_ids 125 | gc.collect() 126 | 127 | batch_generated = [] 128 | batch_logprobs = [] 129 | bz = list(inputs.values())[0].shape[0] 130 | for i in range(bz): 131 | batch_generated.append( 132 | generated[i * args.num_return_sequences:(i + 1) * args.num_return_sequences]) 133 | batch_logprobs.append( 134 | logprobs[i * args.num_return_sequences:(i + 1) * args.num_return_sequences]) 135 | return { 136 | "generated": batch_generated, 137 | "logprobs": batch_logprobs 138 | } 139 | -------------------------------------------------------------------------------- /tigerscore/candidates_generation/eval_candidates.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --time=24:00:00 3 | #SBATCH --job-name=eval_candidates 4 | #SBATCH --output ../../jobs/%j.out 5 | #SBATCH --gres=gpu:2080:1 6 | #SBATCH --nodes=1 7 | #SBATCH -n 2 8 | 9 | data_dir="../../data" 10 | # dataset="samsum,xsum,newsroom" # summarization 11 | # dataset="wmt16/cs-en,wmt16/de-en,wmt16/tr-en,wmt17/fi-en,wmt18/zh-en" # translation 12 | # dataset="totto,kasnerz/wikitabletext" # data2text 13 | dataset="din0s/asqa,DongfuTingle/FeTaQA,cosmos_qa,eli5" # long-form QA 14 | # dataset="databricks/databricks-dolly-15k" 15 | # dataset="gsm8k:main,math_qa" 16 | 17 | # dataset="common_gen,vicgalle/alpaca-gpt4,xnli/en,knkarthick/dialogsum" 18 | set="test" 19 | num_workers=1 20 | metrics="bleu,rouge,bart_score,bart_score_cnn" 21 | overwrite="True" 22 | echo "dataset: $dataset" 23 | echo "set: $set" 24 | python eval_candidates.py \ 25 | --data_dir $data_dir \ 26 | --dataset $dataset \ 27 | --set $set \ 28 | --num_workers $num_workers \ 29 | --metrics $metrics \ 30 | --overwrite $overwrite -------------------------------------------------------------------------------- /tigerscore/candidates_generation/finetune_base_model.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file is to finetune basic models for candidates generation. 3 | Code based on Huggingface Turorial. 4 | """ 5 | from common.evaluation import overall_eval 6 | from model_utils import ( 7 | build_model, 8 | build_tokenizer, 9 | ) 10 | from typing import Optional, Sequence, Dict, List 11 | from generate_candidates import get_model_size, get_torch_dtype 12 | from dataclasses import dataclass, field 13 | from transformers import ( 14 | TrainingArguments, 15 | Seq2SeqTrainer, 16 | Seq2SeqTrainingArguments 17 | ) 18 | import numpy as np 19 | import logging 20 | import transformers 21 | import torch 22 | import json 23 | import os 24 | import sys 25 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 26 | sys.path.append("..") 27 | IGNORE_INDEX = -100 28 | 29 | 30 | @dataclass 31 | class ModelArguments: 32 | model_type: str 33 | model_name_or_path: str 34 | dtype: str = "float32" 35 | cache_dir: Optional[str] = None 36 | 37 | 38 | @dataclass 39 | class DataArguments: 40 | data_dir: str 41 | train_file: str 42 | eval_file: str = None 43 | eval_metrics: List[str] = field(default_factory=lambda: ["bleu", "rouge"]) 44 | input_max_length: int = 512 45 | output_max_length: int = 128 46 | with_instruction: bool = False 47 | 48 | 49 | def load_dataset(data_args): 50 | with open(data_args.train_file, 'r') as f: 51 | train_data = json.load(f) 52 | if data_args.eval_file: 53 | with open(data_args.eval_file, 'r') as f: 54 | eval_data = json.load(f) 55 | else: 56 | eval_data = None 57 | 58 | return train_data, eval_data 59 | 60 | 61 | class SupervisedDataset(torch.utils.data.Dataset): 62 | def __init__(self, encodings): 63 | self.encodings = encodings 64 | 65 | def __getitem__(self, idx): 66 | return {key: val[idx] for key, val in self.encodings.items()} 67 | 68 | def __len__(self): 69 | return len(self.encodings["input_ids"]) 70 | 71 | 72 | def preprocess_function(examples, tokenizer, data_args): 73 | if data_args.with_instruction: 74 | inputs = [x["instruction"] + "\n" + x["input"] for x in examples] 75 | else: 76 | inputs = [x["input"] for x in examples] 77 | inputs = [x.strip(' \n') for x in inputs] 78 | outputs = [x["output"] for x in examples] 79 | 80 | logging.warning("# of examples: {}".format(len(inputs))) 81 | logging.warning("Example of inputs:") 82 | print(inputs[0]) 83 | logging.warning("Example of outputs:") 84 | print(outputs[0]) 85 | 86 | model_inputs = tokenizer( 87 | inputs, max_length=data_args.input_max_length, truncation=True) 88 | # Setup the tokenizer for targets 89 | with tokenizer.as_target_tokenizer(): 90 | labels = tokenizer( 91 | outputs, max_length=data_args.output_max_length, truncation=True) 92 | 93 | logging.warning("Example of model inputs:") 94 | print("input_ids", model_inputs['input_ids'][0]) 95 | print("attention_mask", model_inputs['attention_mask'][0]) 96 | logging.warning("Example of labels:") 97 | print(labels['input_ids'][0]) 98 | labels["input_ids"] = [ 99 | [(_l if _l != tokenizer.pad_token_id else IGNORE_INDEX) for _l in label] for label in labels["input_ids"] 100 | ] 101 | model_inputs["labels"] = labels["input_ids"] 102 | return SupervisedDataset(model_inputs) 103 | 104 | 105 | @dataclass 106 | class DataCollatorForSupervisedDataset(object): 107 | """Collate examples for supervised fine-tuning.""" 108 | 109 | tokenizer: transformers.PreTrainedTokenizer 110 | 111 | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]: 112 | input_ids, labels = tuple([torch.tensor( 113 | instance[key]) for instance in instances] for key in ("input_ids", "labels")) 114 | input_ids = torch.nn.utils.rnn.pad_sequence( 115 | input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id 116 | ) 117 | labels = torch.nn.utils.rnn.pad_sequence( 118 | labels, batch_first=True, padding_value=IGNORE_INDEX) 119 | # print(self.tokenizer.batch_decode(input_ids)) 120 | # print(self.tokenizer.batch_decode(labels.masked_fill(labels == IGNORE_INDEX, self.tokenizer.pad_token_id))) 121 | # print("##" * 30) 122 | return dict( 123 | input_ids=input_ids, 124 | labels=labels, 125 | attention_mask=input_ids.ne(self.tokenizer.pad_token_id), 126 | ) 127 | 128 | 129 | def main( 130 | model_args: ModelArguments, 131 | data_args: DataArguments, 132 | training_args: TrainingArguments, 133 | ): 134 | 135 | model = build_model( 136 | model_args.model_type, 137 | model_args.model_name_or_path, 138 | torch_dtype=get_torch_dtype(model_args.dtype), 139 | device_map="auto", 140 | cache_dir=model_args.cache_dir, resume_download=True) 141 | n_params = sum(p.numel() for p in model.parameters() if p.requires_grad) 142 | logging.warning("The {} has {} trainable parameters".format( 143 | model_args.model_name_or_path, get_model_size(n_params))) 144 | tokenizer = build_tokenizer( 145 | model_args.model_name_or_path, 146 | cache_dir=model_args.cache_dir, resume_download=True) 147 | logging.warning("Loading dataset...") 148 | 149 | train_data, eval_data = load_dataset(data_args) 150 | logging.warning("Dataset loaded.") 151 | logging.warning("Preprocessing dataset...") 152 | train_dataset = preprocess_function(train_data, tokenizer, data_args) 153 | eval_dataset = preprocess_function(eval_data, tokenizer, data_args) 154 | logging.warning("Dataset preprocessed.") 155 | logging.warning("Loading data collator...") 156 | data_collator = DataCollatorForSupervisedDataset(tokenizer) 157 | logging.warning("Data collator loaded.") 158 | logging.warning("Loading trainer...") 159 | 160 | def compute_metrics(eval_pred): 161 | 162 | logits, labels = eval_pred 163 | labels[labels == IGNORE_INDEX] = tokenizer.pad_token_id 164 | logits[logits == IGNORE_INDEX] = tokenizer.pad_token_id 165 | predictions = tokenizer.batch_decode(logits, skip_special_tokens=True) 166 | labels = tokenizer.batch_decode(labels, skip_special_tokens=True) 167 | logging.warning("Example of predictions:") 168 | print(predictions[:3]) 169 | logging.warning("Example of labels:") 170 | print(labels[:3]) 171 | scores = overall_eval(predictions, labels, 172 | metrics=data_args.eval_metrics) 173 | return { 174 | key: np.mean(value) for key, value in scores.items() 175 | } 176 | 177 | training_args.evaluation_strategy = "epoch" 178 | training_args.weight_decay = 0.01 179 | training_args.save_total_limit = 5 180 | training_args.predict_with_generate = True 181 | training_args.generation_num_beams = 4 182 | training_args.generation_max_length = data_args.output_max_length 183 | training_args.load_best_model_at_end = True 184 | logging.warning("Training arguments:") 185 | print(training_args) 186 | trainer = Seq2SeqTrainer( 187 | model=model, 188 | args=training_args, 189 | tokenizer=tokenizer, 190 | train_dataset=train_dataset, 191 | eval_dataset=eval_dataset, 192 | data_collator=data_collator, 193 | compute_metrics=compute_metrics, 194 | ) 195 | logging.warning("Trainer loaded.") 196 | logging.warning("Training...") 197 | trainer.train() 198 | logging.warning("Training finished.") 199 | logging.warning("Saving model...") 200 | trainer.save_model(output_dir=os.path.join( 201 | training_args.output_dir, "checkpoint-best")) 202 | logging.warning("Model saved.") 203 | 204 | 205 | if __name__ == "__main__": 206 | parser = transformers.HfArgumentParser( 207 | (ModelArguments, DataArguments, Seq2SeqTrainingArguments)) 208 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 209 | main(model_args, data_args, training_args) 210 | -------------------------------------------------------------------------------- /tigerscore/candidates_generation/finetune_base_model.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --time=24:00:00 3 | #SBATCH --job-name=finetune 4 | #SBATCH --output ../../jobs/finetune_base_models/%j.out 5 | #SBATCH --gres=gpu:2080:1 6 | #SBATCH --nodes=1 7 | #SBATCH -n 1 8 | 9 | model_type="t5" 10 | model_name_or_path="google/flan-t5-large" 11 | data_dir="../../data" 12 | dataset="cosmos_qa" 13 | train_file="${data_dir}/${dataset}/finetune_data.json" 14 | eval_file="${data_dir}/${dataset}/validation_data.json" 15 | with_instruction=True 16 | run_name="ft_${dataset}" 17 | learning_rate=1e-4 18 | num_train_epochs=10 19 | per_device_train_batch_size=2 20 | per_device_eval_batch_size=8 21 | gradient_accumulation_steps=16 22 | max_grad_norm=1 23 | input_max_length=512 24 | output_max_length=256 25 | optim="adafactor" 26 | lr_scheduler_type="linear" 27 | warmup_ratio=0.1 28 | fp16=False 29 | output_dir="../../finetuned_models/${model_name_or_path}/${run_name}" 30 | cache_dir="../../hf_models" 31 | localhost=$RANDOM # random port number 32 | n_gpu=1 33 | torchrun \ 34 | --rdzv_backend=c10d \ 35 | --rdzv_endpoint="localhost:${localhost}" \ 36 | --nnodes 1 \ 37 | --nproc_per_node ${n_gpu} \ 38 | finetune_base_model.py \ 39 | --model_type $model_type \ 40 | --model_name_or_path $model_name_or_path \ 41 | --data_dir $data_dir \ 42 | --train_file $train_file \ 43 | --eval_file $eval_file \ 44 | --with_instruction $with_instruction \ 45 | --run_name $run_name \ 46 | --learning_rate $learning_rate \ 47 | --optim $optim \ 48 | --fp16 $fp16 \ 49 | --lr_scheduler_type $lr_scheduler_type \ 50 | --num_train_epochs $num_train_epochs \ 51 | --per_device_train_batch_size $per_device_train_batch_size \ 52 | --per_device_eval_batch_size $per_device_eval_batch_size \ 53 | --gradient_accumulation_steps $gradient_accumulation_steps \ 54 | --max_grad_norm $max_grad_norm \ 55 | --input_max_length $input_max_length \ 56 | --output_max_length $output_max_length \ 57 | --output_dir $output_dir \ 58 | --cache_dir $cache_dir \ 59 | --report_to "wandb" \ 60 | --logging_steps 2 \ 61 | 62 | -------------------------------------------------------------------------------- /tigerscore/candidates_generation/generate_candidates_by_gpt.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | Gererate candidates by GPT-3.5 or GPT-4. 4 | """ 5 | from xgptscore.process_utils import XPGTItem 6 | from xgptscore.xgptscore import xgptscore 7 | import json 8 | import random 9 | import logging 10 | import sys 11 | import fire 12 | from pathlib import Path 13 | sys.path.append(str(Path(__file__).parent.parent)) 14 | logging.basicConfig(level=logging.warning) 15 | 16 | 17 | def main( 18 | task: str, 19 | data_path: str, 20 | dataset: str, 21 | output_file: str = None, 22 | xgptscore_mode: str = "instruction", 23 | model_name: str = "ChatGPT", 24 | overwrite: bool = False, 25 | max_size: int = None, 26 | seed: int = 42, 27 | shuffle_file: bool = False, 28 | source_max_length: int = None, 29 | ref_max_length: int = None, 30 | hypo_max_length: int = None, 31 | dataset_split: str = "test", 32 | ): 33 | """Gererate candidates by GPT-3.5 or GPT-4. 34 | 35 | Args: 36 | task (str): Task name. 37 | data_path (str): Path to the data. 38 | dataset (str): Dataset name. 39 | output_file (str, optional): Defaults to None. 40 | xgptscore_mode (str, optional): Defaults to "instruction". 41 | model_name (str, optional): Defaults to "ChatGPT". 42 | overwrite (bool, optional): Defaults to False. 43 | max_size (int, optional): Defaults to None. 44 | seed (int, optional): Defaults to 42. 45 | shuffle_file (bool, optional): Defaults to False. 46 | source_max_length (int, optional): Defaults to None. 47 | ref_max_length (int, optional): Defaults to None. 48 | hypo_max_length (int, optional): Defaults to None. 49 | dataset_split (str, optional): Defaults to "test". 50 | """ 51 | logging.warning("Params: \n{}".format(json.dumps(locals(), indent=4))) 52 | # load data 53 | data_path = Path(data_path) 54 | input_file = data_path / dataset / (dataset_split + "_data.json") 55 | 56 | input_file = Path(input_file) 57 | if not output_file: 58 | output_file = data_path / dataset / "candidates" / \ 59 | dataset_split / "top_p_sampling" / f"{model_name}.json" 60 | if not output_file.parent.parent.exists(): 61 | output_file.parent.parent.mkdir(parents=True) 62 | if not output_file.parent.exists(): 63 | output_file.parent.mkdir() 64 | else: 65 | output_file = Path(output_file) 66 | with open(input_file, "r") as f: 67 | items = json.load(f) 68 | logging.warning("Loaded {} items from {}".format( 69 | len(items), input_file)) 70 | logging.warning("Preparing writing to {}...".format(output_file)) 71 | 72 | random.seed(seed) 73 | logging.warning("Set seed to {}".format(seed)) 74 | if shuffle_file: 75 | random.shuffle(items) 76 | logging.warning("Shuffled {} items".format(len(items))) 77 | if isinstance(max_size, int) and max_size > 0: 78 | items = items[:max_size] 79 | logging.warning("Truncated to {} items".format(len(items))) 80 | 81 | xgptitems = [] 82 | for item in items: 83 | xgptitems.append(XPGTItem( 84 | task=task, 85 | instruction=item['instruction'], 86 | input=item['input'], 87 | ref_output=item['output'] if "output" in item else item['refs'], 88 | hypo_output=None, 89 | )) 90 | if "candidates" in item: 91 | del item["candidates"] 92 | 93 | if not output_file.exists() or overwrite: 94 | logging.warning("Running xgptscore") 95 | # run xgptscore 96 | xgptscore_params = { 97 | "max_lengths": { 98 | "input": source_max_length, 99 | "hypo_output": hypo_max_length, 100 | "ref_output": ref_max_length, 101 | }, 102 | } 103 | result = xgptscore(xgptitems, mode=xgptscore_mode, 104 | model_name=model_name, **xgptscore_params) 105 | for i, item in enumerate(items): 106 | item['responses'] = result['round_completions'][i] 107 | item['messages_records'] = result['messages_records'][i] 108 | item['candidates'] = [ 109 | {"text": result['round_completions'][i][0], 110 | "scores": {} 111 | }] 112 | # print(items) 113 | with open(output_file, "w") as f: 114 | json.dump(items, f, indent=4, ensure_ascii=False) 115 | logging.warning("Saved to {}".format(output_file)) 116 | else: 117 | logging.warning("Loading from {}".format(output_file)) 118 | with open(output_file, "r") as f: 119 | items = json.load(f) 120 | 121 | 122 | if __name__ == "__main__": 123 | fire.Fire(main) 124 | -------------------------------------------------------------------------------- /tigerscore/candidates_generation/generate_candidates_by_gpt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=generate_candidates_by_gpt 3 | #SBATCH --time=24:00:00 4 | #SBATCH --output=../../jobs/%j.out 5 | 6 | 7 | # datasets=("GAIR/lima" "tatsu-lab/alpaca_farm:alpaca_instructions" "HuggingFaceH4/oasst1_en" "JosephusCheung/GuanacoDataset" "databricks/databricks-dolly-15k") 8 | dataset=$1 9 | task=$2 10 | data_path="" 11 | python generate_candidates_by_gpt.py \ 12 | --task $task \ 13 | --data_path $data_path \ 14 | --dataset $dataset \ 15 | --source_max_length 512 \ 16 | --overwrite "False" -------------------------------------------------------------------------------- /tigerscore/candidates_generation/generate_candidates_series.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --time=12:00:00 3 | #SBATCH --job-name=generate_candidates 4 | #SBATCH --output ../../jobs/%j.out 5 | #SBATCH --gres=gpu:1 6 | #SBATCH --qos=normal 7 | #SBATCH -n 1 8 | 9 | # This script is used to generate candidates via GPT-3.5 and local models. 10 | 11 | CMD="sbatch" 12 | 13 | # models=("google/flan-t5-small" "google/flan-t5-base" "google/flan-t5-large" "google/flan-t5-xl" "google/flan-t5-xxl") 14 | # models=("lmsys/vicuna-33b-v1.3" "lmsys/vicuna-13b-v1.3" "lmsys/vicuna-7b-v1.3") # vicuna 15 | models=("lmsys/vicuna-33b-v1.3") # vicuna-33b-v1.3 need two gpus 16 | # models=("lmsys/vicuna-13b-v1.3" "lmsys/vicuna-7b-v1.3") # vicuna 17 | # model_type="t5" 18 | model_type="llama" 19 | dataset="din0s/asqa" 20 | dataset="DongfuTingle/FeTaQA" 21 | # dataset="cosmos_qa" 22 | # dataset="eli5" 23 | set="test" 24 | output_max_length=512 25 | for model in "${models[@]}"; do 26 | ${CMD} _generate_candidates.sh "$dataset" "$set" "$model_type" "$model" "$output_max_length" 27 | done 28 | # data_path="" 29 | # python generate_candidates_by_gpt.py \ 30 | # --task "long-form QA" \ 31 | # --data_path $data_path \ 32 | # --dataset $dataset \ -------------------------------------------------------------------------------- /tigerscore/candidates_generation/generate_ref_by_gpt4.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | Gererate candidates by GPT-3.5 or GPT-4. 4 | """ 5 | 6 | import json 7 | import random 8 | import logging 9 | import sys 10 | import fire 11 | from pathlib import Path 12 | sys.path.append(str(Path(__file__).parent.parent)) 13 | from xgptscore.process_utils import XPGTItem 14 | from xgptscore.xgptscore import xgptscore 15 | logging.basicConfig(level=logging.warning) 16 | 17 | 18 | def main( 19 | task: str, 20 | data_path: str, 21 | xgptscore_mode: str = "instruction", 22 | model_name: str = "gpt-4", 23 | overwrite: bool = False, 24 | max_size: int = None, 25 | seed: int = 42, 26 | shuffle_file: bool = False, 27 | source_max_length: int = None, 28 | ref_max_length: int = None, 29 | hypo_max_length: int = None, 30 | dataset_split: str = "test", 31 | ): 32 | """Gererate candidates by GPT-3.5 or GPT-4. 33 | 34 | Args: 35 | task (str): Task name. 36 | data_path (str): Path to the data. 37 | dataset (str): Dataset name. 38 | output_file (str, optional): Defaults to None. 39 | xgptscore_mode (str, optional): Defaults to "instruction". 40 | model_name (str, optional): Defaults to "ChatGPT". 41 | overwrite (bool, optional): Defaults to False. 42 | max_size (int, optional): Defaults to None. 43 | seed (int, optional): Defaults to 42. 44 | shuffle_file (bool, optional): Defaults to False. 45 | source_max_length (int, optional): Defaults to None. 46 | ref_max_length (int, optional): Defaults to None. 47 | hypo_max_length (int, optional): Defaults to None. 48 | dataset_split (str, optional): Defaults to "test". 49 | """ 50 | logging.warning("Params: \n{}".format(json.dumps(locals(), indent=4))) 51 | # load data 52 | data_path = Path(data_path) 53 | input_file = data_path 54 | 55 | input_file = Path(input_file) 56 | output_file = input_file 57 | with open(input_file, "r") as f: 58 | items = json.load(f) 59 | logging.warning("Loaded {} items from {}".format( 60 | len(items), input_file)) 61 | logging.warning("Preparing writing to {}...".format(output_file)) 62 | 63 | random.seed(seed) 64 | logging.warning("Set seed to {}".format(seed)) 65 | if shuffle_file: 66 | random.shuffle(items) 67 | logging.warning("Shuffled {} items".format(len(items))) 68 | if isinstance(max_size, int) and max_size > 0: 69 | items = items[:max_size] 70 | logging.warning("Truncated to {} items".format(len(items))) 71 | 72 | xgptitems = [] 73 | for item in items: 74 | xgptitems.append(XPGTItem( 75 | task=task, 76 | instruction=item['instruction'], 77 | input=item['input'], 78 | ref_output=item['output'] if "output" in item else item['refs'], 79 | hypo_output=None, 80 | )) 81 | 82 | if not output_file.exists() or overwrite: 83 | logging.warning("Running xgptscore") 84 | # run xgptscore 85 | xgptscore_params = { 86 | "max_lengths": { 87 | "input": source_max_length, 88 | "hypo_output": hypo_max_length, 89 | "ref_output": ref_max_length, 90 | }, 91 | } 92 | result = xgptscore(xgptitems, mode=xgptscore_mode, 93 | model_name=model_name,num_workers=5, **xgptscore_params) 94 | for i, item in enumerate(items): 95 | item['responses'] = result['round_completions'][i] 96 | item['messages_records'] = result['messages_records'][i] 97 | if item["output"] is not None: 98 | item["output"] = result['round_completions'][i][0] 99 | # print(items) 100 | with open(output_file, "w") as f: 101 | json.dump(items, f, indent=4, ensure_ascii=False) 102 | logging.warning("Saved to {}".format(output_file)) 103 | else: 104 | logging.warning("Loading from {}".format(output_file)) 105 | with open(output_file, "r") as f: 106 | items = json.load(f) 107 | 108 | 109 | if __name__ == "__main__": 110 | fire.Fire(main) 111 | -------------------------------------------------------------------------------- /tigerscore/candidates_generation/model_utils.py: -------------------------------------------------------------------------------- 1 | from transformers import ( 2 | AutoTokenizer, 3 | AutoModelForSeq2SeqLM, 4 | AutoModelForCausalLM, 5 | AutoModel, 6 | VisionEncoderDecoderModel, 7 | ViTImageProcessor, 8 | ) 9 | decoder_only_models = ["alpaca", "llama", "opt", "bloom", 10 | "gpt", "vicuna", "koala", "Wizard", "stablelm"] 11 | 12 | 13 | def build_model(model_type, model_name, **kwargs): 14 | """ 15 | Build the model from the model name 16 | """ 17 | if any([x in model_type for x in decoder_only_models]) or any([x in model_name for x in decoder_only_models]): 18 | model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs) 19 | elif model_type in ["vit"]: 20 | model = VisionEncoderDecoderModel.from_pretrained(model_name) 21 | elif model_type in ["bart", "t5", "mbart", "m2m100", "nllb", "opus_mt", "unifiedqa", "opus-mt", "pegasus"]: 22 | model = AutoModelForSeq2SeqLM.from_pretrained(model_name, **kwargs) 23 | else: 24 | model = AutoModel.from_pretrained(model_name, **kwargs) 25 | 26 | return model 27 | 28 | 29 | def build_tokenizer(model_name, **kwargs): 30 | """ 31 | Build the tokenizer from the model name 32 | """ 33 | 34 | if "vicuna" in model_name: 35 | tokenizer = AutoTokenizer.from_pretrained( 36 | model_name, padding_side="left", use_fast=False, **kwargs) 37 | # elif "Wizard" in model_name: 38 | # tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", return_token_type_ids=False, **kwargs) 39 | elif any([x in model_name for x in decoder_only_models]): 40 | # padding left 41 | tokenizer = AutoTokenizer.from_pretrained( 42 | model_name, padding_side="left", **kwargs) 43 | else: 44 | tokenizer = AutoTokenizer.from_pretrained( 45 | model_name, **kwargs) # , use_fast=False) 46 | if tokenizer.pad_token is None: 47 | tokenizer.pad_token = tokenizer.eos_token 48 | tokenizer.pad_token_id = tokenizer.eos_token_id 49 | return tokenizer 50 | 51 | 52 | def build_processor(model_type, model_name, **kwargs): 53 | """ 54 | Build the processor from the model name 55 | """ 56 | if model_type in ["vit"]: 57 | processor = ViTImageProcessor.from_pretrained(model_name, **kwargs) 58 | else: 59 | raise NotImplementedError 60 | return processor 61 | -------------------------------------------------------------------------------- /tigerscore/common/README.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | to get our experiments results, first create a `tigerscore_baseline` environment 3 | ```bash 4 | conda create -n tigerscore_baseline python=3.9 5 | conda activate tigerscore_baseline 6 | pip install -r requirements.txt 7 | pip install https://github.com/vllm-project/vllm/releases/download/v0.2.2/vllm-0.2.2+cu118-cp39-cp39-manylinux1_x86_64.whl 8 | pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu118 9 | ``` 10 | -------------------------------------------------------------------------------- /tigerscore/common/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | cur_folder = os.path.dirname(os.path.abspath(__file__)) 4 | if cur_folder not in sys.path: 5 | sys.path.append(cur_folder) 6 | -------------------------------------------------------------------------------- /tigerscore/common/bart_score.py: -------------------------------------------------------------------------------- 1 | # %% 2 | """ 3 | From https://github.com/neulab/BARTScore 4 | """ 5 | import torch 6 | import torch.nn as nn 7 | import traceback 8 | from transformers import BartTokenizer, BartForConditionalGeneration 9 | 10 | 11 | class BARTScorer: 12 | def __init__(self, device='cuda:0', max_length=1024, checkpoint='facebook/bart-large-cnn'): 13 | # Set up model 14 | self.device = device 15 | self.max_length = max_length 16 | self.tokenizer = BartTokenizer.from_pretrained(checkpoint) 17 | self.model = BartForConditionalGeneration.from_pretrained(checkpoint) 18 | self.model.eval() 19 | self.model.to(device) 20 | 21 | # Set up loss 22 | self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id) 23 | self.lsm = nn.LogSoftmax(dim=1) 24 | 25 | def load(self, path='./models/bart.pth'): 26 | """ Load model from paraphrase finetuning """ 27 | self.model.load_state_dict(torch.load(path, map_location=self.device)) 28 | 29 | def score(self, srcs, tgts, batch_size): 30 | """ Score a batch of examples """ 31 | score_list = [] 32 | for i in range(0, len(srcs), batch_size): 33 | src_list = srcs[i: i + batch_size] 34 | tgt_list = tgts[i: i + batch_size] 35 | try: 36 | with torch.no_grad(): 37 | encoded_src = self.tokenizer( 38 | src_list, 39 | max_length=self.max_length, 40 | truncation=True, 41 | padding=True, 42 | return_tensors='pt' 43 | ) 44 | encoded_tgt = self.tokenizer( 45 | tgt_list, 46 | max_length=self.max_length, 47 | truncation=True, 48 | padding=True, 49 | return_tensors='pt' 50 | ) 51 | src_tokens = encoded_src['input_ids'].to(self.device) 52 | src_mask = encoded_src['attention_mask'].to(self.device) 53 | 54 | tgt_tokens = encoded_tgt['input_ids'].to(self.device) 55 | tgt_mask = encoded_tgt['attention_mask'] 56 | tgt_len = tgt_mask.sum(dim=1).to(self.device) 57 | 58 | output = self.model( 59 | input_ids=src_tokens, 60 | attention_mask=src_mask, 61 | labels=tgt_tokens 62 | ) 63 | logits = output.logits.view(-1, self.model.config.vocab_size) 64 | loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1)) 65 | loss = loss.view(tgt_tokens.shape[0], -1) 66 | loss = loss.sum(dim=1) / tgt_len 67 | curr_score_list = [-x.item() for x in loss] 68 | score_list += curr_score_list 69 | 70 | except RuntimeError: 71 | traceback.print_exc() 72 | print(f'source: {src_list}') 73 | print(f'target: {tgt_list}') 74 | exit(0) 75 | return score_list 76 | -------------------------------------------------------------------------------- /tigerscore/common/cor_eval.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy 3 | 4 | 5 | def cor_pearson(hypo_scores, ref_scores): 6 | """ 7 | Args: 8 | hypo_scores: ndarray of shape (n, c) where n is the number of samples, c is the number of candidates 9 | ref_scores: ndarray of shape (n, c) where n is the number of samples, c is the number of candidates 10 | returns: 11 | cor: float, the mean correlation coefficient 12 | """ 13 | if isinstance(hypo_scores, list): 14 | hypo_scores = np.array(hypo_scores) 15 | if isinstance(ref_scores, list): 16 | ref_scores = np.array(ref_scores) 17 | assert hypo_scores.shape == ref_scores.shape 18 | bz, c = hypo_scores.shape 19 | hypo_scores = hypo_scores.reshape(bz, c).T 20 | ref_scores = ref_scores.reshape(bz, c).T 21 | cor = 0 22 | for i in range(c): 23 | cor += np.corrcoef(hypo_scores[i], ref_scores[i])[0, 1] 24 | cor /= c 25 | return cor 26 | 27 | 28 | def cor_spearman(hypo_scores, ref_scores): 29 | """ 30 | Args: 31 | hypo_scores: ndarray of shape (n, c) where n is the number of samples, c is the number of candidates 32 | ref_scores: ndarray of shape (n, c) where n is the number of samples, c is the number of candidates 33 | returns: 34 | cor: float, the mean of the diagonal elements of the spearman correlation matrix 35 | """ 36 | if isinstance(hypo_scores, list): 37 | hypo_scores = np.array(hypo_scores) 38 | if isinstance(ref_scores, list): 39 | ref_scores = np.array(ref_scores) 40 | assert hypo_scores.shape == ref_scores.shape 41 | bz, c = hypo_scores.shape 42 | hypo_scores = hypo_scores.reshape(bz, c).T 43 | ref_scores = ref_scores.reshape(bz, c).T 44 | cor = 0 45 | for i in range(c): 46 | cor += scipy.stats.spearmanr(hypo_scores[i], ref_scores[i]).correlation 47 | cor /= c 48 | return cor 49 | 50 | 51 | def cor_spearman_footrule(hypo_scores, ref_scores): 52 | """ 53 | Args: 54 | hypo_scores: ndarray of shape (n, c) where n is the number of samples, c is the number of candidates 55 | ref_scores: ndarray of shape (n, c) where n is the number of samples, c is the number of candidates 56 | returns: 57 | cor: float, the mean of the set of the spearman correlation coefficients 58 | """ 59 | if isinstance(hypo_scores, list): 60 | hypo_scores = np.array(hypo_scores) 61 | if isinstance(ref_scores, list): 62 | ref_scores = np.array(ref_scores) 63 | assert hypo_scores.shape == ref_scores.shape 64 | bz, c = hypo_scores.shape 65 | hypo_scores = hypo_scores.reshape(bz, c) 66 | ref_scores = ref_scores.reshape(bz, c) 67 | return np.abs(hypo_scores - ref_scores).sum(axis=-1).mean() 68 | -------------------------------------------------------------------------------- /tigerscore/common/download.sh: -------------------------------------------------------------------------------- 1 | # Download BLEURT 2 | wget https://storage.googleapis.com/bleurt-oss/bleurt-large-512.zip . 3 | unzip bleurt-large-512.zip 4 | mv bleurt-large-512 models/ 5 | rm bleurt-large-512.zip 6 | 7 | # Download PRISM 8 | wget http://data.statmt.org/prism/m39v1.tar 9 | tar xf m39v1.tar 10 | mv m39v1 models/ 11 | rm m39v1.tar -------------------------------------------------------------------------------- /tigerscore/common/flan_score.py: -------------------------------------------------------------------------------- 1 | # %% 2 | """ 3 | From https://github.com/xu1998hz/SEScore3 4 | """ 5 | import torch 6 | import torch.nn as nn 7 | import traceback 8 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer 9 | 10 | 11 | class FLANScorer: 12 | def __init__(self, device='cuda:0', max_length=1024, checkpoint='google/flan-t5-base'): 13 | # Set up model 14 | self.device = device 15 | self.max_length = max_length 16 | self.tokenizer = AutoTokenizer.from_pretrained(checkpoint) 17 | self.model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint) 18 | self.model.eval() 19 | self.model.to(device) 20 | # Set up loss 21 | self.loss_fct = nn.NLLLoss( 22 | reduction='none', ignore_index=self.model.config.pad_token_id) 23 | self.lsm = nn.LogSoftmax(dim=1) 24 | 25 | def load(self): 26 | """ Load model from paraphrase finetuning """ 27 | self.model.load_state_dict(torch.load( 28 | 'models/bart.pth', map_location=self.device)) 29 | 30 | def score(self, srcs, tgts, batch_size): 31 | """ Score a batch of examples """ 32 | score_list = [] 33 | for i in range(0, len(srcs), batch_size): 34 | src_list = srcs[i: i + batch_size] 35 | tgt_list = tgts[i: i + batch_size] 36 | if i < 1: 37 | pass 38 | # print('src_list: ',src_list) 39 | # print('tgt_list: ', tgt_list) 40 | try: 41 | with torch.no_grad(): 42 | encoded_src = self.tokenizer( 43 | src_list, 44 | max_length=self.max_length, 45 | truncation=True, 46 | padding=True, 47 | return_tensors='pt' 48 | ) 49 | encoded_tgt = self.tokenizer( 50 | tgt_list, 51 | max_length=self.max_length, 52 | truncation=True, 53 | padding=True, 54 | return_tensors='pt' 55 | ) 56 | src_tokens = encoded_src['input_ids'].to(self.device) 57 | src_mask = encoded_src['attention_mask'].to(self.device) 58 | tgt_tokens = encoded_tgt['input_ids'].to(self.device) 59 | tgt_mask = encoded_tgt['attention_mask'] 60 | tgt_len = tgt_mask.sum(dim=1).to(self.device) 61 | 62 | output = self.model( 63 | input_ids=src_tokens, 64 | attention_mask=src_mask, 65 | labels=tgt_tokens 66 | ) 67 | logits = output.logits.view(-1, 68 | self.model.config.vocab_size) 69 | loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1)) 70 | loss = loss.view(tgt_tokens.shape[0], -1) 71 | loss = loss.sum(dim=1) / tgt_len 72 | curr_score_list = [-x.item() for x in loss] 73 | score_list += curr_score_list 74 | 75 | except RuntimeError: 76 | traceback.print_exc() 77 | print(f'source: {src_list}') 78 | print(f'target: {tgt_list}') 79 | exit(0) 80 | return score_list 81 | -------------------------------------------------------------------------------- /tigerscore/common/requirements.txt: -------------------------------------------------------------------------------- 1 | torch 2 | transformers 3 | git+https://github.com/Unbabel/COMET.git 4 | git+https://github.com/jdf-prog/UniEval.git 5 | nltk 6 | git+https://github.com/google-research/bleurt.git 7 | fire 8 | rouge_score 9 | bert_score 10 | git+https://github.com/huggingface/evaluate@18932858570b9fa97ac478e1e6e709438e4d093b 11 | pycocoevalcap 12 | spacy 13 | git+https://github.com/google-research/mt-metrics-eval.git 14 | prettytable 15 | psutil 16 | sacrebleu 17 | mosestokenizer 18 | pytorch-lightning==2.0.0 -------------------------------------------------------------------------------- /tigerscore/common/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | import os 3 | import numpy as np 4 | import torch 5 | import argparse 6 | import hashlib 7 | import requests 8 | import time 9 | from io import BytesIO 10 | from tqdm import tqdm 11 | from PIL import Image 12 | from concurrent.futures import ThreadPoolExecutor 13 | from functools import partial 14 | from datasets.utils.file_utils import get_datasets_user_agent 15 | 16 | USER_AGENT = get_datasets_user_agent() 17 | 18 | 19 | def seed_everything(seed=42): 20 | """ 21 | Seed everything for reproducibility 22 | """ 23 | random.seed(seed) 24 | os.environ['PYTHONHASHSEED'] = str(seed) 25 | np.random.seed(seed) 26 | torch.manual_seed(seed) 27 | torch.cuda.manual_seed(seed) 28 | torch.backends.cudnn.deterministic = True 29 | 30 | 31 | def str2bool(v): 32 | """ 33 | Convert string to boolean 34 | """ 35 | if isinstance(v, bool): 36 | return v 37 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 38 | return True 39 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 40 | return False 41 | else: 42 | raise argparse.ArgumentTypeError('Boolean value expected.') 43 | 44 | 45 | def empty2None(x): 46 | if x == '': 47 | return None 48 | elif isinstance(x, str): 49 | return x 50 | else: 51 | raise argparse.ArgumentTypeError('String value expected.') 52 | 53 | 54 | def empty2Noneint(x): 55 | if x == '': 56 | return None 57 | elif isinstance(x, int): 58 | return x 59 | elif isinstance(x, str): 60 | return int(x) 61 | else: 62 | raise argparse.ArgumentTypeError('Integer value expected.') 63 | 64 | 65 | def empty2zero(x): 66 | if x == '': 67 | return 0 68 | elif isinstance(x, int): 69 | return x 70 | elif isinstance(x, str): 71 | return int(x) 72 | else: 73 | raise argparse.ArgumentTypeError('Integer value expected.') 74 | 75 | 76 | def generate_hash_code(text): 77 | if text is None: 78 | return None 79 | # Convert the text to bytes and create a hash object 80 | hash_object = hashlib.sha256(text.encode()) 81 | 82 | # Get the hexadecimal representation of the hash code 83 | hex_code = hash_object.hexdigest() 84 | 85 | # Return the first 16 digits of the hexadecimal code 86 | return hex_code[:16] 87 | 88 | 89 | def fetch_single_image(image_url, timeout=None, retries=2): 90 | """ 91 | Fetch a single image from a URL. 92 | """ 93 | if os.path.exists(image_url): 94 | # fetch from local 95 | try: 96 | image = Image.open(image_url).convert("RGB") 97 | except Exception: 98 | if retries > 0: 99 | time.sleep(3) 100 | return fetch_single_image(image_url, timeout=timeout, retries=retries - 1) 101 | else: 102 | # fetch from url 103 | try: 104 | r = requests.get(image_url, timeout=timeout, 105 | stream=True, headers={"User-Agent": USER_AGENT}) 106 | r.raise_for_status() 107 | image = Image.open(BytesIO(r.content)).convert("RGB") 108 | except Exception as e: 109 | if retries > 0: 110 | time.sleep(3) # Wait 3 seconds before retrying 111 | return fetch_single_image(image_url, timeout=timeout, retries=retries - 1) 112 | else: 113 | print( 114 | f"Failed to fetch image from {image_url} after {retries} retries") 115 | raise e 116 | return image 117 | 118 | 119 | def fetch_images(image_urls, num_threads, timeout=None, retries=2): 120 | """ 121 | Fetch images from a list of URLs in parallel. 122 | Args: 123 | image_urls (list): List of image URLs. 124 | num_threads (int): Number of threads to use. 125 | timeout (int, optional): Timeout for the request. Defaults to None. 126 | retries (int, optional): Number of retries. Defaults to 0. 127 | Returns: 128 | list: List of PIL images. 129 | """ 130 | fetch_single_image_with_args = partial( 131 | fetch_single_image, timeout=timeout, retries=retries) 132 | with ThreadPoolExecutor(max_workers=num_threads) as executor: 133 | images = list( 134 | tqdm( 135 | executor.map(fetch_single_image_with_args, image_urls), 136 | total=len(image_urls), 137 | desc="Fetching images") 138 | ) 139 | print("Fetched {} images".format(len(images))) 140 | return images 141 | -------------------------------------------------------------------------------- /tigerscore/download_dataset/bartscore_data_process.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unzip the data files and convert them to json format. 3 | """ 4 | import os 5 | import json 6 | import argparse 7 | import pickle 8 | from pathlib import Path 9 | 10 | if __name__ == '__main__': 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--data_dir', type=str, required=True) 13 | parser.add_argument('--task', type=str, required=True) 14 | parser.add_argument('--rm_old', action='store_true') 15 | 16 | args = parser.parse_args() 17 | data_dir = args.data_dir 18 | 19 | task_dir = Path(data_dir) / args.task 20 | for data_file in os.listdir(task_dir): 21 | if not data_file.endswith('.pkl'): 22 | continue 23 | print("Data file: ", data_file) 24 | data_path = task_dir / data_file 25 | with open(data_path, 'rb') as f: 26 | data = pickle.load(f) 27 | print("# of data: ", len(data)) 28 | if isinstance(data, dict): 29 | print("Data Example: ", data[list(data.keys())[0]]) 30 | elif isinstance(data, list): 31 | print("Data example: ", data[0]) 32 | with open(data_path.with_suffix('.json'), 'w') as f: 33 | json.dump(data, f, indent=4) 34 | if args.rm_old: 35 | data_path.unlink() 36 | -------------------------------------------------------------------------------- /tigerscore/download_dataset/datasets_scripts/fetaqa.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/TIGERScore/5133420066ee65a875d5b64cfd20ab35cfce0112/tigerscore/download_dataset/datasets_scripts/fetaqa.sh -------------------------------------------------------------------------------- /tigerscore/download_dataset/download_bartscore_data.sh: -------------------------------------------------------------------------------- 1 | # Download the BARTScore used system outputs and references 2 | scripts_dir=$(pwd) 3 | data_dir="../../data/bartscore_data" 4 | mkdir -p $data_dir 5 | 6 | # Summarization 7 | cd $data_dir 8 | datasets=("Newsroom" "QAGS_CNN" "QAGS_XSUM" "REALSumm" "Rank19" "SummEval") 9 | mkdir -p summarization 10 | for dataset in ${datasets[@]}; do 11 | wget "https://github.com/neulab/BARTScore/raw/main/SUM/${dataset}/data.pkl" -O "summarization/${dataset}.pkl" 12 | done 13 | cd $scripts_dir 14 | python bartscore_data_process.py --data_dir "$data_dir" --task "summarization" 15 | 16 | 17 | # Translation 18 | cd $data_dir 19 | datasets=("de-en" "fi-en" "gu-en" "kk-en" "lt-en" "ru-en" "zh-en") 20 | mkdir -p translation 21 | for dataset in ${datasets[@]}; do 22 | wget "https://github.com/neulab/BARTScore/raw/main/WMT/${dataset}/data.pkl" -O "translation/${dataset}.pkl" 23 | done 24 | cd $scripts_dir 25 | python bartscore_data_process.py --data_dir "$data_dir" --task "translation" 26 | 27 | # Data2Text 28 | cd $data_dir 29 | datasets=("BAGEL" "SFHOT" "SFRES") 30 | mkdir -p data2text 31 | for dataset in ${datasets[@]}; do 32 | wget "https://github.com/neulab/BARTScore/raw/main/D2T/${dataset}/data.pkl" -O "data2text/${dataset}.pkl" 33 | done 34 | cd $scripts_dir 35 | python bartscore_data_process.py --data_dir "$data_dir" --task "data2text" -------------------------------------------------------------------------------- /tigerscore/download_dataset/download_general_datasets.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --time=18:00:00 3 | #SBATCH --job-name=downloading_general_datasets 4 | #SBATCH --output ../../jobs/%j.out 5 | #SBATCH --nodelist=ink-gary 6 | #SBATCH -n 1 7 | 8 | python download_general_datasets.py --task "mathQA" --overwrite False 9 | python download_general_datasets.py --task "summarization" --overwrite False 10 | python download_general_datasets.py --task "translation" --overwrite False 11 | python download_general_datasets.py --task "data2text" --overwrite False 12 | python download_general_datasets.py --task "long-form QA" --overwrite False 13 | python download_general_datasets.py --task "instruction-following" --overwrite False 14 | # python download_general_datasets.py --task "story_generation" 15 | # python download_general_datasets.py --task "image_captioning" 16 | python download_general_datasets.py --task "code" -------------------------------------------------------------------------------- /tigerscore/download_dataset/preprocess_utils_totto.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Google AI Language Team Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Baseline preprocessing utilities.""" 16 | import copy 17 | 18 | 19 | def _add_adjusted_col_offsets(table): 20 | """Add adjusted column offsets to take into account multi-column cells.""" 21 | adjusted_table = [] 22 | for row in table: 23 | real_col_index = 0 24 | adjusted_row = [] 25 | for cell in row: 26 | adjusted_cell = copy.deepcopy(cell) 27 | adjusted_cell["adjusted_col_start"] = real_col_index 28 | adjusted_cell["adjusted_col_end"] = ( 29 | adjusted_cell["adjusted_col_start"] + adjusted_cell["column_span"]) 30 | real_col_index += adjusted_cell["column_span"] 31 | adjusted_row.append(adjusted_cell) 32 | adjusted_table.append(adjusted_row) 33 | return adjusted_table 34 | 35 | 36 | def _get_heuristic_row_headers(adjusted_table, row_index, col_index): 37 | """Heuristic to find row headers.""" 38 | row_headers = [] 39 | row = adjusted_table[row_index] 40 | for i in range(0, col_index): 41 | if row[i]["is_header"]: 42 | row_headers.append(row[i]) 43 | return row_headers 44 | 45 | 46 | def _get_heuristic_col_headers(adjusted_table, row_index, col_index): 47 | """Heuristic to find column headers.""" 48 | adjusted_cell = adjusted_table[row_index][col_index] 49 | adjusted_col_start = adjusted_cell["adjusted_col_start"] 50 | adjusted_col_end = adjusted_cell["adjusted_col_end"] 51 | col_headers = [] 52 | for r in range(0, row_index): 53 | row = adjusted_table[r] 54 | for cell in row: 55 | if (cell["adjusted_col_start"] < adjusted_col_end and 56 | cell["adjusted_col_end"] > adjusted_col_start): 57 | if cell["is_header"]: 58 | col_headers.append(cell) 59 | 60 | return col_headers 61 | 62 | 63 | def get_highlighted_subtable(table, cell_indices, with_heuristic_headers=False): 64 | """Extract out the highlighted part of a table.""" 65 | highlighted_table = [] 66 | 67 | adjusted_table = _add_adjusted_col_offsets(table) 68 | 69 | for (row_index, col_index) in cell_indices: 70 | cell = table[row_index][col_index] 71 | if with_heuristic_headers: 72 | row_headers = _get_heuristic_row_headers(adjusted_table, row_index, 73 | col_index) 74 | col_headers = _get_heuristic_col_headers(adjusted_table, row_index, 75 | col_index) 76 | else: 77 | row_headers = [] 78 | col_headers = [] 79 | 80 | highlighted_cell = { 81 | "cell": cell, 82 | "row_headers": row_headers, 83 | "col_headers": col_headers 84 | } 85 | highlighted_table.append(highlighted_cell) 86 | 87 | return highlighted_table 88 | 89 | 90 | def linearize_full_table(table, cell_indices, table_page_title, 91 | table_section_title): 92 | """Linearize full table with localized headers and return a string.""" 93 | table_str = "" 94 | if table_page_title: 95 | table_str += " " + table_page_title + " " 96 | if table_section_title: 97 | table_str += " " + table_section_title + " " 98 | 99 | table_str += " " 100 | adjusted_table = _add_adjusted_col_offsets(table) 101 | for r_index, row in enumerate(table): 102 | row_str = " " 103 | for c_index, col in enumerate(row): 104 | 105 | row_headers = _get_heuristic_row_headers( 106 | adjusted_table, r_index, c_index) 107 | col_headers = _get_heuristic_col_headers( 108 | adjusted_table, r_index, c_index) 109 | 110 | # Distinguish between highlighted and non-highlighted cells. 111 | if [r_index, c_index] in cell_indices: 112 | start_cell_marker = " " 113 | end_cell_marker = " " 114 | else: 115 | start_cell_marker = " " 116 | end_cell_marker = " " 117 | 118 | # The value of the cell. 119 | item_str = start_cell_marker + col["value"] + " " 120 | 121 | # All the column headers associated with this cell. 122 | for col_header in col_headers: 123 | item_str += " " + \ 124 | col_header["value"] + " " 125 | 126 | # All the row headers associated with this cell. 127 | for row_header in row_headers: 128 | item_str += " " + \ 129 | row_header["value"] + " " 130 | 131 | item_str += end_cell_marker 132 | row_str += item_str 133 | 134 | row_str += " " 135 | table_str += row_str 136 | 137 | table_str += "
" 138 | if cell_indices: 139 | assert "" in table_str 140 | return table_str 141 | 142 | 143 | def linearize_subtable(subtable, table_page_title, table_section_title): 144 | """Linearize the highlighted subtable and return a string of its contents.""" 145 | table_str = "" 146 | if table_page_title: 147 | table_str += " " + table_page_title + " " 148 | if table_section_title: 149 | table_str += " " + table_section_title + " " 150 | table_str += " " 151 | 152 | for item in subtable: 153 | cell = item["cell"] 154 | row_headers = item["row_headers"] 155 | col_headers = item["col_headers"] 156 | 157 | # The value of the cell. 158 | item_str = " " + cell["value"] + " " 159 | 160 | # All the column headers associated with this cell. 161 | for col_header in col_headers: 162 | item_str += " " + \ 163 | col_header["value"] + " " 164 | 165 | # All the row headers associated with this cell. 166 | for row_header in row_headers: 167 | item_str += " " + \ 168 | row_header["value"] + " " 169 | 170 | item_str += " " 171 | table_str += item_str 172 | 173 | table_str += "
" 174 | return table_str 175 | -------------------------------------------------------------------------------- /tigerscore/download_dataset/utils.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import argparse 3 | 4 | 5 | def generate_hash_code(text): 6 | # Convert the text to bytes and create a hash object 7 | hash_object = hashlib.sha256(text.encode()) 8 | 9 | # Get the hexadecimal representation of the hash code 10 | hex_code = hash_object.hexdigest() 11 | 12 | # Return the first 16 digits of the hexadecimal code 13 | return hex_code[:16] 14 | 15 | 16 | def str2bool(v): 17 | if isinstance(v, bool): 18 | return v 19 | if v.lower() in ('yes', 'true', 't', 'y', '1'): 20 | return True 21 | elif v.lower() in ('no', 'false', 'f', 'n', '0'): 22 | return False 23 | else: 24 | raise argparse.ArgumentTypeError('Boolean value expected.') 25 | 26 | 27 | def empty2None(x): 28 | if x == '': 29 | return None 30 | else: 31 | return x 32 | 33 | 34 | def empty2zero(x): 35 | if x == '': 36 | return 0 37 | elif isinstance(x, int): 38 | return x 39 | elif isinstance(x, str): 40 | return int(x) 41 | else: 42 | raise argparse.ArgumentTypeError('Integer value expected.') 43 | -------------------------------------------------------------------------------- /tigerscore/eval_scripts/check_data.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | from xgptscore.openai_utils import openai_completions, _chatml_to_prompt 4 | import fire 5 | import json 6 | import random 7 | from string import Template 8 | 9 | 10 | template = """ 11 | ${instruction} 12 | ${input} 13 | 14 | Model-generated output: 15 | ${output} 16 | 17 | An error analysis provided: 18 | ${error_analysis} 19 | 20 | Is the error analysis reasonable? Answer me "yes" or "no" only.\ 21 | """ 22 | 23 | def main(input_file, output_file, model_name="gpt-4", num_samples=None, num_procs=5): 24 | with open(input_file, "r") as f: 25 | if input_file.endswith(".jsonl"): 26 | input_data = [json.loads(line) for line in f] 27 | elif input_file.endswith(".json"): 28 | input_data = json.load(f) 29 | if num_samples is None: 30 | num_samples = len(input_data) 31 | print(num_samples) 32 | input_data = input_data[:num_samples] 33 | 34 | def process_data(item): 35 | prompt = Template(template=template).substitute( 36 | instruction=item["instruction"], 37 | input=item["input_context"], 38 | output=item["hypo_output"], 39 | error_analysis=item["errors"] 40 | ) 41 | message = [{ 42 | "role": "user", 43 | "content": prompt 44 | }] 45 | chatml_prompt = _chatml_to_prompt(message) 46 | return chatml_prompt 47 | 48 | prompts = list(map(process_data, input_data)) 49 | print(prompts[0]) 50 | completions = openai_completions(prompts, model_name=model_name, num_procs=num_procs, use_cache=False) 51 | print(f"Finished generating {len(completions['completions'])} completions.") 52 | print(f"Total prices: {sum(completions['price_per_example'])}") 53 | for i, completion in enumerate(completions['completions']): 54 | input_data[i]["completion"] = completion 55 | with open(output_file, "w") as f: 56 | if output_file.endswith(".jsonl"): 57 | for item in input_data: 58 | json.dump(item, f) 59 | f.write("\n") 60 | elif output_file.endswith(".json"): 61 | json.dump(input_data, f) 62 | 63 | if __name__ == "__main__": 64 | fire.Fire(main) 65 | -------------------------------------------------------------------------------- /tigerscore/eval_scripts/check_data.sh: -------------------------------------------------------------------------------- 1 | # python check_data.py \ 2 | # --input_file "../../data/new_std_400s_m_200s_l_1100s_i3-32k.json" \ 3 | # --output_file "../../data/new_std_400s_m_200s_l_1100s_i3-32k.check.json" \ 4 | # --model_name "gpt-4" \ 5 | # --num_procs 5 6 | 7 | 8 | # python check_data.py \ 9 | # --input_file "../../data/train_mix.jsonl" \ 10 | # --output_file "../../data/train_mix.check_ChatGPT.jsonl" \ 11 | # --model_name "ChatGPT" 12 | 13 | python check_data.py \ 14 | --input_file "../../data/good.jsonl" \ 15 | --output_file "../../data/good.check.json" \ 16 | --model_name "ChatGPT" \ 17 | --num_procs 5 -------------------------------------------------------------------------------- /tigerscore/eval_scripts/check_responses.sh: -------------------------------------------------------------------------------- 1 | model_name="gpt-4" 2 | if [ ${model_name} == "gpt-4" ]; then 3 | export OPENAI_API_KEY= 4 | export OPENAI_API_BASE="" 5 | export OPENAI_API_TYPE="azure" 6 | export OPENAI_API_VERSION="2023-07-01-preview" 7 | fi 8 | 9 | 10 | # python check_responses.py \ 11 | # --input_file "/home//WorkSpace/ExplainableGPTScore/data/wmt/zh-en/train_data.wmt_mqm.distil_new_wmt_mqm_200.json" \ 12 | # --output_file "/home//WorkSpace/ExplainableGPTScore/data/wmt/zh-en/train_data.wmt_mqm.distil_new_wmt_mqm_200.check.json" \ 13 | # --model_name ${model_name} \ 14 | 15 | # python check_responses.py \ 16 | # --input_file "/home//WorkSpace/ExplainableGPTScore_bak/data/sum/train_data.align_score.filter_v2.json" \ 17 | # --output_file "/home//WorkSpace/ExplainableGPTScore_bak/data/sum/train_data.align_score.filter_v2.check.json" \ 18 | # --model_name ${model_name} \ 19 | 20 | 21 | python check_responses.py \ 22 | --input_file "../../data/wmt/train_data.wmt_mqm.distill_new_wmt_mqm.json" \ 23 | --output_file "../../data/wmt/train_data.wmt_mqm.distill_new_wmt_mqm.${model_name}.check.json" \ 24 | --model_name ${model_name} \ -------------------------------------------------------------------------------- /tigerscore/eval_scripts/eval_baseline.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=eval_baseline 3 | #SBATCH -c 3 4 | #SBATCH --partition=a100 5 | #SBATCH --gres=gpu:1 6 | #SBATCH --time=24:00:00 7 | #SBATCH --mem=50G 8 | #SBATCH --output=../../jobs/%x/%j.out 9 | metrics=("bleu" "rouge" "bertscore" "bleurt" "comet_da" "bart_score_cnn" "bart_score_para" "bart_score_cnn_src_hypo" "bart_score_para_src_hypo" "unieval_sum" "cometkiwi_da") 10 | 11 | # # summarization 12 | # input_file="../../BARTScore/SUM/SummEval/final_p_with_xgptscore.json" 13 | # output_file="../../BARTScore/SUM/SummEval/final_p_with_xgptscore.eval.json" 14 | # human_score_names="coherence,consistency,fluency,relevance" 15 | # cp -u $input_file $output_file 16 | # for metric in "${metrics[@]}"; do 17 | # echo "Evaluating $metric" 18 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \ 19 | # --human_score_names "$human_score_names" 20 | # done 21 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \ 22 | # --human_score_names "$human_score_names" --print_results True 23 | 24 | # # data2text 25 | # input_file="../../data_bak/webnlg/webnlg2020_gen_with_scores.json" 26 | # output_file="../../data_bak/webnlg/webnlg2020_gen_with_scores.eval.json" 27 | # input_file="../../data/evaluation/d2t/webnlg_2020/test_data_prepared.json" 28 | # output_file="../../data/evaluation/d2t/webnlg_2020/test_data_prepared.eval.json" 29 | # human_score_names="Correctness,DataCoverage,Fluency,Relevance,TextStructure" 30 | # cp -u $input_file $output_file 31 | # metrics=("${metrics[@]}" "instructscore_d2t" "gptscore_flan_d2t" "gptscore_flan_d2t_src_hypo") 32 | # for metric in "${metrics[@]}"; do 33 | # echo "Evaluating $metric" 34 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \ 35 | # --human_score_names "$human_score_names" 36 | # done 37 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \ 38 | # --human_score_names "$human_score_names" --print_results True 39 | 40 | # # # long_form_QA 41 | # input_file="../../data_bak/lfqa/test.gpt-4.rank.json" 42 | # output_file="../../data_bak/lfqa/test.gpt-4.rank.eval.json" 43 | # human_score_names="rank" 44 | # cp -u $input_file $output_file 45 | # for metric in "${metrics[@]}"; do 46 | # echo "Evaluating $metric" 47 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \ 48 | # --human_score_names "$human_score_names" 49 | # done 50 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \ 51 | # --human_score_names "$human_score_names" --print_results True 52 | 53 | # # instruction-following 54 | # input_file="../../data_bak/llm-blender/mix-instruct/test_data_prepared_300.json" 55 | # output_file="../../data_bak/llm-blender/mix-instruct/test_data_prepared_300.eval.json" 56 | # input_file="../../data/evaluation/instruct/mixinstruct/test_data_prepared.json" 57 | # output_file="../../data/evaluation/instruct/mixinstruct/test_data_prepared.eval.json" 58 | # human_score_names="gpt_rank_score" 59 | # # cp -u $input_file $output_file 60 | # metrics=("tigerscore") 61 | # # for metric in "${metrics[@]}"; do 62 | # # echo "Evaluating $metric" 63 | # # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \ 64 | # # --human_score_names "$human_score_names" 65 | # # done 66 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \ 67 | # --human_score_names "$human_score_names" --print_results True --average_by "sys" --as_rank "True" 68 | 69 | # mathqa 70 | # input_file="../../data_bak/mathqa/gsm8k_test_output_prepared.json" 71 | # output_file="../../data_bak/mathqa/gsm8k_test_output_prepared.eval.json" 72 | # input_file="../../data/evaluation/mathqa/gsm8k/test_data_prepared.json" 73 | # output_file="../../data/evaluation/mathqa/gsm8k/test_data_prepared.eval.json" 74 | # human_score_names="accuracy" 75 | # metrics=("instructscore") 76 | # cp -u $input_file $output_file 77 | # for metric in "${metrics[@]}"; do 78 | # echo "Evaluating $metric" 79 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \ 80 | # --human_score_names "$human_score_names" 81 | # done 82 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \ 83 | # --human_score_names "$human_score_names" --print_results True 84 | 85 | 86 | # # # story_gen 87 | # input_file="../../data/evaluation/storygen/test_data_prepared.json" 88 | # output_file="../../data/evaluation/storygen/test_data_prepared_eval.json" 89 | # metrics=("instructscore") 90 | # human_score_names="human" 91 | # cp -u $input_file $output_file 92 | # # for metric in "${metrics[@]}"; do 93 | # # echo "Evaluating $metric" 94 | # # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \ 95 | # # --human_score_names "$human_score_names" 96 | # # done 97 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \ 98 | # --human_score_names "$human_score_names" --print_results True 99 | 100 | # translation 101 | # input_file="../../data/evaluation/translation/wmt22/zh-en/eval_data.json" 102 | # output_file="../../data/evaluation/translation/wmt22/zh-en/eval_data.eval.json" 103 | # human_score_names="mqm" 104 | # metrics=("instructscore_mt_zh-en") 105 | # cp -u $input_file $output_file 106 | # # for metric in "${metrics[@]}"; do 107 | # # echo "Evaluating $metric" 108 | # # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \ 109 | # # --human_score_names "$human_score_names" 110 | # # done 111 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \ 112 | # --human_score_names "$human_score_names" --print_results True 113 | 114 | # input_file="../../data/evaluation/hhh_alignment/hhh_alignment.json" 115 | # output_file="../../data/evaluation/hhh_alignment/hhh_alignment.eval.json" 116 | # human_score_names="human_preference" 117 | # metrics=("bart_score_para_src_hypo") 118 | # cp -u $input_file $output_file 119 | # for metric in "${metrics[@]}"; do 120 | # echo "Evaluating $metric" 121 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \ 122 | # --human_score_names "$human_score_names" 123 | # done 124 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \ 125 | # --human_score_names "$human_score_names" --add_aggrement True --print_results True 126 | 127 | # input_file="../../data/evaluation/mtbench/mt_bench_human_judgments.json" 128 | # output_file="../../data/evaluation/mtbench/mt_bench_human_judgments.eval.json" 129 | # human_score_names="human_preference" 130 | # metrics=("bart_score_para_src_hypo") 131 | # cp -u $input_file $output_file 132 | # for metric in "${metrics[@]}"; do 133 | # echo "Evaluating $metric" 134 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \ 135 | # --human_score_names "$human_score_names" 136 | # done 137 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \ 138 | # --human_score_names "$human_score_names" --add_aggrement True --print_results True 139 | 140 | 141 | # input_file="../../data/evaluation/pair_cmp/test_data_prepared.json" 142 | # output_file="../../data/evaluation/pair_cmp/test_data_prepared.eval.json" 143 | # human_score_names="gpt_rank_score" 144 | # cp -u $input_file $output_file 145 | # # metrics=("bleu" "rouge" "bertscore" "bleurt" "comet_da" "bart_score_cnn" "unieval_sum" "cometkiwi_da") 146 | # metrics=("unieval_sum") 147 | # for metric in "${metrics[@]}"; do 148 | # echo "Evaluating $metric" 149 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \ 150 | # --human_score_names "$human_score_names" 151 | # done 152 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \ 153 | # --human_score_names "$human_score_names" --print_results True --average_by "sys" --as_rank "True" 154 | -------------------------------------------------------------------------------- /tigerscore/eval_scripts/generate_distill_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | """ 4 | import json 5 | import random 6 | import logging 7 | import sys 8 | import fire 9 | from pathlib import Path 10 | sys.path.append(str(Path(__file__).parent.parent)) 11 | from xgptscore.process_utils import XPGTItem, get_xgptscore_from_json_per_aspect 12 | from xgptscore.xgptscore import xgptscore 13 | from xgptscore.constants import EVAL_ASPECTS 14 | logging.basicConfig(level=logging.warning) 15 | 16 | 17 | def main( 18 | task: str, 19 | xgptscore_mode: str, 20 | model_name: str, 21 | input_file: str, 22 | version_key: str = None, 23 | overwrite: bool = False, 24 | max_size: int = None, 25 | seed: int = 42, 26 | shuffle: bool = False, 27 | ): 28 | 29 | logging.warning("Loading from {}".format(input_file)) 30 | with open(input_file, "r") as f: 31 | items = json.load(f) 32 | if shuffle: 33 | random.seed(seed) 34 | random.shuffle(items) 35 | suffix = f".{xgptscore_mode}.{model_name}" 36 | if version_key: 37 | suffix += f".{version_key}" 38 | if isinstance(max_size, int) and max_size > 0: 39 | items = items[:max_size] 40 | suffix += f".{max_size}" 41 | output_file = Path(input_file).with_suffix(f"{suffix}.json") 42 | 43 | xgptitems = [] 44 | for item in items: 45 | for cand in item['candidates']: 46 | xgptitems.append(XPGTItem( 47 | task=task, 48 | instruction=item['instruction'], 49 | input=item['input'], 50 | ref_output=item['refs'] if 'refs' in item else item['output'], 51 | hypo_output=cand['text'] 52 | )) 53 | 54 | if not output_file.exists() or overwrite: 55 | logging.warning("Running xgptscore") 56 | # run xgptscore 57 | result = xgptscore(xgptitems, mode=xgptscore_mode, 58 | model_name=model_name, num_workers=5) 59 | idx = 0 60 | aspects = EVAL_ASPECTS[task].keys() 61 | score_dict = {"xgptscore_" + aspect: 0 for aspect in aspects} 62 | for item in items: 63 | for cand in item['candidates']: 64 | cand['responses'] = result['round_completions'][idx] 65 | cand['messages_records'] = result['messages_records'][idx] 66 | xgptscore_ans = get_xgptscore_from_json_per_aspect( 67 | cand['responses'][-1]) 68 | if xgptscore_ans is None: 69 | logging.info(f"XGPTScore failed for {cand['text']}") 70 | # cand['scores']['xgptscore'] = None 71 | else: 72 | cand['scores'].update(score_dict) 73 | cand['scores'].update(xgptscore_ans) 74 | idx += 1 75 | with open(output_file, "w") as f: 76 | json.dump(items, f, indent=4, ensure_ascii=False) 77 | logging.info("Saved to {}".format(output_file)) 78 | else: 79 | logging.warning("Found existing {}".format(output_file)) 80 | logging.warning("Skipping xgptscore") 81 | 82 | 83 | if __name__ == "__main__": 84 | logging.basicConfig(level=logging.warning) 85 | fire.Fire(main) 86 | -------------------------------------------------------------------------------- /tigerscore/eval_scripts/generate_distill_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=generate_distill_data 3 | #SBATCH -c 2 4 | #SBATCH --time=24:00:00 5 | #SBATCH --mem=10G 6 | #SBATCH --output=../../jobs/%x/%j.out 7 | 8 | version_key="distill" 9 | overwrite=True 10 | model_name="ChatGPT" 11 | if [ ${model_name} == "gpt-4" ]; then 12 | export OPENAI_API_KEY= 13 | export OPENAI_API_BASE="" 14 | export OPENAI_API_TYPE="azure" 15 | export OPENAI_API_VERSION="2023-07-01-preview" 16 | fi 17 | 18 | # task='translation' 19 | # xgptscore_mode="wmt_mqm" 20 | # input_file="../../data/synthesis_min/translation/train_data.kb_txt.distill.syn_cand.json" 21 | # python generate_distill_data.py \ 22 | # --task ${task} \ 23 | # --input_file ${input_file} \ 24 | # --xgptscore_mode ${xgptscore_mode} \ 25 | # --version_key ${version_key} \ 26 | # --model_name ${model_name} \ 27 | # --overwrite ${overwrite} \ 28 | 29 | # task='summarization' 30 | # xgptscore_mode="align_score" 31 | # input_file="../../data/synthesis_min/summarization/train_data.kb_txt.distill.syn_cand.json" 32 | # python generate_distill_data.py \ 33 | # --task ${task} \ 34 | # --input_file ${input_file} \ 35 | # --xgptscore_mode ${xgptscore_mode} \ 36 | # --version_key ${version_key} \ 37 | # --model_name ${model_name} \ 38 | # --overwrite ${overwrite} \ 39 | 40 | # task='data2text' 41 | # xgptscore_mode="d2t" 42 | # input_file="../../data/synthesis_min/data2text/train_data.kb_txt.distill.syn_cand.json" 43 | # python generate_distill_data.py \ 44 | # --task ${task} \ 45 | # --input_file ${input_file} \ 46 | # --xgptscore_mode ${xgptscore_mode} \ 47 | # --version_key ${version_key} \ 48 | # --model_name ${model_name} \ 49 | # --overwrite ${overwrite} \ 50 | 51 | # task='instruction-following' 52 | # xgptscore_mode="instruction_following" 53 | # input_file="../../data/synthesis_min/instruction-following/train_data.kb_txt.distill.syn_cand.json" 54 | # python generate_distill_data.py \ 55 | # --task ${task} \ 56 | # --input_file ${input_file} \ 57 | # --xgptscore_mode ${xgptscore_mode} \ 58 | # --version_key ${version_key} \ 59 | # --model_name ${model_name} \ 60 | # --overwrite ${overwrite} \ 61 | 62 | task='long-form QA' 63 | xgptscore_mode="longform_qa" 64 | input_file="../../data/synthesis_min/long-form QA/train_data.kb_txt.distill.syn_cand.json" 65 | python generate_distill_data.py \ 66 | --task "${task}" \ 67 | --input_file "${input_file}" \ 68 | --xgptscore_mode ${xgptscore_mode} \ 69 | --version_key ${version_key} \ 70 | --model_name ${model_name} \ 71 | --overwrite ${overwrite} \ 72 | 73 | -------------------------------------------------------------------------------- /tigerscore/eval_scripts/generate_inst_synthetic_data.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append("..") 3 | from xgptscore.openai_utils import openai_completions, _chatml_to_prompt 4 | import fire 5 | import json 6 | import random 7 | from string import Template 8 | 9 | 10 | template = """ 11 | Instruction: 12 | ${instruction} 13 | ${input} 14 | 15 | A ground-truth response: 16 | ${output} 17 | 18 | A model will be asked to respond to this instruction. However, that response might contain errors in various aspects. 19 | 20 | Please first output 5 possible error aspects if a model is asked to generate a response for the above instruction. The error aspects don't have to be one of the above aspects and can be any aspect that you think is reasonable for this instruction. 21 | 22 | Then generate an incorrect response contains up to ${num_errors} errors of these aspects. Each error corresponds to one of the aspect. 23 | The incorrect response should mimic style the real-generation of a model. 24 | 25 | Then give an analysis of these errors. For each error, give me the 26 | - error location (the substring that is wrong in the generated incorrect output) 27 | - error aspect 28 | - explanation (the generic error type description, why it's an error, and the correction suggestions) 29 | - severity ("major" or "minor") 30 | - score reduction (an integer between 1 to 5 given the severity of the error) 31 | 32 | Output format: 33 | Generated incorrect output: 34 | 35 | Error location 1: 36 | Error aspect 1: 37 | Explanation 1: 38 | Severity 1: 39 | Score reduction 1: 40 | ... 41 | """ 42 | 43 | math_template = """ 44 | Question: 45 | ${instruction} 46 | ${input} 47 | 48 | A ground-truth answer: 49 | ${output} 50 | 51 | A model will be asked to answer this math question. However, that response might contain errors in various aspects such as Problem Understanding, Problem Formulation, Computing Accuracy, Solution Interpretation, etc. 52 | 53 | Please first output a few possible error aspects if a model is asked to generate a response for the above instruction. The error aspects don't have to be one of the above aspects and can be any aspect that you think is reasonable for this instruction. 54 | 55 | Then generate an incorrect response contains up to ${num_errors} errors of these aspects. Each error corresponds to one of the aspect. 56 | The incorrect response should mimic style the real-generation of a model. 57 | 58 | Then give an analysis of these errors. For each error, give me the 59 | - error location (the substring that is wrong in the generated incorrect output) 60 | - error aspect 61 | - explanation (the generic error type description, why it's an error, and the correction suggestions) 62 | - severity ("major" or "minor") 63 | - score reduction (an integer between 0.5 to 5 given the severity of the error) 64 | 65 | Output format: 66 | Generated incorrect output: 67 | 68 | Error location 1: 69 | Error aspect 1: 70 | Explanation 1: 71 | Severity 1: 72 | Score reduction 1: 73 | ... 74 | """ 75 | 76 | def main( 77 | input_file, output_file, 78 | model_name="gpt-4", num_samples=None, 79 | num_procs=5, seed=42, 80 | task='inst-fol'): 81 | random.seed(seed) 82 | with open(input_file, "r") as f: 83 | if input_file.endswith(".jsonl"): 84 | input_data = [json.loads(line) for line in f] 85 | elif input_file.endswith(".json"): 86 | input_data = json.load(f) 87 | if num_samples is None: 88 | num_samples = len(input_data) 89 | print(num_samples) 90 | input_data = input_data[:num_samples] 91 | 92 | def process_data(item): 93 | if task == 'math': 94 | _template = math_template 95 | else: 96 | _template = template 97 | prompt = Template(template=_template).substitute( 98 | instruction=item["instruction"], 99 | input=item["input"], 100 | output=item["output"], 101 | num_errors=random.randint(1, 5) 102 | ) 103 | message = [{ 104 | "role": "user", 105 | "content": prompt 106 | }] 107 | chatml_prompt = _chatml_to_prompt(message) 108 | return chatml_prompt 109 | 110 | prompts = list(map(process_data, input_data)) 111 | print(prompts[0]) 112 | completions = openai_completions(prompts, model_name=model_name, num_procs=num_procs, use_cache=True) 113 | print(f"Finished generating {len(completions['completions'])} completions.") 114 | print(f"Total prices: {sum(completions['price_per_example'])}") 115 | for i, completion in enumerate(completions['completions']): 116 | input_data[i]["completion"] = completion 117 | with open(output_file, "w") as f: 118 | if output_file.endswith(".jsonl"): 119 | for item in input_data: 120 | json.dump(item, f) 121 | f.write("\n") 122 | elif output_file.endswith(".json"): 123 | json.dump(input_data, f) 124 | 125 | if __name__ == "__main__": 126 | fire.Fire(main) 127 | -------------------------------------------------------------------------------- /tigerscore/eval_scripts/generate_inst_synthetic_data.sh: -------------------------------------------------------------------------------- 1 | # python generate_inst_synthetic_data.py \ 2 | # --input_file "../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.10k.jsonl" \ 3 | # --output_file "../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.10k.gen.jsonl" \ 4 | # --model_name "gpt-4" \ 5 | # --num_samples 8000 6 | 7 | python generate_inst_synthetic_data.py \ 8 | --input_file "../../data/additional/metamath/metamath.8k.jsonl" \ 9 | --output_file "../../data/additional/metamath/metamath.8k.gen.jsonl" \ 10 | --model_name "gpt-4" \ 11 | --num_samples 10 12 | 13 | # python generate_inst_synthetic_data.py \ 14 | # --input_file "../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.story.1k.jsonl" \ 15 | # --output_file "../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.story.1k.gen.jsonl" \ 16 | # --model_name "gpt-4" \ -------------------------------------------------------------------------------- /tigerscore/eval_scripts/generate_synthesis_distill_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Generate synthesis distillation data from a json file. 3 | """ 4 | import json 5 | import random 6 | import logging 7 | import sys 8 | import fire 9 | from pathlib import Path 10 | sys.path.append(str(Path(__file__).parent.parent)) 11 | from xgptscore.process_utils import XPGTItem 12 | from xgptscore.xgptscore import xgptscore 13 | logging.basicConfig(level=logging.warning) 14 | 15 | 16 | def main( 17 | task: str, 18 | input_file: str, 19 | output_file: str = None, 20 | xgptscore_mode: str = "kb_txt", 21 | model_name: str = "gpt-4", 22 | version_key: str = "default", 23 | overwrite: bool = False, 24 | max_size: int = None, 25 | seed: int = 42, 26 | shuffle_file: bool = False, 27 | source_max_length: int = None, 28 | ref_max_length: int = None, 29 | hypo_max_length: int = None, 30 | ): 31 | logging.warning("Params: \n{}".format(json.dumps(locals(), indent=4))) 32 | # params 33 | if isinstance(max_size, int) and max_size > 0: 34 | version_key = f"{version_key}_{max_size}" 35 | # load data 36 | input_file = Path(input_file) 37 | if not output_file: 38 | output_file = input_file.with_suffix( 39 | f".{xgptscore_mode}.{version_key}.json") 40 | else: 41 | output_file = Path(output_file) 42 | with open(input_file, "r") as f: 43 | items = json.load(f) 44 | logging.warning("Loaded {} items from {}".format( 45 | len(items), input_file)) 46 | logging.warning("Preparing writing to {}...".format(output_file)) 47 | 48 | random.seed(seed) 49 | logging.warning("Set seed to {}".format(seed)) 50 | if shuffle_file: 51 | random.shuffle(items) 52 | logging.warning("Shuffled {} items".format(len(items))) 53 | if isinstance(max_size, int) and max_size > 0: 54 | items = items[:max_size] 55 | logging.warning("Truncated to {} items".format(len(items))) 56 | elif isinstance(max_size, float) and max_size > 0 and max_size < 1: 57 | items = random.sample(items, int(len(items) * max_size)) 58 | logging.warning("Sampled to {} items".format(len(items))) 59 | 60 | xgptitems = [] 61 | for item in items: 62 | xgptitems.append(XPGTItem( 63 | task=task, 64 | instruction=item['instruction'], 65 | input=item['input'], 66 | ref_output=item['output'] if "output" in item else item['refs'], 67 | hypo_output=None, 68 | )) 69 | if "candidates" in item: 70 | del item["candidates"] 71 | 72 | if not output_file.exists() or overwrite: 73 | logging.warning("Running xgptscore") 74 | # run xgptscore 75 | xgptscore_params = { 76 | "max_lengths": { 77 | "input": source_max_length, 78 | "hypo_output": hypo_max_length, 79 | "ref_output": ref_max_length, 80 | }, 81 | } 82 | result = xgptscore(xgptitems, mode=xgptscore_mode, 83 | model_name=model_name, **xgptscore_params) 84 | for i, item in enumerate(items): 85 | item['responses'] = result['round_completions'][i] 86 | item['messages_records'] = result['messages_records'][i] 87 | with open(output_file, "w") as f: 88 | json.dump(items, f, indent=4, ensure_ascii=False) 89 | logging.warning("Saved to {}".format(output_file)) 90 | else: 91 | logging.warning("Loading from {}".format(output_file)) 92 | with open(output_file, "r") as f: 93 | items = json.load(f) 94 | 95 | 96 | if __name__ == "__main__": 97 | fire.Fire(main) 98 | -------------------------------------------------------------------------------- /tigerscore/eval_scripts/generate_synthesis_distill_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=synthesis_distill_data 3 | #SBATCH --time=48:00:00 4 | #SBATCH --output=../../jobs/synthesis_distill_data/%j.out 5 | 6 | xgptscore_mode="kb_txt" 7 | version_key="distill" 8 | model_name="gpt-4" 9 | if [ ${model_name} == "gpt-4" ]; then 10 | export OPENAI_API_KEY= 11 | export OPENAI_API_BASE="" 12 | export OPENAI_API_TYPE="azure" 13 | export OPENAI_API_VERSION="2023-07-01-preview" 14 | fi 15 | 16 | IFS=$'\n' 17 | tasks=("translation" "long-form QA" "summarization" "data2text" "mathQA" "instruction-following") 18 | for task in ${tasks[@]}; do 19 | input_file="/home//WorkSpace/ExplainableGPTScore/data/synthesis/${task}/train_data.json" 20 | echo task: $task 21 | python generate_synthesis_distill_data.py \ 22 | --task $task \ 23 | --xgptscore_mode $xgptscore_mode \ 24 | --version_key $version_key \ 25 | --model_name $model_name \ 26 | --input_file $input_file \ 27 | --source_max_length 512 \ 28 | --overwrite "False" \ 29 | 30 | done -------------------------------------------------------------------------------- /tigerscore/eval_scripts/get_systhesis_ref_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=synthesis_distill_data 3 | #SBATCH --time=48:00:00 4 | #SBATCH --output=../../jobs/synthesis_distill_data/%j.out 5 | 6 | xgptscore_mode="paraphrase" 7 | version_key="distill" 8 | model_name="gpt-4" 9 | if [ ${model_name} == "gpt-4" ]; then 10 | export OPENAI_API_KEY= 11 | export OPENAI_API_BASE="" 12 | export OPENAI_API_TYPE="azure" 13 | export OPENAI_API_VERSION="2023-07-01-preview" 14 | fi 15 | 16 | IFS=$'\n' 17 | # tasks=("translation" "long-form QA" "summarization" "data2text" "mathQA" "instruction-following") 18 | tasks=("translation") 19 | for task in ${tasks[@]}; do 20 | input_file="../../data/synthesis/${task}/train_data.json" 21 | echo task: $task 22 | python generate_synthesis_distill_data.py \ 23 | --task $task \ 24 | --xgptscore_mode $xgptscore_mode \ 25 | --version_key $version_key \ 26 | --model_name $model_name \ 27 | --input_file $input_file \ 28 | --source_max_length 512 \ 29 | --overwrite "False" \ 30 | --shuffle_file True \ 31 | --max_size 0.15 \ 32 | 33 | done -------------------------------------------------------------------------------- /tigerscore/eval_scripts/lfqa_gpt_rate.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file isn't used in our final version. 3 | """ 4 | import sys 5 | import fire 6 | import json 7 | import logging 8 | import regex as re 9 | import random 10 | sys.path.append("..") 11 | from collections import Counter, defaultdict 12 | from string import Template 13 | from xgptscore.openai_utils import openai_completions, _chatml_to_prompt 14 | logging.basicConfig(level=logging.WARNING) 15 | 16 | rank_template = """ 17 | 4 different models are asked to follow a given instruction to generate an answer based on a given source input. 18 | The instruction is: ${instruction} 19 | The source input is: ${source} 20 | The generated output of model 1 is: ${model1_generated} 21 | The generated output of model 2 is: ${model2_generated} 22 | The generated output of model 3 is: ${model3_generated} 23 | The generated output of model 4 is: ${model4_generated} 24 | The reference output is: ${reference} 25 | 26 | Now Please rank the 4 model's outputs from best to worst. 27 | Please first output the rank results in the following format: 28 | [best] [second best] [third best] [worst] (e.g. 1 2 3 4) 29 | Then give your brief comments on why you rank the outputs in this way. 30 | """ 31 | 32 | 33 | def get_rank_prompts( 34 | item: dict 35 | ): 36 | random.shuffle(item['candidates']) 37 | rank_prompt = Template(rank_template).substitute( 38 | instruction=item['instruction'], 39 | source=item['input'], 40 | model1_generated=item['candidates'][0]['text'], 41 | model2_generated=item['candidates'][1]['text'], 42 | model3_generated=item['candidates'][2]['text'], 43 | model4_generated=item['candidates'][3]['text'], 44 | reference=item.get('output') or item.get("refs")[0], 45 | ) 46 | return rank_prompt 47 | 48 | 49 | def main( 50 | input_file: str, 51 | output_file: str, 52 | seed: int = 42, 53 | model_name: str = "ChatGPT", 54 | ): 55 | random.seed(seed) 56 | with open(input_file, "r") as f: 57 | data = json.load(f) 58 | 59 | rank_prompts = list(map(get_rank_prompts, data)) 60 | chatmls = [[{"role": "system", "content": "You are an helpful AI assistant to help user find information."}, 61 | {"role": "user", "content": prompt}] for prompt in rank_prompts] 62 | chatml_prompts = [_chatml_to_prompt(chatml) for chatml in chatmls] 63 | 64 | decoding_kwargs = { 65 | # "max_tokens": 1024, 66 | "temperature": 0, 67 | "top_p": 1.0, 68 | "timeout": 30, 69 | "request_timeout": 30 70 | } 71 | results = openai_completions( 72 | chatml_prompts, model_name=model_name, **decoding_kwargs) 73 | logging.warning("Total price: {:.4f}$".format( 74 | sum(results['price_per_example']))) 75 | completions = results['completions'] 76 | 77 | best_model_idxs = [] 78 | model_ranks = defaultdict(list) 79 | for i, item in enumerate(data): 80 | item['rank_prompt'] = rank_prompts[i] 81 | item['rank_response'] = completions[i] 82 | try: 83 | first_digit_idx = re.search(r"\d", item['rank_response']).start() 84 | item['ranks'] = re.search( 85 | r"(\d)[\n ](\d)[\n ](\d)[\n ](\d)", item['rank_response']) 86 | if not item['ranks']: 87 | item['ranks'] = re.search( 88 | "\[best\] (\d) \[second best\] (\d) \[third best\] (\d) \[worst\] (\d)", item['rank_response']) 89 | if not item['ranks']: 90 | item['ranks'] = re.search( 91 | "\[best\] Model (\d)[\n ]\[second best\] Model (\d)[\n ]\[third best\] Model (\d)[\n ]\[worst\] Model (\d)", item['rank_response']) 92 | # item['ranks'] = item['rank_response'][first_digit_idx:item['rank_response'].index("\n")].split(" ") 93 | item['ranks'] = [int(rank) for rank in item['ranks'].groups()] 94 | except Exception: 95 | print(item['ranks']) 96 | for j, cand in enumerate(item['candidates']): 97 | cand['scores']['gpt_rank_{}'.format( 98 | model_name)] = - item['ranks'][j] 99 | model_ranks[cand['source']].append(item['ranks'][j]) 100 | best_model_idxs.append(item['ranks'][0]) 101 | 102 | print(Counter(best_model_idxs)) 103 | for model, ranks in model_ranks.items(): 104 | c = Counter(ranks) 105 | print(model, sorted(c.items(), key=lambda x: x[0])) 106 | with open(output_file, "w") as f: 107 | json.dump(data, f, indent=4, ensure_ascii=False) 108 | logging.warning(f"Saved to {output_file}") 109 | 110 | 111 | if __name__ == "__main__": 112 | fire.Fire(main) 113 | -------------------------------------------------------------------------------- /tigerscore/eval_scripts/lfqa_gpt_rate.sh: -------------------------------------------------------------------------------- 1 | model_name="gpt-4" 2 | if [ ${model_name} == "gpt-4" ]; then 3 | export OPENAI_API_KEY= 4 | export OPENAI_API_BASE="" 5 | export OPENAI_API_TYPE="azure" 6 | export OPENAI_API_VERSION="2023-07-01-preview" 7 | fi 8 | 9 | python lfqa_gpt_rate.py \ 10 | --input_file "../../data_bak/lfqa/test.json" \ 11 | --output_file "../../data_bak/lfqa/test.${model_name}.rank.json" \ 12 | --model_name ${model_name} \ -------------------------------------------------------------------------------- /tigerscore/eval_scripts/mathqa_rate.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file isn't used in our final version. 3 | """ 4 | import sys 5 | import fire 6 | import json 7 | import logging 8 | import regex as re 9 | import copy 10 | import random 11 | sys.path.append("..") 12 | from xgptscore.openai_utils import openai_completions, _chatml_to_prompt 13 | from typing import List, Dict 14 | from string import Template 15 | from collections import Counter, defaultdict 16 | logging.basicConfig(level=logging.WARNING) 17 | 18 | template = """ 19 | ${instruction} 20 | ${source} 21 | 22 | A correct output is: 23 | ${reference} 24 | 25 | A model generated output is: 26 | ${model1_generated} 27 | 28 | Now please evaluate the errors in the model-generated outputs 29 | For each error associated with problem understanding, problem formulation, computing accuracy, and solution interpretation, reduce 1 or 2 score. 30 | Finally give me a total reductions of score as the evaluation of this model-generated output starting with "Total Score Reduction: ". 31 | """ 32 | 33 | 34 | def get_prompts( 35 | item: dict 36 | ): 37 | prompts = [] 38 | random.shuffle(item['candidates']) 39 | for cand in item['candidates']: 40 | prompt = Template(template).substitute( 41 | instruction=item['instruction'].strip("\n "), 42 | source=item['input'].strip("\n "), 43 | reference=(item.get('output') or item.get("refs")[0]).strip("\n "), 44 | model1_generated=cand['text'].strip("\n "), 45 | ) 46 | prompts.append(prompt) 47 | return prompts 48 | 49 | def main( 50 | input_file: str, 51 | output_file: str, 52 | seed: int = 42, 53 | model_name: str = "ChatGPT", 54 | ): 55 | random.seed(seed) 56 | with open(input_file, "r") as f: 57 | data = json.load(f) 58 | 59 | prompts = list(map(get_prompts, data)) 60 | flatten_prompts = [prompt for prompts_ in prompts for prompt in prompts_] 61 | chatmls = [[{"role":"system","content":"You are an helpful AI assistant to help user find information."}, 62 | {"role":"user","content": prompt}] for prompt in flatten_prompts] 63 | chatml_prompts = [_chatml_to_prompt(chatml) for chatml in chatmls] 64 | 65 | decoding_kwargs = { 66 | # "max_tokens": 1024, 67 | "temperature": 0, 68 | "top_p": 1.0, 69 | "timeout": 30, 70 | "request_timeout": 30 71 | } 72 | results = openai_completions(chatml_prompts, model_name=model_name, **decoding_kwargs) 73 | logging.warning("Total price: {:.4f}$".format(sum(results['price_per_example']))) 74 | completions = results['completions'] 75 | 76 | idx = 0 77 | for i, item in enumerate(data): 78 | for j, cand in enumerate(item['candidates']): 79 | total_score_reduction = re.search("Total Score Reduction: (\d+)", completions[idx]) 80 | if not total_score_reduction: 81 | total_score_reduction = re.search("Total Score Reduction: -(\d+)", completions[idx]) 82 | if not total_score_reduction: 83 | total_score_reduction = re.search("Total Score Reduction is (\d+)", completions[idx]) 84 | if not total_score_reduction: 85 | total_score_reduction = re.search("Total Score Reduction is -(\d+)", completions[idx]) 86 | if total_score_reduction: 87 | cand['scores']['gpt_score_reduction'] = - abs(int(total_score_reduction.groups()[0])) 88 | else: 89 | pass 90 | cand['scores']['gpt_score_reduction'] = 0 91 | cand['gpt_score_output'] = completions[idx] 92 | idx += 1 93 | with open(output_file, "w") as f: 94 | json.dump(data, f, indent=4, ensure_ascii=False) 95 | logging.warning(f"Saved to {output_file}") 96 | 97 | if __name__ == "__main__": 98 | fire.Fire(main) -------------------------------------------------------------------------------- /tigerscore/eval_scripts/test_ref_diff.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file isn't used in our final version. 3 | Calculate the distance between our score and the reference score. 4 | Maybe Pearson is better. Or we can draw a QQ plot. 5 | """ 6 | import json 7 | import random 8 | import logging 9 | import sys 10 | import numpy as np 11 | import pickle 12 | from pathlib import Path 13 | from utils import MyCorrelation 14 | sys.path.append(str(Path(__file__).parent.parent)) 15 | from xgptscore.xgptscore import xgptscore 16 | from itertools import chain 17 | from collections import Counter 18 | from xgptscore.process_utils import XPGTItem, get_xgptscore_from_json_per_aspect 19 | from xgptscore.constants import EVAL_ASPECTS 20 | logging.basicConfig(level=logging.INFO) 21 | 22 | # params 23 | task='data2text' 24 | bart_version="D2T" 25 | dataset="SFHOT" 26 | data_dir="../../BARTScore" 27 | xgptscore_mode="d2t" 28 | version_key=f"{xgptscore_mode}.ref.end_1_5" 29 | our_score_name="xgptscore" 30 | model_name="ChatGPT" 31 | overwrite=False 32 | max_size=200 # set to None to use all examples 33 | num_sys=2 34 | if isinstance(max_size, int) and max_size > 0: 35 | version_key = f"{version_key}_{max_size}" 36 | 37 | # load data 38 | input_file=Path(f"{data_dir}/{bart_version}/{dataset}/final_p_with_xgptscore.json") 39 | if version_key: 40 | output_file = input_file.with_suffix(f".{version_key}.json") 41 | else: 42 | output_file = input_file.with_suffix(f".default.json") 43 | 44 | if not output_file.exists() or overwrite: 45 | # Load and shuffle data 46 | logging.info("Loading from {}".format(input_file)) 47 | with open(input_file, "r") as f: 48 | items = json.load(f) 49 | if isinstance(max_size, int) and max_size > 0: 50 | items = items[:max_size] 51 | # random will cause wrong results 52 | 53 | # Data processing 54 | xgptitems = [] 55 | for item in items: 56 | item['candidates'] = [ 57 | { 58 | "model": "reference", 59 | "decoding_method": "greedy", 60 | "text": item['output'] if isinstance(item['output'], str) else item['output'][0], 61 | "scores": {}, 62 | } 63 | ] 64 | xgptitems.append(XPGTItem( 65 | task=task, 66 | instruction=item['instruction'], 67 | input=item['input'], 68 | # ref_output=item['output'], 69 | ref_output="N/A", 70 | hypo_output=item['output'] if isinstance(item['output'], str) else item['output'][0], 71 | )) 72 | # Run xgptscore 73 | result = xgptscore(xgptitems, mode=xgptscore_mode, model_name=model_name,num_workers=5) 74 | idx = 0 75 | aspects = EVAL_ASPECTS[task].keys() 76 | score_dict = {"xgptscore_"+aspect: 0 for aspect in aspects} 77 | for item in items: 78 | for cand in item['candidates']: 79 | cand['responses'] = result['round_completions'][idx] 80 | cand['messages_records'] = result['messages_records'][idx] 81 | xgptscore_ans = get_xgptscore_from_json_per_aspect(cand['responses'][-1]) 82 | if xgptscore_ans is None: 83 | logging.info(f"XGPTScore failed for {cand['text']}") 84 | # cand['scores']['xgptscore'] = None 85 | else: 86 | cand['scores'].update(score_dict) 87 | cand['scores'].update(xgptscore_ans) 88 | idx += 1 89 | 90 | # Save results 91 | with open(output_file, "w") as f: 92 | json.dump(items, f, indent=4, ensure_ascii=False) 93 | logging.info("Saved to {}".format(output_file)) 94 | else: 95 | logging.info("Loading existing results from {}".format(output_file)) 96 | with open(output_file, "r") as f: 97 | items = json.load(f) 98 | 99 | 100 | # by system 101 | # Compute bias 102 | xgptscores = [] 103 | for item in items: 104 | for cand in item['candidates']: 105 | if our_score_name in cand['scores']: 106 | xgptscores.append(cand['scores'][our_score_name]) 107 | 108 | print(f"Mean: {np.mean(xgptscores)}") 109 | print(f"Distribution: {Counter(xgptscores)}") 110 | print(f"Std: {np.std(xgptscores)}") 111 | print(f"Max: {np.min(xgptscores)}") -------------------------------------------------------------------------------- /tigerscore/eval_scripts/test_xgptscore.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script is used to test xgptscore for prompt engineering. 3 | """ 4 | 5 | from common import str2bool 6 | from xgptscore.xgptscore import xgptscore 7 | from itertools import chain 8 | from xgptscore.process_utils import XPGTItem, get_xgptscore_from_json 9 | import json 10 | import logging 11 | import sys 12 | import numpy as np 13 | import fire 14 | from pathlib import Path 15 | from utils import MyCorrelation 16 | sys.path.append(str(Path(__file__).parent.parent)) 17 | logging.basicConfig(level=logging.INFO) 18 | 19 | 20 | def main(input_file: str, task: str, model_name: str, output_file: str, xgptscore_mode: str = "prompt", max_size: int = None, overwrite: str = "false"): 21 | overwrite = str2bool(overwrite) 22 | if output_file is None: 23 | output_file = Path(input_file).parent / \ 24 | (Path(input_file).stem + "." + xgptscore_mode + ".json") 25 | if not output_file.exists() or overwrite: 26 | logging.info("Loading from {}".format(input_file)) 27 | with open(input_file, "r") as f: 28 | items = json.load(f) 29 | np.random.seed(42) 30 | np.random.shuffle(items) 31 | if isinstance(max_size, int) and max_size > 0: 32 | items = items[:max_size] 33 | 34 | # Data processing 35 | xgptitems = [] 36 | for item in items: 37 | for cand in item['candidates']: 38 | xgptitems.append(XPGTItem( 39 | task=task, 40 | instruction=item['instruction'], 41 | input=item['input'], 42 | ref_output=item['output'], 43 | hypo_output=cand['text'] 44 | )) 45 | # Run xgptscore 46 | result = xgptscore(xgptitems, mode=xgptscore_mode, 47 | model_name=model_name, num_workers=5) 48 | idx = 0 49 | for item in items: 50 | for cand in item['candidates']: 51 | cand['responses'] = result['round_completions'][idx] 52 | cand['messages_records'] = result['messages_records'][idx] 53 | cand['scores']['xgptscore'] = get_xgptscore_from_json( 54 | cand['responses'][-1]) 55 | idx += 1 56 | 57 | # Save results 58 | with open(output_file, "w") as f: 59 | json.dump(items, f, indent=4, ensure_ascii=False) 60 | logging.info("Saved to {}".format(output_file)) 61 | else: 62 | logging.info("Loading existing results from {}".format(output_file)) 63 | with open(output_file, "r") as f: 64 | items = json.load(f) 65 | 66 | # evaluate system 67 | 68 | num_cands = len(items[0]['candidates']) 69 | human_scores = [[cand['scores']["rank"] 70 | for cand in item['candidates']] for item in items] 71 | human_scores = list(chain(*zip(*human_scores))) # transpose and flatten 72 | metrics = ["xgptscore", "bleu", "rouge1", "rouge2", 73 | "rougeL", "rougeLsum", "bart_score", "bart_score_cnn"] 74 | # metrics = ["xgptscore"] 75 | 76 | Pearson_corr = {} 77 | Spearman_corr = {} 78 | Kendall_corr = {} 79 | for metric in metrics: 80 | metric_scores = [[cand['scores'][metric] 81 | for cand in item['candidates']] for item in items] 82 | metric_scores = list(chain(*zip(*metric_scores)) 83 | ) # transpose and flatten 84 | metric_corr = MyCorrelation(num_cands, human_scores, metric_scores) 85 | Pearson_corr[metric] = metric_corr.Pearson() 86 | Spearman_corr[metric] = metric_corr.Spearman() 87 | Kendall_corr[metric] = metric_corr.Kendall() 88 | 89 | # sort Corr 90 | Pearson_corr = {k: v for k, v in sorted( 91 | Pearson_corr.items(), key=lambda item: item[1][0], reverse=True)} 92 | Spearman_corr = {k: v for k, v in sorted( 93 | Spearman_corr.items(), key=lambda item: item[1][0], reverse=True)} 94 | Kendall_corr = {k: v for k, v in sorted( 95 | Kendall_corr.items(), key=lambda item: item[1][0], reverse=True)} 96 | Corr_record = { 97 | "Pearson": Pearson_corr, 98 | "Spearman": Spearman_corr, 99 | "Kendall": Kendall_corr, 100 | } 101 | # Save correlation results 102 | corr_results_file = Path("./eval_results/") / \ 103 | (output_file.stem + ".corr.json") 104 | corr_results_file.parent.mkdir(parents=True, exist_ok=True) 105 | with open(corr_results_file, "w") as f: 106 | json.dump(Corr_record, f, indent=4, ensure_ascii=False) 107 | logging.info("Saved to {}".format(corr_results_file)) 108 | # save to another location 109 | corr_results_file = output_file.parent / \ 110 | "eval_results" / (output_file.stem + ".corr.json") 111 | corr_results_file.parent.mkdir(parents=True, exist_ok=True) 112 | with open(corr_results_file, "w") as f: 113 | json.dump(Corr_record, f, indent=4, ensure_ascii=False) 114 | logging.info("Saved to {}".format(corr_results_file)) 115 | # print("Correlation results:") 116 | # print(json.dumps(Corr_record, indent=4, ensure_ascii=False)) 117 | 118 | 119 | if __name__ == "__main__": 120 | fire.Fire(main) 121 | -------------------------------------------------------------------------------- /tigerscore/eval_scripts/test_xgptscore.sh: -------------------------------------------------------------------------------- 1 | model_name="chatgpt" 2 | 3 | ## Summarization ## 4 | input_file="../../data/evaluation/summarization/summeval/test_data_prepared.json" 5 | python ./test_xgptscore.py \ 6 | --input_file $input_file \ 7 | --task "summarization" \ 8 | --model_name $model_name 9 | 10 | ## Translation ## 11 | input_file="../../data/evaluation/translation/test_data_prepared.json" 12 | python ./test_xgptscore.py \ 13 | --input_file $input_file \ 14 | --task "translation" \ 15 | --model_name $model_name 16 | 17 | ## Data2Text ## 18 | input_file="../../data/evaluation/d2t/webnlg_2020/test_data_prepared.json" 19 | python ./test_xgptscore.py \ 20 | --input_file $input_file \ 21 | --task "data2text" \ 22 | --model_name $model_name 23 | 24 | ## Instructions ## 25 | input_file="../../data/evaluation/instructions/just-eval-instruct/test_data_prepared.json" 26 | python ./test_xgptscore.py \ 27 | --input_file $input_file \ 28 | --task "instructions" \ 29 | --model_name $model_name 30 | 31 | ## Long Form QA ## 32 | input_file="../../data/evaluation/lfqa/test_data_prepared.json" 33 | python ./test_xgptscore.py \ 34 | --input_file $input_file \ 35 | --task "long-form QA" \ 36 | --model_name $model_name 37 | 38 | ## Math QA ## 39 | input_file="../../data/evaluation/mathqa/gsm8k/test_data_prepared.json" 40 | python ./test_xgptscore.py \ 41 | --input_file $input_file \ 42 | --task "mathQA" \ 43 | --model_name $model_name 44 | 45 | ## Story Generation ## 46 | input_file="../../data/evaluation/storygen/test_data_prepared.json" 47 | python ./test_xgptscore.py \ 48 | --input_file $input_file \ 49 | --task "story_generation" \ 50 | --model_name $model_name -------------------------------------------------------------------------------- /tigerscore/eval_scripts/utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from mt_metrics_eval.stats import Correlation 3 | from typing import List 4 | 5 | 6 | class MyCorrelation(Correlation): 7 | def __init__(self, num_sys: int, gold_scores: List[int], metric_scores: List[int]): 8 | # remove nan in metrics scores 9 | none_metric_scores_idxs = [idx for idx, 10 | x in enumerate(metric_scores) if x is None] 11 | logging.info("Remove {} nan scores from {} scores".format( 12 | len(none_metric_scores_idxs), 13 | len(metric_scores) 14 | )) 15 | gold_scores = gold_scores.copy() 16 | # set gold scores to None if metric scores are None 17 | for idx in none_metric_scores_idxs[::-1]: 18 | gold_scores[idx] = None 19 | super().__init__(num_sys, gold_scores, metric_scores) 20 | -------------------------------------------------------------------------------- /tigerscore/finetune/ds_llama_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "bf16": { 3 | "enabled": "auto" 4 | }, 5 | "scheduler": { 6 | "type": "WarmupLR", 7 | "params": { 8 | "warmup_min_lr": "auto", 9 | "warmup_max_lr": "auto", 10 | "warmup_num_steps": "auto" 11 | } 12 | }, 13 | "zero_optimization": { 14 | "stage": 3, 15 | "overlap_comm": true, 16 | "contiguous_gradients": true, 17 | "sub_group_size": 1e9, 18 | "reduce_bucket_size": "auto", 19 | "stage3_prefetch_bucket_size": "auto", 20 | "stage3_param_persistence_threshold": "auto", 21 | "stage3_max_live_parameters": 1e9, 22 | "stage3_max_reuse_distance": 1e9, 23 | "stage3_gather_16bit_weights_on_model_save": true 24 | }, 25 | "gradient_accumulation_steps": "auto", 26 | "gradient_clipping": "auto", 27 | "steps_per_print": 1, 28 | "train_batch_size": "auto", 29 | "train_micro_batch_size_per_gpu": "auto", 30 | "wall_clock_breakdown": false 31 | } -------------------------------------------------------------------------------- /tigerscore/finetune/finetune_llama.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=llama_finetune 3 | #SBATCH -c 10 4 | #SBATCH --partition=a100 5 | #SBATCH --gres=gpu:4 6 | #SBATCH --time=24:00:00 7 | #SBATCH --mem=100G 8 | #SBATCH --output=../../jobs/%x/%j.out 9 | 10 | nvidia-smi 11 | MASTER_PORT=4637 12 | MODEL_DIR="meta-llama/Llama-2-7b-hf" # 13b 13 | run_name="llama.train_mix.check.clean.mathQA" # change this every time you run a new experiment 14 | 15 | output_dir="../../outputs/${MODEL_DIR}/${run_name}" 16 | 17 | train_data_path="../../data/train_mix.check.clean.mathQA.format_v2.json" # 18 | 19 | mkdir -p ${output_dir} 20 | 21 | # slurm system gpus can't connect to each other by default 22 | # set the following environment variables to enable nccl 23 | export NCCL_IB_DISABLE=1; 24 | export NCCL_P2P_DISABLE=1; 25 | 26 | export NCCL_DEBUG=INFO; 27 | export NCCL_SOCKET_IFNAME=en,eth,em,bond; 28 | export CXX=g++; 29 | 30 | # batch_size = train_batch_size * gradient_accumulation_steps * num_gpus = 128 31 | # epoch size: alpaca using 3 epochs for 52k data 32 | # epoch size: translation data size, only 8k 33 | # epoch szie: sum, data2text, trans, 30k, epoch_size = 4 34 | 35 | # deepspeed \ 36 | # --num_gpus 4 \ 37 | # --num_nodes 1 \ 38 | # --master_port ${MASTER_PORT} \ 39 | # train.py \ 40 | # --model_name_or_path ${MODEL_DIR} \ 41 | # --train_data_path ${train_data_path} \ 42 | # --bf16 True \ 43 | # --output_dir ${output_dir} \ 44 | # --num_train_epochs 3 \ 45 | # --per_device_train_batch_size 2 \ 46 | # --per_device_eval_batch_size 2 \ 47 | # --gradient_accumulation_steps 16 \ 48 | # --model_max_length 1024 \ 49 | # --evaluation_strategy "no" \ 50 | # --save_strategy "epoch" \ 51 | # --save_steps 200 \ 52 | # --save_total_limit 1 \ 53 | # --learning_rate 2e-5 \ 54 | # --weight_decay 0. \ 55 | # --warmup_ratio 0.1 \ 56 | # --lr_scheduler_type "cosine" \ 57 | # --logging_steps 2 \ 58 | # --tf32 True \ 59 | # --deepspeed ds_llama_config.json \ 60 | # --run_name ${run_name} \ 61 | # --seed 42 \ 62 | # --is_lora False \ 63 | 64 | CUDA_VISIBLE_DEVICES=0,1,2,3 deepspeed \ 65 | --num_gpus 4 \ 66 | --num_nodes 1 \ 67 | --master_port ${MASTER_PORT} \ 68 | train.py \ 69 | --model_name_or_path ${MODEL_DIR} \ 70 | --train_data_path ${train_data_path} \ 71 | --bf16 True \ 72 | --output_dir ${output_dir} \ 73 | --num_train_epochs 3 \ 74 | --per_device_train_batch_size 1 \ 75 | --per_device_eval_batch_size 2 \ 76 | --gradient_accumulation_steps 32 \ 77 | --model_max_length 1024 \ 78 | --evaluation_strategy "no" \ 79 | --save_strategy "epoch" \ 80 | --save_steps 64 \ 81 | --save_total_limit 6 \ 82 | --learning_rate 2e-5 \ 83 | --weight_decay 0. \ 84 | --warmup_ratio 0.1 \ 85 | --lr_scheduler_type "cosine" \ 86 | --logging_steps 2 \ 87 | --tf32 True \ 88 | --deepspeed ds_llama_config.json \ 89 | --run_name ${run_name} \ 90 | --seed 42 \ 91 | --is_lora False \ 92 | 93 | # # LIMA config 94 | # deepspeed \ 95 | # --num_gpus 4 \ 96 | # --num_nodes 1 \ 97 | # --master_port ${MASTER_PORT} \ 98 | # train.py \ 99 | # --model_name_or_path ${MODEL_DIR} \ 100 | # --train_data_path ${train_data_path} \ 101 | # --bf16 True \ 102 | # --output_dir ${output_dir} \ 103 | # --num_train_epochs 15 \ 104 | # --per_device_train_batch_size 1 \ 105 | # --per_device_eval_batch_size 2 \ 106 | # --gradient_accumulation_steps 32 \ 107 | # --model_max_length 1024 \ 108 | # --evaluation_strategy "no" \ 109 | # --save_strategy "epoch" \ 110 | # --save_steps 200 \ 111 | # --save_total_limit 1 \ 112 | # --learning_rate 1e-5 \ 113 | # --adam_beta1 0.9 \ 114 | # --adam_beta2 0.95 \ 115 | # --weight_decay 0.1 \ 116 | # --warmup_ratio 0. \ 117 | # --lr_scheduler_type "linear" \ 118 | # --logging_steps 2 \ 119 | # --tf32 True \ 120 | # --deepspeed ds_llama_config.json \ 121 | # --run_name ${run_name} \ 122 | # --seed 42 \ 123 | # --is_lora False \ -------------------------------------------------------------------------------- /tigerscore/finetune/finetune_mistral.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=llama_finetune 3 | #SBATCH -c 10 4 | #SBATCH --partition=a100 5 | #SBATCH --gres=gpu:4 6 | #SBATCH --time=24:00:00 7 | #SBATCH --mem=100G 8 | #SBATCH --output=../../jobs/%x/%j.out 9 | 10 | nvidia-smi 11 | MASTER_PORT=4637 12 | MODEL_DIR="mistralai/Mistral-7B-v0.1" # 13b 13 | run_name="train_mix.check_ChatGPT.clean" # change this every time you run a new experiment 14 | 15 | output_dir="../../outputs/${MODEL_DIR}/${run_name}" 16 | train_data_path="../../data/train_mix.check_ChatGPT.clean.format_v2.json" # 17 | 18 | mkdir -p ${output_dir} 19 | 20 | # slurm system gpus can't connect to each other by default 21 | # set the following environment variables to enable nccl 22 | export NCCL_IB_DISABLE=1; 23 | export NCCL_P2P_DISABLE=1; 24 | 25 | export NCCL_DEBUG=INFO; 26 | export NCCL_SOCKET_IFNAME=en,eth,em,bond; 27 | export CXX=g++; 28 | 29 | # batch_size = train_batch_size * gradient_accumulation_steps * num_gpus = 128 30 | # epoch size: alpaca using 3 epochs for 52k data 31 | # epoch size: translation data size, only 8k 32 | # epoch szie: sum, data2text, trans, 30k, epoch_size = 4 33 | 34 | CUDA_VISIBLE_DEVICES="0,1,2,3" deepspeed \ 35 | --num_gpus 4 \ 36 | --num_nodes 1 \ 37 | --master_port ${MASTER_PORT} \ 38 | train.py \ 39 | --model_name_or_path ${MODEL_DIR} \ 40 | --train_data_path ${train_data_path} \ 41 | --bf16 True \ 42 | --output_dir ${output_dir} \ 43 | --num_train_epochs 3 \ 44 | --per_device_train_batch_size 1 \ 45 | --per_device_eval_batch_size 2 \ 46 | --gradient_accumulation_steps 32 \ 47 | --model_max_length 1024 \ 48 | --evaluation_strategy "no" \ 49 | --save_strategy "epoch" \ 50 | --save_steps 64 \ 51 | --save_total_limit 6 \ 52 | --learning_rate 2e-5 \ 53 | --weight_decay 0. \ 54 | --warmup_ratio 0.1 \ 55 | --lr_scheduler_type "cosine" \ 56 | --logging_steps 2 \ 57 | --tf32 True \ 58 | --deepspeed ds_llama_config.json \ 59 | --run_name ${run_name} \ 60 | --seed 42 \ 61 | --is_lora False \ -------------------------------------------------------------------------------- /tigerscore/finetune/format_data_v2.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: transforms the xgptscore data format into the alpaca data format for finetuning. 3 | 4 | """ 5 | import sys 6 | import os 7 | sys.path.append("../") 8 | templates_path = os.path.join(os.path.dirname(__file__), "..") 9 | sys.path.append(templates_path) 10 | from tqdm import tqdm 11 | from transformers import AutoTokenizer 12 | from common.datasets_config import DATASETS_CONFIG 13 | from pathlib import Path 14 | from string import Template 15 | import json 16 | import logging 17 | import fire 18 | import regex as re 19 | import numpy as np 20 | from collections import Counter 21 | from itertools import chain 22 | 23 | 24 | # FINETUNE_INST = "You are evaluating errors in a model-generated output for a(an) ${task} task." 25 | # FINETUNE_INPUT = """\ 26 | # Task instruction: ${generation_instruction} 27 | # Source: ${input_context} 28 | # Model-generated Output: ${hypothesis_output} 29 | 30 | # Based on the given task instruction and source, identify the major and minor errors in this model-generated output. 31 | # Note that Major errors refer to actual errors that affects the task severely, and Minor errors refer to small imperfections, and purely subjective opinions about the output. 32 | # For each error you give in the response, please also elaborate the following information: 33 | # - error location (the words that are wrong in the output) 34 | # - error aspect it belongs to. 35 | # - explanation why it's an error, and the correction suggestions. 36 | # - severity of the error ("Major" or "Minor"). 37 | # - reduction of score (between 0.5 and 5) 38 | 39 | # Your evaluation output in the json format: 40 | # """ 41 | INST = "You are evaluating errors in a model-generated output for a given instruction." 42 | TEMPLATE = """\ 43 | Instruction: 44 | ${generation_instruction} 45 | ${input_context} 46 | 47 | Model-generated Output: 48 | ${hypothesis_output} 49 | 50 | For each error you give in the response, please also elaborate the following information: 51 | - error location (the words that are wrong in the output) 52 | - error aspect it belongs to. 53 | - explanation why it's an error, and the correction suggestions. 54 | - severity of the error ("Major" or "Minor"). 55 | - reduction of score (between 0.5 and 5 given the severity of the error) 56 | 57 | Your evaluation output:\ 58 | """ 59 | 60 | def main( 61 | seed: int = 42, 62 | input_file: str = None, 63 | output_file: str = None, 64 | overwrite: bool = False, 65 | max_eval_input_length: int = None, 66 | max_eval_hyp_length: int = None, 67 | max_eval_output_length: int = None, 68 | ): 69 | tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") 70 | 71 | with open(input_file, 'r') as f: 72 | if input_file.endswith(".json"): 73 | data = json.load(f) 74 | elif input_file.endswith(".jsonl"): 75 | data = [json.loads(line) for line in f] 76 | formatted_data = [] 77 | for item in data: 78 | inst = INST 79 | input_ = Template(TEMPLATE).substitute( 80 | generation_instruction=item['instruction'], 81 | input_context=item['input_context'], 82 | hypothesis_output=item['hypo_output'] 83 | ) 84 | output_ = item['errors'] 85 | formatted_data.append({ 86 | "instruction": inst, 87 | "input": input_, 88 | "output": output_, 89 | }) 90 | 91 | with open(output_file, 'w') as f: 92 | json.dump(formatted_data, f, indent=4, ensure_ascii=False) 93 | logging.info(f"Saved to {output_file}") 94 | 95 | # count the dataset statistics 96 | dataset_statistics = {} 97 | dataset_statistics["#total"] = len(formatted_data) 98 | dataset_statistics["#unique input"] = len( 99 | set([item["input"] for item in formatted_data])) 100 | input_lens = [len(tokenizer.encode(item["input"])) 101 | for item in tqdm(formatted_data, desc="Counting input length")] 102 | output_lens = [len(tokenizer.encode(item["output"])) 103 | for item in tqdm(formatted_data, desc="Counting output length")] 104 | total_lens = [x + y for x, y in zip(input_lens, output_lens)] 105 | dataset_statistics["input_length"] = {} 106 | dataset_statistics["input_length"]["mean"] = np.mean(input_lens).item() 107 | dataset_statistics["input_length"]["percentile"] = np.percentile( 108 | input_lens, [0, 25, 50, 90, 100]).tolist() 109 | dataset_statistics["input_length"]["max"] = max(input_lens) 110 | dataset_statistics["input_length"]["min"] = min(input_lens) 111 | dataset_statistics["output_length"] = {} 112 | dataset_statistics["output_length"]["mean"] = np.mean(output_lens).item() 113 | dataset_statistics["output_length"]["percentile"] = np.percentile( 114 | output_lens, [0, 25, 50, 90, 100]).tolist() 115 | dataset_statistics["output_length"]["max"] = max(output_lens) 116 | dataset_statistics["output_length"]["min"] = min(output_lens) 117 | dataset_statistics["total_length"] = {} 118 | dataset_statistics["total_length"]["mean"] = np.mean(total_lens).item() 119 | dataset_statistics["total_length"]["percentile"] = np.percentile( 120 | total_lens, [0, 25, 50, 90, 100]).tolist() 121 | dataset_statistics["total_length"]["max"] = max(total_lens) 122 | dataset_statistics["total_length"]["min"] = min(total_lens) 123 | error_aspects = [re.findall( 124 | r'(?<=Error aspect \d+: )[ \w]+', item['output']) for item in formatted_data] 125 | error_aspects = list(chain(*error_aspects)) 126 | dataset_statistics["error_aspects_distribution"] = Counter(error_aspects) 127 | 128 | num_errors = [len(re.findall(r'(?<=Error location \d+: ).*(?=\n|$)', 129 | item['output'])) for item in formatted_data] 130 | dataset_statistics["num_errors_distribution"] = Counter(num_errors) 131 | # severity distributions 132 | severities = [re.findall( 133 | r'(?<=Severity \d+: ).*(?=\n|$)', item['output']) for item in formatted_data] 134 | severities = list(chain(*severities)) 135 | dataset_statistics["severity_distribution"] = Counter(severities) 136 | # score reduction distributions 137 | score_reductions = [re.findall( 138 | r'(?<=Score reduction \d+: ).*(?=\n|$)', item['output']) for item in formatted_data] 139 | score_reductions = list(chain(*score_reductions)) 140 | score_reductions = [abs(float(x.replace(" ", ""))) 141 | for x in score_reductions] 142 | dataset_statistics["score_reduction_distribution"] = Counter( 143 | score_reductions) 144 | 145 | print(dataset_statistics) 146 | output_file = Path(output_file).with_suffix(".statistics.json") 147 | with open(output_file, "w") as f: 148 | json.dump(dataset_statistics, f, indent=4, ensure_ascii=False) 149 | logging.info(f"Saved statistics to {output_file}") 150 | 151 | 152 | if __name__ == "__main__": 153 | logging.basicConfig(level=logging.INFO) 154 | fire.Fire(main) 155 | -------------------------------------------------------------------------------- /tigerscore/finetune/format_data_v2.sh: -------------------------------------------------------------------------------- 1 | # INPUT_FILE="../../data/train_mix.check.clean.jsonl" 2 | # OUTPUT_FILE="../../data/train_mix.check.clean.format_v2.json" 3 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \ 4 | # --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400 5 | 6 | # INPUT_FILE="../../data/train_mix.jsonl" 7 | # OUTPUT_FILE="../../data/train_mix.format_v2.json" 8 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \ 9 | # --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400 10 | 11 | # tasks=('data2text' 'instruction-following' 'long-form QA' 'mathQA' 'summarization' 'translation') 12 | # for task in "${tasks[@]}"; do 13 | # INPUT_FILE="../../data/train_mix.${task}.jsonl" 14 | # OUTPUT_FILE="../../data/train_mix.${task}.format_v2.json" 15 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \ 16 | # --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400 17 | # done 18 | 19 | INPUT_FILE="../../data/train_mix.check.clean.mathQA.jsonl" 20 | OUTPUT_FILE="../../data/train_mix.check.clean.mathQA.format_v2.json" 21 | python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \ 22 | --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400 23 | 24 | # INPUT_FILE="../../data/additional/alpaca_cleaned/new_alpaca_cleaned.v2.8k.gen.jsonl" 25 | # OUTPUT_FILE="../../data/additional/alpaca_cleaned/new_alpaca_cleaned.v2.8k.gen.format_v2.json" 26 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \ 27 | # --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400 28 | 29 | # INPUT_FILE="../../data/additional/alpaca_cleaned/new_alpaca_cleaned.v2.2k.jsonl" 30 | # OUTPUT_FILE="../../data/additional/alpaca_cleaned/new_alpaca_cleaned.v2.2k.format_v2.json" 31 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \ 32 | # --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400 33 | 34 | # INPUT_FILE="../../data/new_std_400s_m_200s_l_1100s_i3-32k.check.clean.jsonl" 35 | # OUTPUT_FILE="../../data/new_std_400s_m_200s_l_1100s_i3-32k.check.clean.format_v2.jsonl" 36 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \ 37 | # --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400 38 | 39 | # INPUT_FILE="../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.story.ref.extracted.jsonl" 40 | # OUTPUT_FILE="../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.story.ref.extracted.format_v2.jsonl" 41 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \ 42 | # --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400 43 | 44 | # INPUT_FILE="../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.story.1k.gen.extracted.jsonl" 45 | # OUTPUT_FILE="../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.story.1k.gen.extracted.format_v2.jsonl" 46 | # # INPUT_FILE="TIGERScore/data/32k_final.json" 47 | # # OUTPUT_FILE="TIGERScore/data/32k_final_distill.json" 48 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \ 49 | # --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400 50 | 51 | # INPUT_FILE="../../data/additional/metamath/metamath.1k.ref.extracted.jsonl" 52 | # OUTPUT_FILE="../../data/additional/metamath/metamath.1k.ref.extracted.format_v2.jsonl" 53 | # # INPUT_FILE="TIGERScore/data/32k_final.json" 54 | # # OUTPUT_FILE="TIGERScore/data/32k_final_distill.json" 55 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \ 56 | # --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400 -------------------------------------------------------------------------------- /tigerscore/finetune/format_distill_data.sh: -------------------------------------------------------------------------------- 1 | DATA_DIR="../../data" 2 | 3 | # # transllation 4 | # INPUT_FILE="${DATA_DIR}/wmt/train_data.wmt_mqm.distill_new_wmt_mqm.json" 5 | # OUTPUT_FILE="${DATA_DIR}/wmt/train_data.wmt_mqm.distill_new_wmt_mqm.format_txt.json" 6 | # python format_distill_data.py --task "translation" --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \ 7 | 8 | # # summarization 9 | # INPUT_FILE="${DATA_DIR}/WorkSpace/ExplainableGPTScore_bak/data/sum/train_data.align_score.filter_v2.json" 10 | # OUTPUT_FILE="${DATA_DIR}/WorkSpace/ExplainableGPTScore_bak/data/sum/train_data.align_score.filter_v2.format_txt.json" 11 | # python format_distill_data.py --task "summarization" \ 12 | # --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \ 13 | # --max_eval_input_length 400 --max_eval_hyp_length 300 --max_eval_output_length 400 \ 14 | 15 | # # data2text 16 | # INPUT_FILE="${DATA_DIR}/WorkSpace/ExplainableGPTScore_bak/data/d2t/train_data.d2t.filter_v1.json" 17 | # OUTPUT_FILE="${DATA_DIR}/WorkSpace/ExplainableGPTScore_bak/data/d2t/train_data.d2t.filter_v1.format_txt.json" 18 | # python format_distill_data.py --task "data2text" --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \ 19 | # --max_eval_input_length 400 --max_eval_hyp_length 400 --max_eval_output_length 400 \ 20 | # # long-form QA 21 | 22 | # # SEScore3 zh-en debug 23 | # INPUT_FILE="${DATA_DIR}/WorkSpace/ExplainableGPTScore/data/sescore3/sescore3_zh_en_llama_formatted_data.json" 24 | # OUTPUT_FILE="${DATA_DIR}/WorkSpace/ExplainableGPTScore/data/sescore3/sescore3_zh_en_llama_formatted_data.format_txt.json" 25 | # python format_distill_data.py --task "translation" --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \ 26 | # # --max_eval_input_length 400 --max_eval_hyp_length 400 --max_eval_output_length 400 \ 27 | 28 | # # summarization v3 29 | # INPUT_FILE="../../data/sum/train_data.align_score.filter_v3.json" 30 | # OUTPUT_FILE="../../data/sum/train_data.align_score.filter_v3.format_txt.json" 31 | # python format_distill_data.py --task "summarization" \ 32 | # --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \ 33 | # --max_eval_input_length 400 --max_eval_hyp_length 300 --max_eval_output_length 400 \ 34 | 35 | 36 | IFS=$'\n' 37 | tasks=("translation" "long-form QA" "summarization" "data2text" "instruction-following") 38 | for task in ${tasks[@]}; do 39 | INPUT_FILE="../../data/real_world/${task}.json" 40 | OUTPUT_FILE="../../data/real_world/${task}.format_txt.json" 41 | python format_distill_data.py --task ${task} \ 42 | --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \ 43 | --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400 44 | done 45 | -------------------------------------------------------------------------------- /tigerscore/finetune/format_synthesis_distill_data.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: transforms the xgptscore data format into the alpaca data format for finetuning. 3 | 4 | """ 5 | import sys 6 | import os 7 | sys.path.append("../") 8 | templates_path = os.path.join(os.path.dirname(__file__), "..") 9 | sys.path.append(templates_path) 10 | from tqdm import tqdm 11 | from transformers import AutoTokenizer 12 | from common.datasets_config import DATASETS_CONFIG 13 | from pathlib import Path 14 | from string import Template 15 | import json 16 | import logging 17 | import fire 18 | import regex as re 19 | import numpy as np 20 | from collections import Counter 21 | from itertools import chain 22 | 23 | 24 | FINETUNE_INST = "You are evaluating errors in a model-generated output for a given instruction." 25 | FINETUNE_INPUT = """\ 26 | Instruction: 27 | ${generation_instruction} 28 | ${input_context} 29 | 30 | Model-generated Output: 31 | ${hypothesis_output} 32 | 33 | For each error you give in the response, please also elaborate the following information: 34 | - error location (the words that are wrong in the output) 35 | - error aspect it belongs to. 36 | - explanation why it's an error, and the correction suggestions. 37 | - severity of the error ("Major" or "Minor"). 38 | - reduction of score (between 0.5 and 5 given the severity of the error) 39 | 40 | Your evaluation output:\ 41 | """ 42 | 43 | 44 | def main( 45 | task: str, 46 | seed: int = 42, 47 | input_file: str = None, 48 | output_file: str = None, 49 | overwrite: bool = False, 50 | max_eval_input_length: int = None, 51 | max_eval_hyp_length: int = None, 52 | max_eval_output_length: int = None, 53 | ): 54 | assert task in DATASETS_CONFIG.keys() 55 | 56 | tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b") 57 | 58 | with open(input_file, 'r') as f: 59 | data = json.load(f) 60 | formatted_data = [] 61 | for item in data: 62 | syn_output = item['responses'][-1] 63 | syn_output = syn_output.replace(": \n", ": ") 64 | # decode the synthesis outputs 65 | try: 66 | start_pos = syn_output.index( 67 | "Generated incorrect output: ") + len("Generated incorrect output: ") 68 | end_pos = syn_output.index("\nError location 1") 69 | hyp = syn_output[start_pos:end_pos].strip('\n ') 70 | assert len(hyp) > 0 71 | except Exception: 72 | logging.warning( 73 | "Failed to parse the synthesis output: {}".format(syn_output)) 74 | continue 75 | inst = Template(FINETUNE_INST).substitute(task=task) 76 | input_context_ids = tokenizer.encode( 77 | item['input'], add_special_tokens=False) 78 | hyp_ids = tokenizer.encode(hyp, add_special_tokens=False) 79 | if max_eval_input_length is not None and len(input_context_ids) > max_eval_input_length: 80 | input_context = tokenizer.decode( 81 | input_context_ids[:max_eval_input_length]) + "..." 82 | else: 83 | input_context = item['input'] 84 | if max_eval_hyp_length is not None and len(hyp_ids) > max_eval_hyp_length: 85 | hypothesis_output = tokenizer.decode( 86 | hyp_ids[:max_eval_hyp_length]) + "..." 87 | else: 88 | hypothesis_output = hyp 89 | input_ = Template(FINETUNE_INPUT).substitute( 90 | generation_instruction=item['instruction'], 91 | input_context=input_context, 92 | hypothesis_output=hypothesis_output, 93 | ) 94 | try: 95 | error_locations = re.findall( 96 | r'(?<=Error location \d+: ).*(?=\n|$)', syn_output) 97 | error_aspects = re.findall( 98 | r'(?<=Error aspect \d+: ).*(?=\n|$)', syn_output) 99 | explanations = re.findall( 100 | r'(?<=Explanation \d+: ).*(?=\n|$)', syn_output) 101 | severities = re.findall( 102 | r'(?<=Severity \d+: ).*(?=\n|$)', syn_output) 103 | score_reductions = re.findall( 104 | r'(?<=Score reduction \d+: ).*(?=\n|$)', syn_output) 105 | score_reductions = [abs(int(x.replace(" ", ""))) 106 | for x in score_reductions] 107 | except Exception: 108 | logging.warning( 109 | "Failed to parse the synthesis output: {}".format(syn_output)) 110 | continue 111 | 112 | if not len(error_locations) == len(error_aspects) == len(explanations) == len(severities) == len(score_reductions): 113 | logging.warning( 114 | "The number of errors properties does not match!: {}".format(syn_output)) 115 | continue 116 | 117 | txt_output = "The model-generated output contains {} errors, with a total score reduction of {}.".format( 118 | len(error_locations), 119 | sum([int(score) for score in score_reductions]), 120 | ) 121 | for i in range(len(error_locations)): 122 | txt_output += "\nError location {}: {}\n".format( 123 | i + 1, error_locations[i]) 124 | txt_output += "Error aspect {}: {}\n".format( 125 | i + 1, error_aspects[i]) 126 | txt_output += "Explanation {}: {}\n".format(i + 1, explanations[i]) 127 | txt_output += "Severity {}: {}\n".format(i + 1, severities[i]) 128 | txt_output += "Score reduction {}: {}".format( 129 | i + 1, score_reductions[i]) 130 | output_ = txt_output.strip(' \n') 131 | formatted_data.append({ 132 | "instruction": inst, 133 | "input": input_, 134 | "output": output_, 135 | "task": task, 136 | }) 137 | 138 | # # append 20% non-error examples 139 | # for item in data: 140 | # if random.random() < 0.2: 141 | # inst = Template(FINETUNE_INST).substitute(task=task) 142 | # input_context_ids = tokenizer.encode(item['input'], add_special_tokens=False) 143 | # if max_eval_input_length is not None and len(input_context_ids) > max_eval_input_length: 144 | # input_context = tokenizer.decode(input_context_ids[:max_eval_input_length]) + "..." 145 | # else: 146 | # input_context = item['input'] 147 | # input_ = Template(FINETUNE_INPUT).substitute( 148 | # generation_instruction=item['instruction'], 149 | # input_context=input_context, 150 | # hypothesis_output=item['output'], 151 | # ) 152 | # output_ = "The model-generated output contains 0 errors, with a total score reduction of 0." 153 | # formatted_data.append({ 154 | # "instruction": inst, 155 | # "input": input_, 156 | # "output": output_, 157 | # "task": task, 158 | # }) 159 | 160 | with open(output_file, 'w') as f: 161 | json.dump(formatted_data, f, indent=4, ensure_ascii=False) 162 | logging.info(f"Saved to {output_file}") 163 | 164 | # count the dataset statistics 165 | dataset_statistics = {} 166 | dataset_statistics["#total"] = len(formatted_data) 167 | dataset_statistics["#unique input"] = len( 168 | set([item["input"] for item in formatted_data])) 169 | input_lens = [len(tokenizer.encode(item["input"])) 170 | for item in tqdm(formatted_data, desc="Counting input length")] 171 | output_lens = [len(tokenizer.encode(item["output"])) 172 | for item in tqdm(formatted_data, desc="Counting output length")] 173 | total_lens = [x + y for x, y in zip(input_lens, output_lens)] 174 | dataset_statistics["input_length"] = {} 175 | dataset_statistics["input_length"]["mean"] = np.mean(input_lens).item() 176 | dataset_statistics["input_length"]["percentile"] = np.percentile( 177 | input_lens, [0, 25, 50, 90, 100]).tolist() 178 | dataset_statistics["input_length"]["max"] = max(input_lens) 179 | dataset_statistics["input_length"]["min"] = min(input_lens) 180 | dataset_statistics["output_length"] = {} 181 | dataset_statistics["output_length"]["mean"] = np.mean(output_lens).item() 182 | dataset_statistics["output_length"]["percentile"] = np.percentile( 183 | output_lens, [0, 25, 50, 90, 100]).tolist() 184 | dataset_statistics["output_length"]["max"] = max(output_lens) 185 | dataset_statistics["output_length"]["min"] = min(output_lens) 186 | dataset_statistics["total_length"] = {} 187 | dataset_statistics["total_length"]["mean"] = np.mean(total_lens).item() 188 | dataset_statistics["total_length"]["percentile"] = np.percentile( 189 | total_lens, [0, 25, 50, 90, 100]).tolist() 190 | dataset_statistics["total_length"]["max"] = max(total_lens) 191 | dataset_statistics["total_length"]["min"] = min(total_lens) 192 | error_aspects = [re.findall( 193 | r'(?<=Error aspect \d+: ).*(?=\n|$)', item['output']) for item in formatted_data] 194 | error_aspects = list(chain(*error_aspects)) 195 | dataset_statistics["error_aspects_distribution"] = Counter(error_aspects) 196 | # number of errors distributions 197 | num_errors = [len(re.findall(r'(?<=Error location \d+: ).*(?=\n|$)', 198 | item['output'])) for item in formatted_data] 199 | dataset_statistics["num_errors_distribution"] = Counter(num_errors) 200 | # severity distributions 201 | severities = [re.findall( 202 | r'(?<=Severity \d+: ).*(?=\n|$)', item['output']) for item in formatted_data] 203 | severities = list(chain(*severities)) 204 | dataset_statistics["severity_distribution"] = Counter(severities) 205 | # score reduction distributions 206 | score_reductions = [re.findall( 207 | r'(?<=Score reduction \d+: ).*(?=\n|$)', item['output']) for item in formatted_data] 208 | score_reductions = list(chain(*score_reductions)) 209 | score_reductions = [abs(int(x.replace(" ", ""))) for x in score_reductions] 210 | dataset_statistics["score_reduction_distribution"] = Counter( 211 | score_reductions) 212 | 213 | print(dataset_statistics) 214 | output_file = Path(output_file).with_suffix(".statistics.json") 215 | with open(output_file, "w") as f: 216 | json.dump(dataset_statistics, f, indent=4, ensure_ascii=False) 217 | logging.info(f"Saved statistics to {output_file}") 218 | 219 | 220 | if __name__ == "__main__": 221 | logging.basicConfig(level=logging.INFO) 222 | fire.Fire(main) 223 | -------------------------------------------------------------------------------- /tigerscore/finetune/ft_llama_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=ft_llama_lora 3 | #SBATCH --gres=gpu:a6000:1 4 | #SBATCH --time=24:00:00 5 | #SBATCH --qos=general 6 | #SBATCH --output=../../jobs/llama_finetune/%j.out 7 | 8 | MASTER_PORT=4635 9 | MODEL_DIR="meta-llama/Llama-2-7b-hf" # 13b 10 | run_name="model_len_1024_lora_debug" # change this every time you run a new experiment 11 | output_dir="../../outputs/${MODEL_DIR}/${run_name}" 12 | # train_data_path="../../data/wmt/train_data.wmt_mqm.distill.format.json" 13 | train_data_path="../../WorkSpace/ExplainableGPTScore/finetune_data/translation/train.json" 14 | # train_data_path="../../WorkSpace/ExplainableGPTScore/finetune_data/translation/train/wmt18_zh-en.json" 15 | mkdir -p ${output_dir} 16 | 17 | # slurm system gpus can't connect to each other by default 18 | # set the following environment variables to enable nccl 19 | export NCCL_IB_DISABLE=1; 20 | export NCCL_P2P_DISABLE=1; 21 | 22 | export NCCL_DEBUG=INFO; 23 | export NCCL_SOCKET_IFNAME=en,eth,em,bond; 24 | export CXX=g++; 25 | 26 | # batch_size = train_batch_size * gradient_accumulation_steps * num_gpus = 128 27 | # epoch size: alpaca using 3 epochs for 52k data 28 | # epoch size: translation data size, only 8k 29 | 30 | ../../.conda/envs/llm_reranker/bin/deepspeed \ 31 | --num_gpus 1 \ 32 | --num_nodes 1 \ 33 | --master_port ${MASTER_PORT} \ 34 | train.py \ 35 | --model_name_or_path ${MODEL_DIR} \ 36 | --train_data_path ${train_data_path} \ 37 | --bf16 True \ 38 | --output_dir ${output_dir} \ 39 | --num_train_epochs 3 \ 40 | --per_device_train_batch_size 4 \ 41 | --per_device_eval_batch_size 2 \ 42 | --gradient_accumulation_steps 32 \ 43 | --model_max_length 1024 \ 44 | --evaluation_strategy "no" \ 45 | --save_strategy "epoch" \ 46 | --save_steps 200 \ 47 | --save_total_limit 3 \ 48 | --learning_rate 3e-4 \ 49 | --weight_decay 0. \ 50 | --warmup_ratio 0.1 \ 51 | --lr_scheduler_type "linear" \ 52 | --logging_steps 2 \ 53 | --tf32 True \ 54 | --deepspeed ds_llama_config.json \ 55 | --run_name ${run_name} \ 56 | --seed 42 \ 57 | --is_lora True \ 58 | 59 | # lora Config 60 | # lr: 3e-4 -------------------------------------------------------------------------------- /tigerscore/finetune/test_llama.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=test_llama 3 | #SBATCH --gres=gpu:6000:1 4 | #SBATCH --time=24:00:00 5 | #SBATCH --output=../../jobs/test_llama/%j.out 6 | nvidia-smi 7 | 8 | model_name="meta-llama/Llama-2-7b-hf" 9 | outputs_dir="" 10 | 11 | # outputs_dir="../../outputs" 12 | checkpoint_name="model_len_1024_mix_v2" 13 | checkpoint_path="${outputs_dir}/${model_name}/${checkpoint_name}/checkpoint-best" 14 | # task="translation" 15 | # # finetune test 16 | # data_path="/home//WorkSpace/ExplainableGPTScore/finetune_data/${task}/test.json" 17 | 18 | # BARTScore test 19 | # data_path="/home//WorkSpace/ExplainableGPTScore/BARTScore/WMT/zh-en/final_p_with_xgptscore.test_llama_new.json" 20 | 21 | # mtme test mqm 22 | # task="translation" 23 | # human_score_names="mqm,da" 24 | # data_path="../../data/wmt22/zh-en/eval_data.random_2.json" 25 | 26 | # sum test relevance 27 | # task="summarization" 28 | # human_score_names="coherence,consistency,fluency,relevance" 29 | # data_path="../../BARTScore/SUM/SummEval/final_p_with_xgptscore.json" 30 | 31 | # d2t test Correctness 32 | # task="data2text" 33 | # human_score_names="Correctness,DataCoverage,Fluency,Relevance,TextStructure" 34 | # data_path="/home//WorkSpace/ExplainableGPTScore_bak/data/webnlg/webnlg2020_gen_with_scores.json" 35 | 36 | # instruction-following 37 | # rank 38 | # data_path="/home//WorkSpace/ExplainableGPTScore_bak/data/databricks/databricks-dolly-15k/rank_eval_mid.json" 39 | 40 | # task="instruction-following" 41 | # human_score_names="gpt_rank_score" 42 | # data_path="/home//WorkSpace/ExplainableGPTScore_bak/data/llm-blender/mix-instruct/test_data_prepared_300.json" 43 | 44 | # long-form QA 45 | ### ATTENTION the space in the task name is not allowed,you need use --task "long-form QA" instead of --task ${task} 46 | # task="long-form QA" 47 | # human_score_names="rank" 48 | # data_path="/home//WorkSpace/ExplainableGPTScore_bak/data/lfqa/test.json" 49 | 50 | # Math QA 51 | # accuracy 52 | # task="mathQA" 53 | # human_score_names="accuracy" 54 | # data_path="/home//WorkSpace/ExplainableGPTScore_bak/gsm8k-ScRel/data/test_acc.json" 55 | 56 | output_path="${data_path}.llama_2_7b_${checkpoint_name}.output" 57 | 58 | # seems batch_size=1 is faster than batch_size=2 or higher 59 | python test_llama.py \ 60 | --model_name_or_path ${checkpoint_path} \ 61 | --task ${task} \ 62 | --data_path ${data_path} \ 63 | --output_path ${output_path} \ 64 | --torch_dtype "bfloat16" \ 65 | --batch_size 1 \ 66 | --human_score_names ${human_score_names} \ 67 | --model_max_length 1024 \ 68 | --max_eval_input_length 512 \ 69 | --max_eval_hyp_length 512 \ 70 | --max_eval_output_length 1024 \ 71 | --overwrite True \ -------------------------------------------------------------------------------- /tigerscore/finetune/test_llama_vllm.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import json 4 | import torch 5 | import logging 6 | import sys 7 | import regex as re 8 | from pathlib import Path 9 | sys.path.append(str(Path(__file__).parent.parent)) 10 | from vllm import LLM, SamplingParams 11 | from typing import List 12 | from string import Template 13 | from mt_metrics_eval.stats import Correlation 14 | 15 | 16 | MAX_INT = sys.maxsize 17 | 18 | IGNORE_INDEX = -100 19 | DEFAULT_PAD_TOKEN = "[PAD]" 20 | DEFAULT_EOS_TOKEN = "" 21 | DEFAULT_BOS_TOKEN = "" 22 | DEFAULT_UNK_TOKEN = "" 23 | PROMPT_DICT = { 24 | "prompt_input": ( 25 | "Below is an instruction that describes a task, paired with an input that provides further context. " 26 | "Write a response that appropriately completes the request.\n\n" 27 | "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:" 28 | ), 29 | "prompt_no_input": ( 30 | "Below is an instruction that describes a task. " 31 | "Write a response that appropriately completes the request.\n\n" 32 | "### Instruction:\n{instruction}\n\n### Response:" 33 | ), 34 | } 35 | FINETUNE_INST = "You are evaluating errors in a model-generated output for a given instruction." 36 | FINETUNE_INPUT = """\ 37 | Instruction: 38 | ${generation_instruction} 39 | ${input_context} 40 | 41 | Model-generated Output: 42 | ${hypothesis_output} 43 | 44 | For each error you give in the response, please also elaborate the following information: 45 | - error location (the words that are wrong in the output) 46 | - error aspect it belongs to. 47 | - explanation why it's an error, and the correction suggestions. 48 | - severity of the error ("Major" or "Minor"). 49 | - reduction of score (between 0.5 and 5 given the severity of the error) 50 | 51 | Your evaluation output:\ 52 | """ 53 | 54 | 55 | def get_sum_penalties(eval_output: dict): 56 | """ 57 | Args: 58 | eval_output: dict, the json output of the eval function 59 | 60 | Returns: 61 | """ 62 | try: 63 | penalty_score = 0 64 | for aspect in eval_output: 65 | for penalty_point in eval_output[aspect]["penalty_points"]: 66 | penalty_score += penalty_point["score_reduction"] 67 | return - penalty_score 68 | except Exception: 69 | return None 70 | 71 | 72 | def get_torch_dtype(dtype_str): 73 | """ 74 | Get the torch dtype from a string 75 | """ 76 | if dtype_str == "float32": 77 | return torch.float32 78 | elif dtype_str == "float16": 79 | return torch.float16 80 | elif dtype_str == "bfloat16": 81 | return torch.bfloat16 82 | elif dtype_str == "int8": 83 | return torch.int8 84 | else: 85 | raise ValueError("Invalid dtype {}".format(dtype_str)) 86 | 87 | 88 | def batch_data(data_list, batch_size=1): 89 | n = len(data_list) // batch_size 90 | batch_data = [] 91 | for i in range(n - 1): 92 | start = i * batch_size 93 | end = (i + 1) * batch_size 94 | batch_data.append(data_list[start:end]) 95 | 96 | last_start = (n - 1) * batch_size 97 | last_end = MAX_INT 98 | batch_data.append(data_list[last_start:last_end]) 99 | return batch_data 100 | 101 | 102 | class MyCorrelation(Correlation): 103 | def __init__(self, num_sys: int, gold_scores: List[int], metric_scores: List[int]): 104 | # remove nan in metrics scores 105 | none_metric_scores_idxs = [idx for idx, 106 | x in enumerate(metric_scores) if x is None] 107 | logging.info("Remove {} nan scores from {} scores".format( 108 | len(none_metric_scores_idxs), 109 | len(metric_scores) 110 | )) 111 | gold_scores = gold_scores.copy() 112 | # set gold scores to None if metric scores are None 113 | for idx in none_metric_scores_idxs[::-1]: 114 | gold_scores[idx] = None 115 | super().__init__(num_sys, gold_scores, metric_scores) 116 | 117 | 118 | def main(args): 119 | 120 | if args.output_path is not None: 121 | output_file = Path(args.output_path) 122 | else: 123 | output_file = Path(args.data_path).with_suffix( 124 | '.xgptscore.output.json') 125 | if not output_file.exists() or args.overwrite: 126 | logging.info("Loading model...") 127 | sampling_params = SamplingParams( 128 | temperature=0, top_p=1, max_tokens=1024) 129 | llm = LLM(model=args.model_name_or_path, tensor_parallel_size=1) 130 | logging.info("Model loaded from {}".format(args.model_name_or_path)) 131 | 132 | eval_outputs = [] 133 | 134 | logging.info("Load input data from {}".format(args.data_path)) 135 | with open(args.data_path, "r") as f: 136 | input_data = json.load(f) 137 | formatted_data = [] 138 | for item in input_data: 139 | for cand in item['candidates']: 140 | inst = Template(FINETUNE_INST).substitute(task=args.task) 141 | input_ = Template(FINETUNE_INPUT).substitute( 142 | task=args.task, 143 | generation_instruction=item['instruction'], 144 | input_context=item['input'], 145 | hypothesis_output=cand['text'], 146 | ) 147 | formatted_data.append({ 148 | "instruction": inst, 149 | "input": input_, 150 | }) 151 | prompt_sources = [example['instruction'] + '\n' + 152 | example['input'] for example in formatted_data] 153 | prompt_sources = [x.strip(' \n') + "\n" for x in prompt_sources] 154 | 155 | batch_prompts = batch_data(prompt_sources, batch_size=args.batch_size) 156 | 157 | for idx, batch_prompt in enumerate(batch_prompts): 158 | if isinstance(batch_prompt, list): 159 | pass 160 | else: 161 | batch_prompt = [batch_prompt] 162 | 163 | completions = llm.generate(batch_prompt, sampling_params) 164 | for output in completions: 165 | generated_text = output.outputs[0].text 166 | eval_outputs.append(generated_text) 167 | 168 | cand_idx = 0 169 | for idx, (item, eval_output) in enumerate(zip(input_data, eval_outputs)): 170 | for cand in item['candidates']: 171 | cand['eval_output'] = eval_outputs[cand_idx] 172 | score_reductions = re.findall( 173 | r"(?<=\nScore reduction \d+: )(\d+\.\d+|\d+)", eval_outputs[cand_idx]) 174 | cand['xgptscore'] = -sum(map(float, score_reductions)) 175 | cand_idx += 1 176 | 177 | with open(output_file, 'w') as f: 178 | json.dump(input_data, f, indent=4, ensure_ascii=False) 179 | logging.info("Saved eval results to {}".format(output_file)) 180 | else: 181 | with open(output_file, 'r') as f: 182 | input_data = json.load(f) 183 | for ex in input_data: 184 | for cand in ex['candidates']: 185 | score_reductions = re.findall( 186 | r"(?<=\nScore reduction \d+: )(\d+\.\d+|\d+)", cand['eval_output']) 187 | cand['xgptscore'] = -sum(map(float, score_reductions)) 188 | with open(output_file, 'w') as f: 189 | json.dump(input_data, f, indent=4, ensure_ascii=False) 190 | logging.info("Loaded eval results from {}".format(output_file)) 191 | # Compute correlation 192 | human_score_names = args.human_score_names.split(',') 193 | 194 | for h_name in human_score_names: 195 | human_scores = [] 196 | xgptscores = [] 197 | for item in input_data: 198 | for cand in item['candidates']: 199 | for s_name, score in cand['scores'].items(): 200 | if s_name == h_name: 201 | xgptscores.append(cand['xgptscore']) 202 | human_scores.append(score) 203 | break 204 | corr = MyCorrelation(1, human_scores, xgptscores) 205 | logging.info("Human score: {}".format(h_name)) 206 | logging.info("Pearson correlation: {}".format(corr.Pearson())) 207 | logging.info("Spearman correlation: {}".format(corr.Spearman())) 208 | logging.info("Kendall correlation: {}".format(corr.Kendall())) 209 | 210 | 211 | if __name__ == "__main__": 212 | 213 | logging.basicConfig(level=logging.INFO) 214 | parser = argparse.ArgumentParser() 215 | parser.add_argument("--model_name_or_path", type=str, default=None) 216 | parser.add_argument("--data_path", type=str, default=None) 217 | parser.add_argument("--output_path", type=str, default=None) 218 | parser.add_argument("--overwrite", action="store_true") 219 | parser.add_argument("--task", type=str, default="summarization") 220 | parser.add_argument("--batch_size", type=int, default=1) 221 | parser.add_argument("--human_score_names", type=str, default="score") 222 | args = parser.parse_args() 223 | main(args) 224 | -------------------------------------------------------------------------------- /tigerscore/finetune/test_llama_vllm.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | #SBATCH --job-name=test_llama 3 | #SBATCH --gres=gpu:a6000:1 4 | #SBATCH --time=24:00:00 5 | #SBATCH --output=../../jobs/test_llama/%j.out 6 | nvidia-smi 7 | 8 | 9 | ## Note 10 | # please download the data in the working directory as indicated in the Data Preparation section in the read me 11 | # quick command: gdown https://drive.google.com/uc?id=1DAjvig-A_57CuBvENLg8A2PycOaz9ZkT 12 | ## 13 | 14 | model_name="meta-llama/Llama-2-7b-hf" 15 | outputs_dir="" 16 | 17 | # outputs_dir="../../outputs" 18 | checkpoint_name="ref" 19 | # checkpoint_path="${outputs_dir}/${model_name}/${checkpoint_name}/checkpoint-532" 20 | checkpoint_path="TIGER-Lab/TIGERScore-13B" 21 | 22 | human_score_names="gpt_rank_score" 23 | data_path="../../data/evaluation/lfqa/test_data_prepared.json" 24 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output" 25 | python test_llama_vllm.py \ 26 | --model_name_or_path ${checkpoint_path} \ 27 | --task "long-form QA" \ 28 | --data_path ${data_path} \ 29 | --output_path ${output_path} \ 30 | --batch_size 60 \ 31 | --human_score_names ${human_score_names} \ 32 | --overwrite 33 | 34 | task="instruction-following" 35 | human_score_names="gpt_rank_score" 36 | data_path="../../data/evaluation/instruct/just-eval-instruct/test_data_prepared.json" 37 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output" 38 | python test_llama_vllm.py \ 39 | --model_name_or_path ${checkpoint_path} \ 40 | --task ${task} \ 41 | --data_path ${data_path} \ 42 | --output_path ${output_path} \ 43 | --batch_size 60 \ 44 | --human_score_names ${human_score_names} \ 45 | --overwrite 46 | 47 | task="mathQA" 48 | human_score_names="accuracy" 49 | data_path="../../data/evaluation/mathqa/gsm8k/test_data_prepared.json" 50 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output" 51 | python test_llama_vllm.py \ 52 | --model_name_or_path ${checkpoint_path} \ 53 | --task ${task} \ 54 | --data_path ${data_path} \ 55 | --output_path ${output_path} \ 56 | --batch_size 60 \ 57 | --human_score_names ${human_score_names} \ 58 | --overwrite 59 | 60 | 61 | # mtme test mqm 62 | task="translation" 63 | human_score_names="mqm" 64 | data_path="../../data/evaluation/translation/wmt22/zh-en/eval_data.json" 65 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output" 66 | python test_llama_vllm.py \ 67 | --model_name_or_path ${checkpoint_path} \ 68 | --task ${task} \ 69 | --data_path ${data_path} \ 70 | --output_path ${output_path} \ 71 | --batch_size 60 \ 72 | --human_score_names ${human_score_names} \ 73 | --overwrite 74 | 75 | # sum test relevance 76 | task="summarization" 77 | human_score_names="coherence,consistency,fluency,relevance" 78 | data_path="../../data/evaluation/summarization/summeval/test_data_prepared.json" 79 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output" 80 | python test_llama_vllm.py \ 81 | --model_name_or_path ${checkpoint_path} \ 82 | --task ${task} \ 83 | --data_path ${data_path} \ 84 | --output_path ${output_path} \ 85 | --batch_size 60 \ 86 | --human_score_names ${human_score_names} \ 87 | --overwrite 88 | 89 | # d2t test Correctness 90 | task="data2text" 91 | human_score_names="Correctness,DataCoverage,Fluency,Relevance,TextStructure" 92 | data_path="../../data/evaluation/d2t/webnlg_2020/test_data_prepared.json" 93 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output" 94 | python test_llama_vllm.py \ 95 | --model_name_or_path ${checkpoint_path} \ 96 | --task ${task} \ 97 | --data_path ${data_path} \ 98 | --output_path ${output_path} \ 99 | --batch_size 60 \ 100 | --human_score_names ${human_score_names} \ 101 | --overwrite 102 | 103 | 104 | # storygen test human 105 | task="storygen" 106 | human_score_names="human" 107 | data_path="../../data/evaluation/storygen/test_data_prepared.json" 108 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output" 109 | python test_llama_vllm.py \ 110 | --model_name_or_path ${checkpoint_path} \ 111 | --task ${task} \ 112 | --data_path ${data_path} \ 113 | --output_path ${output_path} \ 114 | --batch_size 60 \ 115 | --human_score_names ${human_score_names} \ 116 | --overwrite 117 | -------------------------------------------------------------------------------- /tigerscore/finetune/test_llama_vllm_distance.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import json 4 | import torch 5 | import logging 6 | import sys 7 | import regex as re 8 | import numpy as np 9 | from pathlib import Path 10 | sys.path.append(str(Path(__file__).parent.parent)) 11 | from vllm import LLM, SamplingParams 12 | from typing import List 13 | from string import Template 14 | from mt_metrics_eval.stats import Correlation 15 | 16 | 17 | MAX_INT = sys.maxsize 18 | 19 | IGNORE_INDEX = -100 20 | DEFAULT_PAD_TOKEN = "[PAD]" 21 | DEFAULT_EOS_TOKEN = "" 22 | DEFAULT_BOS_TOKEN = "" 23 | DEFAULT_UNK_TOKEN = "" 24 | PROMPT_DICT = { 25 | "prompt_input": ( 26 | "Below is an instruction that describes a task, paired with an input that provides further context. " 27 | "Write a response that appropriately completes the request.\n\n" 28 | "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:" 29 | ), 30 | "prompt_no_input": ( 31 | "Below is an instruction that describes a task. " 32 | "Write a response that appropriately completes the request.\n\n" 33 | "### Instruction:\n{instruction}\n\n### Response:" 34 | ), 35 | } 36 | FINETUNE_INST = "You are evaluating errors in a model-generated output for a(an) ${task} task." 37 | FINETUNE_INPUT = """\ 38 | Task instruction: ${generation_instruction} 39 | Source: ${input_context} 40 | Model-generated Output: ${hypothesis_output} 41 | 42 | Based on the given task instruction and source, identify errors in this model-generated output. 43 | For each error you give in the response, please also elaborate the following information: 44 | - error location (the words that are wrong in the output) 45 | - error aspect it belongs to. 46 | - explanation why it's an error, and the correction suggestions. 47 | - severity of the error ("Major" or "Minor"). 48 | - reduction of score (an interger between 0.5 and 5 given the severity of the error) 49 | 50 | Your evaluation output: 51 | """ 52 | 53 | 54 | def get_sum_penalties(eval_output: dict): 55 | """ 56 | Args: 57 | eval_output: dict, the json output of the eval function 58 | 59 | Returns: 60 | """ 61 | try: 62 | penalty_score = 0 63 | for aspect in eval_output: 64 | for penalty_point in eval_output[aspect]["penalty_points"]: 65 | penalty_score += penalty_point["score_reduction"] 66 | return - penalty_score 67 | except Exception: 68 | return None 69 | 70 | 71 | def get_torch_dtype(dtype_str): 72 | """ 73 | Get the torch dtype from a string 74 | """ 75 | if dtype_str == "float32": 76 | return torch.float32 77 | elif dtype_str == "float16": 78 | return torch.float16 79 | elif dtype_str == "bfloat16": 80 | return torch.bfloat16 81 | elif dtype_str == "int8": 82 | return torch.int8 83 | else: 84 | raise ValueError("Invalid dtype {}".format(dtype_str)) 85 | 86 | 87 | def batch_data(data_list, batch_size=1): 88 | n = len(data_list) // batch_size 89 | batch_data = [] 90 | for i in range(n - 1): 91 | start = i * batch_size 92 | end = (i + 1) * batch_size 93 | batch_data.append(data_list[start:end]) 94 | 95 | last_start = (n - 1) * batch_size 96 | last_end = MAX_INT 97 | batch_data.append(data_list[last_start:last_end]) 98 | return batch_data 99 | 100 | 101 | class MyCorrelation(Correlation): 102 | def __init__(self, num_sys: int, gold_scores: List[int], metric_scores: List[int]): 103 | # remove nan in metrics scores 104 | none_metric_scores_idxs = [idx for idx, 105 | x in enumerate(metric_scores) if x is None] 106 | logging.info("Remove {} nan scores from {} scores".format( 107 | len(none_metric_scores_idxs), 108 | len(metric_scores) 109 | )) 110 | gold_scores = gold_scores.copy() 111 | # set gold scores to None if metric scores are None 112 | for idx in none_metric_scores_idxs[::-1]: 113 | gold_scores[idx] = None 114 | super().__init__(num_sys, gold_scores, metric_scores) 115 | 116 | 117 | def main(args): 118 | 119 | if args.output_path is not None: 120 | output_file = Path(args.output_path) 121 | else: 122 | output_file = Path(args.data_path).with_suffix( 123 | '.xgptscore.output.json') 124 | if not output_file.exists() or args.overwrite: 125 | logging.info("Loading model...") 126 | sampling_params = SamplingParams( 127 | temperature=0, top_p=1, max_tokens=1024) 128 | llm = LLM(model=args.model_name_or_path, tensor_parallel_size=1) 129 | logging.info("Model loaded from {}".format(args.model_name_or_path)) 130 | 131 | eval_outputs = [] 132 | 133 | logging.info("Load input data from {}".format(args.data_path)) 134 | with open(args.data_path, "r") as f: 135 | input_data = json.load(f) 136 | formatted_data = [] 137 | for item in input_data: 138 | inst = Template(FINETUNE_INST).substitute(task=args.task) 139 | refs = item['output'] if "output" in item else item["refs"] 140 | item["candidates"] = [] 141 | if isinstance(refs,list): 142 | for ref in refs: 143 | item["candidates"].append( 144 | { 145 | "text":ref, 146 | "source":"unknown", 147 | "scores":{} 148 | } 149 | ) 150 | else: 151 | item["candidates"].append( 152 | { 153 | "text":refs, 154 | "source":"unknown", 155 | "scores":{} 156 | } 157 | ) 158 | for cand in item['candidates']: 159 | inst = Template(FINETUNE_INST).substitute(task=args.task) 160 | input_ = Template(FINETUNE_INPUT).substitute( 161 | task=args.task, 162 | generation_instruction=item['instruction'], 163 | input_context=item['input'], 164 | hypothesis_output=cand['text'], 165 | ) 166 | formatted_data.append({ 167 | "instruction": inst, 168 | "input": input_, 169 | }) 170 | prompt_sources = [example['instruction'] + '\n' + 171 | example['input'] for example in formatted_data] 172 | prompt_sources = [x.strip(' \n') + "\n" for x in prompt_sources] 173 | 174 | batch_prompts = batch_data(prompt_sources, batch_size=args.batch_size) 175 | 176 | for idx, batch_prompt in enumerate(batch_prompts): 177 | if isinstance(batch_prompt, list): 178 | pass 179 | else: 180 | batch_prompt = [batch_prompt] 181 | 182 | completions = llm.generate(batch_prompt, sampling_params) 183 | for output in completions: 184 | generated_text = output.outputs[0].text 185 | eval_outputs.append(generated_text) 186 | 187 | cand_idx = 0 188 | for idx, (item, eval_output) in enumerate(zip(input_data, eval_outputs)): 189 | for cand in item['candidates']: 190 | cand['eval_output'] = eval_outputs[cand_idx] 191 | score_reductions = re.findall( 192 | r"(?<=\nScore reduction \d+: )(\d+\.\d+|\d+)", eval_outputs[cand_idx]) 193 | cand['xgptscore'] = -sum(map(float, score_reductions)) 194 | cand_idx += 1 195 | 196 | with open(output_file, 'w') as f: 197 | json.dump(input_data, f, indent=4, ensure_ascii=False) 198 | logging.info("Saved eval results to {}".format(output_file)) 199 | else: 200 | with open(output_file, 'r') as f: 201 | input_data = json.load(f) 202 | for ex in input_data: 203 | for cand in ex['candidates']: 204 | score_reductions = re.findall( 205 | r"(?<=\nScore reduction \d+: )(\d+\.\d+|\d+)", cand['eval_output']) 206 | cand['xgptscore'] = -sum(map(float, score_reductions)) 207 | with open(output_file, 'w') as f: 208 | json.dump(input_data, f, indent=4, ensure_ascii=False) 209 | logging.info("Loaded eval results from {}".format(output_file)) 210 | # Compute correlation 211 | xgptscores = [] 212 | for item in input_data: 213 | xgptscores.append(item['xgptscore']) 214 | print("Absolute score sum: {}".format(abs(sum(xgptscores)))) 215 | print("Average score: {}".format(sum(xgptscores) / len(xgptscores))) 216 | print("Median score: {}".format(np.median(xgptscores))) 217 | print("Standard deviation: {}".format(np.std(list(map(abs, xgptscores))))) 218 | 219 | 220 | 221 | if __name__ == "__main__": 222 | 223 | logging.basicConfig(level=logging.INFO) 224 | parser = argparse.ArgumentParser() 225 | parser.add_argument("--model_name_or_path", type=str, default=None) 226 | parser.add_argument("--data_path", type=str, default=None) 227 | parser.add_argument("--output_path", type=str, default=None) 228 | parser.add_argument("--overwrite", action="store_true") 229 | parser.add_argument("--task", type=str, default="summarization") 230 | parser.add_argument("--batch_size", type=int, default=1) 231 | parser.add_argument("--human_score_names", type=str, default="score") 232 | args = parser.parse_args() 233 | main(args) 234 | -------------------------------------------------------------------------------- /tigerscore/finetune/test_llama_vllm_vanilla.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import json 4 | import torch 5 | import logging 6 | import sys 7 | import regex as re 8 | from pathlib import Path 9 | sys.path.append(str(Path(__file__).parent.parent)) 10 | from vllm import LLM, SamplingParams 11 | from typing import List 12 | from string import Template 13 | from mt_metrics_eval.stats import Correlation 14 | 15 | 16 | MAX_INT = sys.maxsize 17 | 18 | IGNORE_INDEX = -100 19 | DEFAULT_PAD_TOKEN = "[PAD]" 20 | DEFAULT_EOS_TOKEN = "" 21 | DEFAULT_BOS_TOKEN = "" 22 | DEFAULT_UNK_TOKEN = "" 23 | PROMPT_DICT = { 24 | "prompt_input": ( 25 | "Below is an instruction that describes a task, paired with an input that provides further context. " 26 | "Write a response that appropriately completes the request.\n\n" 27 | "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:" 28 | ), 29 | "prompt_no_input": ( 30 | "Below is an instruction that describes a task. " 31 | "Write a response that appropriately completes the request.\n\n" 32 | "### Instruction:\n{instruction}\n\n### Response:" 33 | ), 34 | } 35 | # FINETUNE_INST = """""" 36 | # FINETUNE_INPUT = """\ 37 | # ${generation_instruction} 38 | # ${input_context} 39 | 40 | # Model-generated Output: 41 | # ${hypothesis_output} 42 | 43 | 44 | # You should rate Model-generated Output on a scale from 0.5 (worst) to 10 (best). 45 | # Rating: \ 46 | # """ 47 | FINETUNE_INST = """[INST] <>\nYou are a helpful, respectful and honest assistant.\n<>\n""" 48 | FINETUNE_INPUT = """\ 49 | ${generation_instruction} 50 | ${input_context} 51 | 52 | Model-generated Output: 53 | ${hypothesis_output} 54 | 55 | 56 | You should rate Model-generated Output on a scale from 0.5 (worst) to 10 (best). [/INST] Rating: \ 57 | """ 58 | # FINETUNE_INPUT = """\ 59 | # USER:You are evaluating errors in a model-generated output for a(an) ${task} task. 60 | # Task instruction: ${generation_instruction} 61 | # Source: ${input_context} 62 | # Model-generated Output: ${hypothesis_output} 63 | 64 | # Based on the given task instruction and source, identify errors in this model-generated output. 65 | # For each error you give in the response, please also elaborate the following information: 66 | # - error location (the words that are wrong in the output) 67 | # - error aspect it belongs to. 68 | # - explanation why it's an error, and the correction suggestions. 69 | # - severity of the error ("Major" or "Minor"). 70 | # - reduction of score (between 0.5 and 5 given the severity of the error) 71 | 72 | # Please give a summary of the errors you found in the output, and the total score reduction. 73 | # The model-generated output contains {num_errors} errors, with a total score reduction of {total_score_reduction}. 74 | 75 | # Your evaluation output: ASSISTANT:\ 76 | # """ 77 | def find_first_float(s): 78 | match = re.search(r"[-+]?\d*\.\d+|\d+", s) 79 | return float(match.group()) if match else None 80 | 81 | def get_sum_penalties(eval_output: dict): 82 | """ 83 | Args: 84 | eval_output: dict, the json output of the eval function 85 | 86 | Returns: 87 | """ 88 | try: 89 | penalty_score = 0 90 | for aspect in eval_output: 91 | for penalty_point in eval_output[aspect]["penalty_points"]: 92 | penalty_score += penalty_point["score_reduction"] 93 | return - penalty_score 94 | except Exception: 95 | return None 96 | 97 | 98 | def get_torch_dtype(dtype_str): 99 | """ 100 | Get the torch dtype from a string 101 | """ 102 | if dtype_str == "float32": 103 | return torch.float32 104 | elif dtype_str == "float16": 105 | return torch.float16 106 | elif dtype_str == "bfloat16": 107 | return torch.bfloat16 108 | elif dtype_str == "int8": 109 | return torch.int8 110 | else: 111 | raise ValueError("Invalid dtype {}".format(dtype_str)) 112 | 113 | 114 | def batch_data(data_list, batch_size=1): 115 | n = len(data_list) // batch_size 116 | batch_data = [] 117 | for i in range(n - 1): 118 | start = i * batch_size 119 | end = (i + 1) * batch_size 120 | batch_data.append(data_list[start:end]) 121 | 122 | last_start = (n - 1) * batch_size 123 | last_end = MAX_INT 124 | batch_data.append(data_list[last_start:last_end]) 125 | return batch_data 126 | 127 | 128 | class MyCorrelation(Correlation): 129 | def __init__(self, num_sys: int, gold_scores: List[int], metric_scores: List[int]): 130 | # remove nan in metrics scores 131 | none_metric_scores_idxs = [idx for idx, 132 | x in enumerate(metric_scores) if x is None] 133 | logging.info("Remove {} nan scores from {} scores".format( 134 | len(none_metric_scores_idxs), 135 | len(metric_scores) 136 | )) 137 | gold_scores = gold_scores.copy() 138 | # set gold scores to None if metric scores are None 139 | for idx in none_metric_scores_idxs[::-1]: 140 | gold_scores[idx] = None 141 | super().__init__(num_sys, gold_scores, metric_scores) 142 | 143 | 144 | def main(args): 145 | 146 | if args.output_path is not None: 147 | output_file = Path(args.output_path) 148 | else: 149 | output_file = Path(args.data_path).with_suffix( 150 | '.xgptscore.output.json') 151 | if not output_file.exists() or args.overwrite: 152 | logging.info("Loading model...") 153 | sampling_params = SamplingParams( 154 | temperature=0, top_p=1, max_tokens=1024) 155 | llm = LLM(model=args.model_name_or_path, tensor_parallel_size=1) 156 | logging.info("Model loaded from {}".format(args.model_name_or_path)) 157 | 158 | eval_outputs = [] 159 | 160 | logging.info("Load input data from {}".format(args.data_path)) 161 | with open(args.data_path, "r") as f: 162 | input_data = json.load(f) 163 | formatted_data = [] 164 | for item in input_data: 165 | for cand in item['candidates']: 166 | inst = Template(FINETUNE_INST).substitute(task=args.task) 167 | input_ = Template(FINETUNE_INPUT).substitute( 168 | task=args.task, 169 | generation_instruction=item['instruction'], 170 | input_context=item['input'], 171 | hypothesis_output=cand['text'], 172 | ) 173 | formatted_data.append({ 174 | "instruction": inst, 175 | "input": input_, 176 | }) 177 | prompt_sources = [example['instruction'] + '\n' + 178 | example['input'] for example in formatted_data] 179 | prompt_sources = [x.strip(' \n') + "\n" for x in prompt_sources] 180 | 181 | batch_prompts = batch_data(prompt_sources, batch_size=args.batch_size) 182 | 183 | for idx, batch_prompt in enumerate(batch_prompts): 184 | if isinstance(batch_prompt, list): 185 | pass 186 | else: 187 | batch_prompt = [batch_prompt] 188 | 189 | completions = llm.generate(batch_prompt, sampling_params) 190 | for output in completions: 191 | generated_text = output.outputs[0].text 192 | eval_outputs.append(generated_text) 193 | 194 | cand_idx = 0 195 | for idx, (item, eval_output) in enumerate(zip(input_data, eval_outputs)): 196 | for cand in item['candidates']: 197 | cand['eval_output'] = eval_outputs[cand_idx] 198 | score_reduction = find_first_float(eval_outputs[cand_idx]) 199 | if score_reduction is None: 200 | cand['vanilla_xgptscore'] = -float(score_reduction) 201 | else: 202 | cand['vanilla_xgptscore'] = None 203 | cand_idx += 1 204 | 205 | with open(output_file, 'w') as f: 206 | json.dump(input_data, f, indent=4, ensure_ascii=False) 207 | logging.info("Saved eval results to {}".format(output_file)) 208 | else: 209 | with open(output_file, 'r') as f: 210 | input_data = json.load(f) 211 | for ex in input_data: 212 | for cand in ex['candidates']: 213 | score_reduction = find_first_float(cand["eval_output"]) 214 | if score_reduction is None: 215 | cand['vanilla_xgptscore'] = -float(score_reduction) 216 | else: 217 | cand['vanilla_xgptscore'] = None 218 | with open(output_file, 'w') as f: 219 | json.dump(input_data, f, indent=4, ensure_ascii=False) 220 | logging.info("Loaded eval results from {}".format(output_file)) 221 | # Compute correlation 222 | human_score_names = args.human_score_names.split(',') 223 | 224 | for h_name in human_score_names: 225 | human_scores = [] 226 | xgptscores = [] 227 | for item in input_data: 228 | for cand in item['candidates']: 229 | for s_name, score in cand['scores'].items(): 230 | if s_name == h_name: 231 | xgptscores.append(cand['vanilla_xgptscore']) 232 | human_scores.append(score) 233 | break 234 | corr = MyCorrelation(1, human_scores, xgptscores) 235 | print("Human score: {}".format(h_name)) 236 | print("Pearson correlation: {}".format(corr.Pearson())) 237 | print("Spearman correlation: {}".format(corr.Spearman())) 238 | print("Kendall correlation: {}".format(corr.Kendall())) 239 | 240 | 241 | if __name__ == "__main__": 242 | 243 | logging.basicConfig(level=logging.INFO) 244 | parser = argparse.ArgumentParser() 245 | parser.add_argument("--model_name_or_path", type=str, default=None) 246 | parser.add_argument("--data_path", type=str, default=None) 247 | parser.add_argument("--output_path", type=str, default=None) 248 | parser.add_argument("--overwrite", action="store_true") 249 | parser.add_argument("--task", type=str, default="summarization") 250 | parser.add_argument("--batch_size", type=int, default=1) 251 | parser.add_argument("--human_score_names", type=str, default="score") 252 | args = parser.parse_args() 253 | main(args) 254 | -------------------------------------------------------------------------------- /tigerscore/finetune/trainer.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | import os 3 | from transformers.trainer import * 4 | from peft import PeftModel 5 | 6 | 7 | class CustomLoraTrainer(Trainer): 8 | def _save(self, output_dir: Optional[str] = None, state_dict=None): 9 | # If we are executing this function, we are the process zero, so we don't check for that. 10 | output_dir = output_dir if output_dir is not None else self.args.output_dir 11 | os.makedirs(output_dir, exist_ok=True) 12 | logger.info(f"Saving model checkpoint to {output_dir}") 13 | # Save a trained model and configuration using `save_pretrained()`. 14 | # They can then be reloaded using `from_pretrained()` 15 | if not isinstance(self.model, PreTrainedModel) and not isinstance(self.model, PeftModel): 16 | if state_dict is None: 17 | state_dict = self.model.state_dict() 18 | 19 | if isinstance(unwrap_model(self.model), PreTrainedModel): 20 | unwrap_model(self.model).save_pretrained( 21 | output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors 22 | ) 23 | else: 24 | logger.info( 25 | "Trainer.model is not a `PreTrainedModel`, only saving its state dict.") 26 | if self.args.save_safetensors: 27 | safetensors.torch.save_file( 28 | state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME)) 29 | else: 30 | torch.save(state_dict, os.path.join( 31 | output_dir, WEIGHTS_NAME)) 32 | else: 33 | print("Saving LoRA model...") 34 | self.model.save_pretrained( 35 | output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors 36 | ) 37 | 38 | if self.tokenizer is not None: 39 | self.tokenizer.save_pretrained(output_dir) 40 | 41 | # Good practice: save your training arguments together with the trained model 42 | torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) 43 | -------------------------------------------------------------------------------- /tigerscore/finetune/utils.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | import logging 3 | import math 4 | import os 5 | import io 6 | import sys 7 | import time 8 | import json 9 | from typing import Optional, Sequence, Union, Dict 10 | 11 | import openai 12 | import tqdm 13 | from openai import openai_object 14 | import copy 15 | 16 | StrOrOpenAIObject = Union[str, openai_object.OpenAIObject] 17 | 18 | openai_org = os.getenv("OPENAI_ORG") 19 | if openai_org is not None: 20 | openai.organization = openai_org 21 | logging.warning( 22 | f"Switching to organization: {openai_org} for OAI API key.") 23 | 24 | 25 | @dataclasses.dataclass 26 | class OpenAIDecodingArguments(object): 27 | max_tokens: int = 1800 28 | temperature: float = 0.2 29 | top_p: float = 1.0 30 | n: int = 1 31 | stream: bool = False 32 | stop: Optional[Sequence[str]] = None 33 | presence_penalty: float = 0.0 34 | frequency_penalty: float = 0.0 35 | suffix: Optional[str] = None 36 | logprobs: Optional[int] = None 37 | echo: bool = False 38 | 39 | 40 | def openai_completion( 41 | prompts: Union[str, Sequence[str], Sequence[Dict[str, str]], Dict[str, str]], 42 | decoding_args: OpenAIDecodingArguments, 43 | model_name="text-davinci-003", 44 | sleep_time=2, 45 | batch_size=1, 46 | max_instances=sys.maxsize, 47 | max_batches=sys.maxsize, 48 | return_text=False, 49 | **decoding_kwargs, 50 | ) -> Union[Union[StrOrOpenAIObject, StrOrOpenAIObject], Sequence[StrOrOpenAIObject], Sequence[Sequence[StrOrOpenAIObject]],]: 51 | """Decode with OpenAI API. 52 | 53 | Args: 54 | prompts: A string or a list of strings to complete. If it is a chat model the strings should be formatted 55 | as explained here: https://github.com/openai/openai-python/blob/main/chatml.md. If it is a chat model 56 | it can also be a dictionary (or list thereof) as explained here: 57 | https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb 58 | decoding_args: Decoding arguments. 59 | model_name: Model name. Can be either in the format of "org/model" or just "model". 60 | sleep_time: Time to sleep once the rate-limit is hit. 61 | batch_size: Number of prompts to send in a single request. Only for non chat model. 62 | max_instances: Maximum number of prompts to decode. 63 | max_batches: Maximum number of batches to decode. This argument will be deprecated in the future. 64 | return_text: If True, return text instead of full completion object (which contains things like logprob). 65 | decoding_kwargs: Additional decoding arguments. Pass in `best_of` and `logit_bias` if you need them. 66 | 67 | Returns: 68 | A completion or a list of completions. 69 | Depending on return_text, return_openai_object, and decoding_args.n, the completion type can be one of 70 | - a string (if return_text is True) 71 | - an openai_object.OpenAIObject object (if return_text is False) 72 | - a list of objects of the above types (if decoding_args.n > 1) 73 | """ 74 | is_single_prompt = isinstance(prompts, (str, dict)) 75 | if is_single_prompt: 76 | prompts = [prompts] 77 | 78 | if max_batches < sys.maxsize: 79 | logging.warning( 80 | "`max_batches` will be deprecated in the future, please use `max_instances` instead." 81 | "Setting `max_instances` to `max_batches * batch_size` for now." 82 | ) 83 | max_instances = max_batches * batch_size 84 | 85 | prompts = prompts[:max_instances] 86 | num_prompts = len(prompts) 87 | prompt_batches = [ 88 | prompts[batch_id * batch_size: (batch_id + 1) * batch_size] 89 | for batch_id in range(int(math.ceil(num_prompts / batch_size))) 90 | ] 91 | 92 | completions = [] 93 | for batch_id, prompt_batch in tqdm.tqdm( 94 | enumerate(prompt_batches), 95 | desc="prompt_batches", 96 | total=len(prompt_batches), 97 | ): 98 | batch_decoding_args = copy.deepcopy( 99 | decoding_args) # cloning the decoding_args 100 | 101 | while True: 102 | try: 103 | shared_kwargs = dict( 104 | model=model_name, 105 | **batch_decoding_args.__dict__, 106 | **decoding_kwargs, 107 | ) 108 | completion_batch = openai.Completion.create( 109 | prompt=prompt_batch, **shared_kwargs) 110 | choices = completion_batch.choices 111 | 112 | for choice in choices: 113 | choice["total_tokens"] = completion_batch.usage.total_tokens 114 | completions.extend(choices) 115 | break 116 | except openai.error.OpenAIError as e: 117 | logging.warning(f"OpenAIError: {e}.") 118 | if "Please reduce your prompt" in str(e): 119 | batch_decoding_args.max_tokens = int( 120 | batch_decoding_args.max_tokens * 0.8) 121 | logging.warning( 122 | f"Reducing target length to {batch_decoding_args.max_tokens}, Retrying...") 123 | else: 124 | logging.warning("Hit request rate limit; retrying...") 125 | time.sleep(sleep_time) # Annoying rate limit on requests. 126 | 127 | if return_text: 128 | completions = [completion.text for completion in completions] 129 | if decoding_args.n > 1: 130 | # make completions a nested list, where each entry is a consecutive decoding_args.n of original entries. 131 | completions = [completions[i: i + decoding_args.n] 132 | for i in range(0, len(completions), decoding_args.n)] 133 | if is_single_prompt: 134 | # Return non-tuple if only 1 input and 1 generation. 135 | (completions,) = completions 136 | return completions 137 | 138 | 139 | def _make_w_io_base(f, mode: str): 140 | if not isinstance(f, io.IOBase): 141 | f_dirname = os.path.dirname(f) 142 | if f_dirname != "": 143 | os.makedirs(f_dirname, exist_ok=True) 144 | f = open(f, mode=mode) 145 | return f 146 | 147 | 148 | def _make_r_io_base(f, mode: str): 149 | if not isinstance(f, io.IOBase): 150 | f = open(f, mode=mode) 151 | return f 152 | 153 | 154 | def jdump(obj, f, mode="w", indent=4, default=str): 155 | """Dump a str or dictionary to a file in json format. 156 | 157 | Args: 158 | obj: An object to be written. 159 | f: A string path to the location on disk. 160 | mode: Mode for opening the file. 161 | indent: Indent for storing json dictionaries. 162 | default: A function to handle non-serializable entries; defaults to `str`. 163 | """ 164 | f = _make_w_io_base(f, mode) 165 | if isinstance(obj, (dict, list)): 166 | json.dump(obj, f, indent=indent, default=default) 167 | elif isinstance(obj, str): 168 | f.write(obj) 169 | else: 170 | raise ValueError(f"Unexpected type: {type(obj)}") 171 | f.close() 172 | 173 | 174 | def jload(f, mode="r"): 175 | """Load a .json file into a dictionary.""" 176 | f = _make_r_io_base(f, mode) 177 | jdict = json.load(f) 178 | f.close() 179 | return jdict 180 | -------------------------------------------------------------------------------- /tigerscore/get_error_types/get_error_types.py: -------------------------------------------------------------------------------- 1 | # Example usage 2 | """ 3 | This file isn't used in final version. 4 | """ 5 | import os 6 | import sys 7 | import fire 8 | import json 9 | from pathlib import Path 10 | os.environ["OPENAI_API_KEY"] = "" 11 | os.environ["OPENAI_API_BASE"] = "" 12 | os.environ["OPENAI_API_TYPE"] = "azure" 13 | os.environ["OPENAI_API_VERSION"] = "2023-03-15-preview" 14 | sys.path.append("../") 15 | from xgptscore.openai_utils import openai_completions, _chatml_to_prompt 16 | from xgptscore.constants import EVAL_ASPECTS 17 | from string import Template 18 | 19 | TEMPLATE = """\ 20 | You are evaluating an ${task} task. Some errors in an incorrect output could be attributed to the following aspects: 21 | ${aspects_descriptions} 22 | 23 | Please elaborate 10 specific error types for each aspect above. Each error type should represent a specific error that falls under the aspect. Error types should be mutually exclusive and collectively exhaustive.\ 24 | """ 25 | 26 | 27 | def main( 28 | task: str, 29 | ): 30 | 31 | task_aspects = EVAL_ASPECTS[task] 32 | prompt = Template(TEMPLATE).substitute( 33 | task=task, 34 | aspects_descriptions="\n".join([f"- {aspect}: {description}" for aspect, description in task_aspects.items()]) 35 | ) 36 | prompts = [prompt] 37 | chatmls = [[{"role": "system", 38 | "content": " You are an AI assistant that helps people find information."}, 39 | {"role": "user", 40 | "content": prompt}] for prompt in prompts[:1]] 41 | 42 | chatml_prompts = [_chatml_to_prompt(chatml) for chatml in chatmls] 43 | results = openai_completions(chatml_prompts, model_name="gpt-4") 44 | output_file = Path("./error_types/" + task + ".txt") 45 | output_file.parent.mkdir(parents=True, exist_ok=True) 46 | results['propmts'] = prompts 47 | with open(output_file, "w") as f: 48 | json.dump(results, f, indent=4, ensure_ascii=False) 49 | 50 | 51 | if __name__ == "__main__": 52 | fire.Fire(main) -------------------------------------------------------------------------------- /tigerscore/scorer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/TIGER-AI-Lab/TIGERScore/5133420066ee65a875d5b64cfd20ab35cfce0112/tigerscore/scorer/__init__.py -------------------------------------------------------------------------------- /tigerscore/xgptscore/README.md: -------------------------------------------------------------------------------- 1 | ## XGPTScore Overview 2 | This folder contains all the templates that we used to query ChatGPT or GPT-4 to get the identified errors in the hypothesis output for different tasks that TIGERScore involved. We call these API query methods as XGPTScore for a e**X**planainable **Scoring** method by querying **GPT** Models. 3 | 4 | The overall pipeline of XGPTScore is: 5 | 6 | 1. We define a query template that askes GPT Models to idnetify errors in the hypothesis output based on the task instruction, source text and reference text. 7 | 2. We mannual construct various evaluation aspects to focus on for different tasks, as shown in [./constants.py](./constants.py). 8 | 3. Then, by applying the templates and also specifiy the aspects to focus on in the template, GPT Models are required to return the identified errors in a predefined format (like json format). 9 | 10 | Sometimes GPTModels will output apparently lower-quality output if we require them to output in a specific format. To mitigate the affections from the predefined format on the response quality, we conduct 2-round evaluation. Firstly, we focus on the evaluation only, allowing the GPT models to output free-form evaluation results on the hypothesis output. Then we ask the GPT-models to format their free-form response in the first round into a specific format and provide elaborated information, which is an easier task for GPTModels. 11 | 12 | ## Quick start 13 | 14 | We have provided a single function `xgptscore()` as the inferface, which takes the `xgptitems` along with the template mode and the OpenAI models as input to start the query. 15 | 16 | Example Usage: 17 | ```python 18 | task = "translation" 19 | with open("example.json", "r") as f: 20 | items = json.load(f) 21 | xgptitems = [] 22 | for item in items: 23 | for cand in item['candidates']: 24 | xgptitems.append(XPGTItem( 25 | task=task, 26 | instruction=item['instruction'], 27 | input=item['input'], 28 | ref_output=item['output'], 29 | hypo_output=cand['text'] 30 | )) 31 | result = xgptscore(xgptitems, "ea", "ChatGPT") 32 | idx = 0 33 | for item in items: 34 | for cand in item['candidates']: 35 | cand['responses'] = result['round_completions'][idx] 36 | cand['messages_records'] = result['messages_records'][idx] 37 | json.dump(items, open("example_result.json", "w"), indent=4, ensure_ascii=False) 38 | ``` 39 | 40 | Please check out the input file `example.json` and the result file `example_results.json` to better understand how it actually works. -------------------------------------------------------------------------------- /tigerscore/xgptscore/mode_configs/align_score.json: -------------------------------------------------------------------------------- 1 | { 2 | "mode": "align_score", 3 | "decoding": { 4 | "max_tokens": 3600, 5 | "temperature": 0.0, 6 | "top_p": 1.0, 7 | "timeout": 60, 8 | "request_timeout": 60 9 | }, 10 | "max_lengths": { 11 | "inst": null, 12 | "input": 600, 13 | "hypo_output": 400, 14 | "ref_output": 400 15 | } 16 | } -------------------------------------------------------------------------------- /tigerscore/xgptscore/mode_configs/default.json: -------------------------------------------------------------------------------- 1 | { 2 | "mode": "default", 3 | "decoding": { 4 | "max_tokens": 3600, 5 | "temperature": 0.0, 6 | "top_p": 1.0, 7 | "timeout": 60 8 | }, 9 | "max_lengths": { 10 | "inst": null, 11 | "input": 512, 12 | "hypo_output": 400, 13 | "ref_output": 400 14 | } 15 | } -------------------------------------------------------------------------------- /tigerscore/xgptscore/mode_configs/kb_txt.json: -------------------------------------------------------------------------------- 1 | { 2 | "mode": "default", 3 | "decoding": { 4 | "max_tokens": 3600, 5 | "temperature": 0.0, 6 | "top_p": 1.0, 7 | "timeout": 120, 8 | "request_timeout": 120 9 | }, 10 | "max_lengths": { 11 | "inst": null, 12 | "input": 512, 13 | "hypo_output": 400, 14 | "ref_output": 400 15 | } 16 | } -------------------------------------------------------------------------------- /tigerscore/xgptscore/mode_configs/wmt_mqm.json: -------------------------------------------------------------------------------- 1 | { 2 | "mode": "wmt_mqm", 3 | "decoding": { 4 | "max_tokens": 3600, 5 | "temperature": 0.0, 6 | "top_p": 1.0, 7 | "timeout": 60, 8 | "request_timeout": 60 9 | }, 10 | "max_lengths": { 11 | "inst": null, 12 | "input": 400, 13 | "hypo_output": 400, 14 | "ref_output": 400 15 | } 16 | } -------------------------------------------------------------------------------- /tigerscore/xgptscore/openai_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | if os.environ.get('OPENAI_API_TYPE', None) == 'azure': 3 | # pip install openai<=0.28.1, fire, numpy, tiktoken 4 | from .openai_utils_azure import ( 5 | openai_completions, 6 | _prompt_to_chatml, 7 | _chatml_to_prompt, 8 | ) 9 | import openai 10 | assert openai.VERSION <= "0.28.1", "Azure API is only supported in openai-python 0.28.1 or later." 11 | elif os.environ.get('OPENAI_UTILS_TYPE', None) == 'curl': 12 | # pip install openai>=1.0.0, fire, numpy, tiktoken 13 | from .openai_utils_curl import ( 14 | openai_completions, 15 | _prompt_to_chatml, 16 | _chatml_to_prompt, 17 | ) 18 | import openai 19 | assert openai.VERSION >= "1.0.0", "OpenAI API is only supported in openai-python 1.0.0 or later." 20 | else: 21 | # pip install openai>=1.0.0, fire, numpy, tiktoken 22 | from .openai_utils_openAI import ( 23 | openai_completions, 24 | _prompt_to_chatml, 25 | _chatml_to_prompt, 26 | ) 27 | import openai 28 | assert openai.VERSION >= "1.0.0", "OpenAI API is only supported in openai-python 1.0.0 or later." 29 | -------------------------------------------------------------------------------- /tigerscore/xgptscore/process_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import json5 4 | import logging 5 | from dataclasses import dataclass 6 | from transformers import AutoTokenizer 7 | from tqdm import tqdm 8 | from typing import List, Union 9 | from itertools import chain 10 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 11 | 12 | 13 | @dataclass 14 | class XPGTItem(): 15 | task: str 16 | instruction: str 17 | input: str 18 | ref_output: Union[str, List[str]] 19 | hypo_output: str 20 | 21 | # Message map functions 22 | 23 | 24 | def default_msg_map(cur_message: dict, messages: List[dict]): 25 | """ Map the text and old messages to the new messages for query 26 | Args: 27 | text (str): the prompt text 28 | messages (List[dict]): the messages list before this query 29 | Returns: 30 | prompt (str): the prompt text 31 | """ 32 | new_messages = messages + [{ 33 | "role": cur_message['role'], 34 | "content": cur_message['content']} 35 | ] 36 | return new_messages 37 | 38 | # Postprocess functions 39 | 40 | 41 | def default_postprocess(content: str): 42 | return content 43 | 44 | 45 | def json_postprocess(content: str): 46 | try: 47 | # find the json content 48 | json_content = content[content.find("{"):content.rfind("}") + 1] 49 | json_content = json.loads(json_content) 50 | return json_content 51 | except json.decoder.JSONDecodeError: 52 | try: 53 | json_content = json5.loads(json_content) 54 | return json_content 55 | except Exception: 56 | return content 57 | 58 | 59 | tokenizer = None 60 | 61 | 62 | def truncate_texts(texts: Union[List[str], List[List[str]]], max_length: int = None): 63 | """ 64 | Truncate the texts to the max length. 65 | Args: 66 | texts (List[str] or List[List[str]]): The list of texts. 67 | max_length (int): The max length. 68 | Returns: 69 | List[str]: The truncated texts. 70 | """ 71 | if max_length is None: 72 | return texts 73 | if isinstance(texts[0], list) and \ 74 | ( 75 | all([len(x) == 0 for x in texts]) or 76 | all([x is None for x in list(chain(*texts))]) 77 | ) or isinstance(texts[0], str) and \ 78 | all([x is None for x in list(chain(texts))]): 79 | logging.warning("All texts are None, skip truncating") 80 | return texts 81 | # using llama tokenizer by default 82 | global tokenizer 83 | disable_tqdm = len(texts) < 1000 84 | logging.warning(f"Truncating texts to max length {max_length}") 85 | if tokenizer is None: 86 | tokenizer = AutoTokenizer.from_pretrained( 87 | "meta-llama/Llama-2-7b-hf", use_auth_token=True) 88 | # ... 89 | token_ids = [] 90 | for text in tqdm(texts, desc="Truncating texts (tokenizing)", disable=disable_tqdm): 91 | if isinstance(text, list): 92 | token_ids.append( 93 | [tokenizer.encode(x, add_special_tokens=False) for x in text]) 94 | else: 95 | token_ids.append(tokenizer.encode(text, add_special_tokens=False)) 96 | # ... 97 | truncated_texts = [] 98 | for i, _token_ids in tqdm(enumerate(token_ids), desc="Truncating texts (truncating)", disable=disable_tqdm): 99 | if (len(_token_ids)) and isinstance(_token_ids[0], list): 100 | truncated_texts.append([]) 101 | for _token_id in _token_ids: 102 | if len(_token_id) > max_length: 103 | truncated_text = tokenizer.decode( 104 | _token_id[:max_length], skip_special_tokens=True) 105 | truncated_text = truncated_text + " ..." 106 | else: 107 | truncated_text = tokenizer.decode( 108 | _token_id, skip_special_tokens=True) 109 | truncated_texts[i].append(truncated_text) 110 | else: 111 | if len(_token_ids) > max_length: 112 | truncated_text = tokenizer.decode( 113 | _token_ids[:max_length], skip_special_tokens=True) 114 | truncated_text = truncated_text + " ..." 115 | else: 116 | truncated_text = tokenizer.decode( 117 | _token_ids, skip_special_tokens=True) 118 | 119 | truncated_texts.append(truncated_text) 120 | return truncated_texts 121 | 122 | 123 | def truncate_items(items: List[XPGTItem], max_lengths): 124 | """ 125 | Truncate the texts in the items to the max length. 126 | Args: 127 | items (List[XPGTItem]): The list of items. 128 | max_length (int): The max length. 129 | Returns: 130 | List[XPGTItem]: The truncated items. 131 | """ 132 | truncated_inputs = truncate_texts( 133 | [item.input for item in items], max_lengths.get("input", None)) 134 | truncated_insts = truncate_texts( 135 | [item.instruction for item in items], max_lengths.get("instruction", None)) 136 | truncated_ref_outputs = truncate_texts( 137 | [item.ref_output for item in items], max_lengths.get("ref_output", None)) 138 | truncated_hypo_outputs = truncate_texts( 139 | [item.hypo_output for item in items], max_lengths.get("hypo_output", None)) 140 | for i, item in enumerate(items): 141 | item.instruction = truncated_insts[i] 142 | item.input = truncated_inputs[i] 143 | item.ref_output = truncated_ref_outputs[i] 144 | item.hypo_output = truncated_hypo_outputs[i] 145 | return items 146 | 147 | 148 | def get_query_messages(messages: List[dict], queried_messages: List[dict]): 149 | """ 150 | Args: 151 | messages (List[dict]): the messages list to add for query 152 | queried_messages (List[dict]): the messages list already queried, which contains the query responses also, 153 | Returns: 154 | new_messages (List[dict]): the new messages list to query 155 | postprocess (function): the postprocess function for the query response 156 | """ 157 | if len(queried_messages) == 0: 158 | last_prompt_idx = -1 159 | else: 160 | assert len( 161 | queried_messages) >= 2, "queried_messages should have at least 2 messages, i.e., the user (system) and the response" 162 | last_prompt = queried_messages[-2]['content'] 163 | prompt_texts = [x['content'] for x in messages] 164 | last_prompt_idx = prompt_texts.index(last_prompt) 165 | if last_prompt_idx == len(messages) - 1: 166 | return None 167 | new_messages = queried_messages.copy() 168 | for idx in range(last_prompt_idx + 1, len(messages)): 169 | new_messages = messages[idx]["map_func"](messages[idx], new_messages) 170 | if messages[idx]["do_query"]: 171 | break 172 | return new_messages, messages[idx]["postprocess"] 173 | 174 | 175 | def get_xgptscore_from_json(json_content: dict): 176 | """ 177 | Args: 178 | json_content (dict): the json content 179 | Returns: 180 | xgptscore (float): the xgptscore, i.e. the sum of the reduction scores for all the errors 181 | """ 182 | if isinstance(json_content, str): 183 | return None 184 | try: 185 | xgptscore = 0 186 | for error in json_content['errors'].values(): 187 | if error['score_reduction'] == "N/A": 188 | continue 189 | xgptscore -= error['score_reduction'] 190 | return xgptscore 191 | except Exception: 192 | return None 193 | 194 | 195 | def get_xgptscore_from_json_star(json_content: dict): 196 | """ 197 | Args: 198 | json_content (dict): the json content 199 | Returns: 200 | xgptscore (float): the xgptscore, i.e. the sum of the reduction scores for all the errors 201 | """ 202 | xgptscore = 0 203 | res = {} 204 | for aspect_key, aspect in json_content.items(): 205 | if isinstance(aspect, dict): 206 | score = aspect['Score'] 207 | try: 208 | score = float(score) 209 | except Exception: 210 | score = 0 211 | xgptscore += score 212 | res["xgptscore_" + aspect_key] = score 213 | res["xgptscore"] = xgptscore 214 | return res 215 | 216 | 217 | def get_xgptscore_from_json_per_aspect(json_content: dict): 218 | """ 219 | Args: 220 | json_content (dict): the json content 221 | Returns: 222 | xgptscore (float): the xgptscore, i.e. the sum of the reduction scores for all the errors 223 | """ 224 | if not isinstance(json_content, dict): 225 | return None 226 | xgptscore = 0 227 | res = {} 228 | for error in json_content['errors'].values(): 229 | if error['error_aspect'] is not None: 230 | if ("xgptscore_" + error['error_aspect'] not in res): 231 | res["xgptscore_" + error['error_aspect']] = 0 232 | res["xgptscore_" + error['error_aspect']] -= error['score_reduction'] 233 | xgptscore -= error['score_reduction'] 234 | res["xgptscore"] = xgptscore 235 | return res 236 | -------------------------------------------------------------------------------- /tigerscore/xgptscore/xgptscore.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import logging 4 | from .process import MODE_PROCESS_MAP 5 | from .process_utils import XPGTItem, truncate_items, get_query_messages 6 | from .openai_utils import openai_completions, _chatml_to_prompt 7 | from typing import List, Union 8 | from dacite import from_dict 9 | from pathlib import Path 10 | from functools import partial 11 | 12 | 13 | def xgptscore( 14 | items: List[Union[XPGTItem, dict]], 15 | mode: str, 16 | model_name: str, 17 | num_workers: int = None, 18 | batch_size: int = None, 19 | **kwargs, 20 | ): 21 | config_path = os.path.join(os.path.dirname( 22 | __file__), f"mode_configs/{mode}.json") 23 | config_path = Path(config_path) 24 | if not config_path.exists(): 25 | logging.warning( 26 | f"Config file {config_path} does not exist. Use default config.") 27 | config_path = config_path.with_name("default.json") 28 | 29 | with open(config_path, "r") as f: 30 | config = json.load(f) 31 | config.update(kwargs) 32 | if "max_lengths" in config: 33 | items = truncate_items(items, config["max_lengths"]) 34 | 35 | if isinstance(items[0], dict): 36 | items = [from_dict(data_class=XPGTItem, data=item) for item in items] 37 | process_func = MODE_PROCESS_MAP[mode] 38 | if "process_kwargs" in config: 39 | process_func = partial(process_func, **config["process_kwargs"]) 40 | process_results = list(map(process_func, items)) 41 | 42 | total_round = len([x for x in process_results[0] if x['do_query']]) 43 | logging.warning(f"Total chat rounds: {total_round}") 44 | logging.warning(f"Total chat messages: {len(items)}") 45 | # query and process 46 | round = 0 47 | queried_messages = [[] for _ in range(len(items))] 48 | total_price = 0 49 | total_time = 0 50 | round_completions = [] 51 | while True: 52 | round += 1 53 | logging.warning(f"Processing chat round {round}/{total_round}") 54 | query_messages = list( 55 | map(get_query_messages, process_results, queried_messages)) 56 | query_messages, postprocess_funcs = list(zip(*query_messages)) 57 | chatml_prompts = list(map(_chatml_to_prompt, query_messages)) 58 | openai_results = openai_completions( 59 | chatml_prompts, 60 | model_name=model_name, 61 | num_procs=num_workers, 62 | batch_size=batch_size, 63 | **config['decoding'], 64 | ) 65 | completions = openai_results['completions'] 66 | total_price += sum(openai_results['price_per_example']) 67 | total_time += sum(openai_results['time_per_example']) 68 | logging.warning(f"Round {round} price: {total_price}$") 69 | logging.warning(f"Round {round} time: {total_time}") 70 | postprocess_completions = [postprocess_funcs[idx]( 71 | completion) for idx, completion in enumerate(completions)] 72 | round_completions.append(postprocess_completions) 73 | for idx, completion in enumerate(completions): 74 | queried_messages[idx] = query_messages[idx] + \ 75 | [{"role": "assistant", "content": completion} 76 | ] # add the assistant response 77 | if round == total_round: 78 | _query_messages = list( 79 | map(get_query_messages, process_results, queried_messages)) 80 | assert all([x is None for x in _query_messages] 81 | ), "All messages should be queried" 82 | break 83 | logging.warning(f"Total price: {total_price}$") 84 | logging.warning(f"Total time: {total_time}") 85 | logging.warning(f"Total time per example: {total_time / len(items)}") 86 | round_completions = list(zip(*round_completions)) 87 | return dict( 88 | round_completions=round_completions, 89 | messages_records=queried_messages, 90 | ) 91 | 92 | 93 | """ 94 | Example Usage: 95 | task = "translation" 96 | with open("example.json", "r") as f: 97 | items = json.load(f) 98 | xgptitems = [] 99 | for item in items: 100 | for cand in item['candidates']: 101 | xgptitems.append(XPGTItem( 102 | task=task, 103 | instruction=item['instruction'], 104 | input=item['input'], 105 | ref_output=item['output'], 106 | hypo_output=cand['text'] 107 | )) 108 | result = xgptscore(xgptitems, "ea", "ChatGPT") 109 | idx = 0 110 | for item in items: 111 | for cand in item['candidates']: 112 | cand['responses'] = result['round_completions'][idx] 113 | cand['messages_records'] = result['messages_records'][idx] 114 | json.dump(items, open("example_result.json", "w"), indent=4, ensure_ascii=False) 115 | """ 116 | --------------------------------------------------------------------------------