├── .gitattributes
├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── github_overview.png
├── requirements.txt
├── setup.py
├── tigerscore
├── __init__.py
├── candidates_generation
│ ├── _generate_candidates.sh
│ ├── downmodel.py
│ ├── engine.py
│ ├── eval_candidates.py
│ ├── eval_candidates.sh
│ ├── finetune_base_model.py
│ ├── finetune_base_model.sh
│ ├── generate_candidates.py
│ ├── generate_candidates.sh
│ ├── generate_candidates_by_gpt.py
│ ├── generate_candidates_by_gpt.sh
│ ├── generate_candidates_series.sh
│ ├── generate_ref_by_gpt4.py
│ └── model_utils.py
├── common
│ ├── InstructScore.py
│ ├── README.md
│ ├── __init__.py
│ ├── bart_score.py
│ ├── cor_eval.py
│ ├── datasets_config.py
│ ├── download.sh
│ ├── evaluation.py
│ ├── flan_score.py
│ ├── prism.py
│ ├── requirements.txt
│ └── utils.py
├── download_dataset
│ ├── bartscore_data_process.py
│ ├── datasets_scripts
│ │ └── fetaqa.sh
│ ├── download_bartscore_data.sh
│ ├── download_general_datasets.py
│ ├── download_general_datasets.sh
│ ├── preprocess_utils_totto.py
│ └── utils.py
├── eval_scripts
│ ├── bs_analysis.py
│ ├── bs_utils.py
│ ├── check_data.py
│ ├── check_data.sh
│ ├── check_responses.py
│ ├── check_responses.sh
│ ├── eval_baseline.py
│ ├── eval_baseline.sh
│ ├── generate_distill_data.py
│ ├── generate_distill_data.sh
│ ├── generate_inst_synthetic_data.py
│ ├── generate_inst_synthetic_data.sh
│ ├── generate_synthesis_distill_data.py
│ ├── generate_synthesis_distill_data.sh
│ ├── get_systhesis_ref_data.sh
│ ├── lfqa_gpt_rate.py
│ ├── lfqa_gpt_rate.sh
│ ├── mathqa_rate.py
│ ├── test_ref_diff.py
│ ├── test_xgptscore.py
│ ├── test_xgptscore.sh
│ └── utils.py
├── finetune
│ ├── ds_llama_config.json
│ ├── finetune_llama.sh
│ ├── finetune_mistral.sh
│ ├── format_data_v2.py
│ ├── format_data_v2.sh
│ ├── format_distill_data.py
│ ├── format_distill_data.sh
│ ├── format_synthesis_distill_data.py
│ ├── format_synthesis_distill_data.sh
│ ├── ft_llama_lora.sh
│ ├── test_llama.py
│ ├── test_llama.sh
│ ├── test_llama_vllm.py
│ ├── test_llama_vllm.sh
│ ├── test_llama_vllm_distance.py
│ ├── test_llama_vllm_vanilla.py
│ ├── train.py
│ ├── trainer.py
│ └── utils.py
├── get_error_types
│ ├── error_types
│ │ └── error_types.json
│ └── get_error_types.py
├── scorer
│ ├── __init__.py
│ └── tigerscore.py
└── xgptscore
│ ├── README.md
│ ├── constants.py
│ ├── example.json
│ ├── example_result.json
│ ├── mode_configs
│ ├── align_score.json
│ ├── default.json
│ ├── kb_txt.json
│ └── wmt_mqm.json
│ ├── openai_utils.py
│ ├── openai_utils_azure.py
│ ├── openai_utils_curl.py
│ ├── openai_utils_openAI.py
│ ├── process.py
│ ├── process_utils.py
│ ├── templates.py
│ └── xgptscore.py
└── tigerscore_example_usage.ipynb
/.gitattributes:
--------------------------------------------------------------------------------
1 | data/evaluation/instruct/mixinstruct/test_data_prepared.json filter=lfs diff=lfs merge=lfs -text
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | tigerscore/xgptscore/cache/
162 | jobs/
163 | /tigerscore/common/models
164 |
165 | /TigerScore.zip
166 | /hf_space
167 | !/hf_space/TIGERScore
168 | /hf_evaluate
169 | /raw_datasets
170 |
171 | /data/real-world/summarization/summeval/cnndm
172 | /data/real-world/summarization/summeval/M*
173 | /data/**/train_data.json
174 | /data/**/**/train_data.json
175 | /data/**/**/**/train_data.json
176 | /data/clean_real_world_data
177 | /data/clean_real_world
178 | /test.ipynb
179 | /data/synthesis/synthesis
180 | /data/*.json
181 | /data/*.jsonl
182 | /data/*.ipynb
183 | /tigerscore/xgptscore/cache
184 | /test.sh
185 | /tigerscore/eval_scripts/check_data_private.sh
186 | /tigerscore/finetune/wandb/
187 | /data/additional
188 | /tigerscore/eval_scripts/eval_inst_baseline.sh
189 | /data/evaluation/translation
190 | /data/evaluation
191 | /data/data_dist
192 | /data/evaluation/pair_cmp
193 | /test*
194 |
--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "hf_space/TIGERScore"]
2 | path = hf_space/TIGERScore
3 | url = https://huggingface.co/spaces/TIGER-Lab/TIGERScore
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 TIGER Lab
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/github_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/TIGERScore/5133420066ee65a875d5b64cfd20ab35cfce0112/github_overview.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers
2 | datasets
3 | torch
4 | accelerate
5 | wget
6 | pycocoevalcap
7 | spacy
8 | evaluate
9 | prettytable
10 | gdcm
11 | pydicom
12 | bitsandbytes
13 | openai
14 | nltk
15 | scipy
16 | json5
17 | peft
18 | fire
19 | gradio
20 | sentencepiece
21 | tiktoken
22 | dacite
23 | wandb
24 | bs4
25 | py7zr
26 | gdown
27 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | description = """
4 | TIGERScore, a Trained metric that follows Instruction Guidance to perform Explainable, and Reference-free evaluation over a wide spectrum of text generation tasks.
5 | Different from other automatic evaluation methods that only provide arcane scores, TIGERScore is guided by the natural language instruction to provide error analysis to pinpoint the mistakes in the generated text.
6 | """
7 |
8 | setup(
9 | name='tigerscore',
10 | version='0.0.1',
11 | description=description,
12 | author='Dongfu Jiang',
13 | author_email='dongfu.jiang@uwaterloo.ca',
14 | packages=find_packages(),
15 | url='https://tiger-ai-lab.github.io/TIGERScore/',
16 | install_requires=[
17 | 'torch',
18 | 'transformers',
19 | 'datasets',
20 | 'accelerate',
21 | 'gradio',
22 | 'tiktoken',
23 | 'llama-cpp-python',
24 | 'protobuf',
25 | 'sentencepiece',
26 | 'accelerate'
27 | ],
28 | )
29 |
--------------------------------------------------------------------------------
/tigerscore/__init__.py:
--------------------------------------------------------------------------------
1 | from tigerscore.scorer.tigerscore import TIGERScorer
--------------------------------------------------------------------------------
/tigerscore/candidates_generation/_generate_candidates.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --time=30:00:00
3 | #SBATCH --job-name=generate_candidates
4 | #SBATCH --output ../../jobs/%j.out
5 | #SBATCH --hint=memory_bound
6 | #SBATCH --mem=60G
7 | #SBATCH --gres=gpu:a6000:2
8 | #SBATCH --qos=normal
9 | #SBATCH -n 1
10 |
11 | nvidia-smi
12 | # candidates will be saved in ../../data/${dataset}/candidates/${decoding_method}/${model}.json
13 | dataset=$1
14 | set=$2
15 | model_type=$3
16 | model=$4
17 | output_max_length=$5
18 | no_instruction=$6
19 | input_max_length=$7
20 | decoding_method=$8
21 | image2text=$9
22 | start_idx=${10}
23 | end_idx=${11}
24 | data_dir="../../data"
25 | dtype="float16"
26 | num_candidates=5
27 | num_beams=$num_candidates
28 | num_beam_groups=$num_candidates
29 | overwrite=False
30 | inference_bs=1
31 |
32 |
33 | if [ -z "$start_idx" ] && [ -z "$end_idx" ]; then
34 | echo "start_idx and end_idx are not provided, set to None"
35 | else
36 | echo "start_idx: $start_idx"
37 | echo "end_idx: $end_idx"
38 | fi
39 | if [ -z "$output_max_length" ]; then
40 | output_max_length=300
41 | echo "output_max_length is not provided, set to $output_max_length"
42 | else
43 | echo "output_max_length: $output_max_length"
44 | fi
45 |
46 | if [ -z "$input_max_length" ]; then
47 | input_max_length=300
48 | echo "input_max_length is not provided, set to $input_max_length"
49 | else
50 | echo "input_max_length: $input_max_length"
51 | fi
52 |
53 | if [ -z "$image2text" ]; then
54 | image2text=False
55 | echo "image2text is not provided, set to $image2text"
56 | else
57 | echo "image2text: $image2text"
58 | fi
59 | if [ -z "$no_instruction" ]; then
60 | no_instruction=False
61 | echo "no_instruction is not provided, set to $no_instruction"
62 | else
63 | echo "no_instruction: $no_instruction"
64 | fi
65 | if [ -z "$decoding_method" ]; then
66 | decoding_method="top_p_sampling"
67 | echo "decoding_method is not provided, set to $decoding_method"
68 | else
69 | echo "decoding_method: $decoding_method"
70 | fi
71 | python ./generate_candidates.py \
72 | --model_type $model_type \
73 | --model $model \
74 | --data_dir $data_dir \
75 | --dataset $dataset \
76 | --set $set \
77 | --num_return_sequences $num_candidates \
78 | --decoding_method $decoding_method \
79 | --inference_bs $inference_bs \
80 | --prompt_max_length $input_max_length \
81 | --output_max_length $output_max_length \
82 | --dtype $dtype \
83 | --num_beams $num_beams \
84 | --num_beam_groups $num_beam_groups \
85 | --no_repeat_ngram_size 3 \
86 | --start_idx "$start_idx" \
87 | --end_idx "$end_idx" \
88 | --overwrite $overwrite \
89 | --image2text "$image2text" \
90 | --no_instruction "$no_instruction" \
--------------------------------------------------------------------------------
/tigerscore/candidates_generation/downmodel.py:
--------------------------------------------------------------------------------
1 | # The task in slurm connot support long time download,so just download in shell.
2 | from model_utils import build_model, build_tokenizer
3 | import os
4 | from pathlib import Path
5 | import fire
6 |
7 |
8 | def main( models: str = None, model_type: str = None, cache_dir: str = None):
9 | models = models
10 | model_type = model_type
11 | cache_dir = (
12 | cache_dir or Path(os.path.abspath(__file__)).parent.parent.parent / "hf_models"
13 | )
14 | for model in models.split(","):
15 | tokenizer = build_tokenizer(
16 | model,
17 | cache_dir=cache_dir,
18 | resume_download=True,
19 | trust_remote_code=True,
20 | )
21 | model = build_model(
22 | model_type,
23 | model,
24 | cache_dir=cache_dir,
25 | resume_download=True,
26 | trust_remote_code=True,
27 | )
28 |
29 |
30 | if __name__ == "__main__":
31 | fire.Fire(main)
--------------------------------------------------------------------------------
/tigerscore/candidates_generation/engine.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is taken from This file is modified based on:
3 | https://github.com/Ravoxsg/SummaReranker-ACL-22-/blob/main/src/candidate_generation/engine.py
4 | We thank the authors for sharing their code.
5 | """
6 | import gc
7 | import torch
8 | import torch.nn.functional as F
9 | from typing import Dict
10 |
11 |
12 | def beam_search_step(inputs: Dict, tokenizer, base_model, args, **kwargs):
13 | """beam search step
14 |
15 | Args:
16 | inputs (dict): settings for beam search
17 | tokenizer (transformers tokenizer): Tokenizer
18 | base_model (transformers model): Model
19 | args (dict): settings for beam search
20 |
21 | Returns:
22 | dict: generated candidates and their logprobs in batch
23 | """
24 | kwargs['return_dict_in_generate'] = True
25 | kwargs['output_scores'] = True
26 | # 1 - beam search
27 | if args.decoding_method == "beam_search":
28 | outputs = base_model.generate(
29 | **inputs,
30 | num_beams=args.num_beams,
31 | num_return_sequences=args.num_return_sequences,
32 | max_new_tokens=args.output_max_length,
33 | repetition_penalty=args.repetition_penalty,
34 | length_penalty=args.length_penalty,
35 | no_repeat_ngram_size=args.no_repeat_ngram_size,
36 | use_cache=True,
37 | early_stopping=True,
38 | temperature=args.temperature,
39 | **kwargs
40 | )
41 | # 2 - diverse beam search
42 | if args.decoding_method == "diverse_beam_search":
43 | outputs = base_model.generate(
44 | **inputs,
45 | num_beams=args.num_beams,
46 | num_beam_groups=args.num_beam_groups,
47 | num_return_sequences=args.num_return_sequences,
48 | max_new_tokens=args.output_max_length,
49 | diversity_penalty=args.diversity_penalty,
50 | repetition_penalty=args.repetition_penalty,
51 | length_penalty=args.length_penalty,
52 | no_repeat_ngram_size=args.no_repeat_ngram_size,
53 | use_cache=True,
54 | early_stopping=True,
55 | temperature=args.temperature,
56 | **kwargs
57 | )
58 | # 3 - top-p sampling
59 | if args.decoding_method == "top_p_sampling":
60 | outputs = base_model.generate(
61 | **inputs,
62 | num_beams=1,
63 | do_sample=True,
64 | top_p=args.top_p,
65 | num_return_sequences=args.num_return_sequences,
66 | max_new_tokens=args.output_max_length,
67 | repetition_penalty=args.repetition_penalty,
68 | length_penalty=args.length_penalty,
69 | no_repeat_ngram_size=args.no_repeat_ngram_size,
70 | use_cache=True,
71 | early_stopping=True,
72 | temperature=args.temperature,
73 | **kwargs
74 | )
75 | # 4 - top-k sampling
76 | if args.decoding_method == "top_k_sampling":
77 | outputs = base_model.generate(
78 | **inputs,
79 | num_beams=1,
80 | do_sample=True,
81 | top_k=args.top_k,
82 | num_return_sequences=args.num_return_sequences,
83 | max_new_tokens=args.output_max_length,
84 | repetition_penalty=args.repetition_penalty,
85 | length_penalty=args.length_penalty,
86 | no_repeat_ngram_size=args.no_repeat_ngram_size,
87 | use_cache=True,
88 | early_stopping=True,
89 | temperature=args.temperature,
90 | **kwargs
91 | )
92 | # for top-p and top-k sampling, some scores will be masked as -inf. These scores are not processed by softmax and logrithm.
93 | masked_logits = torch.stack(outputs.scores, dim=0)
94 | masked_logits = F.log_softmax(masked_logits, dim=1)
95 | summary_ids = outputs.sequences
96 | logprobs = []
97 | # Different process for decoder-only models and encoder-decoder models
98 | if "input_ids" in inputs and \
99 | summary_ids.shape[1] == inputs['input_ids'].shape[1] + masked_logits.shape[0]:
100 | # for decoder-only models
101 | # remove input_ids
102 | summary_ids = summary_ids[:, inputs['input_ids'].shape[1]:]
103 | for i in range(summary_ids.shape[0]):
104 | logprobs.append([])
105 | for j in range(summary_ids.shape[1]): # token_idx
106 | if summary_ids[i][j] == tokenizer.eos_token_id:
107 | break
108 | logprobs[i].append(
109 | masked_logits[j, i, summary_ids[i][j]].item())
110 | else:
111 | # for encoder-decoder models
112 | for i in range(summary_ids.shape[0]):
113 | logprobs.append([])
114 | # shift of decoder because of the additional bos_token
115 | for j in range(summary_ids.shape[1] - 1): # token_idx
116 | if summary_ids[i][j + 1] == tokenizer.eos_token_id:
117 | break
118 | logprobs[i].append(
119 | masked_logits[j, i, summary_ids[i][j + 1]].item())
120 |
121 | logprobs = [sum(_probs) for _probs in logprobs]
122 | generated = tokenizer.batch_decode(
123 | summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
124 | del summary_ids
125 | gc.collect()
126 |
127 | batch_generated = []
128 | batch_logprobs = []
129 | bz = list(inputs.values())[0].shape[0]
130 | for i in range(bz):
131 | batch_generated.append(
132 | generated[i * args.num_return_sequences:(i + 1) * args.num_return_sequences])
133 | batch_logprobs.append(
134 | logprobs[i * args.num_return_sequences:(i + 1) * args.num_return_sequences])
135 | return {
136 | "generated": batch_generated,
137 | "logprobs": batch_logprobs
138 | }
139 |
--------------------------------------------------------------------------------
/tigerscore/candidates_generation/eval_candidates.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --time=24:00:00
3 | #SBATCH --job-name=eval_candidates
4 | #SBATCH --output ../../jobs/%j.out
5 | #SBATCH --gres=gpu:2080:1
6 | #SBATCH --nodes=1
7 | #SBATCH -n 2
8 |
9 | data_dir="../../data"
10 | # dataset="samsum,xsum,newsroom" # summarization
11 | # dataset="wmt16/cs-en,wmt16/de-en,wmt16/tr-en,wmt17/fi-en,wmt18/zh-en" # translation
12 | # dataset="totto,kasnerz/wikitabletext" # data2text
13 | dataset="din0s/asqa,DongfuTingle/FeTaQA,cosmos_qa,eli5" # long-form QA
14 | # dataset="databricks/databricks-dolly-15k"
15 | # dataset="gsm8k:main,math_qa"
16 |
17 | # dataset="common_gen,vicgalle/alpaca-gpt4,xnli/en,knkarthick/dialogsum"
18 | set="test"
19 | num_workers=1
20 | metrics="bleu,rouge,bart_score,bart_score_cnn"
21 | overwrite="True"
22 | echo "dataset: $dataset"
23 | echo "set: $set"
24 | python eval_candidates.py \
25 | --data_dir $data_dir \
26 | --dataset $dataset \
27 | --set $set \
28 | --num_workers $num_workers \
29 | --metrics $metrics \
30 | --overwrite $overwrite
--------------------------------------------------------------------------------
/tigerscore/candidates_generation/finetune_base_model.py:
--------------------------------------------------------------------------------
1 | """
2 | This file is to finetune basic models for candidates generation.
3 | Code based on Huggingface Turorial.
4 | """
5 | from common.evaluation import overall_eval
6 | from model_utils import (
7 | build_model,
8 | build_tokenizer,
9 | )
10 | from typing import Optional, Sequence, Dict, List
11 | from generate_candidates import get_model_size, get_torch_dtype
12 | from dataclasses import dataclass, field
13 | from transformers import (
14 | TrainingArguments,
15 | Seq2SeqTrainer,
16 | Seq2SeqTrainingArguments
17 | )
18 | import numpy as np
19 | import logging
20 | import transformers
21 | import torch
22 | import json
23 | import os
24 | import sys
25 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
26 | sys.path.append("..")
27 | IGNORE_INDEX = -100
28 |
29 |
30 | @dataclass
31 | class ModelArguments:
32 | model_type: str
33 | model_name_or_path: str
34 | dtype: str = "float32"
35 | cache_dir: Optional[str] = None
36 |
37 |
38 | @dataclass
39 | class DataArguments:
40 | data_dir: str
41 | train_file: str
42 | eval_file: str = None
43 | eval_metrics: List[str] = field(default_factory=lambda: ["bleu", "rouge"])
44 | input_max_length: int = 512
45 | output_max_length: int = 128
46 | with_instruction: bool = False
47 |
48 |
49 | def load_dataset(data_args):
50 | with open(data_args.train_file, 'r') as f:
51 | train_data = json.load(f)
52 | if data_args.eval_file:
53 | with open(data_args.eval_file, 'r') as f:
54 | eval_data = json.load(f)
55 | else:
56 | eval_data = None
57 |
58 | return train_data, eval_data
59 |
60 |
61 | class SupervisedDataset(torch.utils.data.Dataset):
62 | def __init__(self, encodings):
63 | self.encodings = encodings
64 |
65 | def __getitem__(self, idx):
66 | return {key: val[idx] for key, val in self.encodings.items()}
67 |
68 | def __len__(self):
69 | return len(self.encodings["input_ids"])
70 |
71 |
72 | def preprocess_function(examples, tokenizer, data_args):
73 | if data_args.with_instruction:
74 | inputs = [x["instruction"] + "\n" + x["input"] for x in examples]
75 | else:
76 | inputs = [x["input"] for x in examples]
77 | inputs = [x.strip(' \n') for x in inputs]
78 | outputs = [x["output"] for x in examples]
79 |
80 | logging.warning("# of examples: {}".format(len(inputs)))
81 | logging.warning("Example of inputs:")
82 | print(inputs[0])
83 | logging.warning("Example of outputs:")
84 | print(outputs[0])
85 |
86 | model_inputs = tokenizer(
87 | inputs, max_length=data_args.input_max_length, truncation=True)
88 | # Setup the tokenizer for targets
89 | with tokenizer.as_target_tokenizer():
90 | labels = tokenizer(
91 | outputs, max_length=data_args.output_max_length, truncation=True)
92 |
93 | logging.warning("Example of model inputs:")
94 | print("input_ids", model_inputs['input_ids'][0])
95 | print("attention_mask", model_inputs['attention_mask'][0])
96 | logging.warning("Example of labels:")
97 | print(labels['input_ids'][0])
98 | labels["input_ids"] = [
99 | [(_l if _l != tokenizer.pad_token_id else IGNORE_INDEX) for _l in label] for label in labels["input_ids"]
100 | ]
101 | model_inputs["labels"] = labels["input_ids"]
102 | return SupervisedDataset(model_inputs)
103 |
104 |
105 | @dataclass
106 | class DataCollatorForSupervisedDataset(object):
107 | """Collate examples for supervised fine-tuning."""
108 |
109 | tokenizer: transformers.PreTrainedTokenizer
110 |
111 | def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
112 | input_ids, labels = tuple([torch.tensor(
113 | instance[key]) for instance in instances] for key in ("input_ids", "labels"))
114 | input_ids = torch.nn.utils.rnn.pad_sequence(
115 | input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
116 | )
117 | labels = torch.nn.utils.rnn.pad_sequence(
118 | labels, batch_first=True, padding_value=IGNORE_INDEX)
119 | # print(self.tokenizer.batch_decode(input_ids))
120 | # print(self.tokenizer.batch_decode(labels.masked_fill(labels == IGNORE_INDEX, self.tokenizer.pad_token_id)))
121 | # print("##" * 30)
122 | return dict(
123 | input_ids=input_ids,
124 | labels=labels,
125 | attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
126 | )
127 |
128 |
129 | def main(
130 | model_args: ModelArguments,
131 | data_args: DataArguments,
132 | training_args: TrainingArguments,
133 | ):
134 |
135 | model = build_model(
136 | model_args.model_type,
137 | model_args.model_name_or_path,
138 | torch_dtype=get_torch_dtype(model_args.dtype),
139 | device_map="auto",
140 | cache_dir=model_args.cache_dir, resume_download=True)
141 | n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
142 | logging.warning("The {} has {} trainable parameters".format(
143 | model_args.model_name_or_path, get_model_size(n_params)))
144 | tokenizer = build_tokenizer(
145 | model_args.model_name_or_path,
146 | cache_dir=model_args.cache_dir, resume_download=True)
147 | logging.warning("Loading dataset...")
148 |
149 | train_data, eval_data = load_dataset(data_args)
150 | logging.warning("Dataset loaded.")
151 | logging.warning("Preprocessing dataset...")
152 | train_dataset = preprocess_function(train_data, tokenizer, data_args)
153 | eval_dataset = preprocess_function(eval_data, tokenizer, data_args)
154 | logging.warning("Dataset preprocessed.")
155 | logging.warning("Loading data collator...")
156 | data_collator = DataCollatorForSupervisedDataset(tokenizer)
157 | logging.warning("Data collator loaded.")
158 | logging.warning("Loading trainer...")
159 |
160 | def compute_metrics(eval_pred):
161 |
162 | logits, labels = eval_pred
163 | labels[labels == IGNORE_INDEX] = tokenizer.pad_token_id
164 | logits[logits == IGNORE_INDEX] = tokenizer.pad_token_id
165 | predictions = tokenizer.batch_decode(logits, skip_special_tokens=True)
166 | labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
167 | logging.warning("Example of predictions:")
168 | print(predictions[:3])
169 | logging.warning("Example of labels:")
170 | print(labels[:3])
171 | scores = overall_eval(predictions, labels,
172 | metrics=data_args.eval_metrics)
173 | return {
174 | key: np.mean(value) for key, value in scores.items()
175 | }
176 |
177 | training_args.evaluation_strategy = "epoch"
178 | training_args.weight_decay = 0.01
179 | training_args.save_total_limit = 5
180 | training_args.predict_with_generate = True
181 | training_args.generation_num_beams = 4
182 | training_args.generation_max_length = data_args.output_max_length
183 | training_args.load_best_model_at_end = True
184 | logging.warning("Training arguments:")
185 | print(training_args)
186 | trainer = Seq2SeqTrainer(
187 | model=model,
188 | args=training_args,
189 | tokenizer=tokenizer,
190 | train_dataset=train_dataset,
191 | eval_dataset=eval_dataset,
192 | data_collator=data_collator,
193 | compute_metrics=compute_metrics,
194 | )
195 | logging.warning("Trainer loaded.")
196 | logging.warning("Training...")
197 | trainer.train()
198 | logging.warning("Training finished.")
199 | logging.warning("Saving model...")
200 | trainer.save_model(output_dir=os.path.join(
201 | training_args.output_dir, "checkpoint-best"))
202 | logging.warning("Model saved.")
203 |
204 |
205 | if __name__ == "__main__":
206 | parser = transformers.HfArgumentParser(
207 | (ModelArguments, DataArguments, Seq2SeqTrainingArguments))
208 | model_args, data_args, training_args = parser.parse_args_into_dataclasses()
209 | main(model_args, data_args, training_args)
210 |
--------------------------------------------------------------------------------
/tigerscore/candidates_generation/finetune_base_model.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --time=24:00:00
3 | #SBATCH --job-name=finetune
4 | #SBATCH --output ../../jobs/finetune_base_models/%j.out
5 | #SBATCH --gres=gpu:2080:1
6 | #SBATCH --nodes=1
7 | #SBATCH -n 1
8 |
9 | model_type="t5"
10 | model_name_or_path="google/flan-t5-large"
11 | data_dir="../../data"
12 | dataset="cosmos_qa"
13 | train_file="${data_dir}/${dataset}/finetune_data.json"
14 | eval_file="${data_dir}/${dataset}/validation_data.json"
15 | with_instruction=True
16 | run_name="ft_${dataset}"
17 | learning_rate=1e-4
18 | num_train_epochs=10
19 | per_device_train_batch_size=2
20 | per_device_eval_batch_size=8
21 | gradient_accumulation_steps=16
22 | max_grad_norm=1
23 | input_max_length=512
24 | output_max_length=256
25 | optim="adafactor"
26 | lr_scheduler_type="linear"
27 | warmup_ratio=0.1
28 | fp16=False
29 | output_dir="../../finetuned_models/${model_name_or_path}/${run_name}"
30 | cache_dir="../../hf_models"
31 | localhost=$RANDOM # random port number
32 | n_gpu=1
33 | torchrun \
34 | --rdzv_backend=c10d \
35 | --rdzv_endpoint="localhost:${localhost}" \
36 | --nnodes 1 \
37 | --nproc_per_node ${n_gpu} \
38 | finetune_base_model.py \
39 | --model_type $model_type \
40 | --model_name_or_path $model_name_or_path \
41 | --data_dir $data_dir \
42 | --train_file $train_file \
43 | --eval_file $eval_file \
44 | --with_instruction $with_instruction \
45 | --run_name $run_name \
46 | --learning_rate $learning_rate \
47 | --optim $optim \
48 | --fp16 $fp16 \
49 | --lr_scheduler_type $lr_scheduler_type \
50 | --num_train_epochs $num_train_epochs \
51 | --per_device_train_batch_size $per_device_train_batch_size \
52 | --per_device_eval_batch_size $per_device_eval_batch_size \
53 | --gradient_accumulation_steps $gradient_accumulation_steps \
54 | --max_grad_norm $max_grad_norm \
55 | --input_max_length $input_max_length \
56 | --output_max_length $output_max_length \
57 | --output_dir $output_dir \
58 | --cache_dir $cache_dir \
59 | --report_to "wandb" \
60 | --logging_steps 2 \
61 |
62 |
--------------------------------------------------------------------------------
/tigerscore/candidates_generation/generate_candidates_by_gpt.py:
--------------------------------------------------------------------------------
1 | """
2 | Usage:
3 | Gererate candidates by GPT-3.5 or GPT-4.
4 | """
5 | from xgptscore.process_utils import XPGTItem
6 | from xgptscore.xgptscore import xgptscore
7 | import json
8 | import random
9 | import logging
10 | import sys
11 | import fire
12 | from pathlib import Path
13 | sys.path.append(str(Path(__file__).parent.parent))
14 | logging.basicConfig(level=logging.warning)
15 |
16 |
17 | def main(
18 | task: str,
19 | data_path: str,
20 | dataset: str,
21 | output_file: str = None,
22 | xgptscore_mode: str = "instruction",
23 | model_name: str = "ChatGPT",
24 | overwrite: bool = False,
25 | max_size: int = None,
26 | seed: int = 42,
27 | shuffle_file: bool = False,
28 | source_max_length: int = None,
29 | ref_max_length: int = None,
30 | hypo_max_length: int = None,
31 | dataset_split: str = "test",
32 | ):
33 | """Gererate candidates by GPT-3.5 or GPT-4.
34 |
35 | Args:
36 | task (str): Task name.
37 | data_path (str): Path to the data.
38 | dataset (str): Dataset name.
39 | output_file (str, optional): Defaults to None.
40 | xgptscore_mode (str, optional): Defaults to "instruction".
41 | model_name (str, optional): Defaults to "ChatGPT".
42 | overwrite (bool, optional): Defaults to False.
43 | max_size (int, optional): Defaults to None.
44 | seed (int, optional): Defaults to 42.
45 | shuffle_file (bool, optional): Defaults to False.
46 | source_max_length (int, optional): Defaults to None.
47 | ref_max_length (int, optional): Defaults to None.
48 | hypo_max_length (int, optional): Defaults to None.
49 | dataset_split (str, optional): Defaults to "test".
50 | """
51 | logging.warning("Params: \n{}".format(json.dumps(locals(), indent=4)))
52 | # load data
53 | data_path = Path(data_path)
54 | input_file = data_path / dataset / (dataset_split + "_data.json")
55 |
56 | input_file = Path(input_file)
57 | if not output_file:
58 | output_file = data_path / dataset / "candidates" / \
59 | dataset_split / "top_p_sampling" / f"{model_name}.json"
60 | if not output_file.parent.parent.exists():
61 | output_file.parent.parent.mkdir(parents=True)
62 | if not output_file.parent.exists():
63 | output_file.parent.mkdir()
64 | else:
65 | output_file = Path(output_file)
66 | with open(input_file, "r") as f:
67 | items = json.load(f)
68 | logging.warning("Loaded {} items from {}".format(
69 | len(items), input_file))
70 | logging.warning("Preparing writing to {}...".format(output_file))
71 |
72 | random.seed(seed)
73 | logging.warning("Set seed to {}".format(seed))
74 | if shuffle_file:
75 | random.shuffle(items)
76 | logging.warning("Shuffled {} items".format(len(items)))
77 | if isinstance(max_size, int) and max_size > 0:
78 | items = items[:max_size]
79 | logging.warning("Truncated to {} items".format(len(items)))
80 |
81 | xgptitems = []
82 | for item in items:
83 | xgptitems.append(XPGTItem(
84 | task=task,
85 | instruction=item['instruction'],
86 | input=item['input'],
87 | ref_output=item['output'] if "output" in item else item['refs'],
88 | hypo_output=None,
89 | ))
90 | if "candidates" in item:
91 | del item["candidates"]
92 |
93 | if not output_file.exists() or overwrite:
94 | logging.warning("Running xgptscore")
95 | # run xgptscore
96 | xgptscore_params = {
97 | "max_lengths": {
98 | "input": source_max_length,
99 | "hypo_output": hypo_max_length,
100 | "ref_output": ref_max_length,
101 | },
102 | }
103 | result = xgptscore(xgptitems, mode=xgptscore_mode,
104 | model_name=model_name, **xgptscore_params)
105 | for i, item in enumerate(items):
106 | item['responses'] = result['round_completions'][i]
107 | item['messages_records'] = result['messages_records'][i]
108 | item['candidates'] = [
109 | {"text": result['round_completions'][i][0],
110 | "scores": {}
111 | }]
112 | # print(items)
113 | with open(output_file, "w") as f:
114 | json.dump(items, f, indent=4, ensure_ascii=False)
115 | logging.warning("Saved to {}".format(output_file))
116 | else:
117 | logging.warning("Loading from {}".format(output_file))
118 | with open(output_file, "r") as f:
119 | items = json.load(f)
120 |
121 |
122 | if __name__ == "__main__":
123 | fire.Fire(main)
124 |
--------------------------------------------------------------------------------
/tigerscore/candidates_generation/generate_candidates_by_gpt.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=generate_candidates_by_gpt
3 | #SBATCH --time=24:00:00
4 | #SBATCH --output=../../jobs/%j.out
5 |
6 |
7 | # datasets=("GAIR/lima" "tatsu-lab/alpaca_farm:alpaca_instructions" "HuggingFaceH4/oasst1_en" "JosephusCheung/GuanacoDataset" "databricks/databricks-dolly-15k")
8 | dataset=$1
9 | task=$2
10 | data_path=""
11 | python generate_candidates_by_gpt.py \
12 | --task $task \
13 | --data_path $data_path \
14 | --dataset $dataset \
15 | --source_max_length 512 \
16 | --overwrite "False"
--------------------------------------------------------------------------------
/tigerscore/candidates_generation/generate_candidates_series.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --time=12:00:00
3 | #SBATCH --job-name=generate_candidates
4 | #SBATCH --output ../../jobs/%j.out
5 | #SBATCH --gres=gpu:1
6 | #SBATCH --qos=normal
7 | #SBATCH -n 1
8 |
9 | # This script is used to generate candidates via GPT-3.5 and local models.
10 |
11 | CMD="sbatch"
12 |
13 | # models=("google/flan-t5-small" "google/flan-t5-base" "google/flan-t5-large" "google/flan-t5-xl" "google/flan-t5-xxl")
14 | # models=("lmsys/vicuna-33b-v1.3" "lmsys/vicuna-13b-v1.3" "lmsys/vicuna-7b-v1.3") # vicuna
15 | models=("lmsys/vicuna-33b-v1.3") # vicuna-33b-v1.3 need two gpus
16 | # models=("lmsys/vicuna-13b-v1.3" "lmsys/vicuna-7b-v1.3") # vicuna
17 | # model_type="t5"
18 | model_type="llama"
19 | dataset="din0s/asqa"
20 | dataset="DongfuTingle/FeTaQA"
21 | # dataset="cosmos_qa"
22 | # dataset="eli5"
23 | set="test"
24 | output_max_length=512
25 | for model in "${models[@]}"; do
26 | ${CMD} _generate_candidates.sh "$dataset" "$set" "$model_type" "$model" "$output_max_length"
27 | done
28 | # data_path=""
29 | # python generate_candidates_by_gpt.py \
30 | # --task "long-form QA" \
31 | # --data_path $data_path \
32 | # --dataset $dataset \
--------------------------------------------------------------------------------
/tigerscore/candidates_generation/generate_ref_by_gpt4.py:
--------------------------------------------------------------------------------
1 | """
2 | Usage:
3 | Gererate candidates by GPT-3.5 or GPT-4.
4 | """
5 |
6 | import json
7 | import random
8 | import logging
9 | import sys
10 | import fire
11 | from pathlib import Path
12 | sys.path.append(str(Path(__file__).parent.parent))
13 | from xgptscore.process_utils import XPGTItem
14 | from xgptscore.xgptscore import xgptscore
15 | logging.basicConfig(level=logging.warning)
16 |
17 |
18 | def main(
19 | task: str,
20 | data_path: str,
21 | xgptscore_mode: str = "instruction",
22 | model_name: str = "gpt-4",
23 | overwrite: bool = False,
24 | max_size: int = None,
25 | seed: int = 42,
26 | shuffle_file: bool = False,
27 | source_max_length: int = None,
28 | ref_max_length: int = None,
29 | hypo_max_length: int = None,
30 | dataset_split: str = "test",
31 | ):
32 | """Gererate candidates by GPT-3.5 or GPT-4.
33 |
34 | Args:
35 | task (str): Task name.
36 | data_path (str): Path to the data.
37 | dataset (str): Dataset name.
38 | output_file (str, optional): Defaults to None.
39 | xgptscore_mode (str, optional): Defaults to "instruction".
40 | model_name (str, optional): Defaults to "ChatGPT".
41 | overwrite (bool, optional): Defaults to False.
42 | max_size (int, optional): Defaults to None.
43 | seed (int, optional): Defaults to 42.
44 | shuffle_file (bool, optional): Defaults to False.
45 | source_max_length (int, optional): Defaults to None.
46 | ref_max_length (int, optional): Defaults to None.
47 | hypo_max_length (int, optional): Defaults to None.
48 | dataset_split (str, optional): Defaults to "test".
49 | """
50 | logging.warning("Params: \n{}".format(json.dumps(locals(), indent=4)))
51 | # load data
52 | data_path = Path(data_path)
53 | input_file = data_path
54 |
55 | input_file = Path(input_file)
56 | output_file = input_file
57 | with open(input_file, "r") as f:
58 | items = json.load(f)
59 | logging.warning("Loaded {} items from {}".format(
60 | len(items), input_file))
61 | logging.warning("Preparing writing to {}...".format(output_file))
62 |
63 | random.seed(seed)
64 | logging.warning("Set seed to {}".format(seed))
65 | if shuffle_file:
66 | random.shuffle(items)
67 | logging.warning("Shuffled {} items".format(len(items)))
68 | if isinstance(max_size, int) and max_size > 0:
69 | items = items[:max_size]
70 | logging.warning("Truncated to {} items".format(len(items)))
71 |
72 | xgptitems = []
73 | for item in items:
74 | xgptitems.append(XPGTItem(
75 | task=task,
76 | instruction=item['instruction'],
77 | input=item['input'],
78 | ref_output=item['output'] if "output" in item else item['refs'],
79 | hypo_output=None,
80 | ))
81 |
82 | if not output_file.exists() or overwrite:
83 | logging.warning("Running xgptscore")
84 | # run xgptscore
85 | xgptscore_params = {
86 | "max_lengths": {
87 | "input": source_max_length,
88 | "hypo_output": hypo_max_length,
89 | "ref_output": ref_max_length,
90 | },
91 | }
92 | result = xgptscore(xgptitems, mode=xgptscore_mode,
93 | model_name=model_name,num_workers=5, **xgptscore_params)
94 | for i, item in enumerate(items):
95 | item['responses'] = result['round_completions'][i]
96 | item['messages_records'] = result['messages_records'][i]
97 | if item["output"] is not None:
98 | item["output"] = result['round_completions'][i][0]
99 | # print(items)
100 | with open(output_file, "w") as f:
101 | json.dump(items, f, indent=4, ensure_ascii=False)
102 | logging.warning("Saved to {}".format(output_file))
103 | else:
104 | logging.warning("Loading from {}".format(output_file))
105 | with open(output_file, "r") as f:
106 | items = json.load(f)
107 |
108 |
109 | if __name__ == "__main__":
110 | fire.Fire(main)
111 |
--------------------------------------------------------------------------------
/tigerscore/candidates_generation/model_utils.py:
--------------------------------------------------------------------------------
1 | from transformers import (
2 | AutoTokenizer,
3 | AutoModelForSeq2SeqLM,
4 | AutoModelForCausalLM,
5 | AutoModel,
6 | VisionEncoderDecoderModel,
7 | ViTImageProcessor,
8 | )
9 | decoder_only_models = ["alpaca", "llama", "opt", "bloom",
10 | "gpt", "vicuna", "koala", "Wizard", "stablelm"]
11 |
12 |
13 | def build_model(model_type, model_name, **kwargs):
14 | """
15 | Build the model from the model name
16 | """
17 | if any([x in model_type for x in decoder_only_models]) or any([x in model_name for x in decoder_only_models]):
18 | model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)
19 | elif model_type in ["vit"]:
20 | model = VisionEncoderDecoderModel.from_pretrained(model_name)
21 | elif model_type in ["bart", "t5", "mbart", "m2m100", "nllb", "opus_mt", "unifiedqa", "opus-mt", "pegasus"]:
22 | model = AutoModelForSeq2SeqLM.from_pretrained(model_name, **kwargs)
23 | else:
24 | model = AutoModel.from_pretrained(model_name, **kwargs)
25 |
26 | return model
27 |
28 |
29 | def build_tokenizer(model_name, **kwargs):
30 | """
31 | Build the tokenizer from the model name
32 | """
33 |
34 | if "vicuna" in model_name:
35 | tokenizer = AutoTokenizer.from_pretrained(
36 | model_name, padding_side="left", use_fast=False, **kwargs)
37 | # elif "Wizard" in model_name:
38 | # tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", return_token_type_ids=False, **kwargs)
39 | elif any([x in model_name for x in decoder_only_models]):
40 | # padding left
41 | tokenizer = AutoTokenizer.from_pretrained(
42 | model_name, padding_side="left", **kwargs)
43 | else:
44 | tokenizer = AutoTokenizer.from_pretrained(
45 | model_name, **kwargs) # , use_fast=False)
46 | if tokenizer.pad_token is None:
47 | tokenizer.pad_token = tokenizer.eos_token
48 | tokenizer.pad_token_id = tokenizer.eos_token_id
49 | return tokenizer
50 |
51 |
52 | def build_processor(model_type, model_name, **kwargs):
53 | """
54 | Build the processor from the model name
55 | """
56 | if model_type in ["vit"]:
57 | processor = ViTImageProcessor.from_pretrained(model_name, **kwargs)
58 | else:
59 | raise NotImplementedError
60 | return processor
61 |
--------------------------------------------------------------------------------
/tigerscore/common/README.md:
--------------------------------------------------------------------------------
1 | ## Installation
2 | to get our experiments results, first create a `tigerscore_baseline` environment
3 | ```bash
4 | conda create -n tigerscore_baseline python=3.9
5 | conda activate tigerscore_baseline
6 | pip install -r requirements.txt
7 | pip install https://github.com/vllm-project/vllm/releases/download/v0.2.2/vllm-0.2.2+cu118-cp39-cp39-manylinux1_x86_64.whl
8 | pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu118
9 | ```
10 |
--------------------------------------------------------------------------------
/tigerscore/common/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | cur_folder = os.path.dirname(os.path.abspath(__file__))
4 | if cur_folder not in sys.path:
5 | sys.path.append(cur_folder)
6 |
--------------------------------------------------------------------------------
/tigerscore/common/bart_score.py:
--------------------------------------------------------------------------------
1 | # %%
2 | """
3 | From https://github.com/neulab/BARTScore
4 | """
5 | import torch
6 | import torch.nn as nn
7 | import traceback
8 | from transformers import BartTokenizer, BartForConditionalGeneration
9 |
10 |
11 | class BARTScorer:
12 | def __init__(self, device='cuda:0', max_length=1024, checkpoint='facebook/bart-large-cnn'):
13 | # Set up model
14 | self.device = device
15 | self.max_length = max_length
16 | self.tokenizer = BartTokenizer.from_pretrained(checkpoint)
17 | self.model = BartForConditionalGeneration.from_pretrained(checkpoint)
18 | self.model.eval()
19 | self.model.to(device)
20 |
21 | # Set up loss
22 | self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id)
23 | self.lsm = nn.LogSoftmax(dim=1)
24 |
25 | def load(self, path='./models/bart.pth'):
26 | """ Load model from paraphrase finetuning """
27 | self.model.load_state_dict(torch.load(path, map_location=self.device))
28 |
29 | def score(self, srcs, tgts, batch_size):
30 | """ Score a batch of examples """
31 | score_list = []
32 | for i in range(0, len(srcs), batch_size):
33 | src_list = srcs[i: i + batch_size]
34 | tgt_list = tgts[i: i + batch_size]
35 | try:
36 | with torch.no_grad():
37 | encoded_src = self.tokenizer(
38 | src_list,
39 | max_length=self.max_length,
40 | truncation=True,
41 | padding=True,
42 | return_tensors='pt'
43 | )
44 | encoded_tgt = self.tokenizer(
45 | tgt_list,
46 | max_length=self.max_length,
47 | truncation=True,
48 | padding=True,
49 | return_tensors='pt'
50 | )
51 | src_tokens = encoded_src['input_ids'].to(self.device)
52 | src_mask = encoded_src['attention_mask'].to(self.device)
53 |
54 | tgt_tokens = encoded_tgt['input_ids'].to(self.device)
55 | tgt_mask = encoded_tgt['attention_mask']
56 | tgt_len = tgt_mask.sum(dim=1).to(self.device)
57 |
58 | output = self.model(
59 | input_ids=src_tokens,
60 | attention_mask=src_mask,
61 | labels=tgt_tokens
62 | )
63 | logits = output.logits.view(-1, self.model.config.vocab_size)
64 | loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
65 | loss = loss.view(tgt_tokens.shape[0], -1)
66 | loss = loss.sum(dim=1) / tgt_len
67 | curr_score_list = [-x.item() for x in loss]
68 | score_list += curr_score_list
69 |
70 | except RuntimeError:
71 | traceback.print_exc()
72 | print(f'source: {src_list}')
73 | print(f'target: {tgt_list}')
74 | exit(0)
75 | return score_list
76 |
--------------------------------------------------------------------------------
/tigerscore/common/cor_eval.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy
3 |
4 |
5 | def cor_pearson(hypo_scores, ref_scores):
6 | """
7 | Args:
8 | hypo_scores: ndarray of shape (n, c) where n is the number of samples, c is the number of candidates
9 | ref_scores: ndarray of shape (n, c) where n is the number of samples, c is the number of candidates
10 | returns:
11 | cor: float, the mean correlation coefficient
12 | """
13 | if isinstance(hypo_scores, list):
14 | hypo_scores = np.array(hypo_scores)
15 | if isinstance(ref_scores, list):
16 | ref_scores = np.array(ref_scores)
17 | assert hypo_scores.shape == ref_scores.shape
18 | bz, c = hypo_scores.shape
19 | hypo_scores = hypo_scores.reshape(bz, c).T
20 | ref_scores = ref_scores.reshape(bz, c).T
21 | cor = 0
22 | for i in range(c):
23 | cor += np.corrcoef(hypo_scores[i], ref_scores[i])[0, 1]
24 | cor /= c
25 | return cor
26 |
27 |
28 | def cor_spearman(hypo_scores, ref_scores):
29 | """
30 | Args:
31 | hypo_scores: ndarray of shape (n, c) where n is the number of samples, c is the number of candidates
32 | ref_scores: ndarray of shape (n, c) where n is the number of samples, c is the number of candidates
33 | returns:
34 | cor: float, the mean of the diagonal elements of the spearman correlation matrix
35 | """
36 | if isinstance(hypo_scores, list):
37 | hypo_scores = np.array(hypo_scores)
38 | if isinstance(ref_scores, list):
39 | ref_scores = np.array(ref_scores)
40 | assert hypo_scores.shape == ref_scores.shape
41 | bz, c = hypo_scores.shape
42 | hypo_scores = hypo_scores.reshape(bz, c).T
43 | ref_scores = ref_scores.reshape(bz, c).T
44 | cor = 0
45 | for i in range(c):
46 | cor += scipy.stats.spearmanr(hypo_scores[i], ref_scores[i]).correlation
47 | cor /= c
48 | return cor
49 |
50 |
51 | def cor_spearman_footrule(hypo_scores, ref_scores):
52 | """
53 | Args:
54 | hypo_scores: ndarray of shape (n, c) where n is the number of samples, c is the number of candidates
55 | ref_scores: ndarray of shape (n, c) where n is the number of samples, c is the number of candidates
56 | returns:
57 | cor: float, the mean of the set of the spearman correlation coefficients
58 | """
59 | if isinstance(hypo_scores, list):
60 | hypo_scores = np.array(hypo_scores)
61 | if isinstance(ref_scores, list):
62 | ref_scores = np.array(ref_scores)
63 | assert hypo_scores.shape == ref_scores.shape
64 | bz, c = hypo_scores.shape
65 | hypo_scores = hypo_scores.reshape(bz, c)
66 | ref_scores = ref_scores.reshape(bz, c)
67 | return np.abs(hypo_scores - ref_scores).sum(axis=-1).mean()
68 |
--------------------------------------------------------------------------------
/tigerscore/common/download.sh:
--------------------------------------------------------------------------------
1 | # Download BLEURT
2 | wget https://storage.googleapis.com/bleurt-oss/bleurt-large-512.zip .
3 | unzip bleurt-large-512.zip
4 | mv bleurt-large-512 models/
5 | rm bleurt-large-512.zip
6 |
7 | # Download PRISM
8 | wget http://data.statmt.org/prism/m39v1.tar
9 | tar xf m39v1.tar
10 | mv m39v1 models/
11 | rm m39v1.tar
--------------------------------------------------------------------------------
/tigerscore/common/flan_score.py:
--------------------------------------------------------------------------------
1 | # %%
2 | """
3 | From https://github.com/xu1998hz/SEScore3
4 | """
5 | import torch
6 | import torch.nn as nn
7 | import traceback
8 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
9 |
10 |
11 | class FLANScorer:
12 | def __init__(self, device='cuda:0', max_length=1024, checkpoint='google/flan-t5-base'):
13 | # Set up model
14 | self.device = device
15 | self.max_length = max_length
16 | self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
17 | self.model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
18 | self.model.eval()
19 | self.model.to(device)
20 | # Set up loss
21 | self.loss_fct = nn.NLLLoss(
22 | reduction='none', ignore_index=self.model.config.pad_token_id)
23 | self.lsm = nn.LogSoftmax(dim=1)
24 |
25 | def load(self):
26 | """ Load model from paraphrase finetuning """
27 | self.model.load_state_dict(torch.load(
28 | 'models/bart.pth', map_location=self.device))
29 |
30 | def score(self, srcs, tgts, batch_size):
31 | """ Score a batch of examples """
32 | score_list = []
33 | for i in range(0, len(srcs), batch_size):
34 | src_list = srcs[i: i + batch_size]
35 | tgt_list = tgts[i: i + batch_size]
36 | if i < 1:
37 | pass
38 | # print('src_list: ',src_list)
39 | # print('tgt_list: ', tgt_list)
40 | try:
41 | with torch.no_grad():
42 | encoded_src = self.tokenizer(
43 | src_list,
44 | max_length=self.max_length,
45 | truncation=True,
46 | padding=True,
47 | return_tensors='pt'
48 | )
49 | encoded_tgt = self.tokenizer(
50 | tgt_list,
51 | max_length=self.max_length,
52 | truncation=True,
53 | padding=True,
54 | return_tensors='pt'
55 | )
56 | src_tokens = encoded_src['input_ids'].to(self.device)
57 | src_mask = encoded_src['attention_mask'].to(self.device)
58 | tgt_tokens = encoded_tgt['input_ids'].to(self.device)
59 | tgt_mask = encoded_tgt['attention_mask']
60 | tgt_len = tgt_mask.sum(dim=1).to(self.device)
61 |
62 | output = self.model(
63 | input_ids=src_tokens,
64 | attention_mask=src_mask,
65 | labels=tgt_tokens
66 | )
67 | logits = output.logits.view(-1,
68 | self.model.config.vocab_size)
69 | loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
70 | loss = loss.view(tgt_tokens.shape[0], -1)
71 | loss = loss.sum(dim=1) / tgt_len
72 | curr_score_list = [-x.item() for x in loss]
73 | score_list += curr_score_list
74 |
75 | except RuntimeError:
76 | traceback.print_exc()
77 | print(f'source: {src_list}')
78 | print(f'target: {tgt_list}')
79 | exit(0)
80 | return score_list
81 |
--------------------------------------------------------------------------------
/tigerscore/common/requirements.txt:
--------------------------------------------------------------------------------
1 | torch
2 | transformers
3 | git+https://github.com/Unbabel/COMET.git
4 | git+https://github.com/jdf-prog/UniEval.git
5 | nltk
6 | git+https://github.com/google-research/bleurt.git
7 | fire
8 | rouge_score
9 | bert_score
10 | git+https://github.com/huggingface/evaluate@18932858570b9fa97ac478e1e6e709438e4d093b
11 | pycocoevalcap
12 | spacy
13 | git+https://github.com/google-research/mt-metrics-eval.git
14 | prettytable
15 | psutil
16 | sacrebleu
17 | mosestokenizer
18 | pytorch-lightning==2.0.0
--------------------------------------------------------------------------------
/tigerscore/common/utils.py:
--------------------------------------------------------------------------------
1 | import random
2 | import os
3 | import numpy as np
4 | import torch
5 | import argparse
6 | import hashlib
7 | import requests
8 | import time
9 | from io import BytesIO
10 | from tqdm import tqdm
11 | from PIL import Image
12 | from concurrent.futures import ThreadPoolExecutor
13 | from functools import partial
14 | from datasets.utils.file_utils import get_datasets_user_agent
15 |
16 | USER_AGENT = get_datasets_user_agent()
17 |
18 |
19 | def seed_everything(seed=42):
20 | """
21 | Seed everything for reproducibility
22 | """
23 | random.seed(seed)
24 | os.environ['PYTHONHASHSEED'] = str(seed)
25 | np.random.seed(seed)
26 | torch.manual_seed(seed)
27 | torch.cuda.manual_seed(seed)
28 | torch.backends.cudnn.deterministic = True
29 |
30 |
31 | def str2bool(v):
32 | """
33 | Convert string to boolean
34 | """
35 | if isinstance(v, bool):
36 | return v
37 | if v.lower() in ('yes', 'true', 't', 'y', '1'):
38 | return True
39 | elif v.lower() in ('no', 'false', 'f', 'n', '0'):
40 | return False
41 | else:
42 | raise argparse.ArgumentTypeError('Boolean value expected.')
43 |
44 |
45 | def empty2None(x):
46 | if x == '':
47 | return None
48 | elif isinstance(x, str):
49 | return x
50 | else:
51 | raise argparse.ArgumentTypeError('String value expected.')
52 |
53 |
54 | def empty2Noneint(x):
55 | if x == '':
56 | return None
57 | elif isinstance(x, int):
58 | return x
59 | elif isinstance(x, str):
60 | return int(x)
61 | else:
62 | raise argparse.ArgumentTypeError('Integer value expected.')
63 |
64 |
65 | def empty2zero(x):
66 | if x == '':
67 | return 0
68 | elif isinstance(x, int):
69 | return x
70 | elif isinstance(x, str):
71 | return int(x)
72 | else:
73 | raise argparse.ArgumentTypeError('Integer value expected.')
74 |
75 |
76 | def generate_hash_code(text):
77 | if text is None:
78 | return None
79 | # Convert the text to bytes and create a hash object
80 | hash_object = hashlib.sha256(text.encode())
81 |
82 | # Get the hexadecimal representation of the hash code
83 | hex_code = hash_object.hexdigest()
84 |
85 | # Return the first 16 digits of the hexadecimal code
86 | return hex_code[:16]
87 |
88 |
89 | def fetch_single_image(image_url, timeout=None, retries=2):
90 | """
91 | Fetch a single image from a URL.
92 | """
93 | if os.path.exists(image_url):
94 | # fetch from local
95 | try:
96 | image = Image.open(image_url).convert("RGB")
97 | except Exception:
98 | if retries > 0:
99 | time.sleep(3)
100 | return fetch_single_image(image_url, timeout=timeout, retries=retries - 1)
101 | else:
102 | # fetch from url
103 | try:
104 | r = requests.get(image_url, timeout=timeout,
105 | stream=True, headers={"User-Agent": USER_AGENT})
106 | r.raise_for_status()
107 | image = Image.open(BytesIO(r.content)).convert("RGB")
108 | except Exception as e:
109 | if retries > 0:
110 | time.sleep(3) # Wait 3 seconds before retrying
111 | return fetch_single_image(image_url, timeout=timeout, retries=retries - 1)
112 | else:
113 | print(
114 | f"Failed to fetch image from {image_url} after {retries} retries")
115 | raise e
116 | return image
117 |
118 |
119 | def fetch_images(image_urls, num_threads, timeout=None, retries=2):
120 | """
121 | Fetch images from a list of URLs in parallel.
122 | Args:
123 | image_urls (list): List of image URLs.
124 | num_threads (int): Number of threads to use.
125 | timeout (int, optional): Timeout for the request. Defaults to None.
126 | retries (int, optional): Number of retries. Defaults to 0.
127 | Returns:
128 | list: List of PIL images.
129 | """
130 | fetch_single_image_with_args = partial(
131 | fetch_single_image, timeout=timeout, retries=retries)
132 | with ThreadPoolExecutor(max_workers=num_threads) as executor:
133 | images = list(
134 | tqdm(
135 | executor.map(fetch_single_image_with_args, image_urls),
136 | total=len(image_urls),
137 | desc="Fetching images")
138 | )
139 | print("Fetched {} images".format(len(images)))
140 | return images
141 |
--------------------------------------------------------------------------------
/tigerscore/download_dataset/bartscore_data_process.py:
--------------------------------------------------------------------------------
1 | """
2 | Unzip the data files and convert them to json format.
3 | """
4 | import os
5 | import json
6 | import argparse
7 | import pickle
8 | from pathlib import Path
9 |
10 | if __name__ == '__main__':
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('--data_dir', type=str, required=True)
13 | parser.add_argument('--task', type=str, required=True)
14 | parser.add_argument('--rm_old', action='store_true')
15 |
16 | args = parser.parse_args()
17 | data_dir = args.data_dir
18 |
19 | task_dir = Path(data_dir) / args.task
20 | for data_file in os.listdir(task_dir):
21 | if not data_file.endswith('.pkl'):
22 | continue
23 | print("Data file: ", data_file)
24 | data_path = task_dir / data_file
25 | with open(data_path, 'rb') as f:
26 | data = pickle.load(f)
27 | print("# of data: ", len(data))
28 | if isinstance(data, dict):
29 | print("Data Example: ", data[list(data.keys())[0]])
30 | elif isinstance(data, list):
31 | print("Data example: ", data[0])
32 | with open(data_path.with_suffix('.json'), 'w') as f:
33 | json.dump(data, f, indent=4)
34 | if args.rm_old:
35 | data_path.unlink()
36 |
--------------------------------------------------------------------------------
/tigerscore/download_dataset/datasets_scripts/fetaqa.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/TIGERScore/5133420066ee65a875d5b64cfd20ab35cfce0112/tigerscore/download_dataset/datasets_scripts/fetaqa.sh
--------------------------------------------------------------------------------
/tigerscore/download_dataset/download_bartscore_data.sh:
--------------------------------------------------------------------------------
1 | # Download the BARTScore used system outputs and references
2 | scripts_dir=$(pwd)
3 | data_dir="../../data/bartscore_data"
4 | mkdir -p $data_dir
5 |
6 | # Summarization
7 | cd $data_dir
8 | datasets=("Newsroom" "QAGS_CNN" "QAGS_XSUM" "REALSumm" "Rank19" "SummEval")
9 | mkdir -p summarization
10 | for dataset in ${datasets[@]}; do
11 | wget "https://github.com/neulab/BARTScore/raw/main/SUM/${dataset}/data.pkl" -O "summarization/${dataset}.pkl"
12 | done
13 | cd $scripts_dir
14 | python bartscore_data_process.py --data_dir "$data_dir" --task "summarization"
15 |
16 |
17 | # Translation
18 | cd $data_dir
19 | datasets=("de-en" "fi-en" "gu-en" "kk-en" "lt-en" "ru-en" "zh-en")
20 | mkdir -p translation
21 | for dataset in ${datasets[@]}; do
22 | wget "https://github.com/neulab/BARTScore/raw/main/WMT/${dataset}/data.pkl" -O "translation/${dataset}.pkl"
23 | done
24 | cd $scripts_dir
25 | python bartscore_data_process.py --data_dir "$data_dir" --task "translation"
26 |
27 | # Data2Text
28 | cd $data_dir
29 | datasets=("BAGEL" "SFHOT" "SFRES")
30 | mkdir -p data2text
31 | for dataset in ${datasets[@]}; do
32 | wget "https://github.com/neulab/BARTScore/raw/main/D2T/${dataset}/data.pkl" -O "data2text/${dataset}.pkl"
33 | done
34 | cd $scripts_dir
35 | python bartscore_data_process.py --data_dir "$data_dir" --task "data2text"
--------------------------------------------------------------------------------
/tigerscore/download_dataset/download_general_datasets.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --time=18:00:00
3 | #SBATCH --job-name=downloading_general_datasets
4 | #SBATCH --output ../../jobs/%j.out
5 | #SBATCH --nodelist=ink-gary
6 | #SBATCH -n 1
7 |
8 | python download_general_datasets.py --task "mathQA" --overwrite False
9 | python download_general_datasets.py --task "summarization" --overwrite False
10 | python download_general_datasets.py --task "translation" --overwrite False
11 | python download_general_datasets.py --task "data2text" --overwrite False
12 | python download_general_datasets.py --task "long-form QA" --overwrite False
13 | python download_general_datasets.py --task "instruction-following" --overwrite False
14 | # python download_general_datasets.py --task "story_generation"
15 | # python download_general_datasets.py --task "image_captioning"
16 | python download_general_datasets.py --task "code"
--------------------------------------------------------------------------------
/tigerscore/download_dataset/preprocess_utils_totto.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # Copyright 2018 The Google AI Language Team Authors.
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Baseline preprocessing utilities."""
16 | import copy
17 |
18 |
19 | def _add_adjusted_col_offsets(table):
20 | """Add adjusted column offsets to take into account multi-column cells."""
21 | adjusted_table = []
22 | for row in table:
23 | real_col_index = 0
24 | adjusted_row = []
25 | for cell in row:
26 | adjusted_cell = copy.deepcopy(cell)
27 | adjusted_cell["adjusted_col_start"] = real_col_index
28 | adjusted_cell["adjusted_col_end"] = (
29 | adjusted_cell["adjusted_col_start"] + adjusted_cell["column_span"])
30 | real_col_index += adjusted_cell["column_span"]
31 | adjusted_row.append(adjusted_cell)
32 | adjusted_table.append(adjusted_row)
33 | return adjusted_table
34 |
35 |
36 | def _get_heuristic_row_headers(adjusted_table, row_index, col_index):
37 | """Heuristic to find row headers."""
38 | row_headers = []
39 | row = adjusted_table[row_index]
40 | for i in range(0, col_index):
41 | if row[i]["is_header"]:
42 | row_headers.append(row[i])
43 | return row_headers
44 |
45 |
46 | def _get_heuristic_col_headers(adjusted_table, row_index, col_index):
47 | """Heuristic to find column headers."""
48 | adjusted_cell = adjusted_table[row_index][col_index]
49 | adjusted_col_start = adjusted_cell["adjusted_col_start"]
50 | adjusted_col_end = adjusted_cell["adjusted_col_end"]
51 | col_headers = []
52 | for r in range(0, row_index):
53 | row = adjusted_table[r]
54 | for cell in row:
55 | if (cell["adjusted_col_start"] < adjusted_col_end and
56 | cell["adjusted_col_end"] > adjusted_col_start):
57 | if cell["is_header"]:
58 | col_headers.append(cell)
59 |
60 | return col_headers
61 |
62 |
63 | def get_highlighted_subtable(table, cell_indices, with_heuristic_headers=False):
64 | """Extract out the highlighted part of a table."""
65 | highlighted_table = []
66 |
67 | adjusted_table = _add_adjusted_col_offsets(table)
68 |
69 | for (row_index, col_index) in cell_indices:
70 | cell = table[row_index][col_index]
71 | if with_heuristic_headers:
72 | row_headers = _get_heuristic_row_headers(adjusted_table, row_index,
73 | col_index)
74 | col_headers = _get_heuristic_col_headers(adjusted_table, row_index,
75 | col_index)
76 | else:
77 | row_headers = []
78 | col_headers = []
79 |
80 | highlighted_cell = {
81 | "cell": cell,
82 | "row_headers": row_headers,
83 | "col_headers": col_headers
84 | }
85 | highlighted_table.append(highlighted_cell)
86 |
87 | return highlighted_table
88 |
89 |
90 | def linearize_full_table(table, cell_indices, table_page_title,
91 | table_section_title):
92 | """Linearize full table with localized headers and return a string."""
93 | table_str = ""
94 | if table_page_title:
95 | table_str += " " + table_page_title + " "
96 | if table_section_title:
97 | table_str += " " + table_section_title + " "
98 |
99 | table_str += "
"
100 | adjusted_table = _add_adjusted_col_offsets(table)
101 | for r_index, row in enumerate(table):
102 | row_str = " "
103 | for c_index, col in enumerate(row):
104 |
105 | row_headers = _get_heuristic_row_headers(
106 | adjusted_table, r_index, c_index)
107 | col_headers = _get_heuristic_col_headers(
108 | adjusted_table, r_index, c_index)
109 |
110 | # Distinguish between highlighted and non-highlighted cells.
111 | if [r_index, c_index] in cell_indices:
112 | start_cell_marker = " "
113 | end_cell_marker = " "
114 | else:
115 | start_cell_marker = "| "
116 | end_cell_marker = " | "
117 |
118 | # The value of the cell.
119 | item_str = start_cell_marker + col["value"] + " "
120 |
121 | # All the column headers associated with this cell.
122 | for col_header in col_headers:
123 | item_str += " " + \
124 | col_header["value"] + " "
125 |
126 | # All the row headers associated with this cell.
127 | for row_header in row_headers:
128 | item_str += " " + \
129 | row_header["value"] + " "
130 |
131 | item_str += end_cell_marker
132 | row_str += item_str
133 |
134 | row_str += "
"
135 | table_str += row_str
136 |
137 | table_str += "
"
138 | if cell_indices:
139 | assert "" in table_str
140 | return table_str
141 |
142 |
143 | def linearize_subtable(subtable, table_page_title, table_section_title):
144 | """Linearize the highlighted subtable and return a string of its contents."""
145 | table_str = ""
146 | if table_page_title:
147 | table_str += " " + table_page_title + " "
148 | if table_section_title:
149 | table_str += " " + table_section_title + " "
150 | table_str += " "
151 |
152 | for item in subtable:
153 | cell = item["cell"]
154 | row_headers = item["row_headers"]
155 | col_headers = item["col_headers"]
156 |
157 | # The value of the cell.
158 | item_str = "| " + cell["value"] + " "
159 |
160 | # All the column headers associated with this cell.
161 | for col_header in col_headers:
162 | item_str += " " + \
163 | col_header["value"] + " "
164 |
165 | # All the row headers associated with this cell.
166 | for row_header in row_headers:
167 | item_str += " " + \
168 | row_header["value"] + " "
169 |
170 | item_str += " | "
171 | table_str += item_str
172 |
173 | table_str += "
"
174 | return table_str
175 |
--------------------------------------------------------------------------------
/tigerscore/download_dataset/utils.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 | import argparse
3 |
4 |
5 | def generate_hash_code(text):
6 | # Convert the text to bytes and create a hash object
7 | hash_object = hashlib.sha256(text.encode())
8 |
9 | # Get the hexadecimal representation of the hash code
10 | hex_code = hash_object.hexdigest()
11 |
12 | # Return the first 16 digits of the hexadecimal code
13 | return hex_code[:16]
14 |
15 |
16 | def str2bool(v):
17 | if isinstance(v, bool):
18 | return v
19 | if v.lower() in ('yes', 'true', 't', 'y', '1'):
20 | return True
21 | elif v.lower() in ('no', 'false', 'f', 'n', '0'):
22 | return False
23 | else:
24 | raise argparse.ArgumentTypeError('Boolean value expected.')
25 |
26 |
27 | def empty2None(x):
28 | if x == '':
29 | return None
30 | else:
31 | return x
32 |
33 |
34 | def empty2zero(x):
35 | if x == '':
36 | return 0
37 | elif isinstance(x, int):
38 | return x
39 | elif isinstance(x, str):
40 | return int(x)
41 | else:
42 | raise argparse.ArgumentTypeError('Integer value expected.')
43 |
--------------------------------------------------------------------------------
/tigerscore/eval_scripts/check_data.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")
3 | from xgptscore.openai_utils import openai_completions, _chatml_to_prompt
4 | import fire
5 | import json
6 | import random
7 | from string import Template
8 |
9 |
10 | template = """
11 | ${instruction}
12 | ${input}
13 |
14 | Model-generated output:
15 | ${output}
16 |
17 | An error analysis provided:
18 | ${error_analysis}
19 |
20 | Is the error analysis reasonable? Answer me "yes" or "no" only.\
21 | """
22 |
23 | def main(input_file, output_file, model_name="gpt-4", num_samples=None, num_procs=5):
24 | with open(input_file, "r") as f:
25 | if input_file.endswith(".jsonl"):
26 | input_data = [json.loads(line) for line in f]
27 | elif input_file.endswith(".json"):
28 | input_data = json.load(f)
29 | if num_samples is None:
30 | num_samples = len(input_data)
31 | print(num_samples)
32 | input_data = input_data[:num_samples]
33 |
34 | def process_data(item):
35 | prompt = Template(template=template).substitute(
36 | instruction=item["instruction"],
37 | input=item["input_context"],
38 | output=item["hypo_output"],
39 | error_analysis=item["errors"]
40 | )
41 | message = [{
42 | "role": "user",
43 | "content": prompt
44 | }]
45 | chatml_prompt = _chatml_to_prompt(message)
46 | return chatml_prompt
47 |
48 | prompts = list(map(process_data, input_data))
49 | print(prompts[0])
50 | completions = openai_completions(prompts, model_name=model_name, num_procs=num_procs, use_cache=False)
51 | print(f"Finished generating {len(completions['completions'])} completions.")
52 | print(f"Total prices: {sum(completions['price_per_example'])}")
53 | for i, completion in enumerate(completions['completions']):
54 | input_data[i]["completion"] = completion
55 | with open(output_file, "w") as f:
56 | if output_file.endswith(".jsonl"):
57 | for item in input_data:
58 | json.dump(item, f)
59 | f.write("\n")
60 | elif output_file.endswith(".json"):
61 | json.dump(input_data, f)
62 |
63 | if __name__ == "__main__":
64 | fire.Fire(main)
65 |
--------------------------------------------------------------------------------
/tigerscore/eval_scripts/check_data.sh:
--------------------------------------------------------------------------------
1 | # python check_data.py \
2 | # --input_file "../../data/new_std_400s_m_200s_l_1100s_i3-32k.json" \
3 | # --output_file "../../data/new_std_400s_m_200s_l_1100s_i3-32k.check.json" \
4 | # --model_name "gpt-4" \
5 | # --num_procs 5
6 |
7 |
8 | # python check_data.py \
9 | # --input_file "../../data/train_mix.jsonl" \
10 | # --output_file "../../data/train_mix.check_ChatGPT.jsonl" \
11 | # --model_name "ChatGPT"
12 |
13 | python check_data.py \
14 | --input_file "../../data/good.jsonl" \
15 | --output_file "../../data/good.check.json" \
16 | --model_name "ChatGPT" \
17 | --num_procs 5
--------------------------------------------------------------------------------
/tigerscore/eval_scripts/check_responses.sh:
--------------------------------------------------------------------------------
1 | model_name="gpt-4"
2 | if [ ${model_name} == "gpt-4" ]; then
3 | export OPENAI_API_KEY=
4 | export OPENAI_API_BASE=""
5 | export OPENAI_API_TYPE="azure"
6 | export OPENAI_API_VERSION="2023-07-01-preview"
7 | fi
8 |
9 |
10 | # python check_responses.py \
11 | # --input_file "/home//WorkSpace/ExplainableGPTScore/data/wmt/zh-en/train_data.wmt_mqm.distil_new_wmt_mqm_200.json" \
12 | # --output_file "/home//WorkSpace/ExplainableGPTScore/data/wmt/zh-en/train_data.wmt_mqm.distil_new_wmt_mqm_200.check.json" \
13 | # --model_name ${model_name} \
14 |
15 | # python check_responses.py \
16 | # --input_file "/home//WorkSpace/ExplainableGPTScore_bak/data/sum/train_data.align_score.filter_v2.json" \
17 | # --output_file "/home//WorkSpace/ExplainableGPTScore_bak/data/sum/train_data.align_score.filter_v2.check.json" \
18 | # --model_name ${model_name} \
19 |
20 |
21 | python check_responses.py \
22 | --input_file "../../data/wmt/train_data.wmt_mqm.distill_new_wmt_mqm.json" \
23 | --output_file "../../data/wmt/train_data.wmt_mqm.distill_new_wmt_mqm.${model_name}.check.json" \
24 | --model_name ${model_name} \
--------------------------------------------------------------------------------
/tigerscore/eval_scripts/eval_baseline.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=eval_baseline
3 | #SBATCH -c 3
4 | #SBATCH --partition=a100
5 | #SBATCH --gres=gpu:1
6 | #SBATCH --time=24:00:00
7 | #SBATCH --mem=50G
8 | #SBATCH --output=../../jobs/%x/%j.out
9 | metrics=("bleu" "rouge" "bertscore" "bleurt" "comet_da" "bart_score_cnn" "bart_score_para" "bart_score_cnn_src_hypo" "bart_score_para_src_hypo" "unieval_sum" "cometkiwi_da")
10 |
11 | # # summarization
12 | # input_file="../../BARTScore/SUM/SummEval/final_p_with_xgptscore.json"
13 | # output_file="../../BARTScore/SUM/SummEval/final_p_with_xgptscore.eval.json"
14 | # human_score_names="coherence,consistency,fluency,relevance"
15 | # cp -u $input_file $output_file
16 | # for metric in "${metrics[@]}"; do
17 | # echo "Evaluating $metric"
18 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
19 | # --human_score_names "$human_score_names"
20 | # done
21 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
22 | # --human_score_names "$human_score_names" --print_results True
23 |
24 | # # data2text
25 | # input_file="../../data_bak/webnlg/webnlg2020_gen_with_scores.json"
26 | # output_file="../../data_bak/webnlg/webnlg2020_gen_with_scores.eval.json"
27 | # input_file="../../data/evaluation/d2t/webnlg_2020/test_data_prepared.json"
28 | # output_file="../../data/evaluation/d2t/webnlg_2020/test_data_prepared.eval.json"
29 | # human_score_names="Correctness,DataCoverage,Fluency,Relevance,TextStructure"
30 | # cp -u $input_file $output_file
31 | # metrics=("${metrics[@]}" "instructscore_d2t" "gptscore_flan_d2t" "gptscore_flan_d2t_src_hypo")
32 | # for metric in "${metrics[@]}"; do
33 | # echo "Evaluating $metric"
34 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
35 | # --human_score_names "$human_score_names"
36 | # done
37 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
38 | # --human_score_names "$human_score_names" --print_results True
39 |
40 | # # # long_form_QA
41 | # input_file="../../data_bak/lfqa/test.gpt-4.rank.json"
42 | # output_file="../../data_bak/lfqa/test.gpt-4.rank.eval.json"
43 | # human_score_names="rank"
44 | # cp -u $input_file $output_file
45 | # for metric in "${metrics[@]}"; do
46 | # echo "Evaluating $metric"
47 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
48 | # --human_score_names "$human_score_names"
49 | # done
50 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
51 | # --human_score_names "$human_score_names" --print_results True
52 |
53 | # # instruction-following
54 | # input_file="../../data_bak/llm-blender/mix-instruct/test_data_prepared_300.json"
55 | # output_file="../../data_bak/llm-blender/mix-instruct/test_data_prepared_300.eval.json"
56 | # input_file="../../data/evaluation/instruct/mixinstruct/test_data_prepared.json"
57 | # output_file="../../data/evaluation/instruct/mixinstruct/test_data_prepared.eval.json"
58 | # human_score_names="gpt_rank_score"
59 | # # cp -u $input_file $output_file
60 | # metrics=("tigerscore")
61 | # # for metric in "${metrics[@]}"; do
62 | # # echo "Evaluating $metric"
63 | # # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
64 | # # --human_score_names "$human_score_names"
65 | # # done
66 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
67 | # --human_score_names "$human_score_names" --print_results True --average_by "sys" --as_rank "True"
68 |
69 | # mathqa
70 | # input_file="../../data_bak/mathqa/gsm8k_test_output_prepared.json"
71 | # output_file="../../data_bak/mathqa/gsm8k_test_output_prepared.eval.json"
72 | # input_file="../../data/evaluation/mathqa/gsm8k/test_data_prepared.json"
73 | # output_file="../../data/evaluation/mathqa/gsm8k/test_data_prepared.eval.json"
74 | # human_score_names="accuracy"
75 | # metrics=("instructscore")
76 | # cp -u $input_file $output_file
77 | # for metric in "${metrics[@]}"; do
78 | # echo "Evaluating $metric"
79 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
80 | # --human_score_names "$human_score_names"
81 | # done
82 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
83 | # --human_score_names "$human_score_names" --print_results True
84 |
85 |
86 | # # # story_gen
87 | # input_file="../../data/evaluation/storygen/test_data_prepared.json"
88 | # output_file="../../data/evaluation/storygen/test_data_prepared_eval.json"
89 | # metrics=("instructscore")
90 | # human_score_names="human"
91 | # cp -u $input_file $output_file
92 | # # for metric in "${metrics[@]}"; do
93 | # # echo "Evaluating $metric"
94 | # # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
95 | # # --human_score_names "$human_score_names"
96 | # # done
97 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
98 | # --human_score_names "$human_score_names" --print_results True
99 |
100 | # translation
101 | # input_file="../../data/evaluation/translation/wmt22/zh-en/eval_data.json"
102 | # output_file="../../data/evaluation/translation/wmt22/zh-en/eval_data.eval.json"
103 | # human_score_names="mqm"
104 | # metrics=("instructscore_mt_zh-en")
105 | # cp -u $input_file $output_file
106 | # # for metric in "${metrics[@]}"; do
107 | # # echo "Evaluating $metric"
108 | # # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
109 | # # --human_score_names "$human_score_names"
110 | # # done
111 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
112 | # --human_score_names "$human_score_names" --print_results True
113 |
114 | # input_file="../../data/evaluation/hhh_alignment/hhh_alignment.json"
115 | # output_file="../../data/evaluation/hhh_alignment/hhh_alignment.eval.json"
116 | # human_score_names="human_preference"
117 | # metrics=("bart_score_para_src_hypo")
118 | # cp -u $input_file $output_file
119 | # for metric in "${metrics[@]}"; do
120 | # echo "Evaluating $metric"
121 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
122 | # --human_score_names "$human_score_names"
123 | # done
124 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
125 | # --human_score_names "$human_score_names" --add_aggrement True --print_results True
126 |
127 | # input_file="../../data/evaluation/mtbench/mt_bench_human_judgments.json"
128 | # output_file="../../data/evaluation/mtbench/mt_bench_human_judgments.eval.json"
129 | # human_score_names="human_preference"
130 | # metrics=("bart_score_para_src_hypo")
131 | # cp -u $input_file $output_file
132 | # for metric in "${metrics[@]}"; do
133 | # echo "Evaluating $metric"
134 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
135 | # --human_score_names "$human_score_names"
136 | # done
137 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
138 | # --human_score_names "$human_score_names" --add_aggrement True --print_results True
139 |
140 |
141 | # input_file="../../data/evaluation/pair_cmp/test_data_prepared.json"
142 | # output_file="../../data/evaluation/pair_cmp/test_data_prepared.eval.json"
143 | # human_score_names="gpt_rank_score"
144 | # cp -u $input_file $output_file
145 | # # metrics=("bleu" "rouge" "bertscore" "bleurt" "comet_da" "bart_score_cnn" "unieval_sum" "cometkiwi_da")
146 | # metrics=("unieval_sum")
147 | # for metric in "${metrics[@]}"; do
148 | # echo "Evaluating $metric"
149 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
150 | # --human_score_names "$human_score_names"
151 | # done
152 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
153 | # --human_score_names "$human_score_names" --print_results True --average_by "sys" --as_rank "True"
154 |
--------------------------------------------------------------------------------
/tigerscore/eval_scripts/generate_distill_data.py:
--------------------------------------------------------------------------------
1 | """
2 |
3 | """
4 | import json
5 | import random
6 | import logging
7 | import sys
8 | import fire
9 | from pathlib import Path
10 | sys.path.append(str(Path(__file__).parent.parent))
11 | from xgptscore.process_utils import XPGTItem, get_xgptscore_from_json_per_aspect
12 | from xgptscore.xgptscore import xgptscore
13 | from xgptscore.constants import EVAL_ASPECTS
14 | logging.basicConfig(level=logging.warning)
15 |
16 |
17 | def main(
18 | task: str,
19 | xgptscore_mode: str,
20 | model_name: str,
21 | input_file: str,
22 | version_key: str = None,
23 | overwrite: bool = False,
24 | max_size: int = None,
25 | seed: int = 42,
26 | shuffle: bool = False,
27 | ):
28 |
29 | logging.warning("Loading from {}".format(input_file))
30 | with open(input_file, "r") as f:
31 | items = json.load(f)
32 | if shuffle:
33 | random.seed(seed)
34 | random.shuffle(items)
35 | suffix = f".{xgptscore_mode}.{model_name}"
36 | if version_key:
37 | suffix += f".{version_key}"
38 | if isinstance(max_size, int) and max_size > 0:
39 | items = items[:max_size]
40 | suffix += f".{max_size}"
41 | output_file = Path(input_file).with_suffix(f"{suffix}.json")
42 |
43 | xgptitems = []
44 | for item in items:
45 | for cand in item['candidates']:
46 | xgptitems.append(XPGTItem(
47 | task=task,
48 | instruction=item['instruction'],
49 | input=item['input'],
50 | ref_output=item['refs'] if 'refs' in item else item['output'],
51 | hypo_output=cand['text']
52 | ))
53 |
54 | if not output_file.exists() or overwrite:
55 | logging.warning("Running xgptscore")
56 | # run xgptscore
57 | result = xgptscore(xgptitems, mode=xgptscore_mode,
58 | model_name=model_name, num_workers=5)
59 | idx = 0
60 | aspects = EVAL_ASPECTS[task].keys()
61 | score_dict = {"xgptscore_" + aspect: 0 for aspect in aspects}
62 | for item in items:
63 | for cand in item['candidates']:
64 | cand['responses'] = result['round_completions'][idx]
65 | cand['messages_records'] = result['messages_records'][idx]
66 | xgptscore_ans = get_xgptscore_from_json_per_aspect(
67 | cand['responses'][-1])
68 | if xgptscore_ans is None:
69 | logging.info(f"XGPTScore failed for {cand['text']}")
70 | # cand['scores']['xgptscore'] = None
71 | else:
72 | cand['scores'].update(score_dict)
73 | cand['scores'].update(xgptscore_ans)
74 | idx += 1
75 | with open(output_file, "w") as f:
76 | json.dump(items, f, indent=4, ensure_ascii=False)
77 | logging.info("Saved to {}".format(output_file))
78 | else:
79 | logging.warning("Found existing {}".format(output_file))
80 | logging.warning("Skipping xgptscore")
81 |
82 |
83 | if __name__ == "__main__":
84 | logging.basicConfig(level=logging.warning)
85 | fire.Fire(main)
86 |
--------------------------------------------------------------------------------
/tigerscore/eval_scripts/generate_distill_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=generate_distill_data
3 | #SBATCH -c 2
4 | #SBATCH --time=24:00:00
5 | #SBATCH --mem=10G
6 | #SBATCH --output=../../jobs/%x/%j.out
7 |
8 | version_key="distill"
9 | overwrite=True
10 | model_name="ChatGPT"
11 | if [ ${model_name} == "gpt-4" ]; then
12 | export OPENAI_API_KEY=
13 | export OPENAI_API_BASE=""
14 | export OPENAI_API_TYPE="azure"
15 | export OPENAI_API_VERSION="2023-07-01-preview"
16 | fi
17 |
18 | # task='translation'
19 | # xgptscore_mode="wmt_mqm"
20 | # input_file="../../data/synthesis_min/translation/train_data.kb_txt.distill.syn_cand.json"
21 | # python generate_distill_data.py \
22 | # --task ${task} \
23 | # --input_file ${input_file} \
24 | # --xgptscore_mode ${xgptscore_mode} \
25 | # --version_key ${version_key} \
26 | # --model_name ${model_name} \
27 | # --overwrite ${overwrite} \
28 |
29 | # task='summarization'
30 | # xgptscore_mode="align_score"
31 | # input_file="../../data/synthesis_min/summarization/train_data.kb_txt.distill.syn_cand.json"
32 | # python generate_distill_data.py \
33 | # --task ${task} \
34 | # --input_file ${input_file} \
35 | # --xgptscore_mode ${xgptscore_mode} \
36 | # --version_key ${version_key} \
37 | # --model_name ${model_name} \
38 | # --overwrite ${overwrite} \
39 |
40 | # task='data2text'
41 | # xgptscore_mode="d2t"
42 | # input_file="../../data/synthesis_min/data2text/train_data.kb_txt.distill.syn_cand.json"
43 | # python generate_distill_data.py \
44 | # --task ${task} \
45 | # --input_file ${input_file} \
46 | # --xgptscore_mode ${xgptscore_mode} \
47 | # --version_key ${version_key} \
48 | # --model_name ${model_name} \
49 | # --overwrite ${overwrite} \
50 |
51 | # task='instruction-following'
52 | # xgptscore_mode="instruction_following"
53 | # input_file="../../data/synthesis_min/instruction-following/train_data.kb_txt.distill.syn_cand.json"
54 | # python generate_distill_data.py \
55 | # --task ${task} \
56 | # --input_file ${input_file} \
57 | # --xgptscore_mode ${xgptscore_mode} \
58 | # --version_key ${version_key} \
59 | # --model_name ${model_name} \
60 | # --overwrite ${overwrite} \
61 |
62 | task='long-form QA'
63 | xgptscore_mode="longform_qa"
64 | input_file="../../data/synthesis_min/long-form QA/train_data.kb_txt.distill.syn_cand.json"
65 | python generate_distill_data.py \
66 | --task "${task}" \
67 | --input_file "${input_file}" \
68 | --xgptscore_mode ${xgptscore_mode} \
69 | --version_key ${version_key} \
70 | --model_name ${model_name} \
71 | --overwrite ${overwrite} \
72 |
73 |
--------------------------------------------------------------------------------
/tigerscore/eval_scripts/generate_inst_synthetic_data.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append("..")
3 | from xgptscore.openai_utils import openai_completions, _chatml_to_prompt
4 | import fire
5 | import json
6 | import random
7 | from string import Template
8 |
9 |
10 | template = """
11 | Instruction:
12 | ${instruction}
13 | ${input}
14 |
15 | A ground-truth response:
16 | ${output}
17 |
18 | A model will be asked to respond to this instruction. However, that response might contain errors in various aspects.
19 |
20 | Please first output 5 possible error aspects if a model is asked to generate a response for the above instruction. The error aspects don't have to be one of the above aspects and can be any aspect that you think is reasonable for this instruction.
21 |
22 | Then generate an incorrect response contains up to ${num_errors} errors of these aspects. Each error corresponds to one of the aspect.
23 | The incorrect response should mimic style the real-generation of a model.
24 |
25 | Then give an analysis of these errors. For each error, give me the
26 | - error location (the substring that is wrong in the generated incorrect output)
27 | - error aspect
28 | - explanation (the generic error type description, why it's an error, and the correction suggestions)
29 | - severity ("major" or "minor")
30 | - score reduction (an integer between 1 to 5 given the severity of the error)
31 |
32 | Output format:
33 | Generated incorrect output:
34 |
35 | Error location 1:
36 | Error aspect 1:
37 | Explanation 1:
38 | Severity 1:
39 | Score reduction 1:
40 | ...
41 | """
42 |
43 | math_template = """
44 | Question:
45 | ${instruction}
46 | ${input}
47 |
48 | A ground-truth answer:
49 | ${output}
50 |
51 | A model will be asked to answer this math question. However, that response might contain errors in various aspects such as Problem Understanding, Problem Formulation, Computing Accuracy, Solution Interpretation, etc.
52 |
53 | Please first output a few possible error aspects if a model is asked to generate a response for the above instruction. The error aspects don't have to be one of the above aspects and can be any aspect that you think is reasonable for this instruction.
54 |
55 | Then generate an incorrect response contains up to ${num_errors} errors of these aspects. Each error corresponds to one of the aspect.
56 | The incorrect response should mimic style the real-generation of a model.
57 |
58 | Then give an analysis of these errors. For each error, give me the
59 | - error location (the substring that is wrong in the generated incorrect output)
60 | - error aspect
61 | - explanation (the generic error type description, why it's an error, and the correction suggestions)
62 | - severity ("major" or "minor")
63 | - score reduction (an integer between 0.5 to 5 given the severity of the error)
64 |
65 | Output format:
66 | Generated incorrect output:
67 |
68 | Error location 1:
69 | Error aspect 1:
70 | Explanation 1:
71 | Severity 1:
72 | Score reduction 1:
73 | ...
74 | """
75 |
76 | def main(
77 | input_file, output_file,
78 | model_name="gpt-4", num_samples=None,
79 | num_procs=5, seed=42,
80 | task='inst-fol'):
81 | random.seed(seed)
82 | with open(input_file, "r") as f:
83 | if input_file.endswith(".jsonl"):
84 | input_data = [json.loads(line) for line in f]
85 | elif input_file.endswith(".json"):
86 | input_data = json.load(f)
87 | if num_samples is None:
88 | num_samples = len(input_data)
89 | print(num_samples)
90 | input_data = input_data[:num_samples]
91 |
92 | def process_data(item):
93 | if task == 'math':
94 | _template = math_template
95 | else:
96 | _template = template
97 | prompt = Template(template=_template).substitute(
98 | instruction=item["instruction"],
99 | input=item["input"],
100 | output=item["output"],
101 | num_errors=random.randint(1, 5)
102 | )
103 | message = [{
104 | "role": "user",
105 | "content": prompt
106 | }]
107 | chatml_prompt = _chatml_to_prompt(message)
108 | return chatml_prompt
109 |
110 | prompts = list(map(process_data, input_data))
111 | print(prompts[0])
112 | completions = openai_completions(prompts, model_name=model_name, num_procs=num_procs, use_cache=True)
113 | print(f"Finished generating {len(completions['completions'])} completions.")
114 | print(f"Total prices: {sum(completions['price_per_example'])}")
115 | for i, completion in enumerate(completions['completions']):
116 | input_data[i]["completion"] = completion
117 | with open(output_file, "w") as f:
118 | if output_file.endswith(".jsonl"):
119 | for item in input_data:
120 | json.dump(item, f)
121 | f.write("\n")
122 | elif output_file.endswith(".json"):
123 | json.dump(input_data, f)
124 |
125 | if __name__ == "__main__":
126 | fire.Fire(main)
127 |
--------------------------------------------------------------------------------
/tigerscore/eval_scripts/generate_inst_synthetic_data.sh:
--------------------------------------------------------------------------------
1 | # python generate_inst_synthetic_data.py \
2 | # --input_file "../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.10k.jsonl" \
3 | # --output_file "../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.10k.gen.jsonl" \
4 | # --model_name "gpt-4" \
5 | # --num_samples 8000
6 |
7 | python generate_inst_synthetic_data.py \
8 | --input_file "../../data/additional/metamath/metamath.8k.jsonl" \
9 | --output_file "../../data/additional/metamath/metamath.8k.gen.jsonl" \
10 | --model_name "gpt-4" \
11 | --num_samples 10
12 |
13 | # python generate_inst_synthetic_data.py \
14 | # --input_file "../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.story.1k.jsonl" \
15 | # --output_file "../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.story.1k.gen.jsonl" \
16 | # --model_name "gpt-4" \
--------------------------------------------------------------------------------
/tigerscore/eval_scripts/generate_synthesis_distill_data.py:
--------------------------------------------------------------------------------
1 | """
2 | Generate synthesis distillation data from a json file.
3 | """
4 | import json
5 | import random
6 | import logging
7 | import sys
8 | import fire
9 | from pathlib import Path
10 | sys.path.append(str(Path(__file__).parent.parent))
11 | from xgptscore.process_utils import XPGTItem
12 | from xgptscore.xgptscore import xgptscore
13 | logging.basicConfig(level=logging.warning)
14 |
15 |
16 | def main(
17 | task: str,
18 | input_file: str,
19 | output_file: str = None,
20 | xgptscore_mode: str = "kb_txt",
21 | model_name: str = "gpt-4",
22 | version_key: str = "default",
23 | overwrite: bool = False,
24 | max_size: int = None,
25 | seed: int = 42,
26 | shuffle_file: bool = False,
27 | source_max_length: int = None,
28 | ref_max_length: int = None,
29 | hypo_max_length: int = None,
30 | ):
31 | logging.warning("Params: \n{}".format(json.dumps(locals(), indent=4)))
32 | # params
33 | if isinstance(max_size, int) and max_size > 0:
34 | version_key = f"{version_key}_{max_size}"
35 | # load data
36 | input_file = Path(input_file)
37 | if not output_file:
38 | output_file = input_file.with_suffix(
39 | f".{xgptscore_mode}.{version_key}.json")
40 | else:
41 | output_file = Path(output_file)
42 | with open(input_file, "r") as f:
43 | items = json.load(f)
44 | logging.warning("Loaded {} items from {}".format(
45 | len(items), input_file))
46 | logging.warning("Preparing writing to {}...".format(output_file))
47 |
48 | random.seed(seed)
49 | logging.warning("Set seed to {}".format(seed))
50 | if shuffle_file:
51 | random.shuffle(items)
52 | logging.warning("Shuffled {} items".format(len(items)))
53 | if isinstance(max_size, int) and max_size > 0:
54 | items = items[:max_size]
55 | logging.warning("Truncated to {} items".format(len(items)))
56 | elif isinstance(max_size, float) and max_size > 0 and max_size < 1:
57 | items = random.sample(items, int(len(items) * max_size))
58 | logging.warning("Sampled to {} items".format(len(items)))
59 |
60 | xgptitems = []
61 | for item in items:
62 | xgptitems.append(XPGTItem(
63 | task=task,
64 | instruction=item['instruction'],
65 | input=item['input'],
66 | ref_output=item['output'] if "output" in item else item['refs'],
67 | hypo_output=None,
68 | ))
69 | if "candidates" in item:
70 | del item["candidates"]
71 |
72 | if not output_file.exists() or overwrite:
73 | logging.warning("Running xgptscore")
74 | # run xgptscore
75 | xgptscore_params = {
76 | "max_lengths": {
77 | "input": source_max_length,
78 | "hypo_output": hypo_max_length,
79 | "ref_output": ref_max_length,
80 | },
81 | }
82 | result = xgptscore(xgptitems, mode=xgptscore_mode,
83 | model_name=model_name, **xgptscore_params)
84 | for i, item in enumerate(items):
85 | item['responses'] = result['round_completions'][i]
86 | item['messages_records'] = result['messages_records'][i]
87 | with open(output_file, "w") as f:
88 | json.dump(items, f, indent=4, ensure_ascii=False)
89 | logging.warning("Saved to {}".format(output_file))
90 | else:
91 | logging.warning("Loading from {}".format(output_file))
92 | with open(output_file, "r") as f:
93 | items = json.load(f)
94 |
95 |
96 | if __name__ == "__main__":
97 | fire.Fire(main)
98 |
--------------------------------------------------------------------------------
/tigerscore/eval_scripts/generate_synthesis_distill_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=synthesis_distill_data
3 | #SBATCH --time=48:00:00
4 | #SBATCH --output=../../jobs/synthesis_distill_data/%j.out
5 |
6 | xgptscore_mode="kb_txt"
7 | version_key="distill"
8 | model_name="gpt-4"
9 | if [ ${model_name} == "gpt-4" ]; then
10 | export OPENAI_API_KEY=
11 | export OPENAI_API_BASE=""
12 | export OPENAI_API_TYPE="azure"
13 | export OPENAI_API_VERSION="2023-07-01-preview"
14 | fi
15 |
16 | IFS=$'\n'
17 | tasks=("translation" "long-form QA" "summarization" "data2text" "mathQA" "instruction-following")
18 | for task in ${tasks[@]}; do
19 | input_file="/home//WorkSpace/ExplainableGPTScore/data/synthesis/${task}/train_data.json"
20 | echo task: $task
21 | python generate_synthesis_distill_data.py \
22 | --task $task \
23 | --xgptscore_mode $xgptscore_mode \
24 | --version_key $version_key \
25 | --model_name $model_name \
26 | --input_file $input_file \
27 | --source_max_length 512 \
28 | --overwrite "False" \
29 |
30 | done
--------------------------------------------------------------------------------
/tigerscore/eval_scripts/get_systhesis_ref_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=synthesis_distill_data
3 | #SBATCH --time=48:00:00
4 | #SBATCH --output=../../jobs/synthesis_distill_data/%j.out
5 |
6 | xgptscore_mode="paraphrase"
7 | version_key="distill"
8 | model_name="gpt-4"
9 | if [ ${model_name} == "gpt-4" ]; then
10 | export OPENAI_API_KEY=
11 | export OPENAI_API_BASE=""
12 | export OPENAI_API_TYPE="azure"
13 | export OPENAI_API_VERSION="2023-07-01-preview"
14 | fi
15 |
16 | IFS=$'\n'
17 | # tasks=("translation" "long-form QA" "summarization" "data2text" "mathQA" "instruction-following")
18 | tasks=("translation")
19 | for task in ${tasks[@]}; do
20 | input_file="../../data/synthesis/${task}/train_data.json"
21 | echo task: $task
22 | python generate_synthesis_distill_data.py \
23 | --task $task \
24 | --xgptscore_mode $xgptscore_mode \
25 | --version_key $version_key \
26 | --model_name $model_name \
27 | --input_file $input_file \
28 | --source_max_length 512 \
29 | --overwrite "False" \
30 | --shuffle_file True \
31 | --max_size 0.15 \
32 |
33 | done
--------------------------------------------------------------------------------
/tigerscore/eval_scripts/lfqa_gpt_rate.py:
--------------------------------------------------------------------------------
1 | """
2 | This file isn't used in our final version.
3 | """
4 | import sys
5 | import fire
6 | import json
7 | import logging
8 | import regex as re
9 | import random
10 | sys.path.append("..")
11 | from collections import Counter, defaultdict
12 | from string import Template
13 | from xgptscore.openai_utils import openai_completions, _chatml_to_prompt
14 | logging.basicConfig(level=logging.WARNING)
15 |
16 | rank_template = """
17 | 4 different models are asked to follow a given instruction to generate an answer based on a given source input.
18 | The instruction is: ${instruction}
19 | The source input is: ${source}
20 | The generated output of model 1 is: ${model1_generated}
21 | The generated output of model 2 is: ${model2_generated}
22 | The generated output of model 3 is: ${model3_generated}
23 | The generated output of model 4 is: ${model4_generated}
24 | The reference output is: ${reference}
25 |
26 | Now Please rank the 4 model's outputs from best to worst.
27 | Please first output the rank results in the following format:
28 | [best] [second best] [third best] [worst] (e.g. 1 2 3 4)
29 | Then give your brief comments on why you rank the outputs in this way.
30 | """
31 |
32 |
33 | def get_rank_prompts(
34 | item: dict
35 | ):
36 | random.shuffle(item['candidates'])
37 | rank_prompt = Template(rank_template).substitute(
38 | instruction=item['instruction'],
39 | source=item['input'],
40 | model1_generated=item['candidates'][0]['text'],
41 | model2_generated=item['candidates'][1]['text'],
42 | model3_generated=item['candidates'][2]['text'],
43 | model4_generated=item['candidates'][3]['text'],
44 | reference=item.get('output') or item.get("refs")[0],
45 | )
46 | return rank_prompt
47 |
48 |
49 | def main(
50 | input_file: str,
51 | output_file: str,
52 | seed: int = 42,
53 | model_name: str = "ChatGPT",
54 | ):
55 | random.seed(seed)
56 | with open(input_file, "r") as f:
57 | data = json.load(f)
58 |
59 | rank_prompts = list(map(get_rank_prompts, data))
60 | chatmls = [[{"role": "system", "content": "You are an helpful AI assistant to help user find information."},
61 | {"role": "user", "content": prompt}] for prompt in rank_prompts]
62 | chatml_prompts = [_chatml_to_prompt(chatml) for chatml in chatmls]
63 |
64 | decoding_kwargs = {
65 | # "max_tokens": 1024,
66 | "temperature": 0,
67 | "top_p": 1.0,
68 | "timeout": 30,
69 | "request_timeout": 30
70 | }
71 | results = openai_completions(
72 | chatml_prompts, model_name=model_name, **decoding_kwargs)
73 | logging.warning("Total price: {:.4f}$".format(
74 | sum(results['price_per_example'])))
75 | completions = results['completions']
76 |
77 | best_model_idxs = []
78 | model_ranks = defaultdict(list)
79 | for i, item in enumerate(data):
80 | item['rank_prompt'] = rank_prompts[i]
81 | item['rank_response'] = completions[i]
82 | try:
83 | first_digit_idx = re.search(r"\d", item['rank_response']).start()
84 | item['ranks'] = re.search(
85 | r"(\d)[\n ](\d)[\n ](\d)[\n ](\d)", item['rank_response'])
86 | if not item['ranks']:
87 | item['ranks'] = re.search(
88 | "\[best\] (\d) \[second best\] (\d) \[third best\] (\d) \[worst\] (\d)", item['rank_response'])
89 | if not item['ranks']:
90 | item['ranks'] = re.search(
91 | "\[best\] Model (\d)[\n ]\[second best\] Model (\d)[\n ]\[third best\] Model (\d)[\n ]\[worst\] Model (\d)", item['rank_response'])
92 | # item['ranks'] = item['rank_response'][first_digit_idx:item['rank_response'].index("\n")].split(" ")
93 | item['ranks'] = [int(rank) for rank in item['ranks'].groups()]
94 | except Exception:
95 | print(item['ranks'])
96 | for j, cand in enumerate(item['candidates']):
97 | cand['scores']['gpt_rank_{}'.format(
98 | model_name)] = - item['ranks'][j]
99 | model_ranks[cand['source']].append(item['ranks'][j])
100 | best_model_idxs.append(item['ranks'][0])
101 |
102 | print(Counter(best_model_idxs))
103 | for model, ranks in model_ranks.items():
104 | c = Counter(ranks)
105 | print(model, sorted(c.items(), key=lambda x: x[0]))
106 | with open(output_file, "w") as f:
107 | json.dump(data, f, indent=4, ensure_ascii=False)
108 | logging.warning(f"Saved to {output_file}")
109 |
110 |
111 | if __name__ == "__main__":
112 | fire.Fire(main)
113 |
--------------------------------------------------------------------------------
/tigerscore/eval_scripts/lfqa_gpt_rate.sh:
--------------------------------------------------------------------------------
1 | model_name="gpt-4"
2 | if [ ${model_name} == "gpt-4" ]; then
3 | export OPENAI_API_KEY=
4 | export OPENAI_API_BASE=""
5 | export OPENAI_API_TYPE="azure"
6 | export OPENAI_API_VERSION="2023-07-01-preview"
7 | fi
8 |
9 | python lfqa_gpt_rate.py \
10 | --input_file "../../data_bak/lfqa/test.json" \
11 | --output_file "../../data_bak/lfqa/test.${model_name}.rank.json" \
12 | --model_name ${model_name} \
--------------------------------------------------------------------------------
/tigerscore/eval_scripts/mathqa_rate.py:
--------------------------------------------------------------------------------
1 | """
2 | This file isn't used in our final version.
3 | """
4 | import sys
5 | import fire
6 | import json
7 | import logging
8 | import regex as re
9 | import copy
10 | import random
11 | sys.path.append("..")
12 | from xgptscore.openai_utils import openai_completions, _chatml_to_prompt
13 | from typing import List, Dict
14 | from string import Template
15 | from collections import Counter, defaultdict
16 | logging.basicConfig(level=logging.WARNING)
17 |
18 | template = """
19 | ${instruction}
20 | ${source}
21 |
22 | A correct output is:
23 | ${reference}
24 |
25 | A model generated output is:
26 | ${model1_generated}
27 |
28 | Now please evaluate the errors in the model-generated outputs
29 | For each error associated with problem understanding, problem formulation, computing accuracy, and solution interpretation, reduce 1 or 2 score.
30 | Finally give me a total reductions of score as the evaluation of this model-generated output starting with "Total Score Reduction: ".
31 | """
32 |
33 |
34 | def get_prompts(
35 | item: dict
36 | ):
37 | prompts = []
38 | random.shuffle(item['candidates'])
39 | for cand in item['candidates']:
40 | prompt = Template(template).substitute(
41 | instruction=item['instruction'].strip("\n "),
42 | source=item['input'].strip("\n "),
43 | reference=(item.get('output') or item.get("refs")[0]).strip("\n "),
44 | model1_generated=cand['text'].strip("\n "),
45 | )
46 | prompts.append(prompt)
47 | return prompts
48 |
49 | def main(
50 | input_file: str,
51 | output_file: str,
52 | seed: int = 42,
53 | model_name: str = "ChatGPT",
54 | ):
55 | random.seed(seed)
56 | with open(input_file, "r") as f:
57 | data = json.load(f)
58 |
59 | prompts = list(map(get_prompts, data))
60 | flatten_prompts = [prompt for prompts_ in prompts for prompt in prompts_]
61 | chatmls = [[{"role":"system","content":"You are an helpful AI assistant to help user find information."},
62 | {"role":"user","content": prompt}] for prompt in flatten_prompts]
63 | chatml_prompts = [_chatml_to_prompt(chatml) for chatml in chatmls]
64 |
65 | decoding_kwargs = {
66 | # "max_tokens": 1024,
67 | "temperature": 0,
68 | "top_p": 1.0,
69 | "timeout": 30,
70 | "request_timeout": 30
71 | }
72 | results = openai_completions(chatml_prompts, model_name=model_name, **decoding_kwargs)
73 | logging.warning("Total price: {:.4f}$".format(sum(results['price_per_example'])))
74 | completions = results['completions']
75 |
76 | idx = 0
77 | for i, item in enumerate(data):
78 | for j, cand in enumerate(item['candidates']):
79 | total_score_reduction = re.search("Total Score Reduction: (\d+)", completions[idx])
80 | if not total_score_reduction:
81 | total_score_reduction = re.search("Total Score Reduction: -(\d+)", completions[idx])
82 | if not total_score_reduction:
83 | total_score_reduction = re.search("Total Score Reduction is (\d+)", completions[idx])
84 | if not total_score_reduction:
85 | total_score_reduction = re.search("Total Score Reduction is -(\d+)", completions[idx])
86 | if total_score_reduction:
87 | cand['scores']['gpt_score_reduction'] = - abs(int(total_score_reduction.groups()[0]))
88 | else:
89 | pass
90 | cand['scores']['gpt_score_reduction'] = 0
91 | cand['gpt_score_output'] = completions[idx]
92 | idx += 1
93 | with open(output_file, "w") as f:
94 | json.dump(data, f, indent=4, ensure_ascii=False)
95 | logging.warning(f"Saved to {output_file}")
96 |
97 | if __name__ == "__main__":
98 | fire.Fire(main)
--------------------------------------------------------------------------------
/tigerscore/eval_scripts/test_ref_diff.py:
--------------------------------------------------------------------------------
1 | """
2 | This file isn't used in our final version.
3 | Calculate the distance between our score and the reference score.
4 | Maybe Pearson is better. Or we can draw a QQ plot.
5 | """
6 | import json
7 | import random
8 | import logging
9 | import sys
10 | import numpy as np
11 | import pickle
12 | from pathlib import Path
13 | from utils import MyCorrelation
14 | sys.path.append(str(Path(__file__).parent.parent))
15 | from xgptscore.xgptscore import xgptscore
16 | from itertools import chain
17 | from collections import Counter
18 | from xgptscore.process_utils import XPGTItem, get_xgptscore_from_json_per_aspect
19 | from xgptscore.constants import EVAL_ASPECTS
20 | logging.basicConfig(level=logging.INFO)
21 |
22 | # params
23 | task='data2text'
24 | bart_version="D2T"
25 | dataset="SFHOT"
26 | data_dir="../../BARTScore"
27 | xgptscore_mode="d2t"
28 | version_key=f"{xgptscore_mode}.ref.end_1_5"
29 | our_score_name="xgptscore"
30 | model_name="ChatGPT"
31 | overwrite=False
32 | max_size=200 # set to None to use all examples
33 | num_sys=2
34 | if isinstance(max_size, int) and max_size > 0:
35 | version_key = f"{version_key}_{max_size}"
36 |
37 | # load data
38 | input_file=Path(f"{data_dir}/{bart_version}/{dataset}/final_p_with_xgptscore.json")
39 | if version_key:
40 | output_file = input_file.with_suffix(f".{version_key}.json")
41 | else:
42 | output_file = input_file.with_suffix(f".default.json")
43 |
44 | if not output_file.exists() or overwrite:
45 | # Load and shuffle data
46 | logging.info("Loading from {}".format(input_file))
47 | with open(input_file, "r") as f:
48 | items = json.load(f)
49 | if isinstance(max_size, int) and max_size > 0:
50 | items = items[:max_size]
51 | # random will cause wrong results
52 |
53 | # Data processing
54 | xgptitems = []
55 | for item in items:
56 | item['candidates'] = [
57 | {
58 | "model": "reference",
59 | "decoding_method": "greedy",
60 | "text": item['output'] if isinstance(item['output'], str) else item['output'][0],
61 | "scores": {},
62 | }
63 | ]
64 | xgptitems.append(XPGTItem(
65 | task=task,
66 | instruction=item['instruction'],
67 | input=item['input'],
68 | # ref_output=item['output'],
69 | ref_output="N/A",
70 | hypo_output=item['output'] if isinstance(item['output'], str) else item['output'][0],
71 | ))
72 | # Run xgptscore
73 | result = xgptscore(xgptitems, mode=xgptscore_mode, model_name=model_name,num_workers=5)
74 | idx = 0
75 | aspects = EVAL_ASPECTS[task].keys()
76 | score_dict = {"xgptscore_"+aspect: 0 for aspect in aspects}
77 | for item in items:
78 | for cand in item['candidates']:
79 | cand['responses'] = result['round_completions'][idx]
80 | cand['messages_records'] = result['messages_records'][idx]
81 | xgptscore_ans = get_xgptscore_from_json_per_aspect(cand['responses'][-1])
82 | if xgptscore_ans is None:
83 | logging.info(f"XGPTScore failed for {cand['text']}")
84 | # cand['scores']['xgptscore'] = None
85 | else:
86 | cand['scores'].update(score_dict)
87 | cand['scores'].update(xgptscore_ans)
88 | idx += 1
89 |
90 | # Save results
91 | with open(output_file, "w") as f:
92 | json.dump(items, f, indent=4, ensure_ascii=False)
93 | logging.info("Saved to {}".format(output_file))
94 | else:
95 | logging.info("Loading existing results from {}".format(output_file))
96 | with open(output_file, "r") as f:
97 | items = json.load(f)
98 |
99 |
100 | # by system
101 | # Compute bias
102 | xgptscores = []
103 | for item in items:
104 | for cand in item['candidates']:
105 | if our_score_name in cand['scores']:
106 | xgptscores.append(cand['scores'][our_score_name])
107 |
108 | print(f"Mean: {np.mean(xgptscores)}")
109 | print(f"Distribution: {Counter(xgptscores)}")
110 | print(f"Std: {np.std(xgptscores)}")
111 | print(f"Max: {np.min(xgptscores)}")
--------------------------------------------------------------------------------
/tigerscore/eval_scripts/test_xgptscore.py:
--------------------------------------------------------------------------------
1 | """
2 | This script is used to test xgptscore for prompt engineering.
3 | """
4 |
5 | from common import str2bool
6 | from xgptscore.xgptscore import xgptscore
7 | from itertools import chain
8 | from xgptscore.process_utils import XPGTItem, get_xgptscore_from_json
9 | import json
10 | import logging
11 | import sys
12 | import numpy as np
13 | import fire
14 | from pathlib import Path
15 | from utils import MyCorrelation
16 | sys.path.append(str(Path(__file__).parent.parent))
17 | logging.basicConfig(level=logging.INFO)
18 |
19 |
20 | def main(input_file: str, task: str, model_name: str, output_file: str, xgptscore_mode: str = "prompt", max_size: int = None, overwrite: str = "false"):
21 | overwrite = str2bool(overwrite)
22 | if output_file is None:
23 | output_file = Path(input_file).parent / \
24 | (Path(input_file).stem + "." + xgptscore_mode + ".json")
25 | if not output_file.exists() or overwrite:
26 | logging.info("Loading from {}".format(input_file))
27 | with open(input_file, "r") as f:
28 | items = json.load(f)
29 | np.random.seed(42)
30 | np.random.shuffle(items)
31 | if isinstance(max_size, int) and max_size > 0:
32 | items = items[:max_size]
33 |
34 | # Data processing
35 | xgptitems = []
36 | for item in items:
37 | for cand in item['candidates']:
38 | xgptitems.append(XPGTItem(
39 | task=task,
40 | instruction=item['instruction'],
41 | input=item['input'],
42 | ref_output=item['output'],
43 | hypo_output=cand['text']
44 | ))
45 | # Run xgptscore
46 | result = xgptscore(xgptitems, mode=xgptscore_mode,
47 | model_name=model_name, num_workers=5)
48 | idx = 0
49 | for item in items:
50 | for cand in item['candidates']:
51 | cand['responses'] = result['round_completions'][idx]
52 | cand['messages_records'] = result['messages_records'][idx]
53 | cand['scores']['xgptscore'] = get_xgptscore_from_json(
54 | cand['responses'][-1])
55 | idx += 1
56 |
57 | # Save results
58 | with open(output_file, "w") as f:
59 | json.dump(items, f, indent=4, ensure_ascii=False)
60 | logging.info("Saved to {}".format(output_file))
61 | else:
62 | logging.info("Loading existing results from {}".format(output_file))
63 | with open(output_file, "r") as f:
64 | items = json.load(f)
65 |
66 | # evaluate system
67 |
68 | num_cands = len(items[0]['candidates'])
69 | human_scores = [[cand['scores']["rank"]
70 | for cand in item['candidates']] for item in items]
71 | human_scores = list(chain(*zip(*human_scores))) # transpose and flatten
72 | metrics = ["xgptscore", "bleu", "rouge1", "rouge2",
73 | "rougeL", "rougeLsum", "bart_score", "bart_score_cnn"]
74 | # metrics = ["xgptscore"]
75 |
76 | Pearson_corr = {}
77 | Spearman_corr = {}
78 | Kendall_corr = {}
79 | for metric in metrics:
80 | metric_scores = [[cand['scores'][metric]
81 | for cand in item['candidates']] for item in items]
82 | metric_scores = list(chain(*zip(*metric_scores))
83 | ) # transpose and flatten
84 | metric_corr = MyCorrelation(num_cands, human_scores, metric_scores)
85 | Pearson_corr[metric] = metric_corr.Pearson()
86 | Spearman_corr[metric] = metric_corr.Spearman()
87 | Kendall_corr[metric] = metric_corr.Kendall()
88 |
89 | # sort Corr
90 | Pearson_corr = {k: v for k, v in sorted(
91 | Pearson_corr.items(), key=lambda item: item[1][0], reverse=True)}
92 | Spearman_corr = {k: v for k, v in sorted(
93 | Spearman_corr.items(), key=lambda item: item[1][0], reverse=True)}
94 | Kendall_corr = {k: v for k, v in sorted(
95 | Kendall_corr.items(), key=lambda item: item[1][0], reverse=True)}
96 | Corr_record = {
97 | "Pearson": Pearson_corr,
98 | "Spearman": Spearman_corr,
99 | "Kendall": Kendall_corr,
100 | }
101 | # Save correlation results
102 | corr_results_file = Path("./eval_results/") / \
103 | (output_file.stem + ".corr.json")
104 | corr_results_file.parent.mkdir(parents=True, exist_ok=True)
105 | with open(corr_results_file, "w") as f:
106 | json.dump(Corr_record, f, indent=4, ensure_ascii=False)
107 | logging.info("Saved to {}".format(corr_results_file))
108 | # save to another location
109 | corr_results_file = output_file.parent / \
110 | "eval_results" / (output_file.stem + ".corr.json")
111 | corr_results_file.parent.mkdir(parents=True, exist_ok=True)
112 | with open(corr_results_file, "w") as f:
113 | json.dump(Corr_record, f, indent=4, ensure_ascii=False)
114 | logging.info("Saved to {}".format(corr_results_file))
115 | # print("Correlation results:")
116 | # print(json.dumps(Corr_record, indent=4, ensure_ascii=False))
117 |
118 |
119 | if __name__ == "__main__":
120 | fire.Fire(main)
121 |
--------------------------------------------------------------------------------
/tigerscore/eval_scripts/test_xgptscore.sh:
--------------------------------------------------------------------------------
1 | model_name="chatgpt"
2 |
3 | ## Summarization ##
4 | input_file="../../data/evaluation/summarization/summeval/test_data_prepared.json"
5 | python ./test_xgptscore.py \
6 | --input_file $input_file \
7 | --task "summarization" \
8 | --model_name $model_name
9 |
10 | ## Translation ##
11 | input_file="../../data/evaluation/translation/test_data_prepared.json"
12 | python ./test_xgptscore.py \
13 | --input_file $input_file \
14 | --task "translation" \
15 | --model_name $model_name
16 |
17 | ## Data2Text ##
18 | input_file="../../data/evaluation/d2t/webnlg_2020/test_data_prepared.json"
19 | python ./test_xgptscore.py \
20 | --input_file $input_file \
21 | --task "data2text" \
22 | --model_name $model_name
23 |
24 | ## Instructions ##
25 | input_file="../../data/evaluation/instructions/just-eval-instruct/test_data_prepared.json"
26 | python ./test_xgptscore.py \
27 | --input_file $input_file \
28 | --task "instructions" \
29 | --model_name $model_name
30 |
31 | ## Long Form QA ##
32 | input_file="../../data/evaluation/lfqa/test_data_prepared.json"
33 | python ./test_xgptscore.py \
34 | --input_file $input_file \
35 | --task "long-form QA" \
36 | --model_name $model_name
37 |
38 | ## Math QA ##
39 | input_file="../../data/evaluation/mathqa/gsm8k/test_data_prepared.json"
40 | python ./test_xgptscore.py \
41 | --input_file $input_file \
42 | --task "mathQA" \
43 | --model_name $model_name
44 |
45 | ## Story Generation ##
46 | input_file="../../data/evaluation/storygen/test_data_prepared.json"
47 | python ./test_xgptscore.py \
48 | --input_file $input_file \
49 | --task "story_generation" \
50 | --model_name $model_name
--------------------------------------------------------------------------------
/tigerscore/eval_scripts/utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from mt_metrics_eval.stats import Correlation
3 | from typing import List
4 |
5 |
6 | class MyCorrelation(Correlation):
7 | def __init__(self, num_sys: int, gold_scores: List[int], metric_scores: List[int]):
8 | # remove nan in metrics scores
9 | none_metric_scores_idxs = [idx for idx,
10 | x in enumerate(metric_scores) if x is None]
11 | logging.info("Remove {} nan scores from {} scores".format(
12 | len(none_metric_scores_idxs),
13 | len(metric_scores)
14 | ))
15 | gold_scores = gold_scores.copy()
16 | # set gold scores to None if metric scores are None
17 | for idx in none_metric_scores_idxs[::-1]:
18 | gold_scores[idx] = None
19 | super().__init__(num_sys, gold_scores, metric_scores)
20 |
--------------------------------------------------------------------------------
/tigerscore/finetune/ds_llama_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "bf16": {
3 | "enabled": "auto"
4 | },
5 | "scheduler": {
6 | "type": "WarmupLR",
7 | "params": {
8 | "warmup_min_lr": "auto",
9 | "warmup_max_lr": "auto",
10 | "warmup_num_steps": "auto"
11 | }
12 | },
13 | "zero_optimization": {
14 | "stage": 3,
15 | "overlap_comm": true,
16 | "contiguous_gradients": true,
17 | "sub_group_size": 1e9,
18 | "reduce_bucket_size": "auto",
19 | "stage3_prefetch_bucket_size": "auto",
20 | "stage3_param_persistence_threshold": "auto",
21 | "stage3_max_live_parameters": 1e9,
22 | "stage3_max_reuse_distance": 1e9,
23 | "stage3_gather_16bit_weights_on_model_save": true
24 | },
25 | "gradient_accumulation_steps": "auto",
26 | "gradient_clipping": "auto",
27 | "steps_per_print": 1,
28 | "train_batch_size": "auto",
29 | "train_micro_batch_size_per_gpu": "auto",
30 | "wall_clock_breakdown": false
31 | }
--------------------------------------------------------------------------------
/tigerscore/finetune/finetune_llama.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=llama_finetune
3 | #SBATCH -c 10
4 | #SBATCH --partition=a100
5 | #SBATCH --gres=gpu:4
6 | #SBATCH --time=24:00:00
7 | #SBATCH --mem=100G
8 | #SBATCH --output=../../jobs/%x/%j.out
9 |
10 | nvidia-smi
11 | MASTER_PORT=4637
12 | MODEL_DIR="meta-llama/Llama-2-7b-hf" # 13b
13 | run_name="llama.train_mix.check.clean.mathQA" # change this every time you run a new experiment
14 |
15 | output_dir="../../outputs/${MODEL_DIR}/${run_name}"
16 |
17 | train_data_path="../../data/train_mix.check.clean.mathQA.format_v2.json" #
18 |
19 | mkdir -p ${output_dir}
20 |
21 | # slurm system gpus can't connect to each other by default
22 | # set the following environment variables to enable nccl
23 | export NCCL_IB_DISABLE=1;
24 | export NCCL_P2P_DISABLE=1;
25 |
26 | export NCCL_DEBUG=INFO;
27 | export NCCL_SOCKET_IFNAME=en,eth,em,bond;
28 | export CXX=g++;
29 |
30 | # batch_size = train_batch_size * gradient_accumulation_steps * num_gpus = 128
31 | # epoch size: alpaca using 3 epochs for 52k data
32 | # epoch size: translation data size, only 8k
33 | # epoch szie: sum, data2text, trans, 30k, epoch_size = 4
34 |
35 | # deepspeed \
36 | # --num_gpus 4 \
37 | # --num_nodes 1 \
38 | # --master_port ${MASTER_PORT} \
39 | # train.py \
40 | # --model_name_or_path ${MODEL_DIR} \
41 | # --train_data_path ${train_data_path} \
42 | # --bf16 True \
43 | # --output_dir ${output_dir} \
44 | # --num_train_epochs 3 \
45 | # --per_device_train_batch_size 2 \
46 | # --per_device_eval_batch_size 2 \
47 | # --gradient_accumulation_steps 16 \
48 | # --model_max_length 1024 \
49 | # --evaluation_strategy "no" \
50 | # --save_strategy "epoch" \
51 | # --save_steps 200 \
52 | # --save_total_limit 1 \
53 | # --learning_rate 2e-5 \
54 | # --weight_decay 0. \
55 | # --warmup_ratio 0.1 \
56 | # --lr_scheduler_type "cosine" \
57 | # --logging_steps 2 \
58 | # --tf32 True \
59 | # --deepspeed ds_llama_config.json \
60 | # --run_name ${run_name} \
61 | # --seed 42 \
62 | # --is_lora False \
63 |
64 | CUDA_VISIBLE_DEVICES=0,1,2,3 deepspeed \
65 | --num_gpus 4 \
66 | --num_nodes 1 \
67 | --master_port ${MASTER_PORT} \
68 | train.py \
69 | --model_name_or_path ${MODEL_DIR} \
70 | --train_data_path ${train_data_path} \
71 | --bf16 True \
72 | --output_dir ${output_dir} \
73 | --num_train_epochs 3 \
74 | --per_device_train_batch_size 1 \
75 | --per_device_eval_batch_size 2 \
76 | --gradient_accumulation_steps 32 \
77 | --model_max_length 1024 \
78 | --evaluation_strategy "no" \
79 | --save_strategy "epoch" \
80 | --save_steps 64 \
81 | --save_total_limit 6 \
82 | --learning_rate 2e-5 \
83 | --weight_decay 0. \
84 | --warmup_ratio 0.1 \
85 | --lr_scheduler_type "cosine" \
86 | --logging_steps 2 \
87 | --tf32 True \
88 | --deepspeed ds_llama_config.json \
89 | --run_name ${run_name} \
90 | --seed 42 \
91 | --is_lora False \
92 |
93 | # # LIMA config
94 | # deepspeed \
95 | # --num_gpus 4 \
96 | # --num_nodes 1 \
97 | # --master_port ${MASTER_PORT} \
98 | # train.py \
99 | # --model_name_or_path ${MODEL_DIR} \
100 | # --train_data_path ${train_data_path} \
101 | # --bf16 True \
102 | # --output_dir ${output_dir} \
103 | # --num_train_epochs 15 \
104 | # --per_device_train_batch_size 1 \
105 | # --per_device_eval_batch_size 2 \
106 | # --gradient_accumulation_steps 32 \
107 | # --model_max_length 1024 \
108 | # --evaluation_strategy "no" \
109 | # --save_strategy "epoch" \
110 | # --save_steps 200 \
111 | # --save_total_limit 1 \
112 | # --learning_rate 1e-5 \
113 | # --adam_beta1 0.9 \
114 | # --adam_beta2 0.95 \
115 | # --weight_decay 0.1 \
116 | # --warmup_ratio 0. \
117 | # --lr_scheduler_type "linear" \
118 | # --logging_steps 2 \
119 | # --tf32 True \
120 | # --deepspeed ds_llama_config.json \
121 | # --run_name ${run_name} \
122 | # --seed 42 \
123 | # --is_lora False \
--------------------------------------------------------------------------------
/tigerscore/finetune/finetune_mistral.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=llama_finetune
3 | #SBATCH -c 10
4 | #SBATCH --partition=a100
5 | #SBATCH --gres=gpu:4
6 | #SBATCH --time=24:00:00
7 | #SBATCH --mem=100G
8 | #SBATCH --output=../../jobs/%x/%j.out
9 |
10 | nvidia-smi
11 | MASTER_PORT=4637
12 | MODEL_DIR="mistralai/Mistral-7B-v0.1" # 13b
13 | run_name="train_mix.check_ChatGPT.clean" # change this every time you run a new experiment
14 |
15 | output_dir="../../outputs/${MODEL_DIR}/${run_name}"
16 | train_data_path="../../data/train_mix.check_ChatGPT.clean.format_v2.json" #
17 |
18 | mkdir -p ${output_dir}
19 |
20 | # slurm system gpus can't connect to each other by default
21 | # set the following environment variables to enable nccl
22 | export NCCL_IB_DISABLE=1;
23 | export NCCL_P2P_DISABLE=1;
24 |
25 | export NCCL_DEBUG=INFO;
26 | export NCCL_SOCKET_IFNAME=en,eth,em,bond;
27 | export CXX=g++;
28 |
29 | # batch_size = train_batch_size * gradient_accumulation_steps * num_gpus = 128
30 | # epoch size: alpaca using 3 epochs for 52k data
31 | # epoch size: translation data size, only 8k
32 | # epoch szie: sum, data2text, trans, 30k, epoch_size = 4
33 |
34 | CUDA_VISIBLE_DEVICES="0,1,2,3" deepspeed \
35 | --num_gpus 4 \
36 | --num_nodes 1 \
37 | --master_port ${MASTER_PORT} \
38 | train.py \
39 | --model_name_or_path ${MODEL_DIR} \
40 | --train_data_path ${train_data_path} \
41 | --bf16 True \
42 | --output_dir ${output_dir} \
43 | --num_train_epochs 3 \
44 | --per_device_train_batch_size 1 \
45 | --per_device_eval_batch_size 2 \
46 | --gradient_accumulation_steps 32 \
47 | --model_max_length 1024 \
48 | --evaluation_strategy "no" \
49 | --save_strategy "epoch" \
50 | --save_steps 64 \
51 | --save_total_limit 6 \
52 | --learning_rate 2e-5 \
53 | --weight_decay 0. \
54 | --warmup_ratio 0.1 \
55 | --lr_scheduler_type "cosine" \
56 | --logging_steps 2 \
57 | --tf32 True \
58 | --deepspeed ds_llama_config.json \
59 | --run_name ${run_name} \
60 | --seed 42 \
61 | --is_lora False \
--------------------------------------------------------------------------------
/tigerscore/finetune/format_data_v2.py:
--------------------------------------------------------------------------------
1 | """
2 | Usage: transforms the xgptscore data format into the alpaca data format for finetuning.
3 |
4 | """
5 | import sys
6 | import os
7 | sys.path.append("../")
8 | templates_path = os.path.join(os.path.dirname(__file__), "..")
9 | sys.path.append(templates_path)
10 | from tqdm import tqdm
11 | from transformers import AutoTokenizer
12 | from common.datasets_config import DATASETS_CONFIG
13 | from pathlib import Path
14 | from string import Template
15 | import json
16 | import logging
17 | import fire
18 | import regex as re
19 | import numpy as np
20 | from collections import Counter
21 | from itertools import chain
22 |
23 |
24 | # FINETUNE_INST = "You are evaluating errors in a model-generated output for a(an) ${task} task."
25 | # FINETUNE_INPUT = """\
26 | # Task instruction: ${generation_instruction}
27 | # Source: ${input_context}
28 | # Model-generated Output: ${hypothesis_output}
29 |
30 | # Based on the given task instruction and source, identify the major and minor errors in this model-generated output.
31 | # Note that Major errors refer to actual errors that affects the task severely, and Minor errors refer to small imperfections, and purely subjective opinions about the output.
32 | # For each error you give in the response, please also elaborate the following information:
33 | # - error location (the words that are wrong in the output)
34 | # - error aspect it belongs to.
35 | # - explanation why it's an error, and the correction suggestions.
36 | # - severity of the error ("Major" or "Minor").
37 | # - reduction of score (between 0.5 and 5)
38 |
39 | # Your evaluation output in the json format:
40 | # """
41 | INST = "You are evaluating errors in a model-generated output for a given instruction."
42 | TEMPLATE = """\
43 | Instruction:
44 | ${generation_instruction}
45 | ${input_context}
46 |
47 | Model-generated Output:
48 | ${hypothesis_output}
49 |
50 | For each error you give in the response, please also elaborate the following information:
51 | - error location (the words that are wrong in the output)
52 | - error aspect it belongs to.
53 | - explanation why it's an error, and the correction suggestions.
54 | - severity of the error ("Major" or "Minor").
55 | - reduction of score (between 0.5 and 5 given the severity of the error)
56 |
57 | Your evaluation output:\
58 | """
59 |
60 | def main(
61 | seed: int = 42,
62 | input_file: str = None,
63 | output_file: str = None,
64 | overwrite: bool = False,
65 | max_eval_input_length: int = None,
66 | max_eval_hyp_length: int = None,
67 | max_eval_output_length: int = None,
68 | ):
69 | tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
70 |
71 | with open(input_file, 'r') as f:
72 | if input_file.endswith(".json"):
73 | data = json.load(f)
74 | elif input_file.endswith(".jsonl"):
75 | data = [json.loads(line) for line in f]
76 | formatted_data = []
77 | for item in data:
78 | inst = INST
79 | input_ = Template(TEMPLATE).substitute(
80 | generation_instruction=item['instruction'],
81 | input_context=item['input_context'],
82 | hypothesis_output=item['hypo_output']
83 | )
84 | output_ = item['errors']
85 | formatted_data.append({
86 | "instruction": inst,
87 | "input": input_,
88 | "output": output_,
89 | })
90 |
91 | with open(output_file, 'w') as f:
92 | json.dump(formatted_data, f, indent=4, ensure_ascii=False)
93 | logging.info(f"Saved to {output_file}")
94 |
95 | # count the dataset statistics
96 | dataset_statistics = {}
97 | dataset_statistics["#total"] = len(formatted_data)
98 | dataset_statistics["#unique input"] = len(
99 | set([item["input"] for item in formatted_data]))
100 | input_lens = [len(tokenizer.encode(item["input"]))
101 | for item in tqdm(formatted_data, desc="Counting input length")]
102 | output_lens = [len(tokenizer.encode(item["output"]))
103 | for item in tqdm(formatted_data, desc="Counting output length")]
104 | total_lens = [x + y for x, y in zip(input_lens, output_lens)]
105 | dataset_statistics["input_length"] = {}
106 | dataset_statistics["input_length"]["mean"] = np.mean(input_lens).item()
107 | dataset_statistics["input_length"]["percentile"] = np.percentile(
108 | input_lens, [0, 25, 50, 90, 100]).tolist()
109 | dataset_statistics["input_length"]["max"] = max(input_lens)
110 | dataset_statistics["input_length"]["min"] = min(input_lens)
111 | dataset_statistics["output_length"] = {}
112 | dataset_statistics["output_length"]["mean"] = np.mean(output_lens).item()
113 | dataset_statistics["output_length"]["percentile"] = np.percentile(
114 | output_lens, [0, 25, 50, 90, 100]).tolist()
115 | dataset_statistics["output_length"]["max"] = max(output_lens)
116 | dataset_statistics["output_length"]["min"] = min(output_lens)
117 | dataset_statistics["total_length"] = {}
118 | dataset_statistics["total_length"]["mean"] = np.mean(total_lens).item()
119 | dataset_statistics["total_length"]["percentile"] = np.percentile(
120 | total_lens, [0, 25, 50, 90, 100]).tolist()
121 | dataset_statistics["total_length"]["max"] = max(total_lens)
122 | dataset_statistics["total_length"]["min"] = min(total_lens)
123 | error_aspects = [re.findall(
124 | r'(?<=Error aspect \d+: )[ \w]+', item['output']) for item in formatted_data]
125 | error_aspects = list(chain(*error_aspects))
126 | dataset_statistics["error_aspects_distribution"] = Counter(error_aspects)
127 |
128 | num_errors = [len(re.findall(r'(?<=Error location \d+: ).*(?=\n|$)',
129 | item['output'])) for item in formatted_data]
130 | dataset_statistics["num_errors_distribution"] = Counter(num_errors)
131 | # severity distributions
132 | severities = [re.findall(
133 | r'(?<=Severity \d+: ).*(?=\n|$)', item['output']) for item in formatted_data]
134 | severities = list(chain(*severities))
135 | dataset_statistics["severity_distribution"] = Counter(severities)
136 | # score reduction distributions
137 | score_reductions = [re.findall(
138 | r'(?<=Score reduction \d+: ).*(?=\n|$)', item['output']) for item in formatted_data]
139 | score_reductions = list(chain(*score_reductions))
140 | score_reductions = [abs(float(x.replace(" ", "")))
141 | for x in score_reductions]
142 | dataset_statistics["score_reduction_distribution"] = Counter(
143 | score_reductions)
144 |
145 | print(dataset_statistics)
146 | output_file = Path(output_file).with_suffix(".statistics.json")
147 | with open(output_file, "w") as f:
148 | json.dump(dataset_statistics, f, indent=4, ensure_ascii=False)
149 | logging.info(f"Saved statistics to {output_file}")
150 |
151 |
152 | if __name__ == "__main__":
153 | logging.basicConfig(level=logging.INFO)
154 | fire.Fire(main)
155 |
--------------------------------------------------------------------------------
/tigerscore/finetune/format_data_v2.sh:
--------------------------------------------------------------------------------
1 | # INPUT_FILE="../../data/train_mix.check.clean.jsonl"
2 | # OUTPUT_FILE="../../data/train_mix.check.clean.format_v2.json"
3 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
4 | # --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
5 |
6 | # INPUT_FILE="../../data/train_mix.jsonl"
7 | # OUTPUT_FILE="../../data/train_mix.format_v2.json"
8 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
9 | # --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
10 |
11 | # tasks=('data2text' 'instruction-following' 'long-form QA' 'mathQA' 'summarization' 'translation')
12 | # for task in "${tasks[@]}"; do
13 | # INPUT_FILE="../../data/train_mix.${task}.jsonl"
14 | # OUTPUT_FILE="../../data/train_mix.${task}.format_v2.json"
15 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
16 | # --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
17 | # done
18 |
19 | INPUT_FILE="../../data/train_mix.check.clean.mathQA.jsonl"
20 | OUTPUT_FILE="../../data/train_mix.check.clean.mathQA.format_v2.json"
21 | python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
22 | --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
23 |
24 | # INPUT_FILE="../../data/additional/alpaca_cleaned/new_alpaca_cleaned.v2.8k.gen.jsonl"
25 | # OUTPUT_FILE="../../data/additional/alpaca_cleaned/new_alpaca_cleaned.v2.8k.gen.format_v2.json"
26 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
27 | # --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
28 |
29 | # INPUT_FILE="../../data/additional/alpaca_cleaned/new_alpaca_cleaned.v2.2k.jsonl"
30 | # OUTPUT_FILE="../../data/additional/alpaca_cleaned/new_alpaca_cleaned.v2.2k.format_v2.json"
31 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
32 | # --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
33 |
34 | # INPUT_FILE="../../data/new_std_400s_m_200s_l_1100s_i3-32k.check.clean.jsonl"
35 | # OUTPUT_FILE="../../data/new_std_400s_m_200s_l_1100s_i3-32k.check.clean.format_v2.jsonl"
36 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
37 | # --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
38 |
39 | # INPUT_FILE="../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.story.ref.extracted.jsonl"
40 | # OUTPUT_FILE="../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.story.ref.extracted.format_v2.jsonl"
41 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
42 | # --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
43 |
44 | # INPUT_FILE="../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.story.1k.gen.extracted.jsonl"
45 | # OUTPUT_FILE="../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.story.1k.gen.extracted.format_v2.jsonl"
46 | # # INPUT_FILE="TIGERScore/data/32k_final.json"
47 | # # OUTPUT_FILE="TIGERScore/data/32k_final_distill.json"
48 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
49 | # --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
50 |
51 | # INPUT_FILE="../../data/additional/metamath/metamath.1k.ref.extracted.jsonl"
52 | # OUTPUT_FILE="../../data/additional/metamath/metamath.1k.ref.extracted.format_v2.jsonl"
53 | # # INPUT_FILE="TIGERScore/data/32k_final.json"
54 | # # OUTPUT_FILE="TIGERScore/data/32k_final_distill.json"
55 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
56 | # --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
--------------------------------------------------------------------------------
/tigerscore/finetune/format_distill_data.sh:
--------------------------------------------------------------------------------
1 | DATA_DIR="../../data"
2 |
3 | # # transllation
4 | # INPUT_FILE="${DATA_DIR}/wmt/train_data.wmt_mqm.distill_new_wmt_mqm.json"
5 | # OUTPUT_FILE="${DATA_DIR}/wmt/train_data.wmt_mqm.distill_new_wmt_mqm.format_txt.json"
6 | # python format_distill_data.py --task "translation" --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
7 |
8 | # # summarization
9 | # INPUT_FILE="${DATA_DIR}/WorkSpace/ExplainableGPTScore_bak/data/sum/train_data.align_score.filter_v2.json"
10 | # OUTPUT_FILE="${DATA_DIR}/WorkSpace/ExplainableGPTScore_bak/data/sum/train_data.align_score.filter_v2.format_txt.json"
11 | # python format_distill_data.py --task "summarization" \
12 | # --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
13 | # --max_eval_input_length 400 --max_eval_hyp_length 300 --max_eval_output_length 400 \
14 |
15 | # # data2text
16 | # INPUT_FILE="${DATA_DIR}/WorkSpace/ExplainableGPTScore_bak/data/d2t/train_data.d2t.filter_v1.json"
17 | # OUTPUT_FILE="${DATA_DIR}/WorkSpace/ExplainableGPTScore_bak/data/d2t/train_data.d2t.filter_v1.format_txt.json"
18 | # python format_distill_data.py --task "data2text" --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
19 | # --max_eval_input_length 400 --max_eval_hyp_length 400 --max_eval_output_length 400 \
20 | # # long-form QA
21 |
22 | # # SEScore3 zh-en debug
23 | # INPUT_FILE="${DATA_DIR}/WorkSpace/ExplainableGPTScore/data/sescore3/sescore3_zh_en_llama_formatted_data.json"
24 | # OUTPUT_FILE="${DATA_DIR}/WorkSpace/ExplainableGPTScore/data/sescore3/sescore3_zh_en_llama_formatted_data.format_txt.json"
25 | # python format_distill_data.py --task "translation" --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
26 | # # --max_eval_input_length 400 --max_eval_hyp_length 400 --max_eval_output_length 400 \
27 |
28 | # # summarization v3
29 | # INPUT_FILE="../../data/sum/train_data.align_score.filter_v3.json"
30 | # OUTPUT_FILE="../../data/sum/train_data.align_score.filter_v3.format_txt.json"
31 | # python format_distill_data.py --task "summarization" \
32 | # --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
33 | # --max_eval_input_length 400 --max_eval_hyp_length 300 --max_eval_output_length 400 \
34 |
35 |
36 | IFS=$'\n'
37 | tasks=("translation" "long-form QA" "summarization" "data2text" "instruction-following")
38 | for task in ${tasks[@]}; do
39 | INPUT_FILE="../../data/real_world/${task}.json"
40 | OUTPUT_FILE="../../data/real_world/${task}.format_txt.json"
41 | python format_distill_data.py --task ${task} \
42 | --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
43 | --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
44 | done
45 |
--------------------------------------------------------------------------------
/tigerscore/finetune/format_synthesis_distill_data.py:
--------------------------------------------------------------------------------
1 | """
2 | Usage: transforms the xgptscore data format into the alpaca data format for finetuning.
3 |
4 | """
5 | import sys
6 | import os
7 | sys.path.append("../")
8 | templates_path = os.path.join(os.path.dirname(__file__), "..")
9 | sys.path.append(templates_path)
10 | from tqdm import tqdm
11 | from transformers import AutoTokenizer
12 | from common.datasets_config import DATASETS_CONFIG
13 | from pathlib import Path
14 | from string import Template
15 | import json
16 | import logging
17 | import fire
18 | import regex as re
19 | import numpy as np
20 | from collections import Counter
21 | from itertools import chain
22 |
23 |
24 | FINETUNE_INST = "You are evaluating errors in a model-generated output for a given instruction."
25 | FINETUNE_INPUT = """\
26 | Instruction:
27 | ${generation_instruction}
28 | ${input_context}
29 |
30 | Model-generated Output:
31 | ${hypothesis_output}
32 |
33 | For each error you give in the response, please also elaborate the following information:
34 | - error location (the words that are wrong in the output)
35 | - error aspect it belongs to.
36 | - explanation why it's an error, and the correction suggestions.
37 | - severity of the error ("Major" or "Minor").
38 | - reduction of score (between 0.5 and 5 given the severity of the error)
39 |
40 | Your evaluation output:\
41 | """
42 |
43 |
44 | def main(
45 | task: str,
46 | seed: int = 42,
47 | input_file: str = None,
48 | output_file: str = None,
49 | overwrite: bool = False,
50 | max_eval_input_length: int = None,
51 | max_eval_hyp_length: int = None,
52 | max_eval_output_length: int = None,
53 | ):
54 | assert task in DATASETS_CONFIG.keys()
55 |
56 | tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
57 |
58 | with open(input_file, 'r') as f:
59 | data = json.load(f)
60 | formatted_data = []
61 | for item in data:
62 | syn_output = item['responses'][-1]
63 | syn_output = syn_output.replace(": \n", ": ")
64 | # decode the synthesis outputs
65 | try:
66 | start_pos = syn_output.index(
67 | "Generated incorrect output: ") + len("Generated incorrect output: ")
68 | end_pos = syn_output.index("\nError location 1")
69 | hyp = syn_output[start_pos:end_pos].strip('\n ')
70 | assert len(hyp) > 0
71 | except Exception:
72 | logging.warning(
73 | "Failed to parse the synthesis output: {}".format(syn_output))
74 | continue
75 | inst = Template(FINETUNE_INST).substitute(task=task)
76 | input_context_ids = tokenizer.encode(
77 | item['input'], add_special_tokens=False)
78 | hyp_ids = tokenizer.encode(hyp, add_special_tokens=False)
79 | if max_eval_input_length is not None and len(input_context_ids) > max_eval_input_length:
80 | input_context = tokenizer.decode(
81 | input_context_ids[:max_eval_input_length]) + "..."
82 | else:
83 | input_context = item['input']
84 | if max_eval_hyp_length is not None and len(hyp_ids) > max_eval_hyp_length:
85 | hypothesis_output = tokenizer.decode(
86 | hyp_ids[:max_eval_hyp_length]) + "..."
87 | else:
88 | hypothesis_output = hyp
89 | input_ = Template(FINETUNE_INPUT).substitute(
90 | generation_instruction=item['instruction'],
91 | input_context=input_context,
92 | hypothesis_output=hypothesis_output,
93 | )
94 | try:
95 | error_locations = re.findall(
96 | r'(?<=Error location \d+: ).*(?=\n|$)', syn_output)
97 | error_aspects = re.findall(
98 | r'(?<=Error aspect \d+: ).*(?=\n|$)', syn_output)
99 | explanations = re.findall(
100 | r'(?<=Explanation \d+: ).*(?=\n|$)', syn_output)
101 | severities = re.findall(
102 | r'(?<=Severity \d+: ).*(?=\n|$)', syn_output)
103 | score_reductions = re.findall(
104 | r'(?<=Score reduction \d+: ).*(?=\n|$)', syn_output)
105 | score_reductions = [abs(int(x.replace(" ", "")))
106 | for x in score_reductions]
107 | except Exception:
108 | logging.warning(
109 | "Failed to parse the synthesis output: {}".format(syn_output))
110 | continue
111 |
112 | if not len(error_locations) == len(error_aspects) == len(explanations) == len(severities) == len(score_reductions):
113 | logging.warning(
114 | "The number of errors properties does not match!: {}".format(syn_output))
115 | continue
116 |
117 | txt_output = "The model-generated output contains {} errors, with a total score reduction of {}.".format(
118 | len(error_locations),
119 | sum([int(score) for score in score_reductions]),
120 | )
121 | for i in range(len(error_locations)):
122 | txt_output += "\nError location {}: {}\n".format(
123 | i + 1, error_locations[i])
124 | txt_output += "Error aspect {}: {}\n".format(
125 | i + 1, error_aspects[i])
126 | txt_output += "Explanation {}: {}\n".format(i + 1, explanations[i])
127 | txt_output += "Severity {}: {}\n".format(i + 1, severities[i])
128 | txt_output += "Score reduction {}: {}".format(
129 | i + 1, score_reductions[i])
130 | output_ = txt_output.strip(' \n')
131 | formatted_data.append({
132 | "instruction": inst,
133 | "input": input_,
134 | "output": output_,
135 | "task": task,
136 | })
137 |
138 | # # append 20% non-error examples
139 | # for item in data:
140 | # if random.random() < 0.2:
141 | # inst = Template(FINETUNE_INST).substitute(task=task)
142 | # input_context_ids = tokenizer.encode(item['input'], add_special_tokens=False)
143 | # if max_eval_input_length is not None and len(input_context_ids) > max_eval_input_length:
144 | # input_context = tokenizer.decode(input_context_ids[:max_eval_input_length]) + "..."
145 | # else:
146 | # input_context = item['input']
147 | # input_ = Template(FINETUNE_INPUT).substitute(
148 | # generation_instruction=item['instruction'],
149 | # input_context=input_context,
150 | # hypothesis_output=item['output'],
151 | # )
152 | # output_ = "The model-generated output contains 0 errors, with a total score reduction of 0."
153 | # formatted_data.append({
154 | # "instruction": inst,
155 | # "input": input_,
156 | # "output": output_,
157 | # "task": task,
158 | # })
159 |
160 | with open(output_file, 'w') as f:
161 | json.dump(formatted_data, f, indent=4, ensure_ascii=False)
162 | logging.info(f"Saved to {output_file}")
163 |
164 | # count the dataset statistics
165 | dataset_statistics = {}
166 | dataset_statistics["#total"] = len(formatted_data)
167 | dataset_statistics["#unique input"] = len(
168 | set([item["input"] for item in formatted_data]))
169 | input_lens = [len(tokenizer.encode(item["input"]))
170 | for item in tqdm(formatted_data, desc="Counting input length")]
171 | output_lens = [len(tokenizer.encode(item["output"]))
172 | for item in tqdm(formatted_data, desc="Counting output length")]
173 | total_lens = [x + y for x, y in zip(input_lens, output_lens)]
174 | dataset_statistics["input_length"] = {}
175 | dataset_statistics["input_length"]["mean"] = np.mean(input_lens).item()
176 | dataset_statistics["input_length"]["percentile"] = np.percentile(
177 | input_lens, [0, 25, 50, 90, 100]).tolist()
178 | dataset_statistics["input_length"]["max"] = max(input_lens)
179 | dataset_statistics["input_length"]["min"] = min(input_lens)
180 | dataset_statistics["output_length"] = {}
181 | dataset_statistics["output_length"]["mean"] = np.mean(output_lens).item()
182 | dataset_statistics["output_length"]["percentile"] = np.percentile(
183 | output_lens, [0, 25, 50, 90, 100]).tolist()
184 | dataset_statistics["output_length"]["max"] = max(output_lens)
185 | dataset_statistics["output_length"]["min"] = min(output_lens)
186 | dataset_statistics["total_length"] = {}
187 | dataset_statistics["total_length"]["mean"] = np.mean(total_lens).item()
188 | dataset_statistics["total_length"]["percentile"] = np.percentile(
189 | total_lens, [0, 25, 50, 90, 100]).tolist()
190 | dataset_statistics["total_length"]["max"] = max(total_lens)
191 | dataset_statistics["total_length"]["min"] = min(total_lens)
192 | error_aspects = [re.findall(
193 | r'(?<=Error aspect \d+: ).*(?=\n|$)', item['output']) for item in formatted_data]
194 | error_aspects = list(chain(*error_aspects))
195 | dataset_statistics["error_aspects_distribution"] = Counter(error_aspects)
196 | # number of errors distributions
197 | num_errors = [len(re.findall(r'(?<=Error location \d+: ).*(?=\n|$)',
198 | item['output'])) for item in formatted_data]
199 | dataset_statistics["num_errors_distribution"] = Counter(num_errors)
200 | # severity distributions
201 | severities = [re.findall(
202 | r'(?<=Severity \d+: ).*(?=\n|$)', item['output']) for item in formatted_data]
203 | severities = list(chain(*severities))
204 | dataset_statistics["severity_distribution"] = Counter(severities)
205 | # score reduction distributions
206 | score_reductions = [re.findall(
207 | r'(?<=Score reduction \d+: ).*(?=\n|$)', item['output']) for item in formatted_data]
208 | score_reductions = list(chain(*score_reductions))
209 | score_reductions = [abs(int(x.replace(" ", ""))) for x in score_reductions]
210 | dataset_statistics["score_reduction_distribution"] = Counter(
211 | score_reductions)
212 |
213 | print(dataset_statistics)
214 | output_file = Path(output_file).with_suffix(".statistics.json")
215 | with open(output_file, "w") as f:
216 | json.dump(dataset_statistics, f, indent=4, ensure_ascii=False)
217 | logging.info(f"Saved statistics to {output_file}")
218 |
219 |
220 | if __name__ == "__main__":
221 | logging.basicConfig(level=logging.INFO)
222 | fire.Fire(main)
223 |
--------------------------------------------------------------------------------
/tigerscore/finetune/ft_llama_lora.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=ft_llama_lora
3 | #SBATCH --gres=gpu:a6000:1
4 | #SBATCH --time=24:00:00
5 | #SBATCH --qos=general
6 | #SBATCH --output=../../jobs/llama_finetune/%j.out
7 |
8 | MASTER_PORT=4635
9 | MODEL_DIR="meta-llama/Llama-2-7b-hf" # 13b
10 | run_name="model_len_1024_lora_debug" # change this every time you run a new experiment
11 | output_dir="../../outputs/${MODEL_DIR}/${run_name}"
12 | # train_data_path="../../data/wmt/train_data.wmt_mqm.distill.format.json"
13 | train_data_path="../../WorkSpace/ExplainableGPTScore/finetune_data/translation/train.json"
14 | # train_data_path="../../WorkSpace/ExplainableGPTScore/finetune_data/translation/train/wmt18_zh-en.json"
15 | mkdir -p ${output_dir}
16 |
17 | # slurm system gpus can't connect to each other by default
18 | # set the following environment variables to enable nccl
19 | export NCCL_IB_DISABLE=1;
20 | export NCCL_P2P_DISABLE=1;
21 |
22 | export NCCL_DEBUG=INFO;
23 | export NCCL_SOCKET_IFNAME=en,eth,em,bond;
24 | export CXX=g++;
25 |
26 | # batch_size = train_batch_size * gradient_accumulation_steps * num_gpus = 128
27 | # epoch size: alpaca using 3 epochs for 52k data
28 | # epoch size: translation data size, only 8k
29 |
30 | ../../.conda/envs/llm_reranker/bin/deepspeed \
31 | --num_gpus 1 \
32 | --num_nodes 1 \
33 | --master_port ${MASTER_PORT} \
34 | train.py \
35 | --model_name_or_path ${MODEL_DIR} \
36 | --train_data_path ${train_data_path} \
37 | --bf16 True \
38 | --output_dir ${output_dir} \
39 | --num_train_epochs 3 \
40 | --per_device_train_batch_size 4 \
41 | --per_device_eval_batch_size 2 \
42 | --gradient_accumulation_steps 32 \
43 | --model_max_length 1024 \
44 | --evaluation_strategy "no" \
45 | --save_strategy "epoch" \
46 | --save_steps 200 \
47 | --save_total_limit 3 \
48 | --learning_rate 3e-4 \
49 | --weight_decay 0. \
50 | --warmup_ratio 0.1 \
51 | --lr_scheduler_type "linear" \
52 | --logging_steps 2 \
53 | --tf32 True \
54 | --deepspeed ds_llama_config.json \
55 | --run_name ${run_name} \
56 | --seed 42 \
57 | --is_lora True \
58 |
59 | # lora Config
60 | # lr: 3e-4
--------------------------------------------------------------------------------
/tigerscore/finetune/test_llama.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=test_llama
3 | #SBATCH --gres=gpu:6000:1
4 | #SBATCH --time=24:00:00
5 | #SBATCH --output=../../jobs/test_llama/%j.out
6 | nvidia-smi
7 |
8 | model_name="meta-llama/Llama-2-7b-hf"
9 | outputs_dir=""
10 |
11 | # outputs_dir="../../outputs"
12 | checkpoint_name="model_len_1024_mix_v2"
13 | checkpoint_path="${outputs_dir}/${model_name}/${checkpoint_name}/checkpoint-best"
14 | # task="translation"
15 | # # finetune test
16 | # data_path="/home//WorkSpace/ExplainableGPTScore/finetune_data/${task}/test.json"
17 |
18 | # BARTScore test
19 | # data_path="/home//WorkSpace/ExplainableGPTScore/BARTScore/WMT/zh-en/final_p_with_xgptscore.test_llama_new.json"
20 |
21 | # mtme test mqm
22 | # task="translation"
23 | # human_score_names="mqm,da"
24 | # data_path="../../data/wmt22/zh-en/eval_data.random_2.json"
25 |
26 | # sum test relevance
27 | # task="summarization"
28 | # human_score_names="coherence,consistency,fluency,relevance"
29 | # data_path="../../BARTScore/SUM/SummEval/final_p_with_xgptscore.json"
30 |
31 | # d2t test Correctness
32 | # task="data2text"
33 | # human_score_names="Correctness,DataCoverage,Fluency,Relevance,TextStructure"
34 | # data_path="/home//WorkSpace/ExplainableGPTScore_bak/data/webnlg/webnlg2020_gen_with_scores.json"
35 |
36 | # instruction-following
37 | # rank
38 | # data_path="/home//WorkSpace/ExplainableGPTScore_bak/data/databricks/databricks-dolly-15k/rank_eval_mid.json"
39 |
40 | # task="instruction-following"
41 | # human_score_names="gpt_rank_score"
42 | # data_path="/home//WorkSpace/ExplainableGPTScore_bak/data/llm-blender/mix-instruct/test_data_prepared_300.json"
43 |
44 | # long-form QA
45 | ### ATTENTION the space in the task name is not allowed,you need use --task "long-form QA" instead of --task ${task}
46 | # task="long-form QA"
47 | # human_score_names="rank"
48 | # data_path="/home//WorkSpace/ExplainableGPTScore_bak/data/lfqa/test.json"
49 |
50 | # Math QA
51 | # accuracy
52 | # task="mathQA"
53 | # human_score_names="accuracy"
54 | # data_path="/home//WorkSpace/ExplainableGPTScore_bak/gsm8k-ScRel/data/test_acc.json"
55 |
56 | output_path="${data_path}.llama_2_7b_${checkpoint_name}.output"
57 |
58 | # seems batch_size=1 is faster than batch_size=2 or higher
59 | python test_llama.py \
60 | --model_name_or_path ${checkpoint_path} \
61 | --task ${task} \
62 | --data_path ${data_path} \
63 | --output_path ${output_path} \
64 | --torch_dtype "bfloat16" \
65 | --batch_size 1 \
66 | --human_score_names ${human_score_names} \
67 | --model_max_length 1024 \
68 | --max_eval_input_length 512 \
69 | --max_eval_hyp_length 512 \
70 | --max_eval_output_length 1024 \
71 | --overwrite True \
--------------------------------------------------------------------------------
/tigerscore/finetune/test_llama_vllm.py:
--------------------------------------------------------------------------------
1 |
2 | import argparse
3 | import json
4 | import torch
5 | import logging
6 | import sys
7 | import regex as re
8 | from pathlib import Path
9 | sys.path.append(str(Path(__file__).parent.parent))
10 | from vllm import LLM, SamplingParams
11 | from typing import List
12 | from string import Template
13 | from mt_metrics_eval.stats import Correlation
14 |
15 |
16 | MAX_INT = sys.maxsize
17 |
18 | IGNORE_INDEX = -100
19 | DEFAULT_PAD_TOKEN = "[PAD]"
20 | DEFAULT_EOS_TOKEN = ""
21 | DEFAULT_BOS_TOKEN = ""
22 | DEFAULT_UNK_TOKEN = ""
23 | PROMPT_DICT = {
24 | "prompt_input": (
25 | "Below is an instruction that describes a task, paired with an input that provides further context. "
26 | "Write a response that appropriately completes the request.\n\n"
27 | "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
28 | ),
29 | "prompt_no_input": (
30 | "Below is an instruction that describes a task. "
31 | "Write a response that appropriately completes the request.\n\n"
32 | "### Instruction:\n{instruction}\n\n### Response:"
33 | ),
34 | }
35 | FINETUNE_INST = "You are evaluating errors in a model-generated output for a given instruction."
36 | FINETUNE_INPUT = """\
37 | Instruction:
38 | ${generation_instruction}
39 | ${input_context}
40 |
41 | Model-generated Output:
42 | ${hypothesis_output}
43 |
44 | For each error you give in the response, please also elaborate the following information:
45 | - error location (the words that are wrong in the output)
46 | - error aspect it belongs to.
47 | - explanation why it's an error, and the correction suggestions.
48 | - severity of the error ("Major" or "Minor").
49 | - reduction of score (between 0.5 and 5 given the severity of the error)
50 |
51 | Your evaluation output:\
52 | """
53 |
54 |
55 | def get_sum_penalties(eval_output: dict):
56 | """
57 | Args:
58 | eval_output: dict, the json output of the eval function
59 |
60 | Returns:
61 | """
62 | try:
63 | penalty_score = 0
64 | for aspect in eval_output:
65 | for penalty_point in eval_output[aspect]["penalty_points"]:
66 | penalty_score += penalty_point["score_reduction"]
67 | return - penalty_score
68 | except Exception:
69 | return None
70 |
71 |
72 | def get_torch_dtype(dtype_str):
73 | """
74 | Get the torch dtype from a string
75 | """
76 | if dtype_str == "float32":
77 | return torch.float32
78 | elif dtype_str == "float16":
79 | return torch.float16
80 | elif dtype_str == "bfloat16":
81 | return torch.bfloat16
82 | elif dtype_str == "int8":
83 | return torch.int8
84 | else:
85 | raise ValueError("Invalid dtype {}".format(dtype_str))
86 |
87 |
88 | def batch_data(data_list, batch_size=1):
89 | n = len(data_list) // batch_size
90 | batch_data = []
91 | for i in range(n - 1):
92 | start = i * batch_size
93 | end = (i + 1) * batch_size
94 | batch_data.append(data_list[start:end])
95 |
96 | last_start = (n - 1) * batch_size
97 | last_end = MAX_INT
98 | batch_data.append(data_list[last_start:last_end])
99 | return batch_data
100 |
101 |
102 | class MyCorrelation(Correlation):
103 | def __init__(self, num_sys: int, gold_scores: List[int], metric_scores: List[int]):
104 | # remove nan in metrics scores
105 | none_metric_scores_idxs = [idx for idx,
106 | x in enumerate(metric_scores) if x is None]
107 | logging.info("Remove {} nan scores from {} scores".format(
108 | len(none_metric_scores_idxs),
109 | len(metric_scores)
110 | ))
111 | gold_scores = gold_scores.copy()
112 | # set gold scores to None if metric scores are None
113 | for idx in none_metric_scores_idxs[::-1]:
114 | gold_scores[idx] = None
115 | super().__init__(num_sys, gold_scores, metric_scores)
116 |
117 |
118 | def main(args):
119 |
120 | if args.output_path is not None:
121 | output_file = Path(args.output_path)
122 | else:
123 | output_file = Path(args.data_path).with_suffix(
124 | '.xgptscore.output.json')
125 | if not output_file.exists() or args.overwrite:
126 | logging.info("Loading model...")
127 | sampling_params = SamplingParams(
128 | temperature=0, top_p=1, max_tokens=1024)
129 | llm = LLM(model=args.model_name_or_path, tensor_parallel_size=1)
130 | logging.info("Model loaded from {}".format(args.model_name_or_path))
131 |
132 | eval_outputs = []
133 |
134 | logging.info("Load input data from {}".format(args.data_path))
135 | with open(args.data_path, "r") as f:
136 | input_data = json.load(f)
137 | formatted_data = []
138 | for item in input_data:
139 | for cand in item['candidates']:
140 | inst = Template(FINETUNE_INST).substitute(task=args.task)
141 | input_ = Template(FINETUNE_INPUT).substitute(
142 | task=args.task,
143 | generation_instruction=item['instruction'],
144 | input_context=item['input'],
145 | hypothesis_output=cand['text'],
146 | )
147 | formatted_data.append({
148 | "instruction": inst,
149 | "input": input_,
150 | })
151 | prompt_sources = [example['instruction'] + '\n' +
152 | example['input'] for example in formatted_data]
153 | prompt_sources = [x.strip(' \n') + "\n" for x in prompt_sources]
154 |
155 | batch_prompts = batch_data(prompt_sources, batch_size=args.batch_size)
156 |
157 | for idx, batch_prompt in enumerate(batch_prompts):
158 | if isinstance(batch_prompt, list):
159 | pass
160 | else:
161 | batch_prompt = [batch_prompt]
162 |
163 | completions = llm.generate(batch_prompt, sampling_params)
164 | for output in completions:
165 | generated_text = output.outputs[0].text
166 | eval_outputs.append(generated_text)
167 |
168 | cand_idx = 0
169 | for idx, (item, eval_output) in enumerate(zip(input_data, eval_outputs)):
170 | for cand in item['candidates']:
171 | cand['eval_output'] = eval_outputs[cand_idx]
172 | score_reductions = re.findall(
173 | r"(?<=\nScore reduction \d+: )(\d+\.\d+|\d+)", eval_outputs[cand_idx])
174 | cand['xgptscore'] = -sum(map(float, score_reductions))
175 | cand_idx += 1
176 |
177 | with open(output_file, 'w') as f:
178 | json.dump(input_data, f, indent=4, ensure_ascii=False)
179 | logging.info("Saved eval results to {}".format(output_file))
180 | else:
181 | with open(output_file, 'r') as f:
182 | input_data = json.load(f)
183 | for ex in input_data:
184 | for cand in ex['candidates']:
185 | score_reductions = re.findall(
186 | r"(?<=\nScore reduction \d+: )(\d+\.\d+|\d+)", cand['eval_output'])
187 | cand['xgptscore'] = -sum(map(float, score_reductions))
188 | with open(output_file, 'w') as f:
189 | json.dump(input_data, f, indent=4, ensure_ascii=False)
190 | logging.info("Loaded eval results from {}".format(output_file))
191 | # Compute correlation
192 | human_score_names = args.human_score_names.split(',')
193 |
194 | for h_name in human_score_names:
195 | human_scores = []
196 | xgptscores = []
197 | for item in input_data:
198 | for cand in item['candidates']:
199 | for s_name, score in cand['scores'].items():
200 | if s_name == h_name:
201 | xgptscores.append(cand['xgptscore'])
202 | human_scores.append(score)
203 | break
204 | corr = MyCorrelation(1, human_scores, xgptscores)
205 | logging.info("Human score: {}".format(h_name))
206 | logging.info("Pearson correlation: {}".format(corr.Pearson()))
207 | logging.info("Spearman correlation: {}".format(corr.Spearman()))
208 | logging.info("Kendall correlation: {}".format(corr.Kendall()))
209 |
210 |
211 | if __name__ == "__main__":
212 |
213 | logging.basicConfig(level=logging.INFO)
214 | parser = argparse.ArgumentParser()
215 | parser.add_argument("--model_name_or_path", type=str, default=None)
216 | parser.add_argument("--data_path", type=str, default=None)
217 | parser.add_argument("--output_path", type=str, default=None)
218 | parser.add_argument("--overwrite", action="store_true")
219 | parser.add_argument("--task", type=str, default="summarization")
220 | parser.add_argument("--batch_size", type=int, default=1)
221 | parser.add_argument("--human_score_names", type=str, default="score")
222 | args = parser.parse_args()
223 | main(args)
224 |
--------------------------------------------------------------------------------
/tigerscore/finetune/test_llama_vllm.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | #SBATCH --job-name=test_llama
3 | #SBATCH --gres=gpu:a6000:1
4 | #SBATCH --time=24:00:00
5 | #SBATCH --output=../../jobs/test_llama/%j.out
6 | nvidia-smi
7 |
8 |
9 | ## Note
10 | # please download the data in the working directory as indicated in the Data Preparation section in the read me
11 | # quick command: gdown https://drive.google.com/uc?id=1DAjvig-A_57CuBvENLg8A2PycOaz9ZkT
12 | ##
13 |
14 | model_name="meta-llama/Llama-2-7b-hf"
15 | outputs_dir=""
16 |
17 | # outputs_dir="../../outputs"
18 | checkpoint_name="ref"
19 | # checkpoint_path="${outputs_dir}/${model_name}/${checkpoint_name}/checkpoint-532"
20 | checkpoint_path="TIGER-Lab/TIGERScore-13B"
21 |
22 | human_score_names="gpt_rank_score"
23 | data_path="../../data/evaluation/lfqa/test_data_prepared.json"
24 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output"
25 | python test_llama_vllm.py \
26 | --model_name_or_path ${checkpoint_path} \
27 | --task "long-form QA" \
28 | --data_path ${data_path} \
29 | --output_path ${output_path} \
30 | --batch_size 60 \
31 | --human_score_names ${human_score_names} \
32 | --overwrite
33 |
34 | task="instruction-following"
35 | human_score_names="gpt_rank_score"
36 | data_path="../../data/evaluation/instruct/just-eval-instruct/test_data_prepared.json"
37 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output"
38 | python test_llama_vllm.py \
39 | --model_name_or_path ${checkpoint_path} \
40 | --task ${task} \
41 | --data_path ${data_path} \
42 | --output_path ${output_path} \
43 | --batch_size 60 \
44 | --human_score_names ${human_score_names} \
45 | --overwrite
46 |
47 | task="mathQA"
48 | human_score_names="accuracy"
49 | data_path="../../data/evaluation/mathqa/gsm8k/test_data_prepared.json"
50 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output"
51 | python test_llama_vllm.py \
52 | --model_name_or_path ${checkpoint_path} \
53 | --task ${task} \
54 | --data_path ${data_path} \
55 | --output_path ${output_path} \
56 | --batch_size 60 \
57 | --human_score_names ${human_score_names} \
58 | --overwrite
59 |
60 |
61 | # mtme test mqm
62 | task="translation"
63 | human_score_names="mqm"
64 | data_path="../../data/evaluation/translation/wmt22/zh-en/eval_data.json"
65 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output"
66 | python test_llama_vllm.py \
67 | --model_name_or_path ${checkpoint_path} \
68 | --task ${task} \
69 | --data_path ${data_path} \
70 | --output_path ${output_path} \
71 | --batch_size 60 \
72 | --human_score_names ${human_score_names} \
73 | --overwrite
74 |
75 | # sum test relevance
76 | task="summarization"
77 | human_score_names="coherence,consistency,fluency,relevance"
78 | data_path="../../data/evaluation/summarization/summeval/test_data_prepared.json"
79 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output"
80 | python test_llama_vllm.py \
81 | --model_name_or_path ${checkpoint_path} \
82 | --task ${task} \
83 | --data_path ${data_path} \
84 | --output_path ${output_path} \
85 | --batch_size 60 \
86 | --human_score_names ${human_score_names} \
87 | --overwrite
88 |
89 | # d2t test Correctness
90 | task="data2text"
91 | human_score_names="Correctness,DataCoverage,Fluency,Relevance,TextStructure"
92 | data_path="../../data/evaluation/d2t/webnlg_2020/test_data_prepared.json"
93 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output"
94 | python test_llama_vllm.py \
95 | --model_name_or_path ${checkpoint_path} \
96 | --task ${task} \
97 | --data_path ${data_path} \
98 | --output_path ${output_path} \
99 | --batch_size 60 \
100 | --human_score_names ${human_score_names} \
101 | --overwrite
102 |
103 |
104 | # storygen test human
105 | task="storygen"
106 | human_score_names="human"
107 | data_path="../../data/evaluation/storygen/test_data_prepared.json"
108 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output"
109 | python test_llama_vllm.py \
110 | --model_name_or_path ${checkpoint_path} \
111 | --task ${task} \
112 | --data_path ${data_path} \
113 | --output_path ${output_path} \
114 | --batch_size 60 \
115 | --human_score_names ${human_score_names} \
116 | --overwrite
117 |
--------------------------------------------------------------------------------
/tigerscore/finetune/test_llama_vllm_distance.py:
--------------------------------------------------------------------------------
1 |
2 | import argparse
3 | import json
4 | import torch
5 | import logging
6 | import sys
7 | import regex as re
8 | import numpy as np
9 | from pathlib import Path
10 | sys.path.append(str(Path(__file__).parent.parent))
11 | from vllm import LLM, SamplingParams
12 | from typing import List
13 | from string import Template
14 | from mt_metrics_eval.stats import Correlation
15 |
16 |
17 | MAX_INT = sys.maxsize
18 |
19 | IGNORE_INDEX = -100
20 | DEFAULT_PAD_TOKEN = "[PAD]"
21 | DEFAULT_EOS_TOKEN = ""
22 | DEFAULT_BOS_TOKEN = ""
23 | DEFAULT_UNK_TOKEN = ""
24 | PROMPT_DICT = {
25 | "prompt_input": (
26 | "Below is an instruction that describes a task, paired with an input that provides further context. "
27 | "Write a response that appropriately completes the request.\n\n"
28 | "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
29 | ),
30 | "prompt_no_input": (
31 | "Below is an instruction that describes a task. "
32 | "Write a response that appropriately completes the request.\n\n"
33 | "### Instruction:\n{instruction}\n\n### Response:"
34 | ),
35 | }
36 | FINETUNE_INST = "You are evaluating errors in a model-generated output for a(an) ${task} task."
37 | FINETUNE_INPUT = """\
38 | Task instruction: ${generation_instruction}
39 | Source: ${input_context}
40 | Model-generated Output: ${hypothesis_output}
41 |
42 | Based on the given task instruction and source, identify errors in this model-generated output.
43 | For each error you give in the response, please also elaborate the following information:
44 | - error location (the words that are wrong in the output)
45 | - error aspect it belongs to.
46 | - explanation why it's an error, and the correction suggestions.
47 | - severity of the error ("Major" or "Minor").
48 | - reduction of score (an interger between 0.5 and 5 given the severity of the error)
49 |
50 | Your evaluation output:
51 | """
52 |
53 |
54 | def get_sum_penalties(eval_output: dict):
55 | """
56 | Args:
57 | eval_output: dict, the json output of the eval function
58 |
59 | Returns:
60 | """
61 | try:
62 | penalty_score = 0
63 | for aspect in eval_output:
64 | for penalty_point in eval_output[aspect]["penalty_points"]:
65 | penalty_score += penalty_point["score_reduction"]
66 | return - penalty_score
67 | except Exception:
68 | return None
69 |
70 |
71 | def get_torch_dtype(dtype_str):
72 | """
73 | Get the torch dtype from a string
74 | """
75 | if dtype_str == "float32":
76 | return torch.float32
77 | elif dtype_str == "float16":
78 | return torch.float16
79 | elif dtype_str == "bfloat16":
80 | return torch.bfloat16
81 | elif dtype_str == "int8":
82 | return torch.int8
83 | else:
84 | raise ValueError("Invalid dtype {}".format(dtype_str))
85 |
86 |
87 | def batch_data(data_list, batch_size=1):
88 | n = len(data_list) // batch_size
89 | batch_data = []
90 | for i in range(n - 1):
91 | start = i * batch_size
92 | end = (i + 1) * batch_size
93 | batch_data.append(data_list[start:end])
94 |
95 | last_start = (n - 1) * batch_size
96 | last_end = MAX_INT
97 | batch_data.append(data_list[last_start:last_end])
98 | return batch_data
99 |
100 |
101 | class MyCorrelation(Correlation):
102 | def __init__(self, num_sys: int, gold_scores: List[int], metric_scores: List[int]):
103 | # remove nan in metrics scores
104 | none_metric_scores_idxs = [idx for idx,
105 | x in enumerate(metric_scores) if x is None]
106 | logging.info("Remove {} nan scores from {} scores".format(
107 | len(none_metric_scores_idxs),
108 | len(metric_scores)
109 | ))
110 | gold_scores = gold_scores.copy()
111 | # set gold scores to None if metric scores are None
112 | for idx in none_metric_scores_idxs[::-1]:
113 | gold_scores[idx] = None
114 | super().__init__(num_sys, gold_scores, metric_scores)
115 |
116 |
117 | def main(args):
118 |
119 | if args.output_path is not None:
120 | output_file = Path(args.output_path)
121 | else:
122 | output_file = Path(args.data_path).with_suffix(
123 | '.xgptscore.output.json')
124 | if not output_file.exists() or args.overwrite:
125 | logging.info("Loading model...")
126 | sampling_params = SamplingParams(
127 | temperature=0, top_p=1, max_tokens=1024)
128 | llm = LLM(model=args.model_name_or_path, tensor_parallel_size=1)
129 | logging.info("Model loaded from {}".format(args.model_name_or_path))
130 |
131 | eval_outputs = []
132 |
133 | logging.info("Load input data from {}".format(args.data_path))
134 | with open(args.data_path, "r") as f:
135 | input_data = json.load(f)
136 | formatted_data = []
137 | for item in input_data:
138 | inst = Template(FINETUNE_INST).substitute(task=args.task)
139 | refs = item['output'] if "output" in item else item["refs"]
140 | item["candidates"] = []
141 | if isinstance(refs,list):
142 | for ref in refs:
143 | item["candidates"].append(
144 | {
145 | "text":ref,
146 | "source":"unknown",
147 | "scores":{}
148 | }
149 | )
150 | else:
151 | item["candidates"].append(
152 | {
153 | "text":refs,
154 | "source":"unknown",
155 | "scores":{}
156 | }
157 | )
158 | for cand in item['candidates']:
159 | inst = Template(FINETUNE_INST).substitute(task=args.task)
160 | input_ = Template(FINETUNE_INPUT).substitute(
161 | task=args.task,
162 | generation_instruction=item['instruction'],
163 | input_context=item['input'],
164 | hypothesis_output=cand['text'],
165 | )
166 | formatted_data.append({
167 | "instruction": inst,
168 | "input": input_,
169 | })
170 | prompt_sources = [example['instruction'] + '\n' +
171 | example['input'] for example in formatted_data]
172 | prompt_sources = [x.strip(' \n') + "\n" for x in prompt_sources]
173 |
174 | batch_prompts = batch_data(prompt_sources, batch_size=args.batch_size)
175 |
176 | for idx, batch_prompt in enumerate(batch_prompts):
177 | if isinstance(batch_prompt, list):
178 | pass
179 | else:
180 | batch_prompt = [batch_prompt]
181 |
182 | completions = llm.generate(batch_prompt, sampling_params)
183 | for output in completions:
184 | generated_text = output.outputs[0].text
185 | eval_outputs.append(generated_text)
186 |
187 | cand_idx = 0
188 | for idx, (item, eval_output) in enumerate(zip(input_data, eval_outputs)):
189 | for cand in item['candidates']:
190 | cand['eval_output'] = eval_outputs[cand_idx]
191 | score_reductions = re.findall(
192 | r"(?<=\nScore reduction \d+: )(\d+\.\d+|\d+)", eval_outputs[cand_idx])
193 | cand['xgptscore'] = -sum(map(float, score_reductions))
194 | cand_idx += 1
195 |
196 | with open(output_file, 'w') as f:
197 | json.dump(input_data, f, indent=4, ensure_ascii=False)
198 | logging.info("Saved eval results to {}".format(output_file))
199 | else:
200 | with open(output_file, 'r') as f:
201 | input_data = json.load(f)
202 | for ex in input_data:
203 | for cand in ex['candidates']:
204 | score_reductions = re.findall(
205 | r"(?<=\nScore reduction \d+: )(\d+\.\d+|\d+)", cand['eval_output'])
206 | cand['xgptscore'] = -sum(map(float, score_reductions))
207 | with open(output_file, 'w') as f:
208 | json.dump(input_data, f, indent=4, ensure_ascii=False)
209 | logging.info("Loaded eval results from {}".format(output_file))
210 | # Compute correlation
211 | xgptscores = []
212 | for item in input_data:
213 | xgptscores.append(item['xgptscore'])
214 | print("Absolute score sum: {}".format(abs(sum(xgptscores))))
215 | print("Average score: {}".format(sum(xgptscores) / len(xgptscores)))
216 | print("Median score: {}".format(np.median(xgptscores)))
217 | print("Standard deviation: {}".format(np.std(list(map(abs, xgptscores)))))
218 |
219 |
220 |
221 | if __name__ == "__main__":
222 |
223 | logging.basicConfig(level=logging.INFO)
224 | parser = argparse.ArgumentParser()
225 | parser.add_argument("--model_name_or_path", type=str, default=None)
226 | parser.add_argument("--data_path", type=str, default=None)
227 | parser.add_argument("--output_path", type=str, default=None)
228 | parser.add_argument("--overwrite", action="store_true")
229 | parser.add_argument("--task", type=str, default="summarization")
230 | parser.add_argument("--batch_size", type=int, default=1)
231 | parser.add_argument("--human_score_names", type=str, default="score")
232 | args = parser.parse_args()
233 | main(args)
234 |
--------------------------------------------------------------------------------
/tigerscore/finetune/test_llama_vllm_vanilla.py:
--------------------------------------------------------------------------------
1 |
2 | import argparse
3 | import json
4 | import torch
5 | import logging
6 | import sys
7 | import regex as re
8 | from pathlib import Path
9 | sys.path.append(str(Path(__file__).parent.parent))
10 | from vllm import LLM, SamplingParams
11 | from typing import List
12 | from string import Template
13 | from mt_metrics_eval.stats import Correlation
14 |
15 |
16 | MAX_INT = sys.maxsize
17 |
18 | IGNORE_INDEX = -100
19 | DEFAULT_PAD_TOKEN = "[PAD]"
20 | DEFAULT_EOS_TOKEN = ""
21 | DEFAULT_BOS_TOKEN = ""
22 | DEFAULT_UNK_TOKEN = ""
23 | PROMPT_DICT = {
24 | "prompt_input": (
25 | "Below is an instruction that describes a task, paired with an input that provides further context. "
26 | "Write a response that appropriately completes the request.\n\n"
27 | "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
28 | ),
29 | "prompt_no_input": (
30 | "Below is an instruction that describes a task. "
31 | "Write a response that appropriately completes the request.\n\n"
32 | "### Instruction:\n{instruction}\n\n### Response:"
33 | ),
34 | }
35 | # FINETUNE_INST = """"""
36 | # FINETUNE_INPUT = """\
37 | # ${generation_instruction}
38 | # ${input_context}
39 |
40 | # Model-generated Output:
41 | # ${hypothesis_output}
42 |
43 |
44 | # You should rate Model-generated Output on a scale from 0.5 (worst) to 10 (best).
45 | # Rating: \
46 | # """
47 | FINETUNE_INST = """[INST] <>\nYou are a helpful, respectful and honest assistant.\n<>\n"""
48 | FINETUNE_INPUT = """\
49 | ${generation_instruction}
50 | ${input_context}
51 |
52 | Model-generated Output:
53 | ${hypothesis_output}
54 |
55 |
56 | You should rate Model-generated Output on a scale from 0.5 (worst) to 10 (best). [/INST] Rating: \
57 | """
58 | # FINETUNE_INPUT = """\
59 | # USER:You are evaluating errors in a model-generated output for a(an) ${task} task.
60 | # Task instruction: ${generation_instruction}
61 | # Source: ${input_context}
62 | # Model-generated Output: ${hypothesis_output}
63 |
64 | # Based on the given task instruction and source, identify errors in this model-generated output.
65 | # For each error you give in the response, please also elaborate the following information:
66 | # - error location (the words that are wrong in the output)
67 | # - error aspect it belongs to.
68 | # - explanation why it's an error, and the correction suggestions.
69 | # - severity of the error ("Major" or "Minor").
70 | # - reduction of score (between 0.5 and 5 given the severity of the error)
71 |
72 | # Please give a summary of the errors you found in the output, and the total score reduction.
73 | # The model-generated output contains {num_errors} errors, with a total score reduction of {total_score_reduction}.
74 |
75 | # Your evaluation output: ASSISTANT:\
76 | # """
77 | def find_first_float(s):
78 | match = re.search(r"[-+]?\d*\.\d+|\d+", s)
79 | return float(match.group()) if match else None
80 |
81 | def get_sum_penalties(eval_output: dict):
82 | """
83 | Args:
84 | eval_output: dict, the json output of the eval function
85 |
86 | Returns:
87 | """
88 | try:
89 | penalty_score = 0
90 | for aspect in eval_output:
91 | for penalty_point in eval_output[aspect]["penalty_points"]:
92 | penalty_score += penalty_point["score_reduction"]
93 | return - penalty_score
94 | except Exception:
95 | return None
96 |
97 |
98 | def get_torch_dtype(dtype_str):
99 | """
100 | Get the torch dtype from a string
101 | """
102 | if dtype_str == "float32":
103 | return torch.float32
104 | elif dtype_str == "float16":
105 | return torch.float16
106 | elif dtype_str == "bfloat16":
107 | return torch.bfloat16
108 | elif dtype_str == "int8":
109 | return torch.int8
110 | else:
111 | raise ValueError("Invalid dtype {}".format(dtype_str))
112 |
113 |
114 | def batch_data(data_list, batch_size=1):
115 | n = len(data_list) // batch_size
116 | batch_data = []
117 | for i in range(n - 1):
118 | start = i * batch_size
119 | end = (i + 1) * batch_size
120 | batch_data.append(data_list[start:end])
121 |
122 | last_start = (n - 1) * batch_size
123 | last_end = MAX_INT
124 | batch_data.append(data_list[last_start:last_end])
125 | return batch_data
126 |
127 |
128 | class MyCorrelation(Correlation):
129 | def __init__(self, num_sys: int, gold_scores: List[int], metric_scores: List[int]):
130 | # remove nan in metrics scores
131 | none_metric_scores_idxs = [idx for idx,
132 | x in enumerate(metric_scores) if x is None]
133 | logging.info("Remove {} nan scores from {} scores".format(
134 | len(none_metric_scores_idxs),
135 | len(metric_scores)
136 | ))
137 | gold_scores = gold_scores.copy()
138 | # set gold scores to None if metric scores are None
139 | for idx in none_metric_scores_idxs[::-1]:
140 | gold_scores[idx] = None
141 | super().__init__(num_sys, gold_scores, metric_scores)
142 |
143 |
144 | def main(args):
145 |
146 | if args.output_path is not None:
147 | output_file = Path(args.output_path)
148 | else:
149 | output_file = Path(args.data_path).with_suffix(
150 | '.xgptscore.output.json')
151 | if not output_file.exists() or args.overwrite:
152 | logging.info("Loading model...")
153 | sampling_params = SamplingParams(
154 | temperature=0, top_p=1, max_tokens=1024)
155 | llm = LLM(model=args.model_name_or_path, tensor_parallel_size=1)
156 | logging.info("Model loaded from {}".format(args.model_name_or_path))
157 |
158 | eval_outputs = []
159 |
160 | logging.info("Load input data from {}".format(args.data_path))
161 | with open(args.data_path, "r") as f:
162 | input_data = json.load(f)
163 | formatted_data = []
164 | for item in input_data:
165 | for cand in item['candidates']:
166 | inst = Template(FINETUNE_INST).substitute(task=args.task)
167 | input_ = Template(FINETUNE_INPUT).substitute(
168 | task=args.task,
169 | generation_instruction=item['instruction'],
170 | input_context=item['input'],
171 | hypothesis_output=cand['text'],
172 | )
173 | formatted_data.append({
174 | "instruction": inst,
175 | "input": input_,
176 | })
177 | prompt_sources = [example['instruction'] + '\n' +
178 | example['input'] for example in formatted_data]
179 | prompt_sources = [x.strip(' \n') + "\n" for x in prompt_sources]
180 |
181 | batch_prompts = batch_data(prompt_sources, batch_size=args.batch_size)
182 |
183 | for idx, batch_prompt in enumerate(batch_prompts):
184 | if isinstance(batch_prompt, list):
185 | pass
186 | else:
187 | batch_prompt = [batch_prompt]
188 |
189 | completions = llm.generate(batch_prompt, sampling_params)
190 | for output in completions:
191 | generated_text = output.outputs[0].text
192 | eval_outputs.append(generated_text)
193 |
194 | cand_idx = 0
195 | for idx, (item, eval_output) in enumerate(zip(input_data, eval_outputs)):
196 | for cand in item['candidates']:
197 | cand['eval_output'] = eval_outputs[cand_idx]
198 | score_reduction = find_first_float(eval_outputs[cand_idx])
199 | if score_reduction is None:
200 | cand['vanilla_xgptscore'] = -float(score_reduction)
201 | else:
202 | cand['vanilla_xgptscore'] = None
203 | cand_idx += 1
204 |
205 | with open(output_file, 'w') as f:
206 | json.dump(input_data, f, indent=4, ensure_ascii=False)
207 | logging.info("Saved eval results to {}".format(output_file))
208 | else:
209 | with open(output_file, 'r') as f:
210 | input_data = json.load(f)
211 | for ex in input_data:
212 | for cand in ex['candidates']:
213 | score_reduction = find_first_float(cand["eval_output"])
214 | if score_reduction is None:
215 | cand['vanilla_xgptscore'] = -float(score_reduction)
216 | else:
217 | cand['vanilla_xgptscore'] = None
218 | with open(output_file, 'w') as f:
219 | json.dump(input_data, f, indent=4, ensure_ascii=False)
220 | logging.info("Loaded eval results from {}".format(output_file))
221 | # Compute correlation
222 | human_score_names = args.human_score_names.split(',')
223 |
224 | for h_name in human_score_names:
225 | human_scores = []
226 | xgptscores = []
227 | for item in input_data:
228 | for cand in item['candidates']:
229 | for s_name, score in cand['scores'].items():
230 | if s_name == h_name:
231 | xgptscores.append(cand['vanilla_xgptscore'])
232 | human_scores.append(score)
233 | break
234 | corr = MyCorrelation(1, human_scores, xgptscores)
235 | print("Human score: {}".format(h_name))
236 | print("Pearson correlation: {}".format(corr.Pearson()))
237 | print("Spearman correlation: {}".format(corr.Spearman()))
238 | print("Kendall correlation: {}".format(corr.Kendall()))
239 |
240 |
241 | if __name__ == "__main__":
242 |
243 | logging.basicConfig(level=logging.INFO)
244 | parser = argparse.ArgumentParser()
245 | parser.add_argument("--model_name_or_path", type=str, default=None)
246 | parser.add_argument("--data_path", type=str, default=None)
247 | parser.add_argument("--output_path", type=str, default=None)
248 | parser.add_argument("--overwrite", action="store_true")
249 | parser.add_argument("--task", type=str, default="summarization")
250 | parser.add_argument("--batch_size", type=int, default=1)
251 | parser.add_argument("--human_score_names", type=str, default="score")
252 | args = parser.parse_args()
253 | main(args)
254 |
--------------------------------------------------------------------------------
/tigerscore/finetune/trainer.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 | import os
3 | from transformers.trainer import *
4 | from peft import PeftModel
5 |
6 |
7 | class CustomLoraTrainer(Trainer):
8 | def _save(self, output_dir: Optional[str] = None, state_dict=None):
9 | # If we are executing this function, we are the process zero, so we don't check for that.
10 | output_dir = output_dir if output_dir is not None else self.args.output_dir
11 | os.makedirs(output_dir, exist_ok=True)
12 | logger.info(f"Saving model checkpoint to {output_dir}")
13 | # Save a trained model and configuration using `save_pretrained()`.
14 | # They can then be reloaded using `from_pretrained()`
15 | if not isinstance(self.model, PreTrainedModel) and not isinstance(self.model, PeftModel):
16 | if state_dict is None:
17 | state_dict = self.model.state_dict()
18 |
19 | if isinstance(unwrap_model(self.model), PreTrainedModel):
20 | unwrap_model(self.model).save_pretrained(
21 | output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
22 | )
23 | else:
24 | logger.info(
25 | "Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
26 | if self.args.save_safetensors:
27 | safetensors.torch.save_file(
28 | state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME))
29 | else:
30 | torch.save(state_dict, os.path.join(
31 | output_dir, WEIGHTS_NAME))
32 | else:
33 | print("Saving LoRA model...")
34 | self.model.save_pretrained(
35 | output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
36 | )
37 |
38 | if self.tokenizer is not None:
39 | self.tokenizer.save_pretrained(output_dir)
40 |
41 | # Good practice: save your training arguments together with the trained model
42 | torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
43 |
--------------------------------------------------------------------------------
/tigerscore/finetune/utils.py:
--------------------------------------------------------------------------------
1 | import dataclasses
2 | import logging
3 | import math
4 | import os
5 | import io
6 | import sys
7 | import time
8 | import json
9 | from typing import Optional, Sequence, Union, Dict
10 |
11 | import openai
12 | import tqdm
13 | from openai import openai_object
14 | import copy
15 |
16 | StrOrOpenAIObject = Union[str, openai_object.OpenAIObject]
17 |
18 | openai_org = os.getenv("OPENAI_ORG")
19 | if openai_org is not None:
20 | openai.organization = openai_org
21 | logging.warning(
22 | f"Switching to organization: {openai_org} for OAI API key.")
23 |
24 |
25 | @dataclasses.dataclass
26 | class OpenAIDecodingArguments(object):
27 | max_tokens: int = 1800
28 | temperature: float = 0.2
29 | top_p: float = 1.0
30 | n: int = 1
31 | stream: bool = False
32 | stop: Optional[Sequence[str]] = None
33 | presence_penalty: float = 0.0
34 | frequency_penalty: float = 0.0
35 | suffix: Optional[str] = None
36 | logprobs: Optional[int] = None
37 | echo: bool = False
38 |
39 |
40 | def openai_completion(
41 | prompts: Union[str, Sequence[str], Sequence[Dict[str, str]], Dict[str, str]],
42 | decoding_args: OpenAIDecodingArguments,
43 | model_name="text-davinci-003",
44 | sleep_time=2,
45 | batch_size=1,
46 | max_instances=sys.maxsize,
47 | max_batches=sys.maxsize,
48 | return_text=False,
49 | **decoding_kwargs,
50 | ) -> Union[Union[StrOrOpenAIObject, StrOrOpenAIObject], Sequence[StrOrOpenAIObject], Sequence[Sequence[StrOrOpenAIObject]],]:
51 | """Decode with OpenAI API.
52 |
53 | Args:
54 | prompts: A string or a list of strings to complete. If it is a chat model the strings should be formatted
55 | as explained here: https://github.com/openai/openai-python/blob/main/chatml.md. If it is a chat model
56 | it can also be a dictionary (or list thereof) as explained here:
57 | https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb
58 | decoding_args: Decoding arguments.
59 | model_name: Model name. Can be either in the format of "org/model" or just "model".
60 | sleep_time: Time to sleep once the rate-limit is hit.
61 | batch_size: Number of prompts to send in a single request. Only for non chat model.
62 | max_instances: Maximum number of prompts to decode.
63 | max_batches: Maximum number of batches to decode. This argument will be deprecated in the future.
64 | return_text: If True, return text instead of full completion object (which contains things like logprob).
65 | decoding_kwargs: Additional decoding arguments. Pass in `best_of` and `logit_bias` if you need them.
66 |
67 | Returns:
68 | A completion or a list of completions.
69 | Depending on return_text, return_openai_object, and decoding_args.n, the completion type can be one of
70 | - a string (if return_text is True)
71 | - an openai_object.OpenAIObject object (if return_text is False)
72 | - a list of objects of the above types (if decoding_args.n > 1)
73 | """
74 | is_single_prompt = isinstance(prompts, (str, dict))
75 | if is_single_prompt:
76 | prompts = [prompts]
77 |
78 | if max_batches < sys.maxsize:
79 | logging.warning(
80 | "`max_batches` will be deprecated in the future, please use `max_instances` instead."
81 | "Setting `max_instances` to `max_batches * batch_size` for now."
82 | )
83 | max_instances = max_batches * batch_size
84 |
85 | prompts = prompts[:max_instances]
86 | num_prompts = len(prompts)
87 | prompt_batches = [
88 | prompts[batch_id * batch_size: (batch_id + 1) * batch_size]
89 | for batch_id in range(int(math.ceil(num_prompts / batch_size)))
90 | ]
91 |
92 | completions = []
93 | for batch_id, prompt_batch in tqdm.tqdm(
94 | enumerate(prompt_batches),
95 | desc="prompt_batches",
96 | total=len(prompt_batches),
97 | ):
98 | batch_decoding_args = copy.deepcopy(
99 | decoding_args) # cloning the decoding_args
100 |
101 | while True:
102 | try:
103 | shared_kwargs = dict(
104 | model=model_name,
105 | **batch_decoding_args.__dict__,
106 | **decoding_kwargs,
107 | )
108 | completion_batch = openai.Completion.create(
109 | prompt=prompt_batch, **shared_kwargs)
110 | choices = completion_batch.choices
111 |
112 | for choice in choices:
113 | choice["total_tokens"] = completion_batch.usage.total_tokens
114 | completions.extend(choices)
115 | break
116 | except openai.error.OpenAIError as e:
117 | logging.warning(f"OpenAIError: {e}.")
118 | if "Please reduce your prompt" in str(e):
119 | batch_decoding_args.max_tokens = int(
120 | batch_decoding_args.max_tokens * 0.8)
121 | logging.warning(
122 | f"Reducing target length to {batch_decoding_args.max_tokens}, Retrying...")
123 | else:
124 | logging.warning("Hit request rate limit; retrying...")
125 | time.sleep(sleep_time) # Annoying rate limit on requests.
126 |
127 | if return_text:
128 | completions = [completion.text for completion in completions]
129 | if decoding_args.n > 1:
130 | # make completions a nested list, where each entry is a consecutive decoding_args.n of original entries.
131 | completions = [completions[i: i + decoding_args.n]
132 | for i in range(0, len(completions), decoding_args.n)]
133 | if is_single_prompt:
134 | # Return non-tuple if only 1 input and 1 generation.
135 | (completions,) = completions
136 | return completions
137 |
138 |
139 | def _make_w_io_base(f, mode: str):
140 | if not isinstance(f, io.IOBase):
141 | f_dirname = os.path.dirname(f)
142 | if f_dirname != "":
143 | os.makedirs(f_dirname, exist_ok=True)
144 | f = open(f, mode=mode)
145 | return f
146 |
147 |
148 | def _make_r_io_base(f, mode: str):
149 | if not isinstance(f, io.IOBase):
150 | f = open(f, mode=mode)
151 | return f
152 |
153 |
154 | def jdump(obj, f, mode="w", indent=4, default=str):
155 | """Dump a str or dictionary to a file in json format.
156 |
157 | Args:
158 | obj: An object to be written.
159 | f: A string path to the location on disk.
160 | mode: Mode for opening the file.
161 | indent: Indent for storing json dictionaries.
162 | default: A function to handle non-serializable entries; defaults to `str`.
163 | """
164 | f = _make_w_io_base(f, mode)
165 | if isinstance(obj, (dict, list)):
166 | json.dump(obj, f, indent=indent, default=default)
167 | elif isinstance(obj, str):
168 | f.write(obj)
169 | else:
170 | raise ValueError(f"Unexpected type: {type(obj)}")
171 | f.close()
172 |
173 |
174 | def jload(f, mode="r"):
175 | """Load a .json file into a dictionary."""
176 | f = _make_r_io_base(f, mode)
177 | jdict = json.load(f)
178 | f.close()
179 | return jdict
180 |
--------------------------------------------------------------------------------
/tigerscore/get_error_types/get_error_types.py:
--------------------------------------------------------------------------------
1 | # Example usage
2 | """
3 | This file isn't used in final version.
4 | """
5 | import os
6 | import sys
7 | import fire
8 | import json
9 | from pathlib import Path
10 | os.environ["OPENAI_API_KEY"] = ""
11 | os.environ["OPENAI_API_BASE"] = ""
12 | os.environ["OPENAI_API_TYPE"] = "azure"
13 | os.environ["OPENAI_API_VERSION"] = "2023-03-15-preview"
14 | sys.path.append("../")
15 | from xgptscore.openai_utils import openai_completions, _chatml_to_prompt
16 | from xgptscore.constants import EVAL_ASPECTS
17 | from string import Template
18 |
19 | TEMPLATE = """\
20 | You are evaluating an ${task} task. Some errors in an incorrect output could be attributed to the following aspects:
21 | ${aspects_descriptions}
22 |
23 | Please elaborate 10 specific error types for each aspect above. Each error type should represent a specific error that falls under the aspect. Error types should be mutually exclusive and collectively exhaustive.\
24 | """
25 |
26 |
27 | def main(
28 | task: str,
29 | ):
30 |
31 | task_aspects = EVAL_ASPECTS[task]
32 | prompt = Template(TEMPLATE).substitute(
33 | task=task,
34 | aspects_descriptions="\n".join([f"- {aspect}: {description}" for aspect, description in task_aspects.items()])
35 | )
36 | prompts = [prompt]
37 | chatmls = [[{"role": "system",
38 | "content": " You are an AI assistant that helps people find information."},
39 | {"role": "user",
40 | "content": prompt}] for prompt in prompts[:1]]
41 |
42 | chatml_prompts = [_chatml_to_prompt(chatml) for chatml in chatmls]
43 | results = openai_completions(chatml_prompts, model_name="gpt-4")
44 | output_file = Path("./error_types/" + task + ".txt")
45 | output_file.parent.mkdir(parents=True, exist_ok=True)
46 | results['propmts'] = prompts
47 | with open(output_file, "w") as f:
48 | json.dump(results, f, indent=4, ensure_ascii=False)
49 |
50 |
51 | if __name__ == "__main__":
52 | fire.Fire(main)
--------------------------------------------------------------------------------
/tigerscore/scorer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/TIGERScore/5133420066ee65a875d5b64cfd20ab35cfce0112/tigerscore/scorer/__init__.py
--------------------------------------------------------------------------------
/tigerscore/xgptscore/README.md:
--------------------------------------------------------------------------------
1 | ## XGPTScore Overview
2 | This folder contains all the templates that we used to query ChatGPT or GPT-4 to get the identified errors in the hypothesis output for different tasks that TIGERScore involved. We call these API query methods as XGPTScore for a e**X**planainable **Scoring** method by querying **GPT** Models.
3 |
4 | The overall pipeline of XGPTScore is:
5 |
6 | 1. We define a query template that askes GPT Models to idnetify errors in the hypothesis output based on the task instruction, source text and reference text.
7 | 2. We mannual construct various evaluation aspects to focus on for different tasks, as shown in [./constants.py](./constants.py).
8 | 3. Then, by applying the templates and also specifiy the aspects to focus on in the template, GPT Models are required to return the identified errors in a predefined format (like json format).
9 |
10 | Sometimes GPTModels will output apparently lower-quality output if we require them to output in a specific format. To mitigate the affections from the predefined format on the response quality, we conduct 2-round evaluation. Firstly, we focus on the evaluation only, allowing the GPT models to output free-form evaluation results on the hypothesis output. Then we ask the GPT-models to format their free-form response in the first round into a specific format and provide elaborated information, which is an easier task for GPTModels.
11 |
12 | ## Quick start
13 |
14 | We have provided a single function `xgptscore()` as the inferface, which takes the `xgptitems` along with the template mode and the OpenAI models as input to start the query.
15 |
16 | Example Usage:
17 | ```python
18 | task = "translation"
19 | with open("example.json", "r") as f:
20 | items = json.load(f)
21 | xgptitems = []
22 | for item in items:
23 | for cand in item['candidates']:
24 | xgptitems.append(XPGTItem(
25 | task=task,
26 | instruction=item['instruction'],
27 | input=item['input'],
28 | ref_output=item['output'],
29 | hypo_output=cand['text']
30 | ))
31 | result = xgptscore(xgptitems, "ea", "ChatGPT")
32 | idx = 0
33 | for item in items:
34 | for cand in item['candidates']:
35 | cand['responses'] = result['round_completions'][idx]
36 | cand['messages_records'] = result['messages_records'][idx]
37 | json.dump(items, open("example_result.json", "w"), indent=4, ensure_ascii=False)
38 | ```
39 |
40 | Please check out the input file `example.json` and the result file `example_results.json` to better understand how it actually works.
--------------------------------------------------------------------------------
/tigerscore/xgptscore/mode_configs/align_score.json:
--------------------------------------------------------------------------------
1 | {
2 | "mode": "align_score",
3 | "decoding": {
4 | "max_tokens": 3600,
5 | "temperature": 0.0,
6 | "top_p": 1.0,
7 | "timeout": 60,
8 | "request_timeout": 60
9 | },
10 | "max_lengths": {
11 | "inst": null,
12 | "input": 600,
13 | "hypo_output": 400,
14 | "ref_output": 400
15 | }
16 | }
--------------------------------------------------------------------------------
/tigerscore/xgptscore/mode_configs/default.json:
--------------------------------------------------------------------------------
1 | {
2 | "mode": "default",
3 | "decoding": {
4 | "max_tokens": 3600,
5 | "temperature": 0.0,
6 | "top_p": 1.0,
7 | "timeout": 60
8 | },
9 | "max_lengths": {
10 | "inst": null,
11 | "input": 512,
12 | "hypo_output": 400,
13 | "ref_output": 400
14 | }
15 | }
--------------------------------------------------------------------------------
/tigerscore/xgptscore/mode_configs/kb_txt.json:
--------------------------------------------------------------------------------
1 | {
2 | "mode": "default",
3 | "decoding": {
4 | "max_tokens": 3600,
5 | "temperature": 0.0,
6 | "top_p": 1.0,
7 | "timeout": 120,
8 | "request_timeout": 120
9 | },
10 | "max_lengths": {
11 | "inst": null,
12 | "input": 512,
13 | "hypo_output": 400,
14 | "ref_output": 400
15 | }
16 | }
--------------------------------------------------------------------------------
/tigerscore/xgptscore/mode_configs/wmt_mqm.json:
--------------------------------------------------------------------------------
1 | {
2 | "mode": "wmt_mqm",
3 | "decoding": {
4 | "max_tokens": 3600,
5 | "temperature": 0.0,
6 | "top_p": 1.0,
7 | "timeout": 60,
8 | "request_timeout": 60
9 | },
10 | "max_lengths": {
11 | "inst": null,
12 | "input": 400,
13 | "hypo_output": 400,
14 | "ref_output": 400
15 | }
16 | }
--------------------------------------------------------------------------------
/tigerscore/xgptscore/openai_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | if os.environ.get('OPENAI_API_TYPE', None) == 'azure':
3 | # pip install openai<=0.28.1, fire, numpy, tiktoken
4 | from .openai_utils_azure import (
5 | openai_completions,
6 | _prompt_to_chatml,
7 | _chatml_to_prompt,
8 | )
9 | import openai
10 | assert openai.VERSION <= "0.28.1", "Azure API is only supported in openai-python 0.28.1 or later."
11 | elif os.environ.get('OPENAI_UTILS_TYPE', None) == 'curl':
12 | # pip install openai>=1.0.0, fire, numpy, tiktoken
13 | from .openai_utils_curl import (
14 | openai_completions,
15 | _prompt_to_chatml,
16 | _chatml_to_prompt,
17 | )
18 | import openai
19 | assert openai.VERSION >= "1.0.0", "OpenAI API is only supported in openai-python 1.0.0 or later."
20 | else:
21 | # pip install openai>=1.0.0, fire, numpy, tiktoken
22 | from .openai_utils_openAI import (
23 | openai_completions,
24 | _prompt_to_chatml,
25 | _chatml_to_prompt,
26 | )
27 | import openai
28 | assert openai.VERSION >= "1.0.0", "OpenAI API is only supported in openai-python 1.0.0 or later."
29 |
--------------------------------------------------------------------------------
/tigerscore/xgptscore/process_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import json5
4 | import logging
5 | from dataclasses import dataclass
6 | from transformers import AutoTokenizer
7 | from tqdm import tqdm
8 | from typing import List, Union
9 | from itertools import chain
10 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
11 |
12 |
13 | @dataclass
14 | class XPGTItem():
15 | task: str
16 | instruction: str
17 | input: str
18 | ref_output: Union[str, List[str]]
19 | hypo_output: str
20 |
21 | # Message map functions
22 |
23 |
24 | def default_msg_map(cur_message: dict, messages: List[dict]):
25 | """ Map the text and old messages to the new messages for query
26 | Args:
27 | text (str): the prompt text
28 | messages (List[dict]): the messages list before this query
29 | Returns:
30 | prompt (str): the prompt text
31 | """
32 | new_messages = messages + [{
33 | "role": cur_message['role'],
34 | "content": cur_message['content']}
35 | ]
36 | return new_messages
37 |
38 | # Postprocess functions
39 |
40 |
41 | def default_postprocess(content: str):
42 | return content
43 |
44 |
45 | def json_postprocess(content: str):
46 | try:
47 | # find the json content
48 | json_content = content[content.find("{"):content.rfind("}") + 1]
49 | json_content = json.loads(json_content)
50 | return json_content
51 | except json.decoder.JSONDecodeError:
52 | try:
53 | json_content = json5.loads(json_content)
54 | return json_content
55 | except Exception:
56 | return content
57 |
58 |
59 | tokenizer = None
60 |
61 |
62 | def truncate_texts(texts: Union[List[str], List[List[str]]], max_length: int = None):
63 | """
64 | Truncate the texts to the max length.
65 | Args:
66 | texts (List[str] or List[List[str]]): The list of texts.
67 | max_length (int): The max length.
68 | Returns:
69 | List[str]: The truncated texts.
70 | """
71 | if max_length is None:
72 | return texts
73 | if isinstance(texts[0], list) and \
74 | (
75 | all([len(x) == 0 for x in texts]) or
76 | all([x is None for x in list(chain(*texts))])
77 | ) or isinstance(texts[0], str) and \
78 | all([x is None for x in list(chain(texts))]):
79 | logging.warning("All texts are None, skip truncating")
80 | return texts
81 | # using llama tokenizer by default
82 | global tokenizer
83 | disable_tqdm = len(texts) < 1000
84 | logging.warning(f"Truncating texts to max length {max_length}")
85 | if tokenizer is None:
86 | tokenizer = AutoTokenizer.from_pretrained(
87 | "meta-llama/Llama-2-7b-hf", use_auth_token=True)
88 | # ...
89 | token_ids = []
90 | for text in tqdm(texts, desc="Truncating texts (tokenizing)", disable=disable_tqdm):
91 | if isinstance(text, list):
92 | token_ids.append(
93 | [tokenizer.encode(x, add_special_tokens=False) for x in text])
94 | else:
95 | token_ids.append(tokenizer.encode(text, add_special_tokens=False))
96 | # ...
97 | truncated_texts = []
98 | for i, _token_ids in tqdm(enumerate(token_ids), desc="Truncating texts (truncating)", disable=disable_tqdm):
99 | if (len(_token_ids)) and isinstance(_token_ids[0], list):
100 | truncated_texts.append([])
101 | for _token_id in _token_ids:
102 | if len(_token_id) > max_length:
103 | truncated_text = tokenizer.decode(
104 | _token_id[:max_length], skip_special_tokens=True)
105 | truncated_text = truncated_text + " ..."
106 | else:
107 | truncated_text = tokenizer.decode(
108 | _token_id, skip_special_tokens=True)
109 | truncated_texts[i].append(truncated_text)
110 | else:
111 | if len(_token_ids) > max_length:
112 | truncated_text = tokenizer.decode(
113 | _token_ids[:max_length], skip_special_tokens=True)
114 | truncated_text = truncated_text + " ..."
115 | else:
116 | truncated_text = tokenizer.decode(
117 | _token_ids, skip_special_tokens=True)
118 |
119 | truncated_texts.append(truncated_text)
120 | return truncated_texts
121 |
122 |
123 | def truncate_items(items: List[XPGTItem], max_lengths):
124 | """
125 | Truncate the texts in the items to the max length.
126 | Args:
127 | items (List[XPGTItem]): The list of items.
128 | max_length (int): The max length.
129 | Returns:
130 | List[XPGTItem]: The truncated items.
131 | """
132 | truncated_inputs = truncate_texts(
133 | [item.input for item in items], max_lengths.get("input", None))
134 | truncated_insts = truncate_texts(
135 | [item.instruction for item in items], max_lengths.get("instruction", None))
136 | truncated_ref_outputs = truncate_texts(
137 | [item.ref_output for item in items], max_lengths.get("ref_output", None))
138 | truncated_hypo_outputs = truncate_texts(
139 | [item.hypo_output for item in items], max_lengths.get("hypo_output", None))
140 | for i, item in enumerate(items):
141 | item.instruction = truncated_insts[i]
142 | item.input = truncated_inputs[i]
143 | item.ref_output = truncated_ref_outputs[i]
144 | item.hypo_output = truncated_hypo_outputs[i]
145 | return items
146 |
147 |
148 | def get_query_messages(messages: List[dict], queried_messages: List[dict]):
149 | """
150 | Args:
151 | messages (List[dict]): the messages list to add for query
152 | queried_messages (List[dict]): the messages list already queried, which contains the query responses also,
153 | Returns:
154 | new_messages (List[dict]): the new messages list to query
155 | postprocess (function): the postprocess function for the query response
156 | """
157 | if len(queried_messages) == 0:
158 | last_prompt_idx = -1
159 | else:
160 | assert len(
161 | queried_messages) >= 2, "queried_messages should have at least 2 messages, i.e., the user (system) and the response"
162 | last_prompt = queried_messages[-2]['content']
163 | prompt_texts = [x['content'] for x in messages]
164 | last_prompt_idx = prompt_texts.index(last_prompt)
165 | if last_prompt_idx == len(messages) - 1:
166 | return None
167 | new_messages = queried_messages.copy()
168 | for idx in range(last_prompt_idx + 1, len(messages)):
169 | new_messages = messages[idx]["map_func"](messages[idx], new_messages)
170 | if messages[idx]["do_query"]:
171 | break
172 | return new_messages, messages[idx]["postprocess"]
173 |
174 |
175 | def get_xgptscore_from_json(json_content: dict):
176 | """
177 | Args:
178 | json_content (dict): the json content
179 | Returns:
180 | xgptscore (float): the xgptscore, i.e. the sum of the reduction scores for all the errors
181 | """
182 | if isinstance(json_content, str):
183 | return None
184 | try:
185 | xgptscore = 0
186 | for error in json_content['errors'].values():
187 | if error['score_reduction'] == "N/A":
188 | continue
189 | xgptscore -= error['score_reduction']
190 | return xgptscore
191 | except Exception:
192 | return None
193 |
194 |
195 | def get_xgptscore_from_json_star(json_content: dict):
196 | """
197 | Args:
198 | json_content (dict): the json content
199 | Returns:
200 | xgptscore (float): the xgptscore, i.e. the sum of the reduction scores for all the errors
201 | """
202 | xgptscore = 0
203 | res = {}
204 | for aspect_key, aspect in json_content.items():
205 | if isinstance(aspect, dict):
206 | score = aspect['Score']
207 | try:
208 | score = float(score)
209 | except Exception:
210 | score = 0
211 | xgptscore += score
212 | res["xgptscore_" + aspect_key] = score
213 | res["xgptscore"] = xgptscore
214 | return res
215 |
216 |
217 | def get_xgptscore_from_json_per_aspect(json_content: dict):
218 | """
219 | Args:
220 | json_content (dict): the json content
221 | Returns:
222 | xgptscore (float): the xgptscore, i.e. the sum of the reduction scores for all the errors
223 | """
224 | if not isinstance(json_content, dict):
225 | return None
226 | xgptscore = 0
227 | res = {}
228 | for error in json_content['errors'].values():
229 | if error['error_aspect'] is not None:
230 | if ("xgptscore_" + error['error_aspect'] not in res):
231 | res["xgptscore_" + error['error_aspect']] = 0
232 | res["xgptscore_" + error['error_aspect']] -= error['score_reduction']
233 | xgptscore -= error['score_reduction']
234 | res["xgptscore"] = xgptscore
235 | return res
236 |
--------------------------------------------------------------------------------
/tigerscore/xgptscore/xgptscore.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import logging
4 | from .process import MODE_PROCESS_MAP
5 | from .process_utils import XPGTItem, truncate_items, get_query_messages
6 | from .openai_utils import openai_completions, _chatml_to_prompt
7 | from typing import List, Union
8 | from dacite import from_dict
9 | from pathlib import Path
10 | from functools import partial
11 |
12 |
13 | def xgptscore(
14 | items: List[Union[XPGTItem, dict]],
15 | mode: str,
16 | model_name: str,
17 | num_workers: int = None,
18 | batch_size: int = None,
19 | **kwargs,
20 | ):
21 | config_path = os.path.join(os.path.dirname(
22 | __file__), f"mode_configs/{mode}.json")
23 | config_path = Path(config_path)
24 | if not config_path.exists():
25 | logging.warning(
26 | f"Config file {config_path} does not exist. Use default config.")
27 | config_path = config_path.with_name("default.json")
28 |
29 | with open(config_path, "r") as f:
30 | config = json.load(f)
31 | config.update(kwargs)
32 | if "max_lengths" in config:
33 | items = truncate_items(items, config["max_lengths"])
34 |
35 | if isinstance(items[0], dict):
36 | items = [from_dict(data_class=XPGTItem, data=item) for item in items]
37 | process_func = MODE_PROCESS_MAP[mode]
38 | if "process_kwargs" in config:
39 | process_func = partial(process_func, **config["process_kwargs"])
40 | process_results = list(map(process_func, items))
41 |
42 | total_round = len([x for x in process_results[0] if x['do_query']])
43 | logging.warning(f"Total chat rounds: {total_round}")
44 | logging.warning(f"Total chat messages: {len(items)}")
45 | # query and process
46 | round = 0
47 | queried_messages = [[] for _ in range(len(items))]
48 | total_price = 0
49 | total_time = 0
50 | round_completions = []
51 | while True:
52 | round += 1
53 | logging.warning(f"Processing chat round {round}/{total_round}")
54 | query_messages = list(
55 | map(get_query_messages, process_results, queried_messages))
56 | query_messages, postprocess_funcs = list(zip(*query_messages))
57 | chatml_prompts = list(map(_chatml_to_prompt, query_messages))
58 | openai_results = openai_completions(
59 | chatml_prompts,
60 | model_name=model_name,
61 | num_procs=num_workers,
62 | batch_size=batch_size,
63 | **config['decoding'],
64 | )
65 | completions = openai_results['completions']
66 | total_price += sum(openai_results['price_per_example'])
67 | total_time += sum(openai_results['time_per_example'])
68 | logging.warning(f"Round {round} price: {total_price}$")
69 | logging.warning(f"Round {round} time: {total_time}")
70 | postprocess_completions = [postprocess_funcs[idx](
71 | completion) for idx, completion in enumerate(completions)]
72 | round_completions.append(postprocess_completions)
73 | for idx, completion in enumerate(completions):
74 | queried_messages[idx] = query_messages[idx] + \
75 | [{"role": "assistant", "content": completion}
76 | ] # add the assistant response
77 | if round == total_round:
78 | _query_messages = list(
79 | map(get_query_messages, process_results, queried_messages))
80 | assert all([x is None for x in _query_messages]
81 | ), "All messages should be queried"
82 | break
83 | logging.warning(f"Total price: {total_price}$")
84 | logging.warning(f"Total time: {total_time}")
85 | logging.warning(f"Total time per example: {total_time / len(items)}")
86 | round_completions = list(zip(*round_completions))
87 | return dict(
88 | round_completions=round_completions,
89 | messages_records=queried_messages,
90 | )
91 |
92 |
93 | """
94 | Example Usage:
95 | task = "translation"
96 | with open("example.json", "r") as f:
97 | items = json.load(f)
98 | xgptitems = []
99 | for item in items:
100 | for cand in item['candidates']:
101 | xgptitems.append(XPGTItem(
102 | task=task,
103 | instruction=item['instruction'],
104 | input=item['input'],
105 | ref_output=item['output'],
106 | hypo_output=cand['text']
107 | ))
108 | result = xgptscore(xgptitems, "ea", "ChatGPT")
109 | idx = 0
110 | for item in items:
111 | for cand in item['candidates']:
112 | cand['responses'] = result['round_completions'][idx]
113 | cand['messages_records'] = result['messages_records'][idx]
114 | json.dump(items, open("example_result.json", "w"), indent=4, ensure_ascii=False)
115 | """
116 |
--------------------------------------------------------------------------------