├── .gitattributes
├── .gitignore
├── .gitmodules
├── LICENSE
├── README.md
├── github_overview.png
├── requirements.txt
├── setup.py
├── tigerscore
    ├── __init__.py
    ├── candidates_generation
    │   ├── _generate_candidates.sh
    │   ├── downmodel.py
    │   ├── engine.py
    │   ├── eval_candidates.py
    │   ├── eval_candidates.sh
    │   ├── finetune_base_model.py
    │   ├── finetune_base_model.sh
    │   ├── generate_candidates.py
    │   ├── generate_candidates.sh
    │   ├── generate_candidates_by_gpt.py
    │   ├── generate_candidates_by_gpt.sh
    │   ├── generate_candidates_series.sh
    │   ├── generate_ref_by_gpt4.py
    │   └── model_utils.py
    ├── common
    │   ├── InstructScore.py
    │   ├── README.md
    │   ├── __init__.py
    │   ├── bart_score.py
    │   ├── cor_eval.py
    │   ├── datasets_config.py
    │   ├── download.sh
    │   ├── evaluation.py
    │   ├── flan_score.py
    │   ├── prism.py
    │   ├── requirements.txt
    │   └── utils.py
    ├── download_dataset
    │   ├── bartscore_data_process.py
    │   ├── datasets_scripts
    │   │   └── fetaqa.sh
    │   ├── download_bartscore_data.sh
    │   ├── download_general_datasets.py
    │   ├── download_general_datasets.sh
    │   ├── preprocess_utils_totto.py
    │   └── utils.py
    ├── eval_scripts
    │   ├── bs_analysis.py
    │   ├── bs_utils.py
    │   ├── check_data.py
    │   ├── check_data.sh
    │   ├── check_responses.py
    │   ├── check_responses.sh
    │   ├── eval_baseline.py
    │   ├── eval_baseline.sh
    │   ├── generate_distill_data.py
    │   ├── generate_distill_data.sh
    │   ├── generate_inst_synthetic_data.py
    │   ├── generate_inst_synthetic_data.sh
    │   ├── generate_synthesis_distill_data.py
    │   ├── generate_synthesis_distill_data.sh
    │   ├── get_systhesis_ref_data.sh
    │   ├── lfqa_gpt_rate.py
    │   ├── lfqa_gpt_rate.sh
    │   ├── mathqa_rate.py
    │   ├── test_ref_diff.py
    │   ├── test_xgptscore.py
    │   ├── test_xgptscore.sh
    │   └── utils.py
    ├── finetune
    │   ├── ds_llama_config.json
    │   ├── finetune_llama.sh
    │   ├── finetune_mistral.sh
    │   ├── format_data_v2.py
    │   ├── format_data_v2.sh
    │   ├── format_distill_data.py
    │   ├── format_distill_data.sh
    │   ├── format_synthesis_distill_data.py
    │   ├── format_synthesis_distill_data.sh
    │   ├── ft_llama_lora.sh
    │   ├── test_llama.py
    │   ├── test_llama.sh
    │   ├── test_llama_vllm.py
    │   ├── test_llama_vllm.sh
    │   ├── test_llama_vllm_distance.py
    │   ├── test_llama_vllm_vanilla.py
    │   ├── train.py
    │   ├── trainer.py
    │   └── utils.py
    ├── get_error_types
    │   ├── error_types
    │   │   └── error_types.json
    │   └── get_error_types.py
    ├── scorer
    │   ├── __init__.py
    │   └── tigerscore.py
    └── xgptscore
    │   ├── README.md
    │   ├── constants.py
    │   ├── example.json
    │   ├── example_result.json
    │   ├── mode_configs
    │       ├── align_score.json
    │       ├── default.json
    │       ├── kb_txt.json
    │       └── wmt_mqm.json
    │   ├── openai_utils.py
    │   ├── openai_utils_azure.py
    │   ├── openai_utils_curl.py
    │   ├── openai_utils_openAI.py
    │   ├── process.py
    │   ├── process_utils.py
    │   ├── templates.py
    │   └── xgptscore.py
└── tigerscore_example_usage.ipynb


/.gitattributes:
--------------------------------------------------------------------------------
1 | data/evaluation/instruct/mixinstruct/test_data_prepared.json filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | tigerscore/xgptscore/cache/
162 | jobs/
163 | /tigerscore/common/models
164 | 
165 | /TigerScore.zip
166 | /hf_space
167 | !/hf_space/TIGERScore
168 | /hf_evaluate
169 | /raw_datasets
170 | 
171 | /data/real-world/summarization/summeval/cnndm
172 | /data/real-world/summarization/summeval/M*
173 | /data/**/train_data.json
174 | /data/**/**/train_data.json
175 | /data/**/**/**/train_data.json
176 | /data/clean_real_world_data
177 | /data/clean_real_world
178 | /test.ipynb
179 | /data/synthesis/synthesis
180 | /data/*.json
181 | /data/*.jsonl
182 | /data/*.ipynb
183 | /tigerscore/xgptscore/cache
184 | /test.sh
185 | /tigerscore/eval_scripts/check_data_private.sh
186 | /tigerscore/finetune/wandb/
187 | /data/additional
188 | /tigerscore/eval_scripts/eval_inst_baseline.sh
189 | /data/evaluation/translation
190 | /data/evaluation
191 | /data/data_dist
192 | /data/evaluation/pair_cmp
193 | /test*
194 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "hf_space/TIGERScore"]
2 | 	path = hf_space/TIGERScore
3 | 	url = https://huggingface.co/spaces/TIGER-Lab/TIGERScore


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 TIGER Lab
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/github_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/TIGERScore/5133420066ee65a875d5b64cfd20ab35cfce0112/github_overview.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | transformers
 2 | datasets
 3 | torch
 4 | accelerate
 5 | wget
 6 | pycocoevalcap
 7 | spacy
 8 | evaluate
 9 | prettytable
10 | gdcm 
11 | pydicom
12 | bitsandbytes
13 | openai
14 | nltk
15 | scipy
16 | json5
17 | peft
18 | fire
19 | gradio
20 | sentencepiece
21 | tiktoken
22 | dacite
23 | wandb
24 | bs4
25 | py7zr
26 | gdown
27 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | description = """
 4 | TIGERScore, a Trained metric that follows Instruction Guidance to perform Explainable, and Reference-free evaluation over a wide spectrum of text generation tasks. 
 5 | Different from other automatic evaluation methods that only provide arcane scores, TIGERScore is guided by the natural language instruction to provide error analysis to pinpoint the mistakes in the generated text.
 6 | """
 7 | 
 8 | setup(
 9 |     name='tigerscore',
10 |     version='0.0.1',
11 |     description=description,
12 |     author='Dongfu Jiang',
13 |     author_email='dongfu.jiang@uwaterloo.ca',
14 |     packages=find_packages(),
15 |     url='https://tiger-ai-lab.github.io/TIGERScore/',
16 |     install_requires=[
17 |         'torch',
18 |         'transformers',
19 |         'datasets',
20 |         'accelerate',
21 |         'gradio',
22 |         'tiktoken',
23 |         'llama-cpp-python',
24 |         'protobuf',
25 |         'sentencepiece',
26 |         'accelerate'
27 |     ],
28 | )
29 | 


--------------------------------------------------------------------------------
/tigerscore/__init__.py:
--------------------------------------------------------------------------------
1 | from tigerscore.scorer.tigerscore import TIGERScorer


--------------------------------------------------------------------------------
/tigerscore/candidates_generation/_generate_candidates.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --time=30:00:00
 3 | #SBATCH --job-name=generate_candidates
 4 | #SBATCH --output ../../jobs/%j.out
 5 | #SBATCH --hint=memory_bound
 6 | #SBATCH --mem=60G
 7 | #SBATCH --gres=gpu:a6000:2
 8 | #SBATCH --qos=normal
 9 | #SBATCH -n 1
10 | 
11 | nvidia-smi
12 | # candidates will be saved in ../../data/${dataset}/candidates/${decoding_method}/${model}.json
13 | dataset=$1
14 | set=$2
15 | model_type=$3
16 | model=$4
17 | output_max_length=$5
18 | no_instruction=$6
19 | input_max_length=$7
20 | decoding_method=$8
21 | image2text=$9
22 | start_idx=${10}
23 | end_idx=${11}
24 | data_dir="../../data"
25 | dtype="float16"
26 | num_candidates=5
27 | num_beams=$num_candidates
28 | num_beam_groups=$num_candidates
29 | overwrite=False
30 | inference_bs=1
31 | 
32 | 
33 | if [ -z "$start_idx" ] && [ -z "$end_idx" ]; then
34 |     echo "start_idx and end_idx are not provided, set to None"
35 | else
36 |     echo "start_idx: $start_idx"
37 |     echo "end_idx: $end_idx"
38 | fi
39 | if [ -z "$output_max_length" ]; then
40 |     output_max_length=300
41 |     echo "output_max_length is not provided, set to $output_max_length"
42 | else
43 |     echo "output_max_length: $output_max_length"
44 | fi
45 | 
46 | if [ -z "$input_max_length" ]; then
47 |     input_max_length=300
48 |     echo "input_max_length is not provided, set to $input_max_length"
49 | else
50 |     echo "input_max_length: $input_max_length"
51 | fi
52 | 
53 | if [ -z "$image2text" ]; then
54 |     image2text=False
55 |     echo "image2text is not provided, set to $image2text"
56 | else
57 |     echo "image2text: $image2text"
58 | fi
59 | if [ -z "$no_instruction" ]; then
60 |     no_instruction=False
61 |     echo "no_instruction is not provided, set to $no_instruction"
62 | else
63 |     echo "no_instruction: $no_instruction"
64 | fi
65 | if [ -z "$decoding_method" ]; then
66 |     decoding_method="top_p_sampling"
67 |     echo "decoding_method is not provided, set to $decoding_method"
68 | else
69 |     echo "decoding_method: $decoding_method"
70 | fi
71 | python ./generate_candidates.py \
72 |     --model_type $model_type \
73 |     --model $model \
74 |     --data_dir $data_dir \
75 |     --dataset $dataset \
76 |     --set $set \
77 |     --num_return_sequences $num_candidates \
78 |     --decoding_method $decoding_method \
79 |     --inference_bs $inference_bs \
80 |     --prompt_max_length $input_max_length \
81 |     --output_max_length $output_max_length \
82 |     --dtype $dtype \
83 |     --num_beams $num_beams \
84 |     --num_beam_groups $num_beam_groups \
85 |     --no_repeat_ngram_size 3 \
86 |     --start_idx "$start_idx" \
87 |     --end_idx "$end_idx" \
88 |     --overwrite $overwrite \
89 |     --image2text "$image2text" \
90 |     --no_instruction "$no_instruction" \


--------------------------------------------------------------------------------
/tigerscore/candidates_generation/downmodel.py:
--------------------------------------------------------------------------------
 1 | # The task in slurm connot support long time download,so just download in shell.
 2 | from model_utils import build_model, build_tokenizer
 3 | import os
 4 | from pathlib import Path
 5 | import fire
 6 | 
 7 | 
 8 | def main( models: str = None, model_type: str = None, cache_dir: str = None):
 9 |         models = models
10 |         model_type = model_type
11 |         cache_dir = (
12 |             cache_dir or Path(os.path.abspath(__file__)).parent.parent.parent / "hf_models"
13 |         )
14 |         for model in models.split(","):
15 |             tokenizer = build_tokenizer(
16 |                 model,
17 |                 cache_dir=cache_dir,
18 |                 resume_download=True,
19 |                 trust_remote_code=True,
20 |             )
21 |             model = build_model(
22 |                 model_type,
23 |                 model,
24 |                 cache_dir=cache_dir,
25 |                 resume_download=True,
26 |                 trust_remote_code=True,
27 |             )
28 | 
29 | 
30 | if __name__ == "__main__":
31 |     fire.Fire(main)


--------------------------------------------------------------------------------
/tigerscore/candidates_generation/engine.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     This file is taken from This file is modified based on:
  3 |     https://github.com/Ravoxsg/SummaReranker-ACL-22-/blob/main/src/candidate_generation/engine.py
  4 |     We thank the authors for sharing their code.
  5 | """
  6 | import gc
  7 | import torch
  8 | import torch.nn.functional as F
  9 | from typing import Dict
 10 | 
 11 | 
 12 | def beam_search_step(inputs: Dict, tokenizer, base_model, args, **kwargs):
 13 |     """beam search step
 14 | 
 15 |     Args:
 16 |         inputs (dict): settings for beam search
 17 |         tokenizer (transformers tokenizer): Tokenizer
 18 |         base_model (transformers model): Model
 19 |         args (dict): settings for beam search
 20 | 
 21 |     Returns:
 22 |         dict: generated candidates and their logprobs in batch
 23 |     """
 24 |     kwargs['return_dict_in_generate'] = True
 25 |     kwargs['output_scores'] = True
 26 |     # 1 - beam search
 27 |     if args.decoding_method == "beam_search":
 28 |         outputs = base_model.generate(
 29 |             **inputs,
 30 |             num_beams=args.num_beams,
 31 |             num_return_sequences=args.num_return_sequences,
 32 |             max_new_tokens=args.output_max_length,
 33 |             repetition_penalty=args.repetition_penalty,
 34 |             length_penalty=args.length_penalty,
 35 |             no_repeat_ngram_size=args.no_repeat_ngram_size,
 36 |             use_cache=True,
 37 |             early_stopping=True,
 38 |             temperature=args.temperature,
 39 |             **kwargs
 40 |         )
 41 |     # 2 - diverse beam search
 42 |     if args.decoding_method == "diverse_beam_search":
 43 |         outputs = base_model.generate(
 44 |             **inputs,
 45 |             num_beams=args.num_beams,
 46 |             num_beam_groups=args.num_beam_groups,
 47 |             num_return_sequences=args.num_return_sequences,
 48 |             max_new_tokens=args.output_max_length,
 49 |             diversity_penalty=args.diversity_penalty,
 50 |             repetition_penalty=args.repetition_penalty,
 51 |             length_penalty=args.length_penalty,
 52 |             no_repeat_ngram_size=args.no_repeat_ngram_size,
 53 |             use_cache=True,
 54 |             early_stopping=True,
 55 |             temperature=args.temperature,
 56 |             **kwargs
 57 |         )
 58 |     # 3 - top-p sampling
 59 |     if args.decoding_method == "top_p_sampling":
 60 |         outputs = base_model.generate(
 61 |             **inputs,
 62 |             num_beams=1,
 63 |             do_sample=True,
 64 |             top_p=args.top_p,
 65 |             num_return_sequences=args.num_return_sequences,
 66 |             max_new_tokens=args.output_max_length,
 67 |             repetition_penalty=args.repetition_penalty,
 68 |             length_penalty=args.length_penalty,
 69 |             no_repeat_ngram_size=args.no_repeat_ngram_size,
 70 |             use_cache=True,
 71 |             early_stopping=True,
 72 |             temperature=args.temperature,
 73 |             **kwargs
 74 |         )
 75 |     # 4 - top-k sampling
 76 |     if args.decoding_method == "top_k_sampling":
 77 |         outputs = base_model.generate(
 78 |             **inputs,
 79 |             num_beams=1,
 80 |             do_sample=True,
 81 |             top_k=args.top_k,
 82 |             num_return_sequences=args.num_return_sequences,
 83 |             max_new_tokens=args.output_max_length,
 84 |             repetition_penalty=args.repetition_penalty,
 85 |             length_penalty=args.length_penalty,
 86 |             no_repeat_ngram_size=args.no_repeat_ngram_size,
 87 |             use_cache=True,
 88 |             early_stopping=True,
 89 |             temperature=args.temperature,
 90 |             **kwargs
 91 |         )
 92 |     # for top-p and top-k sampling, some scores will be masked as -inf. These scores are not processed by softmax and logrithm.
 93 |     masked_logits = torch.stack(outputs.scores, dim=0)
 94 |     masked_logits = F.log_softmax(masked_logits, dim=1)
 95 |     summary_ids = outputs.sequences
 96 |     logprobs = []
 97 |     # Different process for decoder-only models and encoder-decoder models
 98 |     if "input_ids" in inputs and \
 99 |             summary_ids.shape[1] == inputs['input_ids'].shape[1] + masked_logits.shape[0]:
100 |         # for decoder-only models
101 |         # remove input_ids
102 |         summary_ids = summary_ids[:, inputs['input_ids'].shape[1]:]
103 |         for i in range(summary_ids.shape[0]):
104 |             logprobs.append([])
105 |             for j in range(summary_ids.shape[1]):  # token_idx
106 |                 if summary_ids[i][j] == tokenizer.eos_token_id:
107 |                     break
108 |                 logprobs[i].append(
109 |                     masked_logits[j, i, summary_ids[i][j]].item())
110 |     else:
111 |         # for encoder-decoder models
112 |         for i in range(summary_ids.shape[0]):
113 |             logprobs.append([])
114 |             # shift of decoder because of the additional bos_token
115 |             for j in range(summary_ids.shape[1] - 1):  # token_idx
116 |                 if summary_ids[i][j + 1] == tokenizer.eos_token_id:
117 |                     break
118 |                 logprobs[i].append(
119 |                     masked_logits[j, i, summary_ids[i][j + 1]].item())
120 | 
121 |     logprobs = [sum(_probs) for _probs in logprobs]
122 |     generated = tokenizer.batch_decode(
123 |         summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
124 |     del summary_ids
125 |     gc.collect()
126 | 
127 |     batch_generated = []
128 |     batch_logprobs = []
129 |     bz = list(inputs.values())[0].shape[0]
130 |     for i in range(bz):
131 |         batch_generated.append(
132 |             generated[i * args.num_return_sequences:(i + 1) * args.num_return_sequences])
133 |         batch_logprobs.append(
134 |             logprobs[i * args.num_return_sequences:(i + 1) * args.num_return_sequences])
135 |     return {
136 |         "generated": batch_generated,
137 |         "logprobs": batch_logprobs
138 |     }
139 | 


--------------------------------------------------------------------------------
/tigerscore/candidates_generation/eval_candidates.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --time=24:00:00
 3 | #SBATCH --job-name=eval_candidates
 4 | #SBATCH --output ../../jobs/%j.out
 5 | #SBATCH --gres=gpu:2080:1
 6 | #SBATCH --nodes=1
 7 | #SBATCH -n 2
 8 | 
 9 | data_dir="../../data"
10 | # dataset="samsum,xsum,newsroom" # summarization
11 | # dataset="wmt16/cs-en,wmt16/de-en,wmt16/tr-en,wmt17/fi-en,wmt18/zh-en" # translation
12 | # dataset="totto,kasnerz/wikitabletext" # data2text
13 | dataset="din0s/asqa,DongfuTingle/FeTaQA,cosmos_qa,eli5" # long-form QA 
14 | # dataset="databricks/databricks-dolly-15k" 
15 | # dataset="gsm8k:main,math_qa"
16 | 
17 | # dataset="common_gen,vicgalle/alpaca-gpt4,xnli/en,knkarthick/dialogsum"
18 | set="test"
19 | num_workers=1
20 | metrics="bleu,rouge,bart_score,bart_score_cnn"
21 | overwrite="True"
22 | echo "dataset: $dataset"
23 | echo "set: $set"
24 | python eval_candidates.py \
25 |     --data_dir $data_dir \
26 |     --dataset $dataset \
27 |     --set $set \
28 |     --num_workers $num_workers \
29 |     --metrics $metrics \
30 |     --overwrite $overwrite


--------------------------------------------------------------------------------
/tigerscore/candidates_generation/finetune_base_model.py:
--------------------------------------------------------------------------------
  1 | """
  2 |     This file is to finetune basic models for candidates generation.
  3 |     Code based on Huggingface Turorial.
  4 | """
  5 | from common.evaluation import overall_eval
  6 | from model_utils import (
  7 |     build_model,
  8 |     build_tokenizer,
  9 | )
 10 | from typing import Optional, Sequence, Dict, List
 11 | from generate_candidates import get_model_size, get_torch_dtype
 12 | from dataclasses import dataclass, field
 13 | from transformers import (
 14 |     TrainingArguments,
 15 |     Seq2SeqTrainer,
 16 |     Seq2SeqTrainingArguments
 17 | )
 18 | import numpy as np
 19 | import logging
 20 | import transformers
 21 | import torch
 22 | import json
 23 | import os
 24 | import sys
 25 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
 26 | sys.path.append("..")
 27 | IGNORE_INDEX = -100
 28 | 
 29 | 
 30 | @dataclass
 31 | class ModelArguments:
 32 |     model_type: str
 33 |     model_name_or_path: str
 34 |     dtype: str = "float32"
 35 |     cache_dir: Optional[str] = None
 36 | 
 37 | 
 38 | @dataclass
 39 | class DataArguments:
 40 |     data_dir: str
 41 |     train_file: str
 42 |     eval_file: str = None
 43 |     eval_metrics: List[str] = field(default_factory=lambda: ["bleu", "rouge"])
 44 |     input_max_length: int = 512
 45 |     output_max_length: int = 128
 46 |     with_instruction: bool = False
 47 | 
 48 | 
 49 | def load_dataset(data_args):
 50 |     with open(data_args.train_file, 'r') as f:
 51 |         train_data = json.load(f)
 52 |     if data_args.eval_file:
 53 |         with open(data_args.eval_file, 'r') as f:
 54 |             eval_data = json.load(f)
 55 |     else:
 56 |         eval_data = None
 57 | 
 58 |     return train_data, eval_data
 59 | 
 60 | 
 61 | class SupervisedDataset(torch.utils.data.Dataset):
 62 |     def __init__(self, encodings):
 63 |         self.encodings = encodings
 64 | 
 65 |     def __getitem__(self, idx):
 66 |         return {key: val[idx] for key, val in self.encodings.items()}
 67 | 
 68 |     def __len__(self):
 69 |         return len(self.encodings["input_ids"])
 70 | 
 71 | 
 72 | def preprocess_function(examples, tokenizer, data_args):
 73 |     if data_args.with_instruction:
 74 |         inputs = [x["instruction"] + "\n" + x["input"] for x in examples]
 75 |     else:
 76 |         inputs = [x["input"] for x in examples]
 77 |     inputs = [x.strip(' \n') for x in inputs]
 78 |     outputs = [x["output"] for x in examples]
 79 | 
 80 |     logging.warning("# of examples: {}".format(len(inputs)))
 81 |     logging.warning("Example of inputs:")
 82 |     print(inputs[0])
 83 |     logging.warning("Example of outputs:")
 84 |     print(outputs[0])
 85 | 
 86 |     model_inputs = tokenizer(
 87 |         inputs, max_length=data_args.input_max_length, truncation=True)
 88 |     # Setup the tokenizer for targets
 89 |     with tokenizer.as_target_tokenizer():
 90 |         labels = tokenizer(
 91 |             outputs, max_length=data_args.output_max_length, truncation=True)
 92 | 
 93 |     logging.warning("Example of model inputs:")
 94 |     print("input_ids", model_inputs['input_ids'][0])
 95 |     print("attention_mask", model_inputs['attention_mask'][0])
 96 |     logging.warning("Example of labels:")
 97 |     print(labels['input_ids'][0])
 98 |     labels["input_ids"] = [
 99 |         [(_l if _l != tokenizer.pad_token_id else IGNORE_INDEX) for _l in label] for label in labels["input_ids"]
100 |     ]
101 |     model_inputs["labels"] = labels["input_ids"]
102 |     return SupervisedDataset(model_inputs)
103 | 
104 | 
105 | @dataclass
106 | class DataCollatorForSupervisedDataset(object):
107 |     """Collate examples for supervised fine-tuning."""
108 | 
109 |     tokenizer: transformers.PreTrainedTokenizer
110 | 
111 |     def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
112 |         input_ids, labels = tuple([torch.tensor(
113 |             instance[key]) for instance in instances] for key in ("input_ids", "labels"))
114 |         input_ids = torch.nn.utils.rnn.pad_sequence(
115 |             input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
116 |         )
117 |         labels = torch.nn.utils.rnn.pad_sequence(
118 |             labels, batch_first=True, padding_value=IGNORE_INDEX)
119 |         # print(self.tokenizer.batch_decode(input_ids))
120 |         # print(self.tokenizer.batch_decode(labels.masked_fill(labels == IGNORE_INDEX, self.tokenizer.pad_token_id)))
121 |         # print("##" * 30)
122 |         return dict(
123 |             input_ids=input_ids,
124 |             labels=labels,
125 |             attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
126 |         )
127 | 
128 | 
129 | def main(
130 |     model_args: ModelArguments,
131 |     data_args: DataArguments,
132 |     training_args: TrainingArguments,
133 | ):
134 | 
135 |     model = build_model(
136 |         model_args.model_type,
137 |         model_args.model_name_or_path,
138 |         torch_dtype=get_torch_dtype(model_args.dtype),
139 |         device_map="auto",
140 |         cache_dir=model_args.cache_dir, resume_download=True)
141 |     n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
142 |     logging.warning("The {} has {} trainable parameters".format(
143 |         model_args.model_name_or_path, get_model_size(n_params)))
144 |     tokenizer = build_tokenizer(
145 |         model_args.model_name_or_path,
146 |         cache_dir=model_args.cache_dir, resume_download=True)
147 |     logging.warning("Loading dataset...")
148 | 
149 |     train_data, eval_data = load_dataset(data_args)
150 |     logging.warning("Dataset loaded.")
151 |     logging.warning("Preprocessing dataset...")
152 |     train_dataset = preprocess_function(train_data, tokenizer, data_args)
153 |     eval_dataset = preprocess_function(eval_data, tokenizer, data_args)
154 |     logging.warning("Dataset preprocessed.")
155 |     logging.warning("Loading data collator...")
156 |     data_collator = DataCollatorForSupervisedDataset(tokenizer)
157 |     logging.warning("Data collator loaded.")
158 |     logging.warning("Loading trainer...")
159 | 
160 |     def compute_metrics(eval_pred):
161 | 
162 |         logits, labels = eval_pred
163 |         labels[labels == IGNORE_INDEX] = tokenizer.pad_token_id
164 |         logits[logits == IGNORE_INDEX] = tokenizer.pad_token_id
165 |         predictions = tokenizer.batch_decode(logits, skip_special_tokens=True)
166 |         labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
167 |         logging.warning("Example of predictions:")
168 |         print(predictions[:3])
169 |         logging.warning("Example of labels:")
170 |         print(labels[:3])
171 |         scores = overall_eval(predictions, labels,
172 |                               metrics=data_args.eval_metrics)
173 |         return {
174 |             key: np.mean(value) for key, value in scores.items()
175 |         }
176 | 
177 |     training_args.evaluation_strategy = "epoch"
178 |     training_args.weight_decay = 0.01
179 |     training_args.save_total_limit = 5
180 |     training_args.predict_with_generate = True
181 |     training_args.generation_num_beams = 4
182 |     training_args.generation_max_length = data_args.output_max_length
183 |     training_args.load_best_model_at_end = True
184 |     logging.warning("Training arguments:")
185 |     print(training_args)
186 |     trainer = Seq2SeqTrainer(
187 |         model=model,
188 |         args=training_args,
189 |         tokenizer=tokenizer,
190 |         train_dataset=train_dataset,
191 |         eval_dataset=eval_dataset,
192 |         data_collator=data_collator,
193 |         compute_metrics=compute_metrics,
194 |     )
195 |     logging.warning("Trainer loaded.")
196 |     logging.warning("Training...")
197 |     trainer.train()
198 |     logging.warning("Training finished.")
199 |     logging.warning("Saving model...")
200 |     trainer.save_model(output_dir=os.path.join(
201 |         training_args.output_dir, "checkpoint-best"))
202 |     logging.warning("Model saved.")
203 | 
204 | 
205 | if __name__ == "__main__":
206 |     parser = transformers.HfArgumentParser(
207 |         (ModelArguments, DataArguments, Seq2SeqTrainingArguments))
208 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
209 |     main(model_args, data_args, training_args)
210 | 


--------------------------------------------------------------------------------
/tigerscore/candidates_generation/finetune_base_model.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --time=24:00:00
 3 | #SBATCH --job-name=finetune
 4 | #SBATCH --output ../../jobs/finetune_base_models/%j.out
 5 | #SBATCH --gres=gpu:2080:1
 6 | #SBATCH --nodes=1
 7 | #SBATCH -n 1
 8 | 
 9 | model_type="t5"
10 | model_name_or_path="google/flan-t5-large"
11 | data_dir="../../data"
12 | dataset="cosmos_qa"
13 | train_file="${data_dir}/${dataset}/finetune_data.json"
14 | eval_file="${data_dir}/${dataset}/validation_data.json"
15 | with_instruction=True
16 | run_name="ft_${dataset}"
17 | learning_rate=1e-4
18 | num_train_epochs=10
19 | per_device_train_batch_size=2
20 | per_device_eval_batch_size=8
21 | gradient_accumulation_steps=16
22 | max_grad_norm=1
23 | input_max_length=512
24 | output_max_length=256
25 | optim="adafactor"
26 | lr_scheduler_type="linear"
27 | warmup_ratio=0.1
28 | fp16=False
29 | output_dir="../../finetuned_models/${model_name_or_path}/${run_name}"
30 | cache_dir="../../hf_models"
31 | localhost=$RANDOM # random port number
32 | n_gpu=1
33 | torchrun \
34 |     --rdzv_backend=c10d \
35 |     --rdzv_endpoint="localhost:${localhost}" \
36 |     --nnodes 1 \
37 |     --nproc_per_node ${n_gpu} \
38 |     finetune_base_model.py \
39 |     --model_type $model_type \
40 |     --model_name_or_path $model_name_or_path \
41 |     --data_dir $data_dir \
42 |     --train_file $train_file \
43 |     --eval_file $eval_file \
44 |     --with_instruction $with_instruction \
45 |     --run_name $run_name \
46 |     --learning_rate $learning_rate \
47 |     --optim $optim \
48 |     --fp16 $fp16 \
49 |     --lr_scheduler_type $lr_scheduler_type \
50 |     --num_train_epochs $num_train_epochs \
51 |     --per_device_train_batch_size $per_device_train_batch_size \
52 |     --per_device_eval_batch_size $per_device_eval_batch_size \
53 |     --gradient_accumulation_steps $gradient_accumulation_steps \
54 |     --max_grad_norm $max_grad_norm \
55 |     --input_max_length $input_max_length \
56 |     --output_max_length $output_max_length \
57 |     --output_dir $output_dir \
58 |     --cache_dir $cache_dir \
59 |     --report_to "wandb" \
60 |     --logging_steps 2 \
61 | 
62 | 


--------------------------------------------------------------------------------
/tigerscore/candidates_generation/generate_candidates_by_gpt.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Usage:
  3 |     Gererate candidates by GPT-3.5 or GPT-4.
  4 | """
  5 | from xgptscore.process_utils import XPGTItem
  6 | from xgptscore.xgptscore import xgptscore
  7 | import json
  8 | import random
  9 | import logging
 10 | import sys
 11 | import fire
 12 | from pathlib import Path
 13 | sys.path.append(str(Path(__file__).parent.parent))
 14 | logging.basicConfig(level=logging.warning)
 15 | 
 16 | 
 17 | def main(
 18 |     task: str,
 19 |     data_path: str,
 20 |     dataset: str,
 21 |     output_file: str = None,
 22 |     xgptscore_mode: str = "instruction",
 23 |     model_name: str = "ChatGPT",
 24 |     overwrite: bool = False,
 25 |     max_size: int = None,
 26 |     seed: int = 42,
 27 |     shuffle_file: bool = False,
 28 |     source_max_length: int = None,
 29 |     ref_max_length: int = None,
 30 |     hypo_max_length: int = None,
 31 |     dataset_split: str = "test",
 32 | ):
 33 |     """Gererate candidates by GPT-3.5 or GPT-4.
 34 | 
 35 |     Args:
 36 |         task (str): Task name.
 37 |         data_path (str): Path to the data.
 38 |         dataset (str): Dataset name.
 39 |         output_file (str, optional):  Defaults to None.
 40 |         xgptscore_mode (str, optional):  Defaults to "instruction".
 41 |         model_name (str, optional):  Defaults to "ChatGPT".
 42 |         overwrite (bool, optional):  Defaults to False.
 43 |         max_size (int, optional):  Defaults to None.
 44 |         seed (int, optional):  Defaults to 42.
 45 |         shuffle_file (bool, optional):  Defaults to False.
 46 |         source_max_length (int, optional):  Defaults to None.
 47 |         ref_max_length (int, optional):  Defaults to None.
 48 |         hypo_max_length (int, optional):  Defaults to None.
 49 |         dataset_split (str, optional):  Defaults to "test".
 50 |     """
 51 |     logging.warning("Params: \n{}".format(json.dumps(locals(), indent=4)))
 52 |     # load data
 53 |     data_path = Path(data_path)
 54 |     input_file = data_path / dataset / (dataset_split + "_data.json")
 55 | 
 56 |     input_file = Path(input_file)
 57 |     if not output_file:
 58 |         output_file = data_path / dataset / "candidates" / \
 59 |             dataset_split / "top_p_sampling" / f"{model_name}.json"
 60 |         if not output_file.parent.parent.exists():
 61 |             output_file.parent.parent.mkdir(parents=True)
 62 |         if not output_file.parent.exists():
 63 |             output_file.parent.mkdir()
 64 |     else:
 65 |         output_file = Path(output_file)
 66 |     with open(input_file, "r") as f:
 67 |         items = json.load(f)
 68 |         logging.warning("Loaded {} items from {}".format(
 69 |             len(items), input_file))
 70 |     logging.warning("Preparing writing to {}...".format(output_file))
 71 | 
 72 |     random.seed(seed)
 73 |     logging.warning("Set seed to {}".format(seed))
 74 |     if shuffle_file:
 75 |         random.shuffle(items)
 76 |         logging.warning("Shuffled {} items".format(len(items)))
 77 |     if isinstance(max_size, int) and max_size > 0:
 78 |         items = items[:max_size]
 79 |         logging.warning("Truncated to {} items".format(len(items)))
 80 | 
 81 |     xgptitems = []
 82 |     for item in items:
 83 |         xgptitems.append(XPGTItem(
 84 |             task=task,
 85 |             instruction=item['instruction'],
 86 |             input=item['input'],
 87 |             ref_output=item['output'] if "output" in item else item['refs'],
 88 |             hypo_output=None,
 89 |         ))
 90 |         if "candidates" in item:
 91 |             del item["candidates"]
 92 | 
 93 |     if not output_file.exists() or overwrite:
 94 |         logging.warning("Running xgptscore")
 95 |         # run xgptscore
 96 |         xgptscore_params = {
 97 |             "max_lengths": {
 98 |                 "input": source_max_length,
 99 |                 "hypo_output": hypo_max_length,
100 |                 "ref_output": ref_max_length,
101 |             },
102 |         }
103 |         result = xgptscore(xgptitems, mode=xgptscore_mode,
104 |                            model_name=model_name, **xgptscore_params)
105 |         for i, item in enumerate(items):
106 |             item['responses'] = result['round_completions'][i]
107 |             item['messages_records'] = result['messages_records'][i]
108 |             item['candidates'] = [
109 |                 {"text": result['round_completions'][i][0],
110 |                  "scores": {}
111 |                  }]
112 |         # print(items)
113 |         with open(output_file, "w") as f:
114 |             json.dump(items, f, indent=4, ensure_ascii=False)
115 |             logging.warning("Saved to {}".format(output_file))
116 |     else:
117 |         logging.warning("Loading from {}".format(output_file))
118 |         with open(output_file, "r") as f:
119 |             items = json.load(f)
120 | 
121 | 
122 | if __name__ == "__main__":
123 |     fire.Fire(main)
124 | 


--------------------------------------------------------------------------------
/tigerscore/candidates_generation/generate_candidates_by_gpt.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=generate_candidates_by_gpt
 3 | #SBATCH --time=24:00:00
 4 | #SBATCH --output=../../jobs/%j.out
 5 | 
 6 | 
 7 | # datasets=("GAIR/lima" "tatsu-lab/alpaca_farm:alpaca_instructions" "HuggingFaceH4/oasst1_en" "JosephusCheung/GuanacoDataset" "databricks/databricks-dolly-15k")
 8 | dataset=$1
 9 | task=$2
10 | data_path=""
11 | python generate_candidates_by_gpt.py \
12 |     --task $task \
13 |     --data_path $data_path \
14 |     --dataset $dataset \
15 |     --source_max_length 512 \
16 |     --overwrite "False" 


--------------------------------------------------------------------------------
/tigerscore/candidates_generation/generate_candidates_series.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --time=12:00:00
 3 | #SBATCH --job-name=generate_candidates
 4 | #SBATCH --output ../../jobs/%j.out
 5 | #SBATCH --gres=gpu:1
 6 | #SBATCH --qos=normal
 7 | #SBATCH -n 1
 8 | 
 9 | # This script is used to generate candidates via GPT-3.5 and local models.
10 | 
11 | CMD="sbatch"
12 | 
13 | # models=("google/flan-t5-small" "google/flan-t5-base" "google/flan-t5-large" "google/flan-t5-xl" "google/flan-t5-xxl")
14 | # models=("lmsys/vicuna-33b-v1.3" "lmsys/vicuna-13b-v1.3" "lmsys/vicuna-7b-v1.3") # vicuna
15 | models=("lmsys/vicuna-33b-v1.3") # vicuna-33b-v1.3 need two gpus
16 | # models=("lmsys/vicuna-13b-v1.3" "lmsys/vicuna-7b-v1.3") # vicuna
17 | # model_type="t5"
18 | model_type="llama"
19 | dataset="din0s/asqa"
20 | dataset="DongfuTingle/FeTaQA"
21 | # dataset="cosmos_qa"
22 | # dataset="eli5"
23 | set="test"
24 | output_max_length=512
25 | for model in "${models[@]}"; do
26 |     ${CMD} _generate_candidates.sh "$dataset" "$set" "$model_type" "$model" "$output_max_length"
27 | done
28 | # data_path=""
29 | # python generate_candidates_by_gpt.py \
30 | #     --task "long-form QA" \
31 | #     --data_path $data_path \
32 | #     --dataset $dataset \


--------------------------------------------------------------------------------
/tigerscore/candidates_generation/generate_ref_by_gpt4.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Usage:
  3 |     Gererate candidates by GPT-3.5 or GPT-4.
  4 | """
  5 | 
  6 | import json
  7 | import random
  8 | import logging
  9 | import sys
 10 | import fire
 11 | from pathlib import Path
 12 | sys.path.append(str(Path(__file__).parent.parent))
 13 | from xgptscore.process_utils import XPGTItem
 14 | from xgptscore.xgptscore import xgptscore
 15 | logging.basicConfig(level=logging.warning)
 16 | 
 17 | 
 18 | def main(
 19 |     task: str,
 20 |     data_path: str,
 21 |     xgptscore_mode: str = "instruction",
 22 |     model_name: str = "gpt-4",
 23 |     overwrite: bool = False,
 24 |     max_size: int = None,
 25 |     seed: int = 42,
 26 |     shuffle_file: bool = False,
 27 |     source_max_length: int = None,
 28 |     ref_max_length: int = None,
 29 |     hypo_max_length: int = None,
 30 |     dataset_split: str = "test",
 31 | ):
 32 |     """Gererate candidates by GPT-3.5 or GPT-4.
 33 | 
 34 |     Args:
 35 |         task (str): Task name.
 36 |         data_path (str): Path to the data.
 37 |         dataset (str): Dataset name.
 38 |         output_file (str, optional):  Defaults to None.
 39 |         xgptscore_mode (str, optional):  Defaults to "instruction".
 40 |         model_name (str, optional):  Defaults to "ChatGPT".
 41 |         overwrite (bool, optional):  Defaults to False.
 42 |         max_size (int, optional):  Defaults to None.
 43 |         seed (int, optional):  Defaults to 42.
 44 |         shuffle_file (bool, optional):  Defaults to False.
 45 |         source_max_length (int, optional):  Defaults to None.
 46 |         ref_max_length (int, optional):  Defaults to None.
 47 |         hypo_max_length (int, optional):  Defaults to None.
 48 |         dataset_split (str, optional):  Defaults to "test".
 49 |     """
 50 |     logging.warning("Params: \n{}".format(json.dumps(locals(), indent=4)))
 51 |     # load data
 52 |     data_path = Path(data_path)
 53 |     input_file = data_path
 54 | 
 55 |     input_file = Path(input_file)
 56 |     output_file = input_file
 57 |     with open(input_file, "r") as f:
 58 |         items = json.load(f)
 59 |         logging.warning("Loaded {} items from {}".format(
 60 |             len(items), input_file))
 61 |     logging.warning("Preparing writing to {}...".format(output_file))
 62 | 
 63 |     random.seed(seed)
 64 |     logging.warning("Set seed to {}".format(seed))
 65 |     if shuffle_file:
 66 |         random.shuffle(items)
 67 |         logging.warning("Shuffled {} items".format(len(items)))
 68 |     if isinstance(max_size, int) and max_size > 0:
 69 |         items = items[:max_size]
 70 |         logging.warning("Truncated to {} items".format(len(items)))
 71 | 
 72 |     xgptitems = []
 73 |     for item in items:
 74 |         xgptitems.append(XPGTItem(
 75 |             task=task,
 76 |             instruction=item['instruction'],
 77 |             input=item['input'],
 78 |             ref_output=item['output'] if "output" in item else item['refs'],
 79 |             hypo_output=None,
 80 |         ))
 81 | 
 82 |     if not output_file.exists() or overwrite:
 83 |         logging.warning("Running xgptscore")
 84 |         # run xgptscore
 85 |         xgptscore_params = {
 86 |             "max_lengths": {
 87 |                 "input": source_max_length,
 88 |                 "hypo_output": hypo_max_length,
 89 |                 "ref_output": ref_max_length,
 90 |             },
 91 |         }
 92 |         result = xgptscore(xgptitems, mode=xgptscore_mode,
 93 |                            model_name=model_name,num_workers=5, **xgptscore_params)
 94 |         for i, item in enumerate(items):
 95 |             item['responses'] = result['round_completions'][i]
 96 |             item['messages_records'] = result['messages_records'][i]
 97 |             if item["output"] is not None:
 98 |                 item["output"] = result['round_completions'][i][0]
 99 |         # print(items)
100 |         with open(output_file, "w") as f:
101 |             json.dump(items, f, indent=4, ensure_ascii=False)
102 |             logging.warning("Saved to {}".format(output_file))
103 |     else:
104 |         logging.warning("Loading from {}".format(output_file))
105 |         with open(output_file, "r") as f:
106 |             items = json.load(f)
107 | 
108 | 
109 | if __name__ == "__main__":
110 |     fire.Fire(main)
111 | 


--------------------------------------------------------------------------------
/tigerscore/candidates_generation/model_utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import (
 2 |     AutoTokenizer,
 3 |     AutoModelForSeq2SeqLM,
 4 |     AutoModelForCausalLM,
 5 |     AutoModel,
 6 |     VisionEncoderDecoderModel,
 7 |     ViTImageProcessor,
 8 | )
 9 | decoder_only_models = ["alpaca", "llama", "opt", "bloom",
10 |                        "gpt", "vicuna", "koala", "Wizard", "stablelm"]
11 | 
12 | 
13 | def build_model(model_type, model_name, **kwargs):
14 |     """
15 |         Build the model from the model name
16 |     """
17 |     if any([x in model_type for x in decoder_only_models]) or any([x in model_name for x in decoder_only_models]):
18 |         model = AutoModelForCausalLM.from_pretrained(model_name, **kwargs)
19 |     elif model_type in ["vit"]:
20 |         model = VisionEncoderDecoderModel.from_pretrained(model_name)
21 |     elif model_type in ["bart", "t5", "mbart", "m2m100", "nllb", "opus_mt", "unifiedqa", "opus-mt", "pegasus"]:
22 |         model = AutoModelForSeq2SeqLM.from_pretrained(model_name, **kwargs)
23 |     else:
24 |         model = AutoModel.from_pretrained(model_name, **kwargs)
25 | 
26 |     return model
27 | 
28 | 
29 | def build_tokenizer(model_name, **kwargs):
30 |     """
31 |         Build the tokenizer from the model name
32 |     """
33 | 
34 |     if "vicuna" in model_name:
35 |         tokenizer = AutoTokenizer.from_pretrained(
36 |             model_name, padding_side="left", use_fast=False, **kwargs)
37 |     # elif "Wizard" in model_name:
38 |     #     tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left", return_token_type_ids=False, **kwargs)
39 |     elif any([x in model_name for x in decoder_only_models]):
40 |         # padding left
41 |         tokenizer = AutoTokenizer.from_pretrained(
42 |             model_name, padding_side="left", **kwargs)
43 |     else:
44 |         tokenizer = AutoTokenizer.from_pretrained(
45 |             model_name, **kwargs)  # , use_fast=False)
46 |     if tokenizer.pad_token is None:
47 |         tokenizer.pad_token = tokenizer.eos_token
48 |         tokenizer.pad_token_id = tokenizer.eos_token_id
49 |     return tokenizer
50 | 
51 | 
52 | def build_processor(model_type, model_name, **kwargs):
53 |     """
54 |         Build the processor from the model name
55 |     """
56 |     if model_type in ["vit"]:
57 |         processor = ViTImageProcessor.from_pretrained(model_name, **kwargs)
58 |     else:
59 |         raise NotImplementedError
60 |     return processor
61 | 


--------------------------------------------------------------------------------
/tigerscore/common/README.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | to get our experiments results, first create a `tigerscore_baseline` environment
 3 | ```bash
 4 | conda create -n tigerscore_baseline python=3.9
 5 | conda activate tigerscore_baseline
 6 | pip install -r requirements.txt
 7 | pip install https://github.com/vllm-project/vllm/releases/download/v0.2.2/vllm-0.2.2+cu118-cp39-cp39-manylinux1_x86_64.whl
 8 | pip3 install -U xformers --index-url https://download.pytorch.org/whl/cu118
 9 | ```
10 | 


--------------------------------------------------------------------------------
/tigerscore/common/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sys
3 | cur_folder = os.path.dirname(os.path.abspath(__file__))
4 | if cur_folder not in sys.path:
5 |     sys.path.append(cur_folder)
6 | 


--------------------------------------------------------------------------------
/tigerscore/common/bart_score.py:
--------------------------------------------------------------------------------
 1 | # %%
 2 | """
 3 |     From https://github.com/neulab/BARTScore
 4 | """
 5 | import torch
 6 | import torch.nn as nn
 7 | import traceback
 8 | from transformers import BartTokenizer, BartForConditionalGeneration
 9 | 
10 | 
11 | class BARTScorer:
12 |     def __init__(self, device='cuda:0', max_length=1024, checkpoint='facebook/bart-large-cnn'):
13 |         # Set up model
14 |         self.device = device
15 |         self.max_length = max_length
16 |         self.tokenizer = BartTokenizer.from_pretrained(checkpoint)
17 |         self.model = BartForConditionalGeneration.from_pretrained(checkpoint)
18 |         self.model.eval()
19 |         self.model.to(device)
20 | 
21 |         # Set up loss
22 |         self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id)
23 |         self.lsm = nn.LogSoftmax(dim=1)
24 | 
25 |     def load(self, path='./models/bart.pth'):
26 |         """ Load model from paraphrase finetuning """
27 |         self.model.load_state_dict(torch.load(path, map_location=self.device))
28 | 
29 |     def score(self, srcs, tgts, batch_size):
30 |         """ Score a batch of examples """
31 |         score_list = []
32 |         for i in range(0, len(srcs), batch_size):
33 |             src_list = srcs[i: i + batch_size]
34 |             tgt_list = tgts[i: i + batch_size]
35 |             try:
36 |                 with torch.no_grad():
37 |                     encoded_src = self.tokenizer(
38 |                         src_list,
39 |                         max_length=self.max_length,
40 |                         truncation=True,
41 |                         padding=True,
42 |                         return_tensors='pt'
43 |                     )
44 |                     encoded_tgt = self.tokenizer(
45 |                         tgt_list,
46 |                         max_length=self.max_length,
47 |                         truncation=True,
48 |                         padding=True,
49 |                         return_tensors='pt'
50 |                     )
51 |                     src_tokens = encoded_src['input_ids'].to(self.device)
52 |                     src_mask = encoded_src['attention_mask'].to(self.device)
53 | 
54 |                     tgt_tokens = encoded_tgt['input_ids'].to(self.device)
55 |                     tgt_mask = encoded_tgt['attention_mask']
56 |                     tgt_len = tgt_mask.sum(dim=1).to(self.device)
57 | 
58 |                     output = self.model(
59 |                         input_ids=src_tokens,
60 |                         attention_mask=src_mask,
61 |                         labels=tgt_tokens
62 |                     )
63 |                     logits = output.logits.view(-1, self.model.config.vocab_size)
64 |                     loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
65 |                     loss = loss.view(tgt_tokens.shape[0], -1)
66 |                     loss = loss.sum(dim=1) / tgt_len
67 |                     curr_score_list = [-x.item() for x in loss]
68 |                     score_list += curr_score_list
69 | 
70 |             except RuntimeError:
71 |                 traceback.print_exc()
72 |                 print(f'source: {src_list}')
73 |                 print(f'target: {tgt_list}')
74 |                 exit(0)
75 |         return score_list
76 | 


--------------------------------------------------------------------------------
/tigerscore/common/cor_eval.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy
 3 | 
 4 | 
 5 | def cor_pearson(hypo_scores, ref_scores):
 6 |     """
 7 |     Args:
 8 |         hypo_scores: ndarray of shape (n, c) where n is the number of samples, c is the number of candidates
 9 |         ref_scores: ndarray of shape (n, c) where n is the number of samples, c is the number of candidates
10 |     returns:
11 |         cor: float, the mean correlation coefficient
12 |     """
13 |     if isinstance(hypo_scores, list):
14 |         hypo_scores = np.array(hypo_scores)
15 |     if isinstance(ref_scores, list):
16 |         ref_scores = np.array(ref_scores)
17 |     assert hypo_scores.shape == ref_scores.shape
18 |     bz, c = hypo_scores.shape
19 |     hypo_scores = hypo_scores.reshape(bz, c).T
20 |     ref_scores = ref_scores.reshape(bz, c).T
21 |     cor = 0
22 |     for i in range(c):
23 |         cor += np.corrcoef(hypo_scores[i], ref_scores[i])[0, 1]
24 |     cor /= c
25 |     return cor
26 | 
27 | 
28 | def cor_spearman(hypo_scores, ref_scores):
29 |     """
30 |     Args:
31 |         hypo_scores: ndarray of shape (n, c) where n is the number of samples, c is the number of candidates
32 |         ref_scores: ndarray of shape (n, c) where n is the number of samples, c is the number of candidates
33 |     returns:
34 |         cor: float, the mean of the diagonal elements of the spearman correlation matrix
35 |     """
36 |     if isinstance(hypo_scores, list):
37 |         hypo_scores = np.array(hypo_scores)
38 |     if isinstance(ref_scores, list):
39 |         ref_scores = np.array(ref_scores)
40 |     assert hypo_scores.shape == ref_scores.shape
41 |     bz, c = hypo_scores.shape
42 |     hypo_scores = hypo_scores.reshape(bz, c).T
43 |     ref_scores = ref_scores.reshape(bz, c).T
44 |     cor = 0
45 |     for i in range(c):
46 |         cor += scipy.stats.spearmanr(hypo_scores[i], ref_scores[i]).correlation
47 |     cor /= c
48 |     return cor
49 | 
50 | 
51 | def cor_spearman_footrule(hypo_scores, ref_scores):
52 |     """
53 |     Args:
54 |         hypo_scores: ndarray of shape (n, c) where n is the number of samples, c is the number of candidates
55 |         ref_scores: ndarray of shape (n, c) where n is the number of samples, c is the number of candidates
56 |     returns:
57 |         cor: float, the mean of the set of the spearman correlation coefficients
58 |     """
59 |     if isinstance(hypo_scores, list):
60 |         hypo_scores = np.array(hypo_scores)
61 |     if isinstance(ref_scores, list):
62 |         ref_scores = np.array(ref_scores)
63 |     assert hypo_scores.shape == ref_scores.shape
64 |     bz, c = hypo_scores.shape
65 |     hypo_scores = hypo_scores.reshape(bz, c)
66 |     ref_scores = ref_scores.reshape(bz, c)
67 |     return np.abs(hypo_scores - ref_scores).sum(axis=-1).mean()
68 | 


--------------------------------------------------------------------------------
/tigerscore/common/download.sh:
--------------------------------------------------------------------------------
 1 | # Download BLEURT
 2 | wget https://storage.googleapis.com/bleurt-oss/bleurt-large-512.zip .
 3 | unzip bleurt-large-512.zip
 4 | mv bleurt-large-512 models/
 5 | rm bleurt-large-512.zip
 6 | 
 7 | # Download PRISM
 8 | wget http://data.statmt.org/prism/m39v1.tar
 9 | tar xf m39v1.tar
10 | mv m39v1 models/
11 | rm m39v1.tar


--------------------------------------------------------------------------------
/tigerscore/common/flan_score.py:
--------------------------------------------------------------------------------
 1 | # %%
 2 | """
 3 |     From https://github.com/xu1998hz/SEScore3
 4 | """
 5 | import torch
 6 | import torch.nn as nn
 7 | import traceback
 8 | from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 9 | 
10 | 
11 | class FLANScorer:
12 |     def __init__(self, device='cuda:0', max_length=1024, checkpoint='google/flan-t5-base'):
13 |         # Set up model
14 |         self.device = device
15 |         self.max_length = max_length
16 |         self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
17 |         self.model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
18 |         self.model.eval()
19 |         self.model.to(device)
20 |         # Set up loss
21 |         self.loss_fct = nn.NLLLoss(
22 |             reduction='none', ignore_index=self.model.config.pad_token_id)
23 |         self.lsm = nn.LogSoftmax(dim=1)
24 | 
25 |     def load(self):
26 |         """ Load model from paraphrase finetuning """
27 |         self.model.load_state_dict(torch.load(
28 |             'models/bart.pth', map_location=self.device))
29 | 
30 |     def score(self, srcs, tgts, batch_size):
31 |         """ Score a batch of examples """
32 |         score_list = []
33 |         for i in range(0, len(srcs), batch_size):
34 |             src_list = srcs[i: i + batch_size]
35 |             tgt_list = tgts[i: i + batch_size]
36 |             if i < 1:
37 |                 pass
38 |                 # print('src_list: ',src_list)
39 |                 # print('tgt_list: ', tgt_list)
40 |             try:
41 |                 with torch.no_grad():
42 |                     encoded_src = self.tokenizer(
43 |                         src_list,
44 |                         max_length=self.max_length,
45 |                         truncation=True,
46 |                         padding=True,
47 |                         return_tensors='pt'
48 |                     )
49 |                     encoded_tgt = self.tokenizer(
50 |                         tgt_list,
51 |                         max_length=self.max_length,
52 |                         truncation=True,
53 |                         padding=True,
54 |                         return_tensors='pt'
55 |                     )
56 |                     src_tokens = encoded_src['input_ids'].to(self.device)
57 |                     src_mask = encoded_src['attention_mask'].to(self.device)
58 |                     tgt_tokens = encoded_tgt['input_ids'].to(self.device)
59 |                     tgt_mask = encoded_tgt['attention_mask']
60 |                     tgt_len = tgt_mask.sum(dim=1).to(self.device)
61 | 
62 |                     output = self.model(
63 |                         input_ids=src_tokens,
64 |                         attention_mask=src_mask,
65 |                         labels=tgt_tokens
66 |                     )
67 |                     logits = output.logits.view(-1,
68 |                                                 self.model.config.vocab_size)
69 |                     loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
70 |                     loss = loss.view(tgt_tokens.shape[0], -1)
71 |                     loss = loss.sum(dim=1) / tgt_len
72 |                     curr_score_list = [-x.item() for x in loss]
73 |                     score_list += curr_score_list
74 | 
75 |             except RuntimeError:
76 |                 traceback.print_exc()
77 |                 print(f'source: {src_list}')
78 |                 print(f'target: {tgt_list}')
79 |                 exit(0)
80 |         return score_list
81 | 


--------------------------------------------------------------------------------
/tigerscore/common/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch
 2 | transformers
 3 | git+https://github.com/Unbabel/COMET.git
 4 | git+https://github.com/jdf-prog/UniEval.git
 5 | nltk
 6 | git+https://github.com/google-research/bleurt.git
 7 | fire
 8 | rouge_score
 9 | bert_score
10 | git+https://github.com/huggingface/evaluate@18932858570b9fa97ac478e1e6e709438e4d093b
11 | pycocoevalcap
12 | spacy
13 | git+https://github.com/google-research/mt-metrics-eval.git
14 | prettytable
15 | psutil
16 | sacrebleu
17 | mosestokenizer
18 | pytorch-lightning==2.0.0


--------------------------------------------------------------------------------
/tigerscore/common/utils.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | import os
  3 | import numpy as np
  4 | import torch
  5 | import argparse
  6 | import hashlib
  7 | import requests
  8 | import time
  9 | from io import BytesIO
 10 | from tqdm import tqdm
 11 | from PIL import Image
 12 | from concurrent.futures import ThreadPoolExecutor
 13 | from functools import partial
 14 | from datasets.utils.file_utils import get_datasets_user_agent
 15 | 
 16 | USER_AGENT = get_datasets_user_agent()
 17 | 
 18 | 
 19 | def seed_everything(seed=42):
 20 |     """
 21 |         Seed everything for reproducibility
 22 |     """
 23 |     random.seed(seed)
 24 |     os.environ['PYTHONHASHSEED'] = str(seed)
 25 |     np.random.seed(seed)
 26 |     torch.manual_seed(seed)
 27 |     torch.cuda.manual_seed(seed)
 28 |     torch.backends.cudnn.deterministic = True
 29 | 
 30 | 
 31 | def str2bool(v):
 32 |     """
 33 |         Convert string to boolean
 34 |     """
 35 |     if isinstance(v, bool):
 36 |         return v
 37 |     if v.lower() in ('yes', 'true', 't', 'y', '1'):
 38 |         return True
 39 |     elif v.lower() in ('no', 'false', 'f', 'n', '0'):
 40 |         return False
 41 |     else:
 42 |         raise argparse.ArgumentTypeError('Boolean value expected.')
 43 | 
 44 | 
 45 | def empty2None(x):
 46 |     if x == '':
 47 |         return None
 48 |     elif isinstance(x, str):
 49 |         return x
 50 |     else:
 51 |         raise argparse.ArgumentTypeError('String value expected.')
 52 | 
 53 | 
 54 | def empty2Noneint(x):
 55 |     if x == '':
 56 |         return None
 57 |     elif isinstance(x, int):
 58 |         return x
 59 |     elif isinstance(x, str):
 60 |         return int(x)
 61 |     else:
 62 |         raise argparse.ArgumentTypeError('Integer value expected.')
 63 | 
 64 | 
 65 | def empty2zero(x):
 66 |     if x == '':
 67 |         return 0
 68 |     elif isinstance(x, int):
 69 |         return x
 70 |     elif isinstance(x, str):
 71 |         return int(x)
 72 |     else:
 73 |         raise argparse.ArgumentTypeError('Integer value expected.')
 74 | 
 75 | 
 76 | def generate_hash_code(text):
 77 |     if text is None:
 78 |         return None
 79 |     # Convert the text to bytes and create a hash object
 80 |     hash_object = hashlib.sha256(text.encode())
 81 | 
 82 |     # Get the hexadecimal representation of the hash code
 83 |     hex_code = hash_object.hexdigest()
 84 | 
 85 |     # Return the first 16 digits of the hexadecimal code
 86 |     return hex_code[:16]
 87 | 
 88 | 
 89 | def fetch_single_image(image_url, timeout=None, retries=2):
 90 |     """
 91 |     Fetch a single image from a URL.
 92 |     """
 93 |     if os.path.exists(image_url):
 94 |         # fetch from local
 95 |         try:
 96 |             image = Image.open(image_url).convert("RGB")
 97 |         except Exception:
 98 |             if retries > 0:
 99 |                 time.sleep(3)
100 |                 return fetch_single_image(image_url, timeout=timeout, retries=retries - 1)
101 |     else:
102 |         # fetch from url
103 |         try:
104 |             r = requests.get(image_url, timeout=timeout,
105 |                              stream=True, headers={"User-Agent": USER_AGENT})
106 |             r.raise_for_status()
107 |             image = Image.open(BytesIO(r.content)).convert("RGB")
108 |         except Exception as e:
109 |             if retries > 0:
110 |                 time.sleep(3)  # Wait 3 seconds before retrying
111 |                 return fetch_single_image(image_url, timeout=timeout, retries=retries - 1)
112 |             else:
113 |                 print(
114 |                     f"Failed to fetch image from {image_url} after {retries} retries")
115 |                 raise e
116 |     return image
117 | 
118 | 
119 | def fetch_images(image_urls, num_threads, timeout=None, retries=2):
120 |     """
121 |     Fetch images from a list of URLs in parallel.
122 |     Args:
123 |         image_urls (list): List of image URLs.
124 |         num_threads (int): Number of threads to use.
125 |         timeout (int, optional): Timeout for the request. Defaults to None.
126 |         retries (int, optional): Number of retries. Defaults to 0.
127 |     Returns:
128 |         list: List of PIL images.
129 |     """
130 |     fetch_single_image_with_args = partial(
131 |         fetch_single_image, timeout=timeout, retries=retries)
132 |     with ThreadPoolExecutor(max_workers=num_threads) as executor:
133 |         images = list(
134 |             tqdm(
135 |                 executor.map(fetch_single_image_with_args, image_urls),
136 |                 total=len(image_urls),
137 |                 desc="Fetching images")
138 |         )
139 |     print("Fetched {} images".format(len(images)))
140 |     return images
141 | 


--------------------------------------------------------------------------------
/tigerscore/download_dataset/bartscore_data_process.py:
--------------------------------------------------------------------------------
 1 | """
 2 |   Unzip the data files and convert them to json format.
 3 | """
 4 | import os
 5 | import json
 6 | import argparse
 7 | import pickle
 8 | from pathlib import Path
 9 | 
10 | if __name__ == '__main__':
11 |     parser = argparse.ArgumentParser()
12 |     parser.add_argument('--data_dir', type=str, required=True)
13 |     parser.add_argument('--task', type=str, required=True)
14 |     parser.add_argument('--rm_old', action='store_true')
15 | 
16 |     args = parser.parse_args()
17 |     data_dir = args.data_dir
18 | 
19 |     task_dir = Path(data_dir) / args.task
20 |     for data_file in os.listdir(task_dir):
21 |         if not data_file.endswith('.pkl'):
22 |             continue
23 |         print("Data file: ", data_file)
24 |         data_path = task_dir / data_file
25 |         with open(data_path, 'rb') as f:
26 |             data = pickle.load(f)
27 |         print("# of data: ", len(data))
28 |         if isinstance(data, dict):
29 |             print("Data Example: ", data[list(data.keys())[0]])
30 |         elif isinstance(data, list):
31 |             print("Data example: ", data[0])
32 |         with open(data_path.with_suffix('.json'), 'w') as f:
33 |             json.dump(data, f, indent=4)
34 |         if args.rm_old:
35 |             data_path.unlink()
36 | 


--------------------------------------------------------------------------------
/tigerscore/download_dataset/datasets_scripts/fetaqa.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/TIGERScore/5133420066ee65a875d5b64cfd20ab35cfce0112/tigerscore/download_dataset/datasets_scripts/fetaqa.sh


--------------------------------------------------------------------------------
/tigerscore/download_dataset/download_bartscore_data.sh:
--------------------------------------------------------------------------------
 1 | # Download the BARTScore used system outputs and references
 2 | scripts_dir=$(pwd)
 3 | data_dir="../../data/bartscore_data"
 4 | mkdir -p $data_dir
 5 | 
 6 | # Summarization
 7 | cd $data_dir
 8 | datasets=("Newsroom" "QAGS_CNN" "QAGS_XSUM" "REALSumm" "Rank19" "SummEval")
 9 | mkdir -p summarization
10 | for dataset in ${datasets[@]}; do
11 |     wget "https://github.com/neulab/BARTScore/raw/main/SUM/${dataset}/data.pkl" -O "summarization/${dataset}.pkl"
12 | done
13 | cd $scripts_dir
14 | python bartscore_data_process.py --data_dir "$data_dir" --task "summarization"
15 | 
16 | 
17 | # Translation
18 | cd $data_dir
19 | datasets=("de-en" "fi-en" "gu-en" "kk-en" "lt-en" "ru-en" "zh-en")
20 | mkdir -p translation
21 | for dataset in ${datasets[@]}; do
22 |     wget "https://github.com/neulab/BARTScore/raw/main/WMT/${dataset}/data.pkl" -O "translation/${dataset}.pkl"
23 | done
24 | cd $scripts_dir
25 | python bartscore_data_process.py --data_dir "$data_dir" --task "translation"
26 | 
27 | # Data2Text
28 | cd $data_dir
29 | datasets=("BAGEL" "SFHOT" "SFRES")
30 | mkdir -p data2text
31 | for dataset in ${datasets[@]}; do
32 |     wget "https://github.com/neulab/BARTScore/raw/main/D2T/${dataset}/data.pkl" -O "data2text/${dataset}.pkl"
33 | done
34 | cd $scripts_dir
35 | python bartscore_data_process.py --data_dir "$data_dir" --task "data2text"


--------------------------------------------------------------------------------
/tigerscore/download_dataset/download_general_datasets.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --time=18:00:00
 3 | #SBATCH --job-name=downloading_general_datasets
 4 | #SBATCH --output ../../jobs/%j.out
 5 | #SBATCH --nodelist=ink-gary
 6 | #SBATCH -n 1
 7 | 
 8 | python download_general_datasets.py --task "mathQA" --overwrite False
 9 | python download_general_datasets.py --task "summarization" --overwrite False
10 | python download_general_datasets.py --task "translation" --overwrite False
11 | python download_general_datasets.py --task "data2text" --overwrite False
12 | python download_general_datasets.py --task "long-form QA" --overwrite False
13 | python download_general_datasets.py --task "instruction-following" --overwrite False
14 | # python download_general_datasets.py --task "story_generation"
15 | # python download_general_datasets.py --task "image_captioning"
16 | python download_general_datasets.py --task "code"


--------------------------------------------------------------------------------
/tigerscore/download_dataset/preprocess_utils_totto.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2018 The Google AI Language Team Authors.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """Baseline preprocessing utilities."""
 16 | import copy
 17 | 
 18 | 
 19 | def _add_adjusted_col_offsets(table):
 20 |     """Add adjusted column offsets to take into account multi-column cells."""
 21 |     adjusted_table = []
 22 |     for row in table:
 23 |         real_col_index = 0
 24 |         adjusted_row = []
 25 |         for cell in row:
 26 |             adjusted_cell = copy.deepcopy(cell)
 27 |             adjusted_cell["adjusted_col_start"] = real_col_index
 28 |             adjusted_cell["adjusted_col_end"] = (
 29 |                 adjusted_cell["adjusted_col_start"] + adjusted_cell["column_span"])
 30 |             real_col_index += adjusted_cell["column_span"]
 31 |             adjusted_row.append(adjusted_cell)
 32 |         adjusted_table.append(adjusted_row)
 33 |     return adjusted_table
 34 | 
 35 | 
 36 | def _get_heuristic_row_headers(adjusted_table, row_index, col_index):
 37 |     """Heuristic to find row headers."""
 38 |     row_headers = []
 39 |     row = adjusted_table[row_index]
 40 |     for i in range(0, col_index):
 41 |         if row[i]["is_header"]:
 42 |             row_headers.append(row[i])
 43 |     return row_headers
 44 | 
 45 | 
 46 | def _get_heuristic_col_headers(adjusted_table, row_index, col_index):
 47 |     """Heuristic to find column headers."""
 48 |     adjusted_cell = adjusted_table[row_index][col_index]
 49 |     adjusted_col_start = adjusted_cell["adjusted_col_start"]
 50 |     adjusted_col_end = adjusted_cell["adjusted_col_end"]
 51 |     col_headers = []
 52 |     for r in range(0, row_index):
 53 |         row = adjusted_table[r]
 54 |         for cell in row:
 55 |             if (cell["adjusted_col_start"] < adjusted_col_end and
 56 |                     cell["adjusted_col_end"] > adjusted_col_start):
 57 |                 if cell["is_header"]:
 58 |                     col_headers.append(cell)
 59 | 
 60 |     return col_headers
 61 | 
 62 | 
 63 | def get_highlighted_subtable(table, cell_indices, with_heuristic_headers=False):
 64 |     """Extract out the highlighted part of a table."""
 65 |     highlighted_table = []
 66 | 
 67 |     adjusted_table = _add_adjusted_col_offsets(table)
 68 | 
 69 |     for (row_index, col_index) in cell_indices:
 70 |         cell = table[row_index][col_index]
 71 |         if with_heuristic_headers:
 72 |             row_headers = _get_heuristic_row_headers(adjusted_table, row_index,
 73 |                                                      col_index)
 74 |             col_headers = _get_heuristic_col_headers(adjusted_table, row_index,
 75 |                                                      col_index)
 76 |         else:
 77 |             row_headers = []
 78 |             col_headers = []
 79 | 
 80 |         highlighted_cell = {
 81 |             "cell": cell,
 82 |             "row_headers": row_headers,
 83 |             "col_headers": col_headers
 84 |         }
 85 |         highlighted_table.append(highlighted_cell)
 86 | 
 87 |     return highlighted_table
 88 | 
 89 | 
 90 | def linearize_full_table(table, cell_indices, table_page_title,
 91 |                          table_section_title):
 92 |     """Linearize full table with localized headers and return a string."""
 93 |     table_str = ""
 94 |     if table_page_title:
 95 |         table_str += "<page_title> " + table_page_title + " </page_title> "
 96 |     if table_section_title:
 97 |         table_str += "<section_title> " + table_section_title + " </section_title> "
 98 | 
 99 |     table_str += "<table> "
100 |     adjusted_table = _add_adjusted_col_offsets(table)
101 |     for r_index, row in enumerate(table):
102 |         row_str = "<row> "
103 |         for c_index, col in enumerate(row):
104 | 
105 |             row_headers = _get_heuristic_row_headers(
106 |                 adjusted_table, r_index, c_index)
107 |             col_headers = _get_heuristic_col_headers(
108 |                 adjusted_table, r_index, c_index)
109 | 
110 |             # Distinguish between highlighted and non-highlighted cells.
111 |             if [r_index, c_index] in cell_indices:
112 |                 start_cell_marker = "<highlighted_cell> "
113 |                 end_cell_marker = "</highlighted_cell> "
114 |             else:
115 |                 start_cell_marker = "<cell> "
116 |                 end_cell_marker = "</cell> "
117 | 
118 |             # The value of the cell.
119 |             item_str = start_cell_marker + col["value"] + " "
120 | 
121 |             # All the column headers associated with this cell.
122 |             for col_header in col_headers:
123 |                 item_str += "<col_header> " + \
124 |                     col_header["value"] + " </col_header> "
125 | 
126 |             # All the row headers associated with this cell.
127 |             for row_header in row_headers:
128 |                 item_str += "<row_header> " + \
129 |                     row_header["value"] + " </row_header> "
130 | 
131 |             item_str += end_cell_marker
132 |             row_str += item_str
133 | 
134 |         row_str += "</row> "
135 |         table_str += row_str
136 | 
137 |     table_str += "</table>"
138 |     if cell_indices:
139 |         assert "<highlighted_cell>" in table_str
140 |     return table_str
141 | 
142 | 
143 | def linearize_subtable(subtable, table_page_title, table_section_title):
144 |     """Linearize the highlighted subtable and return a string of its contents."""
145 |     table_str = ""
146 |     if table_page_title:
147 |         table_str += "<page_title> " + table_page_title + " </page_title> "
148 |     if table_section_title:
149 |         table_str += "<section_title> " + table_section_title + " </section_title> "
150 |     table_str += "<table> "
151 | 
152 |     for item in subtable:
153 |         cell = item["cell"]
154 |         row_headers = item["row_headers"]
155 |         col_headers = item["col_headers"]
156 | 
157 |         # The value of the cell.
158 |         item_str = "<cell> " + cell["value"] + " "
159 | 
160 |         # All the column headers associated with this cell.
161 |         for col_header in col_headers:
162 |             item_str += "<col_header> " + \
163 |                 col_header["value"] + " </col_header> "
164 | 
165 |         # All the row headers associated with this cell.
166 |         for row_header in row_headers:
167 |             item_str += "<row_header> " + \
168 |                 row_header["value"] + " </row_header> "
169 | 
170 |         item_str += "</cell> "
171 |         table_str += item_str
172 | 
173 |     table_str += "</table>"
174 |     return table_str
175 | 


--------------------------------------------------------------------------------
/tigerscore/download_dataset/utils.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | import argparse
 3 | 
 4 | 
 5 | def generate_hash_code(text):
 6 |     # Convert the text to bytes and create a hash object
 7 |     hash_object = hashlib.sha256(text.encode())
 8 | 
 9 |     # Get the hexadecimal representation of the hash code
10 |     hex_code = hash_object.hexdigest()
11 | 
12 |     # Return the first 16 digits of the hexadecimal code
13 |     return hex_code[:16]
14 | 
15 | 
16 | def str2bool(v):
17 |     if isinstance(v, bool):
18 |         return v
19 |     if v.lower() in ('yes', 'true', 't', 'y', '1'):
20 |         return True
21 |     elif v.lower() in ('no', 'false', 'f', 'n', '0'):
22 |         return False
23 |     else:
24 |         raise argparse.ArgumentTypeError('Boolean value expected.')
25 | 
26 | 
27 | def empty2None(x):
28 |     if x == '':
29 |         return None
30 |     else:
31 |         return x
32 | 
33 | 
34 | def empty2zero(x):
35 |     if x == '':
36 |         return 0
37 |     elif isinstance(x, int):
38 |         return x
39 |     elif isinstance(x, str):
40 |         return int(x)
41 |     else:
42 |         raise argparse.ArgumentTypeError('Integer value expected.')
43 | 


--------------------------------------------------------------------------------
/tigerscore/eval_scripts/check_data.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append("..")
 3 | from xgptscore.openai_utils import openai_completions, _chatml_to_prompt
 4 | import fire
 5 | import json
 6 | import random
 7 | from string import Template
 8 | 
 9 | 
10 | template = """
11 | ${instruction}
12 | ${input}
13 | 
14 | Model-generated output:
15 | ${output}
16 | 
17 | An error analysis provided:
18 | ${error_analysis}
19 | 
20 | Is the error analysis reasonable? Answer me "yes" or "no" only.\
21 | """
22 | 
23 | def main(input_file, output_file, model_name="gpt-4", num_samples=None, num_procs=5):
24 |     with open(input_file, "r") as f:
25 |         if input_file.endswith(".jsonl"):
26 |             input_data = [json.loads(line) for line in f]
27 |         elif input_file.endswith(".json"):
28 |             input_data = json.load(f)
29 |     if num_samples is None:
30 |         num_samples = len(input_data)
31 |     print(num_samples)
32 |     input_data = input_data[:num_samples]
33 |     
34 |     def process_data(item):
35 |         prompt = Template(template=template).substitute(
36 |             instruction=item["instruction"],
37 |             input=item["input_context"],
38 |             output=item["hypo_output"],
39 |             error_analysis=item["errors"]
40 |         )
41 |         message = [{
42 |             "role": "user",
43 |             "content": prompt
44 |         }]
45 |         chatml_prompt = _chatml_to_prompt(message)
46 |         return chatml_prompt
47 |         
48 |     prompts = list(map(process_data, input_data))
49 |     print(prompts[0])
50 |     completions = openai_completions(prompts, model_name=model_name, num_procs=num_procs, use_cache=False)
51 |     print(f"Finished generating {len(completions['completions'])} completions.")
52 |     print(f"Total prices: {sum(completions['price_per_example'])}")
53 |     for i, completion in enumerate(completions['completions']):
54 |         input_data[i]["completion"] = completion
55 |     with open(output_file, "w") as f:
56 |         if output_file.endswith(".jsonl"):
57 |             for item in input_data:
58 |                 json.dump(item, f)
59 |                 f.write("\n")
60 |         elif output_file.endswith(".json"):
61 |             json.dump(input_data, f)
62 |         
63 | if __name__ == "__main__":
64 |     fire.Fire(main)
65 |     


--------------------------------------------------------------------------------
/tigerscore/eval_scripts/check_data.sh:
--------------------------------------------------------------------------------
 1 | # python check_data.py \
 2 | #     --input_file "../../data/new_std_400s_m_200s_l_1100s_i3-32k.json" \
 3 | #     --output_file "../../data/new_std_400s_m_200s_l_1100s_i3-32k.check.json" \
 4 | #     --model_name "gpt-4" \
 5 | #     --num_procs 5
 6 | 
 7 | 
 8 | # python check_data.py \
 9 | #     --input_file "../../data/train_mix.jsonl" \
10 | #     --output_file "../../data/train_mix.check_ChatGPT.jsonl" \
11 | #     --model_name "ChatGPT"
12 | 
13 | python check_data.py \
14 |     --input_file "../../data/good.jsonl" \
15 |     --output_file "../../data/good.check.json" \
16 |     --model_name "ChatGPT" \
17 |     --num_procs 5


--------------------------------------------------------------------------------
/tigerscore/eval_scripts/check_responses.sh:
--------------------------------------------------------------------------------
 1 | model_name="gpt-4"
 2 | if [ ${model_name} == "gpt-4" ]; then
 3 |     export OPENAI_API_KEY=
 4 |     export OPENAI_API_BASE=""
 5 |     export OPENAI_API_TYPE="azure"
 6 |     export OPENAI_API_VERSION="2023-07-01-preview"
 7 | fi
 8 | 
 9 | 
10 | # python check_responses.py \
11 | #     --input_file "/home//WorkSpace/ExplainableGPTScore/data/wmt/zh-en/train_data.wmt_mqm.distil_new_wmt_mqm_200.json" \
12 | #     --output_file "/home//WorkSpace/ExplainableGPTScore/data/wmt/zh-en/train_data.wmt_mqm.distil_new_wmt_mqm_200.check.json" \
13 | #     --model_name ${model_name} \
14 | 
15 | # python check_responses.py \
16 | #     --input_file "/home//WorkSpace/ExplainableGPTScore_bak/data/sum/train_data.align_score.filter_v2.json" \
17 | #     --output_file "/home//WorkSpace/ExplainableGPTScore_bak/data/sum/train_data.align_score.filter_v2.check.json" \
18 | #     --model_name ${model_name} \
19 | 
20 | 
21 | python check_responses.py \
22 |     --input_file "../../data/wmt/train_data.wmt_mqm.distill_new_wmt_mqm.json" \
23 |     --output_file "../../data/wmt/train_data.wmt_mqm.distill_new_wmt_mqm.${model_name}.check.json" \
24 |     --model_name ${model_name} \


--------------------------------------------------------------------------------
/tigerscore/eval_scripts/eval_baseline.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #SBATCH --job-name=eval_baseline
  3 | #SBATCH -c 3
  4 | #SBATCH --partition=a100
  5 | #SBATCH --gres=gpu:1
  6 | #SBATCH --time=24:00:00
  7 | #SBATCH --mem=50G
  8 | #SBATCH --output=../../jobs/%x/%j.out
  9 | metrics=("bleu" "rouge" "bertscore" "bleurt" "comet_da" "bart_score_cnn" "bart_score_para" "bart_score_cnn_src_hypo" "bart_score_para_src_hypo" "unieval_sum" "cometkiwi_da")
 10 | 
 11 | # # summarization
 12 | # input_file="../../BARTScore/SUM/SummEval/final_p_with_xgptscore.json"
 13 | # output_file="../../BARTScore/SUM/SummEval/final_p_with_xgptscore.eval.json"
 14 | # human_score_names="coherence,consistency,fluency,relevance"
 15 | # cp -u $input_file $output_file
 16 | # for metric in "${metrics[@]}"; do
 17 | #     echo "Evaluating $metric"
 18 | #     python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
 19 | #         --human_score_names "$human_score_names"
 20 | # done
 21 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
 22 | #     --human_score_names "$human_score_names" --print_results True
 23 | 
 24 | # # data2text
 25 | # input_file="../../data_bak/webnlg/webnlg2020_gen_with_scores.json"
 26 | # output_file="../../data_bak/webnlg/webnlg2020_gen_with_scores.eval.json"
 27 | # input_file="../../data/evaluation/d2t/webnlg_2020/test_data_prepared.json"
 28 | # output_file="../../data/evaluation/d2t/webnlg_2020/test_data_prepared.eval.json"
 29 | # human_score_names="Correctness,DataCoverage,Fluency,Relevance,TextStructure"
 30 | # cp -u $input_file $output_file
 31 | # metrics=("${metrics[@]}" "instructscore_d2t" "gptscore_flan_d2t" "gptscore_flan_d2t_src_hypo")
 32 | # for metric in "${metrics[@]}"; do
 33 | #     echo "Evaluating $metric"
 34 | #     python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
 35 | #         --human_score_names "$human_score_names"
 36 | # done
 37 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
 38 | #     --human_score_names "$human_score_names" --print_results True
 39 | 
 40 | # # # long_form_QA
 41 | # input_file="../../data_bak/lfqa/test.gpt-4.rank.json"
 42 | # output_file="../../data_bak/lfqa/test.gpt-4.rank.eval.json"
 43 | # human_score_names="rank"
 44 | # cp -u $input_file $output_file
 45 | # for metric in "${metrics[@]}"; do
 46 | #     echo "Evaluating $metric"
 47 | #     python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
 48 | #         --human_score_names "$human_score_names"
 49 | # done
 50 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
 51 | #     --human_score_names "$human_score_names" --print_results True
 52 | 
 53 | # # instruction-following
 54 | # input_file="../../data_bak/llm-blender/mix-instruct/test_data_prepared_300.json"
 55 | # output_file="../../data_bak/llm-blender/mix-instruct/test_data_prepared_300.eval.json"
 56 | # input_file="../../data/evaluation/instruct/mixinstruct/test_data_prepared.json"
 57 | # output_file="../../data/evaluation/instruct/mixinstruct/test_data_prepared.eval.json"
 58 | # human_score_names="gpt_rank_score"
 59 | # # cp -u $input_file $output_file
 60 | # metrics=("tigerscore")
 61 | # # for metric in "${metrics[@]}"; do
 62 | # #     echo "Evaluating $metric"
 63 | # #     python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
 64 | # #         --human_score_names "$human_score_names"
 65 | # # done
 66 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
 67 | #     --human_score_names "$human_score_names" --print_results True --average_by "sys" --as_rank "True"
 68 | 
 69 | # mathqa
 70 | # input_file="../../data_bak/mathqa/gsm8k_test_output_prepared.json"
 71 | # output_file="../../data_bak/mathqa/gsm8k_test_output_prepared.eval.json"
 72 | # input_file="../../data/evaluation/mathqa/gsm8k/test_data_prepared.json"
 73 | # output_file="../../data/evaluation/mathqa/gsm8k/test_data_prepared.eval.json"
 74 | # human_score_names="accuracy"
 75 | # metrics=("instructscore")
 76 | # cp -u $input_file $output_file
 77 | # for metric in "${metrics[@]}"; do
 78 | #     echo "Evaluating $metric"
 79 | #     python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
 80 | #         --human_score_names "$human_score_names"
 81 | # done
 82 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
 83 | #     --human_score_names "$human_score_names" --print_results True
 84 | 
 85 | 
 86 | # # # story_gen
 87 | # input_file="../../data/evaluation/storygen/test_data_prepared.json"
 88 | # output_file="../../data/evaluation/storygen/test_data_prepared_eval.json"
 89 | # metrics=("instructscore")
 90 | # human_score_names="human"
 91 | # cp -u $input_file $output_file
 92 | # # for metric in "${metrics[@]}"; do
 93 | # #     echo "Evaluating $metric"
 94 | # #     python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
 95 | # #         --human_score_names "$human_score_names"
 96 | # # done
 97 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
 98 | #     --human_score_names "$human_score_names" --print_results True
 99 | 
100 | # translation 
101 | # input_file="../../data/evaluation/translation/wmt22/zh-en/eval_data.json"
102 | # output_file="../../data/evaluation/translation/wmt22/zh-en/eval_data.eval.json"
103 | # human_score_names="mqm"
104 | # metrics=("instructscore_mt_zh-en")
105 | # cp -u $input_file $output_file
106 | # # for metric in "${metrics[@]}"; do
107 | # #     echo "Evaluating $metric"
108 | # #     python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
109 | # #         --human_score_names "$human_score_names"
110 | # # done
111 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
112 | #     --human_score_names "$human_score_names" --print_results True
113 | 
114 | # input_file="../../data/evaluation/hhh_alignment/hhh_alignment.json"
115 | # output_file="../../data/evaluation/hhh_alignment/hhh_alignment.eval.json"
116 | # human_score_names="human_preference"
117 | # metrics=("bart_score_para_src_hypo")
118 | # cp -u $input_file $output_file
119 | # for metric in "${metrics[@]}"; do
120 | #     echo "Evaluating $metric"
121 | #     python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
122 | #         --human_score_names "$human_score_names"
123 | # done
124 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
125 | #     --human_score_names "$human_score_names" --add_aggrement True --print_results True
126 | 
127 | # input_file="../../data/evaluation/mtbench/mt_bench_human_judgments.json"
128 | # output_file="../../data/evaluation/mtbench/mt_bench_human_judgments.eval.json"
129 | # human_score_names="human_preference"
130 | # metrics=("bart_score_para_src_hypo")
131 | # cp -u $input_file $output_file
132 | # for metric in "${metrics[@]}"; do
133 | #     echo "Evaluating $metric"
134 | #     python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
135 | #         --human_score_names "$human_score_names"
136 | # done
137 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
138 | #     --human_score_names "$human_score_names" --add_aggrement True --print_results True
139 | 
140 | 
141 | # input_file="../../data/evaluation/pair_cmp/test_data_prepared.json"
142 | # output_file="../../data/evaluation/pair_cmp/test_data_prepared.eval.json"
143 | # human_score_names="gpt_rank_score"
144 | # cp -u $input_file $output_file
145 | # # metrics=("bleu" "rouge" "bertscore" "bleurt" "comet_da" "bart_score_cnn" "unieval_sum" "cometkiwi_da")
146 | # metrics=("unieval_sum")
147 | # for metric in "${metrics[@]}"; do
148 | #     echo "Evaluating $metric"
149 | #     python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metric" \
150 | #         --human_score_names "$human_score_names"
151 | # done
152 | # python eval_baseline.py --input_file $output_file --output_file $output_file --metrics "$metrics" \
153 | #     --human_score_names "$human_score_names" --print_results True --average_by "sys" --as_rank "True"
154 | 


--------------------------------------------------------------------------------
/tigerscore/eval_scripts/generate_distill_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | """
 4 | import json
 5 | import random
 6 | import logging
 7 | import sys
 8 | import fire
 9 | from pathlib import Path
10 | sys.path.append(str(Path(__file__).parent.parent))
11 | from xgptscore.process_utils import XPGTItem, get_xgptscore_from_json_per_aspect
12 | from xgptscore.xgptscore import xgptscore
13 | from xgptscore.constants import EVAL_ASPECTS
14 | logging.basicConfig(level=logging.warning)
15 | 
16 | 
17 | def main(
18 |     task: str,
19 |     xgptscore_mode: str,
20 |     model_name: str,
21 |     input_file: str,
22 |     version_key: str = None,
23 |     overwrite: bool = False,
24 |     max_size: int = None,
25 |     seed: int = 42,
26 |     shuffle: bool = False,
27 | ):
28 | 
29 |     logging.warning("Loading from {}".format(input_file))
30 |     with open(input_file, "r") as f:
31 |         items = json.load(f)
32 |     if shuffle:
33 |         random.seed(seed)
34 |         random.shuffle(items)
35 |     suffix = f".{xgptscore_mode}.{model_name}"
36 |     if version_key:
37 |         suffix += f".{version_key}"
38 |     if isinstance(max_size, int) and max_size > 0:
39 |         items = items[:max_size]
40 |         suffix += f".{max_size}"
41 |     output_file = Path(input_file).with_suffix(f"{suffix}.json")
42 | 
43 |     xgptitems = []
44 |     for item in items:
45 |         for cand in item['candidates']:
46 |             xgptitems.append(XPGTItem(
47 |                 task=task,
48 |                 instruction=item['instruction'],
49 |                 input=item['input'],
50 |                 ref_output=item['refs'] if 'refs' in item else item['output'],
51 |                 hypo_output=cand['text']
52 |             ))
53 | 
54 |     if not output_file.exists() or overwrite:
55 |         logging.warning("Running xgptscore")
56 |         # run xgptscore
57 |         result = xgptscore(xgptitems, mode=xgptscore_mode,
58 |                            model_name=model_name, num_workers=5)
59 |         idx = 0
60 |         aspects = EVAL_ASPECTS[task].keys()
61 |         score_dict = {"xgptscore_" + aspect: 0 for aspect in aspects}
62 |         for item in items:
63 |             for cand in item['candidates']:
64 |                 cand['responses'] = result['round_completions'][idx]
65 |                 cand['messages_records'] = result['messages_records'][idx]
66 |                 xgptscore_ans = get_xgptscore_from_json_per_aspect(
67 |                     cand['responses'][-1])
68 |                 if xgptscore_ans is None:
69 |                     logging.info(f"XGPTScore failed for {cand['text']}")
70 |                     # cand['scores']['xgptscore'] = None
71 |                 else:
72 |                     cand['scores'].update(score_dict)
73 |                     cand['scores'].update(xgptscore_ans)
74 |                 idx += 1
75 |         with open(output_file, "w") as f:
76 |             json.dump(items, f, indent=4, ensure_ascii=False)
77 |             logging.info("Saved to {}".format(output_file))
78 |     else:
79 |         logging.warning("Found existing {}".format(output_file))
80 |         logging.warning("Skipping xgptscore")
81 | 
82 | 
83 | if __name__ == "__main__":
84 |     logging.basicConfig(level=logging.warning)
85 |     fire.Fire(main)
86 | 


--------------------------------------------------------------------------------
/tigerscore/eval_scripts/generate_distill_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=generate_distill_data
 3 | #SBATCH -c 2
 4 | #SBATCH --time=24:00:00
 5 | #SBATCH --mem=10G
 6 | #SBATCH --output=../../jobs/%x/%j.out
 7 | 
 8 | version_key="distill"
 9 | overwrite=True
10 | model_name="ChatGPT"
11 | if [ ${model_name} == "gpt-4" ]; then
12 |     export OPENAI_API_KEY=
13 |     export OPENAI_API_BASE=""
14 |     export OPENAI_API_TYPE="azure"
15 |     export OPENAI_API_VERSION="2023-07-01-preview"
16 | fi
17 | 
18 | # task='translation'
19 | # xgptscore_mode="wmt_mqm"
20 | # input_file="../../data/synthesis_min/translation/train_data.kb_txt.distill.syn_cand.json"
21 | # python generate_distill_data.py \
22 | #     --task ${task} \
23 | #     --input_file ${input_file} \
24 | #     --xgptscore_mode ${xgptscore_mode} \
25 | #     --version_key ${version_key} \
26 | #     --model_name ${model_name} \
27 | #     --overwrite ${overwrite} \
28 | 
29 | # task='summarization'
30 | # xgptscore_mode="align_score"
31 | # input_file="../../data/synthesis_min/summarization/train_data.kb_txt.distill.syn_cand.json"
32 | # python generate_distill_data.py \
33 | #     --task ${task} \
34 | #     --input_file ${input_file} \
35 | #     --xgptscore_mode ${xgptscore_mode} \
36 | #     --version_key ${version_key} \
37 | #     --model_name ${model_name} \
38 | #     --overwrite ${overwrite} \
39 | 
40 | # task='data2text'
41 | # xgptscore_mode="d2t"
42 | # input_file="../../data/synthesis_min/data2text/train_data.kb_txt.distill.syn_cand.json"
43 | # python generate_distill_data.py \
44 | #     --task ${task} \
45 | #     --input_file ${input_file} \
46 | #     --xgptscore_mode ${xgptscore_mode} \
47 | #     --version_key ${version_key} \
48 | #     --model_name ${model_name} \
49 | #     --overwrite ${overwrite} \
50 | 
51 | # task='instruction-following'
52 | # xgptscore_mode="instruction_following"
53 | # input_file="../../data/synthesis_min/instruction-following/train_data.kb_txt.distill.syn_cand.json"
54 | # python generate_distill_data.py \
55 | #     --task ${task} \
56 | #     --input_file ${input_file} \
57 | #     --xgptscore_mode ${xgptscore_mode} \
58 | #     --version_key ${version_key} \
59 | #     --model_name ${model_name} \
60 | #     --overwrite ${overwrite} \
61 | 
62 | task='long-form QA'
63 | xgptscore_mode="longform_qa"
64 | input_file="../../data/synthesis_min/long-form QA/train_data.kb_txt.distill.syn_cand.json"
65 | python generate_distill_data.py \
66 |     --task "${task}" \
67 |     --input_file "${input_file}" \
68 |     --xgptscore_mode ${xgptscore_mode} \
69 |     --version_key ${version_key} \
70 |     --model_name ${model_name} \
71 |     --overwrite ${overwrite} \
72 | 
73 | 


--------------------------------------------------------------------------------
/tigerscore/eval_scripts/generate_inst_synthetic_data.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append("..")
  3 | from xgptscore.openai_utils import openai_completions, _chatml_to_prompt
  4 | import fire
  5 | import json
  6 | import random
  7 | from string import Template
  8 | 
  9 | 
 10 | template = """
 11 | Instruction:
 12 | ${instruction}
 13 | ${input}
 14 | 
 15 | A ground-truth response:
 16 | ${output}
 17 | 
 18 | A model will be asked to respond to this instruction. However, that response might contain errors in various aspects.
 19 | 
 20 | Please first output 5 possible error aspects if a model is asked to generate a response for the above instruction. The error aspects don't have to be one of the above aspects and can be any aspect that you think is reasonable for this instruction.
 21 | 
 22 | Then generate an incorrect response contains up to ${num_errors} errors of these aspects. Each error corresponds to one of the aspect.
 23 | The incorrect response should mimic style the real-generation of a model. 
 24 | 
 25 | Then give an analysis of these errors. For each error, give me the 
 26 | - error location (the substring that is wrong in the generated incorrect output)
 27 | - error aspect
 28 | - explanation (the generic error type description, why it's an error, and the correction suggestions)
 29 | - severity ("major" or "minor")
 30 | - score reduction (an integer between 1 to 5 given the severity of the error)
 31 | 
 32 | Output format:
 33 | Generated incorrect output: 
 34 | 
 35 | Error location 1:
 36 | Error aspect 1:
 37 | Explanation 1:
 38 | Severity 1:
 39 | Score reduction 1:
 40 | ...
 41 | """
 42 | 
 43 | math_template = """
 44 | Question:
 45 | ${instruction}
 46 | ${input}
 47 | 
 48 | A ground-truth answer:
 49 | ${output}
 50 | 
 51 | A model will be asked to answer this math question. However, that response might contain errors in various aspects such as Problem Understanding, Problem Formulation, Computing Accuracy, Solution Interpretation, etc.
 52 | 
 53 | Please first output a few possible error aspects if a model is asked to generate a response for the above instruction. The error aspects don't have to be one of the above aspects and can be any aspect that you think is reasonable for this instruction.
 54 | 
 55 | Then generate an incorrect response contains up to ${num_errors} errors of these aspects. Each error corresponds to one of the aspect.
 56 | The incorrect response should mimic style the real-generation of a model. 
 57 | 
 58 | Then give an analysis of these errors. For each error, give me the 
 59 | - error location (the substring that is wrong in the generated incorrect output)
 60 | - error aspect
 61 | - explanation (the generic error type description, why it's an error, and the correction suggestions)
 62 | - severity ("major" or "minor")
 63 | - score reduction (an integer between 0.5 to 5 given the severity of the error)
 64 | 
 65 | Output format:
 66 | Generated incorrect output: 
 67 | 
 68 | Error location 1:
 69 | Error aspect 1:
 70 | Explanation 1:
 71 | Severity 1:
 72 | Score reduction 1:
 73 | ...
 74 | """
 75 | 
 76 | def main(
 77 |     input_file, output_file, 
 78 |     model_name="gpt-4", num_samples=None, 
 79 |     num_procs=5, seed=42,
 80 |     task='inst-fol'):
 81 |     random.seed(seed)
 82 |     with open(input_file, "r") as f:
 83 |         if input_file.endswith(".jsonl"):
 84 |             input_data = [json.loads(line) for line in f]
 85 |         elif input_file.endswith(".json"):
 86 |             input_data = json.load(f)
 87 |     if num_samples is None:
 88 |         num_samples = len(input_data)
 89 |     print(num_samples)
 90 |     input_data = input_data[:num_samples]
 91 |     
 92 |     def process_data(item):
 93 |         if task == 'math':
 94 |             _template = math_template
 95 |         else:
 96 |             _template = template
 97 |         prompt = Template(template=_template).substitute(
 98 |             instruction=item["instruction"],
 99 |             input=item["input"],
100 |             output=item["output"],
101 |             num_errors=random.randint(1, 5)
102 |         )
103 |         message = [{
104 |             "role": "user",
105 |             "content": prompt
106 |         }]
107 |         chatml_prompt = _chatml_to_prompt(message)
108 |         return chatml_prompt
109 |         
110 |     prompts = list(map(process_data, input_data))
111 |     print(prompts[0])
112 |     completions = openai_completions(prompts, model_name=model_name, num_procs=num_procs, use_cache=True)
113 |     print(f"Finished generating {len(completions['completions'])} completions.")
114 |     print(f"Total prices: {sum(completions['price_per_example'])}")
115 |     for i, completion in enumerate(completions['completions']):
116 |         input_data[i]["completion"] = completion
117 |     with open(output_file, "w") as f:
118 |         if output_file.endswith(".jsonl"):
119 |             for item in input_data:
120 |                 json.dump(item, f)
121 |                 f.write("\n")
122 |         elif output_file.endswith(".json"):
123 |             json.dump(input_data, f)
124 |         
125 | if __name__ == "__main__":
126 |     fire.Fire(main)
127 |     


--------------------------------------------------------------------------------
/tigerscore/eval_scripts/generate_inst_synthetic_data.sh:
--------------------------------------------------------------------------------
 1 | # python generate_inst_synthetic_data.py \
 2 | #     --input_file "../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.10k.jsonl" \
 3 | #     --output_file "../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.10k.gen.jsonl" \
 4 | #     --model_name "gpt-4" \
 5 | #     --num_samples 8000
 6 | 
 7 | python generate_inst_synthetic_data.py \
 8 |     --input_file "../../data/additional/metamath/metamath.8k.jsonl" \
 9 |     --output_file "../../data/additional/metamath/metamath.8k.gen.jsonl" \
10 |     --model_name "gpt-4" \
11 |     --num_samples 10
12 | 
13 | # python generate_inst_synthetic_data.py \
14 | #     --input_file "../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.story.1k.jsonl" \
15 | #     --output_file "../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.story.1k.gen.jsonl" \
16 | #     --model_name "gpt-4" \


--------------------------------------------------------------------------------
/tigerscore/eval_scripts/generate_synthesis_distill_data.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Generate synthesis distillation data from a json file.
 3 | """
 4 | import json
 5 | import random
 6 | import logging
 7 | import sys
 8 | import fire
 9 | from pathlib import Path
10 | sys.path.append(str(Path(__file__).parent.parent))
11 | from xgptscore.process_utils import XPGTItem
12 | from xgptscore.xgptscore import xgptscore
13 | logging.basicConfig(level=logging.warning)
14 | 
15 | 
16 | def main(
17 |     task: str,
18 |     input_file: str,
19 |     output_file: str = None,
20 |     xgptscore_mode: str = "kb_txt",
21 |     model_name: str = "gpt-4",
22 |     version_key: str = "default",
23 |     overwrite: bool = False,
24 |     max_size: int = None,
25 |     seed: int = 42,
26 |     shuffle_file: bool = False,
27 |     source_max_length: int = None,
28 |     ref_max_length: int = None,
29 |     hypo_max_length: int = None,
30 | ):
31 |     logging.warning("Params: \n{}".format(json.dumps(locals(), indent=4)))
32 |     # params
33 |     if isinstance(max_size, int) and max_size > 0:
34 |         version_key = f"{version_key}_{max_size}"
35 |     # load data
36 |     input_file = Path(input_file)
37 |     if not output_file:
38 |         output_file = input_file.with_suffix(
39 |             f".{xgptscore_mode}.{version_key}.json")
40 |     else:
41 |         output_file = Path(output_file)
42 |     with open(input_file, "r") as f:
43 |         items = json.load(f)
44 |         logging.warning("Loaded {} items from {}".format(
45 |             len(items), input_file))
46 |     logging.warning("Preparing writing to {}...".format(output_file))
47 | 
48 |     random.seed(seed)
49 |     logging.warning("Set seed to {}".format(seed))
50 |     if shuffle_file:
51 |         random.shuffle(items)
52 |         logging.warning("Shuffled {} items".format(len(items)))
53 |     if isinstance(max_size, int) and max_size > 0:
54 |         items = items[:max_size]
55 |         logging.warning("Truncated to {} items".format(len(items)))
56 |     elif isinstance(max_size, float) and max_size > 0 and max_size < 1:
57 |         items = random.sample(items, int(len(items) * max_size))
58 |         logging.warning("Sampled to {} items".format(len(items)))
59 | 
60 |     xgptitems = []
61 |     for item in items:
62 |         xgptitems.append(XPGTItem(
63 |             task=task,
64 |             instruction=item['instruction'],
65 |             input=item['input'],
66 |             ref_output=item['output'] if "output" in item else item['refs'],
67 |             hypo_output=None,
68 |         ))
69 |         if "candidates" in item:
70 |             del item["candidates"]
71 | 
72 |     if not output_file.exists() or overwrite:
73 |         logging.warning("Running xgptscore")
74 |         # run xgptscore
75 |         xgptscore_params = {
76 |             "max_lengths": {
77 |                 "input": source_max_length,
78 |                 "hypo_output": hypo_max_length,
79 |                 "ref_output": ref_max_length,
80 |             },
81 |         }
82 |         result = xgptscore(xgptitems, mode=xgptscore_mode,
83 |                            model_name=model_name, **xgptscore_params)
84 |         for i, item in enumerate(items):
85 |             item['responses'] = result['round_completions'][i]
86 |             item['messages_records'] = result['messages_records'][i]
87 |         with open(output_file, "w") as f:
88 |             json.dump(items, f, indent=4, ensure_ascii=False)
89 |             logging.warning("Saved to {}".format(output_file))
90 |     else:
91 |         logging.warning("Loading from {}".format(output_file))
92 |         with open(output_file, "r") as f:
93 |             items = json.load(f)
94 | 
95 | 
96 | if __name__ == "__main__":
97 |     fire.Fire(main)
98 | 


--------------------------------------------------------------------------------
/tigerscore/eval_scripts/generate_synthesis_distill_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=synthesis_distill_data
 3 | #SBATCH --time=48:00:00
 4 | #SBATCH --output=../../jobs/synthesis_distill_data/%j.out
 5 | 
 6 | xgptscore_mode="kb_txt"
 7 | version_key="distill"
 8 | model_name="gpt-4"
 9 | if [ ${model_name} == "gpt-4" ]; then
10 |     export OPENAI_API_KEY=
11 |     export OPENAI_API_BASE=""
12 |     export OPENAI_API_TYPE="azure"
13 |     export OPENAI_API_VERSION="2023-07-01-preview"
14 | fi
15 | 
16 | IFS=$'\n'
17 | tasks=("translation" "long-form QA" "summarization" "data2text" "mathQA" "instruction-following")
18 | for task in ${tasks[@]}; do
19 |     input_file="/home//WorkSpace/ExplainableGPTScore/data/synthesis/${task}/train_data.json"
20 |     echo task: $task
21 |     python generate_synthesis_distill_data.py \
22 |         --task $task \
23 |         --xgptscore_mode $xgptscore_mode \
24 |         --version_key $version_key \
25 |         --model_name $model_name \
26 |         --input_file $input_file \
27 |         --source_max_length 512 \
28 |         --overwrite "False" \
29 |         
30 | done


--------------------------------------------------------------------------------
/tigerscore/eval_scripts/get_systhesis_ref_data.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=synthesis_distill_data
 3 | #SBATCH --time=48:00:00
 4 | #SBATCH --output=../../jobs/synthesis_distill_data/%j.out
 5 | 
 6 | xgptscore_mode="paraphrase"
 7 | version_key="distill"
 8 | model_name="gpt-4"
 9 | if [ ${model_name} == "gpt-4" ]; then
10 |     export OPENAI_API_KEY=
11 |     export OPENAI_API_BASE=""
12 |     export OPENAI_API_TYPE="azure"
13 |     export OPENAI_API_VERSION="2023-07-01-preview"
14 | fi
15 | 
16 | IFS=$'\n'
17 | # tasks=("translation" "long-form QA" "summarization" "data2text" "mathQA" "instruction-following")
18 | tasks=("translation")
19 | for task in ${tasks[@]}; do
20 |     input_file="../../data/synthesis/${task}/train_data.json"
21 |     echo task: $task
22 |     python generate_synthesis_distill_data.py \
23 |         --task $task \
24 |         --xgptscore_mode $xgptscore_mode \
25 |         --version_key $version_key \
26 |         --model_name $model_name \
27 |         --input_file $input_file \
28 |         --source_max_length 512 \
29 |         --overwrite "False" \
30 |         --shuffle_file True \
31 |         --max_size 0.15 \
32 |         
33 | done


--------------------------------------------------------------------------------
/tigerscore/eval_scripts/lfqa_gpt_rate.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file isn't used in our final version.
  3 | """
  4 | import sys
  5 | import fire
  6 | import json
  7 | import logging
  8 | import regex as re
  9 | import random
 10 | sys.path.append("..")
 11 | from collections import Counter, defaultdict
 12 | from string import Template
 13 | from xgptscore.openai_utils import openai_completions, _chatml_to_prompt
 14 | logging.basicConfig(level=logging.WARNING)
 15 | 
 16 | rank_template = """
 17 | 4 different models are asked to follow a given instruction to generate an answer based on a given source input.
 18 | The instruction is: ${instruction}
 19 | The source input is: ${source}
 20 | The generated output of model 1 is: ${model1_generated}
 21 | The generated output of model 2 is: ${model2_generated}
 22 | The generated output of model 3 is: ${model3_generated}
 23 | The generated output of model 4 is: ${model4_generated}
 24 | The reference output is: ${reference}
 25 | 
 26 | Now Please rank the 4 model's outputs from best to worst.
 27 | Please first output the rank results in the following format:
 28 | [best] [second best] [third best] [worst] (e.g. 1 2 3 4)
 29 | Then give your brief comments on why you rank the outputs in this way.
 30 | """
 31 | 
 32 | 
 33 | def get_rank_prompts(
 34 |     item: dict
 35 | ):
 36 |     random.shuffle(item['candidates'])
 37 |     rank_prompt = Template(rank_template).substitute(
 38 |         instruction=item['instruction'],
 39 |         source=item['input'],
 40 |         model1_generated=item['candidates'][0]['text'],
 41 |         model2_generated=item['candidates'][1]['text'],
 42 |         model3_generated=item['candidates'][2]['text'],
 43 |         model4_generated=item['candidates'][3]['text'],
 44 |         reference=item.get('output') or item.get("refs")[0],
 45 |     )
 46 |     return rank_prompt
 47 | 
 48 | 
 49 | def main(
 50 |     input_file: str,
 51 |     output_file: str,
 52 |     seed: int = 42,
 53 |     model_name: str = "ChatGPT",
 54 | ):
 55 |     random.seed(seed)
 56 |     with open(input_file, "r") as f:
 57 |         data = json.load(f)
 58 | 
 59 |     rank_prompts = list(map(get_rank_prompts, data))
 60 |     chatmls = [[{"role": "system", "content": "You are an helpful AI assistant to help user find information."},
 61 |                 {"role": "user", "content": prompt}] for prompt in rank_prompts]
 62 |     chatml_prompts = [_chatml_to_prompt(chatml) for chatml in chatmls]
 63 | 
 64 |     decoding_kwargs = {
 65 |         # "max_tokens": 1024,
 66 |         "temperature": 0,
 67 |         "top_p": 1.0,
 68 |         "timeout": 30,
 69 |         "request_timeout": 30
 70 |     }
 71 |     results = openai_completions(
 72 |         chatml_prompts, model_name=model_name, **decoding_kwargs)
 73 |     logging.warning("Total price: {:.4f}$".format(
 74 |         sum(results['price_per_example'])))
 75 |     completions = results['completions']
 76 | 
 77 |     best_model_idxs = []
 78 |     model_ranks = defaultdict(list)
 79 |     for i, item in enumerate(data):
 80 |         item['rank_prompt'] = rank_prompts[i]
 81 |         item['rank_response'] = completions[i]
 82 |         try:
 83 |             first_digit_idx = re.search(r"\d", item['rank_response']).start()
 84 |             item['ranks'] = re.search(
 85 |                 r"(\d)[\n ](\d)[\n ](\d)[\n ](\d)", item['rank_response'])
 86 |             if not item['ranks']:
 87 |                 item['ranks'] = re.search(
 88 |                     "\[best\] (\d) \[second best\] (\d) \[third best\] (\d) \[worst\] (\d)", item['rank_response'])
 89 |             if not item['ranks']:
 90 |                 item['ranks'] = re.search(
 91 |                     "\[best\] Model (\d)[\n ]\[second best\] Model (\d)[\n ]\[third best\] Model (\d)[\n ]\[worst\] Model (\d)", item['rank_response'])
 92 |             # item['ranks'] = item['rank_response'][first_digit_idx:item['rank_response'].index("\n")].split(" ")
 93 |             item['ranks'] = [int(rank) for rank in item['ranks'].groups()]
 94 |         except Exception:
 95 |             print(item['ranks'])
 96 |         for j, cand in enumerate(item['candidates']):
 97 |             cand['scores']['gpt_rank_{}'.format(
 98 |                 model_name)] = - item['ranks'][j]
 99 |             model_ranks[cand['source']].append(item['ranks'][j])
100 |         best_model_idxs.append(item['ranks'][0])
101 | 
102 |     print(Counter(best_model_idxs))
103 |     for model, ranks in model_ranks.items():
104 |         c = Counter(ranks)
105 |         print(model, sorted(c.items(), key=lambda x: x[0]))
106 |     with open(output_file, "w") as f:
107 |         json.dump(data, f, indent=4, ensure_ascii=False)
108 |     logging.warning(f"Saved to {output_file}")
109 | 
110 | 
111 | if __name__ == "__main__":
112 |     fire.Fire(main)
113 | 


--------------------------------------------------------------------------------
/tigerscore/eval_scripts/lfqa_gpt_rate.sh:
--------------------------------------------------------------------------------
 1 | model_name="gpt-4"
 2 | if [ ${model_name} == "gpt-4" ]; then
 3 |     export OPENAI_API_KEY=
 4 |     export OPENAI_API_BASE=""
 5 |     export OPENAI_API_TYPE="azure"
 6 |     export OPENAI_API_VERSION="2023-07-01-preview"
 7 | fi
 8 | 
 9 | python lfqa_gpt_rate.py \
10 |     --input_file "../../data_bak/lfqa/test.json" \
11 |     --output_file "../../data_bak/lfqa/test.${model_name}.rank.json" \
12 |     --model_name ${model_name} \


--------------------------------------------------------------------------------
/tigerscore/eval_scripts/mathqa_rate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file isn't used in our final version.
 3 | """
 4 | import sys
 5 | import fire
 6 | import json
 7 | import logging
 8 | import regex as re
 9 | import copy
10 | import random
11 | sys.path.append("..")
12 | from xgptscore.openai_utils import openai_completions, _chatml_to_prompt
13 | from typing import List, Dict
14 | from string import Template
15 | from collections import Counter, defaultdict
16 | logging.basicConfig(level=logging.WARNING)
17 | 
18 | template = """
19 | ${instruction}
20 | ${source}
21 | 
22 | A correct output is: 
23 | ${reference}
24 | 
25 | A model generated output is:
26 | ${model1_generated}
27 | 
28 | Now please evaluate the errors in the model-generated outputs
29 | For each error associated with problem understanding, problem formulation,  computing accuracy, and solution interpretation, reduce 1 or 2 score. 
30 | Finally give me a total reductions of score as the evaluation of this model-generated output starting with "Total Score Reduction: ".
31 | """
32 | 
33 | 
34 | def get_prompts(
35 |     item: dict
36 | ):
37 |     prompts = []
38 |     random.shuffle(item['candidates'])
39 |     for cand in item['candidates']:
40 |         prompt = Template(template).substitute(
41 |             instruction=item['instruction'].strip("\n "),
42 |             source=item['input'].strip("\n "),
43 |             reference=(item.get('output') or item.get("refs")[0]).strip("\n "),
44 |             model1_generated=cand['text'].strip("\n "),
45 |         )
46 |         prompts.append(prompt)
47 |     return prompts
48 | 
49 | def main(
50 |     input_file: str,
51 |     output_file: str,
52 |     seed: int = 42,
53 |     model_name: str = "ChatGPT",
54 | ):
55 |     random.seed(seed)
56 |     with open(input_file, "r") as f:
57 |         data = json.load(f)
58 |     
59 |     prompts = list(map(get_prompts, data))
60 |     flatten_prompts = [prompt for prompts_ in prompts for prompt in prompts_]
61 |     chatmls = [[{"role":"system","content":"You are an helpful AI assistant to help user find information."},
62 |             {"role":"user","content": prompt}] for prompt in flatten_prompts]
63 |     chatml_prompts = [_chatml_to_prompt(chatml) for chatml in chatmls]
64 | 
65 |     decoding_kwargs = {
66 |         # "max_tokens": 1024,
67 |         "temperature": 0,
68 |         "top_p": 1.0,
69 |         "timeout": 30,
70 |         "request_timeout": 30
71 |     }
72 |     results = openai_completions(chatml_prompts, model_name=model_name, **decoding_kwargs)
73 |     logging.warning("Total price: {:.4f}$".format(sum(results['price_per_example'])))
74 |     completions = results['completions']
75 |     
76 |     idx = 0
77 |     for i, item in enumerate(data):
78 |         for j, cand in enumerate(item['candidates']):
79 |             total_score_reduction = re.search("Total Score Reduction: (\d+)", completions[idx])
80 |             if not total_score_reduction:
81 |                 total_score_reduction = re.search("Total Score Reduction: -(\d+)", completions[idx])
82 |             if not total_score_reduction:
83 |                 total_score_reduction = re.search("Total Score Reduction is (\d+)", completions[idx])
84 |             if not total_score_reduction:
85 |                 total_score_reduction = re.search("Total Score Reduction is -(\d+)", completions[idx])
86 |             if total_score_reduction:
87 |                 cand['scores']['gpt_score_reduction'] = - abs(int(total_score_reduction.groups()[0]))
88 |             else:
89 |                 pass
90 |                 cand['scores']['gpt_score_reduction'] = 0
91 |             cand['gpt_score_output'] = completions[idx]
92 |             idx += 1
93 |     with open(output_file, "w") as f:
94 |         json.dump(data, f, indent=4, ensure_ascii=False)
95 |     logging.warning(f"Saved to {output_file}")
96 | 
97 | if __name__ == "__main__":
98 |     fire.Fire(main)


--------------------------------------------------------------------------------
/tigerscore/eval_scripts/test_ref_diff.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file isn't used in our final version.
  3 | Calculate the distance between our score and the reference score.
  4 | Maybe Pearson is better. Or we can draw a QQ plot.
  5 | """
  6 | import json
  7 | import random
  8 | import logging
  9 | import sys
 10 | import numpy as np
 11 | import pickle
 12 | from pathlib import Path
 13 | from utils import MyCorrelation
 14 | sys.path.append(str(Path(__file__).parent.parent))
 15 | from xgptscore.xgptscore import xgptscore
 16 | from itertools import chain
 17 | from collections import Counter
 18 | from xgptscore.process_utils import XPGTItem, get_xgptscore_from_json_per_aspect
 19 | from xgptscore.constants import EVAL_ASPECTS
 20 | logging.basicConfig(level=logging.INFO)
 21 | 
 22 | # params
 23 | task='data2text'
 24 | bart_version="D2T"
 25 | dataset="SFHOT"
 26 | data_dir="../../BARTScore"
 27 | xgptscore_mode="d2t"
 28 | version_key=f"{xgptscore_mode}.ref.end_1_5"
 29 | our_score_name="xgptscore"
 30 | model_name="ChatGPT"
 31 | overwrite=False
 32 | max_size=200 # set to None to use all examples
 33 | num_sys=2
 34 | if isinstance(max_size, int) and max_size > 0:
 35 |     version_key = f"{version_key}_{max_size}"
 36 | 
 37 | # load data
 38 | input_file=Path(f"{data_dir}/{bart_version}/{dataset}/final_p_with_xgptscore.json")
 39 | if version_key:
 40 |     output_file = input_file.with_suffix(f".{version_key}.json")
 41 | else:
 42 |     output_file = input_file.with_suffix(f".default.json")
 43 | 
 44 | if not output_file.exists() or overwrite:
 45 |     # Load and shuffle data
 46 |     logging.info("Loading from {}".format(input_file))
 47 |     with open(input_file, "r") as f:
 48 |         items = json.load(f)
 49 |     if isinstance(max_size, int) and max_size > 0:
 50 |         items = items[:max_size]
 51 |     # random will cause wrong results
 52 |     
 53 |     # Data processing
 54 |     xgptitems = []
 55 |     for item in items:
 56 |         item['candidates'] = [
 57 |             {
 58 |                 "model": "reference",
 59 |                 "decoding_method": "greedy",
 60 |                 "text": item['output'] if isinstance(item['output'], str) else item['output'][0],
 61 |                 "scores": {},
 62 |             }
 63 |         ]
 64 |         xgptitems.append(XPGTItem(
 65 |             task=task,
 66 |             instruction=item['instruction'],
 67 |             input=item['input'],
 68 |             # ref_output=item['output'],
 69 |             ref_output="N/A",
 70 |             hypo_output=item['output'] if isinstance(item['output'], str) else item['output'][0],
 71 |         ))
 72 |     # Run xgptscore
 73 |     result = xgptscore(xgptitems, mode=xgptscore_mode, model_name=model_name,num_workers=5)
 74 |     idx = 0
 75 |     aspects = EVAL_ASPECTS[task].keys()
 76 |     score_dict = {"xgptscore_"+aspect: 0 for aspect in aspects}
 77 |     for item in items:
 78 |         for cand in item['candidates']:      
 79 |             cand['responses'] = result['round_completions'][idx]
 80 |             cand['messages_records'] = result['messages_records'][idx]
 81 |             xgptscore_ans = get_xgptscore_from_json_per_aspect(cand['responses'][-1])
 82 |             if xgptscore_ans is None:
 83 |                 logging.info(f"XGPTScore failed for {cand['text']}")
 84 |                 # cand['scores']['xgptscore'] = None
 85 |             else:
 86 |                 cand['scores'].update(score_dict)
 87 |                 cand['scores'].update(xgptscore_ans) 
 88 |             idx += 1
 89 |         
 90 |     # Save results
 91 |     with open(output_file, "w") as f:
 92 |         json.dump(items, f, indent=4, ensure_ascii=False)
 93 |         logging.info("Saved to {}".format(output_file))
 94 | else:
 95 |     logging.info("Loading existing results from {}".format(output_file))
 96 |     with open(output_file, "r") as f:
 97 |         items = json.load(f)
 98 | 
 99 | 
100 | # by system
101 | # Compute bias
102 | xgptscores = []
103 | for item in items:
104 |     for cand in item['candidates']:
105 |         if our_score_name in cand['scores']:
106 |             xgptscores.append(cand['scores'][our_score_name])
107 | 
108 | print(f"Mean: {np.mean(xgptscores)}")
109 | print(f"Distribution: {Counter(xgptscores)}")
110 | print(f"Std: {np.std(xgptscores)}")
111 | print(f"Max: {np.min(xgptscores)}")


--------------------------------------------------------------------------------
/tigerscore/eval_scripts/test_xgptscore.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script is used to test xgptscore for prompt engineering.
  3 | """
  4 | 
  5 | from common import str2bool
  6 | from xgptscore.xgptscore import xgptscore
  7 | from itertools import chain
  8 | from xgptscore.process_utils import XPGTItem, get_xgptscore_from_json
  9 | import json
 10 | import logging
 11 | import sys
 12 | import numpy as np
 13 | import fire
 14 | from pathlib import Path
 15 | from utils import MyCorrelation
 16 | sys.path.append(str(Path(__file__).parent.parent))
 17 | logging.basicConfig(level=logging.INFO)
 18 | 
 19 | 
 20 | def main(input_file: str, task: str, model_name: str, output_file: str, xgptscore_mode: str = "prompt", max_size: int = None, overwrite: str = "false"):
 21 |     overwrite = str2bool(overwrite)
 22 |     if output_file is None:
 23 |         output_file = Path(input_file).parent / \
 24 |             (Path(input_file).stem + "." + xgptscore_mode + ".json")
 25 |     if not output_file.exists() or overwrite:
 26 |         logging.info("Loading from {}".format(input_file))
 27 |         with open(input_file, "r") as f:
 28 |             items = json.load(f)
 29 |         np.random.seed(42)
 30 |         np.random.shuffle(items)
 31 |         if isinstance(max_size, int) and max_size > 0:
 32 |             items = items[:max_size]
 33 | 
 34 |         # Data processing
 35 |         xgptitems = []
 36 |         for item in items:
 37 |             for cand in item['candidates']:
 38 |                 xgptitems.append(XPGTItem(
 39 |                     task=task,
 40 |                     instruction=item['instruction'],
 41 |                     input=item['input'],
 42 |                     ref_output=item['output'],
 43 |                     hypo_output=cand['text']
 44 |                 ))
 45 |         # Run xgptscore
 46 |         result = xgptscore(xgptitems, mode=xgptscore_mode,
 47 |                            model_name=model_name, num_workers=5)
 48 |         idx = 0
 49 |         for item in items:
 50 |             for cand in item['candidates']:
 51 |                 cand['responses'] = result['round_completions'][idx]
 52 |                 cand['messages_records'] = result['messages_records'][idx]
 53 |                 cand['scores']['xgptscore'] = get_xgptscore_from_json(
 54 |                     cand['responses'][-1])
 55 |                 idx += 1
 56 | 
 57 |         # Save results
 58 |         with open(output_file, "w") as f:
 59 |             json.dump(items, f, indent=4, ensure_ascii=False)
 60 |             logging.info("Saved to {}".format(output_file))
 61 |     else:
 62 |         logging.info("Loading existing results from {}".format(output_file))
 63 |     with open(output_file, "r") as f:
 64 |         items = json.load(f)
 65 | 
 66 |     # evaluate system
 67 | 
 68 |     num_cands = len(items[0]['candidates'])
 69 |     human_scores = [[cand['scores']["rank"]
 70 |                      for cand in item['candidates']] for item in items]
 71 |     human_scores = list(chain(*zip(*human_scores)))  # transpose and flatten
 72 |     metrics = ["xgptscore", "bleu", "rouge1", "rouge2",
 73 |                "rougeL", "rougeLsum", "bart_score", "bart_score_cnn"]
 74 |     # metrics = ["xgptscore"]
 75 | 
 76 |     Pearson_corr = {}
 77 |     Spearman_corr = {}
 78 |     Kendall_corr = {}
 79 |     for metric in metrics:
 80 |         metric_scores = [[cand['scores'][metric]
 81 |                           for cand in item['candidates']] for item in items]
 82 |         metric_scores = list(chain(*zip(*metric_scores))
 83 |                              )  # transpose and flatten
 84 |         metric_corr = MyCorrelation(num_cands, human_scores, metric_scores)
 85 |         Pearson_corr[metric] = metric_corr.Pearson()
 86 |         Spearman_corr[metric] = metric_corr.Spearman()
 87 |         Kendall_corr[metric] = metric_corr.Kendall()
 88 | 
 89 |     # sort Corr
 90 |     Pearson_corr = {k: v for k, v in sorted(
 91 |         Pearson_corr.items(), key=lambda item: item[1][0], reverse=True)}
 92 |     Spearman_corr = {k: v for k, v in sorted(
 93 |         Spearman_corr.items(), key=lambda item: item[1][0], reverse=True)}
 94 |     Kendall_corr = {k: v for k, v in sorted(
 95 |         Kendall_corr.items(), key=lambda item: item[1][0], reverse=True)}
 96 |     Corr_record = {
 97 |         "Pearson": Pearson_corr,
 98 |         "Spearman": Spearman_corr,
 99 |         "Kendall": Kendall_corr,
100 |     }
101 |     # Save correlation results
102 |     corr_results_file = Path("./eval_results/") / \
103 |         (output_file.stem + ".corr.json")
104 |     corr_results_file.parent.mkdir(parents=True, exist_ok=True)
105 |     with open(corr_results_file, "w") as f:
106 |         json.dump(Corr_record, f, indent=4, ensure_ascii=False)
107 |     logging.info("Saved to {}".format(corr_results_file))
108 |     # save to another location
109 |     corr_results_file = output_file.parent / \
110 |         "eval_results" / (output_file.stem + ".corr.json")
111 |     corr_results_file.parent.mkdir(parents=True, exist_ok=True)
112 |     with open(corr_results_file, "w") as f:
113 |         json.dump(Corr_record, f, indent=4, ensure_ascii=False)
114 |     logging.info("Saved to {}".format(corr_results_file))
115 |     # print("Correlation results:")
116 |     # print(json.dumps(Corr_record, indent=4, ensure_ascii=False))
117 | 
118 | 
119 | if __name__ == "__main__":
120 |     fire.Fire(main)
121 | 


--------------------------------------------------------------------------------
/tigerscore/eval_scripts/test_xgptscore.sh:
--------------------------------------------------------------------------------
 1 | model_name="chatgpt"
 2 | 
 3 | ## Summarization ##
 4 | input_file="../../data/evaluation/summarization/summeval/test_data_prepared.json"
 5 | python ./test_xgptscore.py  \
 6 |     --input_file $input_file \
 7 |     --task "summarization" \
 8 |     --model_name $model_name
 9 | 
10 | ## Translation ##
11 | input_file="../../data/evaluation/translation/test_data_prepared.json"
12 | python ./test_xgptscore.py  \
13 |     --input_file $input_file \
14 |     --task "translation" \
15 |     --model_name $model_name
16 | 
17 | ## Data2Text ##
18 | input_file="../../data/evaluation/d2t/webnlg_2020/test_data_prepared.json"
19 | python ./test_xgptscore.py  \
20 |     --input_file $input_file \
21 |     --task "data2text" \
22 |     --model_name $model_name
23 | 
24 | ## Instructions ##
25 | input_file="../../data/evaluation/instructions/just-eval-instruct/test_data_prepared.json"
26 | python ./test_xgptscore.py  \
27 |     --input_file $input_file \
28 |     --task "instructions" \
29 |     --model_name $model_name
30 | 
31 | ## Long Form QA ##
32 | input_file="../../data/evaluation/lfqa/test_data_prepared.json"
33 | python ./test_xgptscore.py  \
34 |     --input_file $input_file \
35 |     --task "long-form QA" \
36 |     --model_name $model_name
37 | 
38 | ## Math QA ##
39 | input_file="../../data/evaluation/mathqa/gsm8k/test_data_prepared.json"
40 | python ./test_xgptscore.py  \
41 |     --input_file $input_file \
42 |     --task "mathQA" \
43 |     --model_name $model_name
44 | 
45 | ## Story Generation ##
46 | input_file="../../data/evaluation/storygen/test_data_prepared.json"
47 | python ./test_xgptscore.py  \
48 |     --input_file $input_file \
49 |     --task "story_generation" \
50 |     --model_name $model_name


--------------------------------------------------------------------------------
/tigerscore/eval_scripts/utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from mt_metrics_eval.stats import Correlation
 3 | from typing import List
 4 | 
 5 | 
 6 | class MyCorrelation(Correlation):
 7 |     def __init__(self, num_sys: int, gold_scores: List[int], metric_scores: List[int]):
 8 |         # remove nan in metrics scores
 9 |         none_metric_scores_idxs = [idx for idx,
10 |                                    x in enumerate(metric_scores) if x is None]
11 |         logging.info("Remove {} nan scores from {} scores".format(
12 |             len(none_metric_scores_idxs),
13 |             len(metric_scores)
14 |         ))
15 |         gold_scores = gold_scores.copy()
16 |         # set gold scores to None if metric scores are None
17 |         for idx in none_metric_scores_idxs[::-1]:
18 |             gold_scores[idx] = None
19 |         super().__init__(num_sys, gold_scores, metric_scores)
20 | 


--------------------------------------------------------------------------------
/tigerscore/finetune/ds_llama_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "bf16": {
 3 |     "enabled": "auto"
 4 |   },
 5 |   "scheduler": {
 6 |     "type": "WarmupLR",
 7 |     "params": {
 8 |       "warmup_min_lr": "auto",
 9 |       "warmup_max_lr": "auto",
10 |       "warmup_num_steps": "auto"
11 |     }
12 |   },
13 |   "zero_optimization": {
14 |     "stage": 3,
15 |     "overlap_comm": true,
16 |     "contiguous_gradients": true,
17 |     "sub_group_size": 1e9,
18 |     "reduce_bucket_size": "auto",
19 |     "stage3_prefetch_bucket_size": "auto",
20 |     "stage3_param_persistence_threshold": "auto",
21 |     "stage3_max_live_parameters": 1e9,
22 |     "stage3_max_reuse_distance": 1e9,
23 |     "stage3_gather_16bit_weights_on_model_save": true
24 |   },
25 |   "gradient_accumulation_steps": "auto",
26 |   "gradient_clipping": "auto",
27 |   "steps_per_print": 1,
28 |   "train_batch_size": "auto",
29 |   "train_micro_batch_size_per_gpu": "auto",
30 |   "wall_clock_breakdown": false
31 | }


--------------------------------------------------------------------------------
/tigerscore/finetune/finetune_llama.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #SBATCH --job-name=llama_finetune
  3 | #SBATCH -c 10
  4 | #SBATCH --partition=a100
  5 | #SBATCH --gres=gpu:4
  6 | #SBATCH --time=24:00:00
  7 | #SBATCH --mem=100G
  8 | #SBATCH --output=../../jobs/%x/%j.out
  9 | 
 10 | nvidia-smi
 11 | MASTER_PORT=4637
 12 | MODEL_DIR="meta-llama/Llama-2-7b-hf" # 13b
 13 | run_name="llama.train_mix.check.clean.mathQA" # change this every time you run a new experiment
 14 | 
 15 | output_dir="../../outputs/${MODEL_DIR}/${run_name}"
 16 | 
 17 | train_data_path="../../data/train_mix.check.clean.mathQA.format_v2.json" # 
 18 | 
 19 | mkdir -p ${output_dir}
 20 | 
 21 | # slurm system gpus can't connect to each other by default
 22 | # set the following environment variables to enable nccl
 23 | export NCCL_IB_DISABLE=1;
 24 | export NCCL_P2P_DISABLE=1;
 25 | 
 26 | export NCCL_DEBUG=INFO;
 27 | export NCCL_SOCKET_IFNAME=en,eth,em,bond;
 28 | export CXX=g++;
 29 | 
 30 | # batch_size = train_batch_size * gradient_accumulation_steps * num_gpus = 128
 31 | # epoch size: alpaca using 3 epochs for 52k data
 32 | # epoch size: translation data size, only 8k
 33 | # epoch szie: sum, data2text, trans, 30k, epoch_size = 4
 34 | 
 35 | # deepspeed \
 36 | #     --num_gpus 4 \
 37 | #     --num_nodes 1 \
 38 | #     --master_port ${MASTER_PORT} \
 39 | #     train.py \
 40 | #     --model_name_or_path ${MODEL_DIR} \
 41 | #     --train_data_path ${train_data_path} \
 42 | #     --bf16 True \
 43 | #     --output_dir ${output_dir} \
 44 | #     --num_train_epochs 3 \
 45 | #     --per_device_train_batch_size 2 \
 46 | #     --per_device_eval_batch_size 2 \
 47 | #     --gradient_accumulation_steps 16 \
 48 | #     --model_max_length 1024 \
 49 | #     --evaluation_strategy "no" \
 50 | #     --save_strategy "epoch" \
 51 | #     --save_steps 200 \
 52 | #     --save_total_limit 1 \
 53 | #     --learning_rate 2e-5 \
 54 | #     --weight_decay 0. \
 55 | #     --warmup_ratio 0.1 \
 56 | #     --lr_scheduler_type "cosine" \
 57 | #     --logging_steps 2 \
 58 | #     --tf32 True \
 59 | #     --deepspeed ds_llama_config.json \
 60 | #     --run_name ${run_name} \
 61 | #     --seed 42 \
 62 | #     --is_lora False \
 63 | 
 64 | CUDA_VISIBLE_DEVICES=0,1,2,3 deepspeed \
 65 |     --num_gpus 4 \
 66 |     --num_nodes 1 \
 67 |     --master_port ${MASTER_PORT} \
 68 |     train.py \
 69 |     --model_name_or_path ${MODEL_DIR} \
 70 |     --train_data_path ${train_data_path} \
 71 |     --bf16 True \
 72 |     --output_dir ${output_dir} \
 73 |     --num_train_epochs 3 \
 74 |     --per_device_train_batch_size 1 \
 75 |     --per_device_eval_batch_size 2 \
 76 |     --gradient_accumulation_steps 32 \
 77 |     --model_max_length 1024 \
 78 |     --evaluation_strategy "no" \
 79 |     --save_strategy "epoch" \
 80 |     --save_steps 64 \
 81 |     --save_total_limit 6 \
 82 |     --learning_rate 2e-5 \
 83 |     --weight_decay 0. \
 84 |     --warmup_ratio 0.1 \
 85 |     --lr_scheduler_type "cosine" \
 86 |     --logging_steps 2 \
 87 |     --tf32 True \
 88 |     --deepspeed ds_llama_config.json \
 89 |     --run_name ${run_name} \
 90 |     --seed 42 \
 91 |     --is_lora False \
 92 | 
 93 | # # LIMA config
 94 | # deepspeed \
 95 | #     --num_gpus 4 \
 96 | #     --num_nodes 1 \
 97 | #     --master_port ${MASTER_PORT} \
 98 | #     train.py \
 99 | #     --model_name_or_path ${MODEL_DIR} \
100 | #     --train_data_path ${train_data_path} \
101 | #     --bf16 True \
102 | #     --output_dir ${output_dir} \
103 | #     --num_train_epochs 15 \
104 | #     --per_device_train_batch_size 1 \
105 | #     --per_device_eval_batch_size 2 \
106 | #     --gradient_accumulation_steps 32 \
107 | #     --model_max_length 1024 \
108 | #     --evaluation_strategy "no" \
109 | #     --save_strategy "epoch" \
110 | #     --save_steps 200 \
111 | #     --save_total_limit 1 \
112 | #     --learning_rate 1e-5 \
113 | #     --adam_beta1 0.9 \
114 | #     --adam_beta2 0.95 \
115 | #     --weight_decay 0.1 \
116 | #     --warmup_ratio 0. \
117 | #     --lr_scheduler_type "linear" \
118 | #     --logging_steps 2 \
119 | #     --tf32 True \
120 | #     --deepspeed ds_llama_config.json \
121 | #     --run_name ${run_name} \
122 | #     --seed 42 \
123 | #     --is_lora False \


--------------------------------------------------------------------------------
/tigerscore/finetune/finetune_mistral.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=llama_finetune
 3 | #SBATCH -c 10
 4 | #SBATCH --partition=a100
 5 | #SBATCH --gres=gpu:4
 6 | #SBATCH --time=24:00:00
 7 | #SBATCH --mem=100G
 8 | #SBATCH --output=../../jobs/%x/%j.out
 9 | 
10 | nvidia-smi
11 | MASTER_PORT=4637
12 | MODEL_DIR="mistralai/Mistral-7B-v0.1" # 13b
13 | run_name="train_mix.check_ChatGPT.clean" # change this every time you run a new experiment
14 | 
15 | output_dir="../../outputs/${MODEL_DIR}/${run_name}"
16 | train_data_path="../../data/train_mix.check_ChatGPT.clean.format_v2.json" #
17 | 
18 | mkdir -p ${output_dir}
19 | 
20 | # slurm system gpus can't connect to each other by default
21 | # set the following environment variables to enable nccl
22 | export NCCL_IB_DISABLE=1;
23 | export NCCL_P2P_DISABLE=1;
24 | 
25 | export NCCL_DEBUG=INFO;
26 | export NCCL_SOCKET_IFNAME=en,eth,em,bond;
27 | export CXX=g++;
28 | 
29 | # batch_size = train_batch_size * gradient_accumulation_steps * num_gpus = 128
30 | # epoch size: alpaca using 3 epochs for 52k data
31 | # epoch size: translation data size, only 8k
32 | # epoch szie: sum, data2text, trans, 30k, epoch_size = 4
33 | 
34 | CUDA_VISIBLE_DEVICES="0,1,2,3" deepspeed \
35 |     --num_gpus 4 \
36 |     --num_nodes 1 \
37 |     --master_port ${MASTER_PORT} \
38 |     train.py \
39 |     --model_name_or_path ${MODEL_DIR} \
40 |     --train_data_path ${train_data_path} \
41 |     --bf16 True \
42 |     --output_dir ${output_dir} \
43 |     --num_train_epochs 3 \
44 |     --per_device_train_batch_size 1 \
45 |     --per_device_eval_batch_size 2 \
46 |     --gradient_accumulation_steps 32 \
47 |     --model_max_length 1024 \
48 |     --evaluation_strategy "no" \
49 |     --save_strategy "epoch" \
50 |     --save_steps 64 \
51 |     --save_total_limit 6 \
52 |     --learning_rate 2e-5 \
53 |     --weight_decay 0. \
54 |     --warmup_ratio 0.1 \
55 |     --lr_scheduler_type "cosine" \
56 |     --logging_steps 2 \
57 |     --tf32 True \
58 |     --deepspeed ds_llama_config.json \
59 |     --run_name ${run_name} \
60 |     --seed 42 \
61 |     --is_lora False \


--------------------------------------------------------------------------------
/tigerscore/finetune/format_data_v2.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Usage: transforms the xgptscore data format into the alpaca data format for finetuning.
  3 | 
  4 | """
  5 | import sys
  6 | import os
  7 | sys.path.append("../")
  8 | templates_path = os.path.join(os.path.dirname(__file__), "..")
  9 | sys.path.append(templates_path)
 10 | from tqdm import tqdm
 11 | from transformers import AutoTokenizer
 12 | from common.datasets_config import DATASETS_CONFIG
 13 | from pathlib import Path
 14 | from string import Template
 15 | import json
 16 | import logging
 17 | import fire
 18 | import regex as re
 19 | import numpy as np
 20 | from collections import Counter
 21 | from itertools import chain
 22 | 
 23 | 
 24 | # FINETUNE_INST = "You are evaluating errors in a model-generated output for a(an) ${task} task."
 25 | # FINETUNE_INPUT = """\
 26 | # Task instruction: ${generation_instruction}
 27 | # Source: ${input_context}
 28 | # Model-generated Output: ${hypothesis_output}
 29 | 
 30 | # Based on the given task instruction and source, identify the major and minor errors in this model-generated output.
 31 | # Note that Major errors refer to actual errors that affects the task severely, and Minor errors refer to small imperfections, and purely subjective opinions about the output.
 32 | # For each error you give in the response, please also elaborate the following information:
 33 | # - error location (the words that are wrong in the output)
 34 | # - error aspect it belongs to.
 35 | # - explanation why it's an error, and the correction suggestions.
 36 | # - severity of the error ("Major" or "Minor").
 37 | # - reduction of score (between 0.5 and 5)
 38 | 
 39 | # Your evaluation output in the json format:
 40 | # """
 41 | INST = "You are evaluating errors in a model-generated output for a given instruction."
 42 | TEMPLATE = """\
 43 | Instruction: 
 44 | ${generation_instruction}
 45 | ${input_context}
 46 | 
 47 | Model-generated Output: 
 48 | ${hypothesis_output}
 49 | 
 50 | For each error you give in the response, please also elaborate the following information:
 51 | - error location (the words that are wrong in the output)
 52 | - error aspect it belongs to.
 53 | - explanation why it's an error, and the correction suggestions.
 54 | - severity of the error ("Major" or "Minor"). 
 55 | - reduction of score (between 0.5 and 5 given the severity of the error)
 56 | 
 57 | Your evaluation output:\
 58 | """
 59 | 
 60 | def main(
 61 |     seed: int = 42,
 62 |     input_file: str = None,
 63 |     output_file: str = None,
 64 |     overwrite: bool = False,
 65 |     max_eval_input_length: int = None,
 66 |     max_eval_hyp_length: int = None,
 67 |     max_eval_output_length: int = None,
 68 | ):
 69 |     tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
 70 | 
 71 |     with open(input_file, 'r') as f:
 72 |         if input_file.endswith(".json"):
 73 |             data = json.load(f)
 74 |         elif input_file.endswith(".jsonl"):
 75 |             data = [json.loads(line) for line in f]
 76 |     formatted_data = []
 77 |     for item in data:
 78 |         inst = INST
 79 |         input_ = Template(TEMPLATE).substitute(
 80 |             generation_instruction=item['instruction'],
 81 |             input_context=item['input_context'],
 82 |             hypothesis_output=item['hypo_output']
 83 |         )
 84 |         output_ = item['errors']
 85 |         formatted_data.append({
 86 |             "instruction": inst,
 87 |             "input": input_,
 88 |             "output": output_,
 89 |         })
 90 |         
 91 |     with open(output_file, 'w') as f:
 92 |         json.dump(formatted_data, f, indent=4, ensure_ascii=False)
 93 |     logging.info(f"Saved to {output_file}")
 94 | 
 95 |     # count the dataset statistics
 96 |     dataset_statistics = {}
 97 |     dataset_statistics["#total"] = len(formatted_data)
 98 |     dataset_statistics["#unique input"] = len(
 99 |         set([item["input"] for item in formatted_data]))
100 |     input_lens = [len(tokenizer.encode(item["input"]))
101 |                   for item in tqdm(formatted_data, desc="Counting input length")]
102 |     output_lens = [len(tokenizer.encode(item["output"]))
103 |                    for item in tqdm(formatted_data, desc="Counting output length")]
104 |     total_lens = [x + y for x, y in zip(input_lens, output_lens)]
105 |     dataset_statistics["input_length"] = {}
106 |     dataset_statistics["input_length"]["mean"] = np.mean(input_lens).item()
107 |     dataset_statistics["input_length"]["percentile"] = np.percentile(
108 |         input_lens, [0, 25, 50, 90, 100]).tolist()
109 |     dataset_statistics["input_length"]["max"] = max(input_lens)
110 |     dataset_statistics["input_length"]["min"] = min(input_lens)
111 |     dataset_statistics["output_length"] = {}
112 |     dataset_statistics["output_length"]["mean"] = np.mean(output_lens).item()
113 |     dataset_statistics["output_length"]["percentile"] = np.percentile(
114 |         output_lens, [0, 25, 50, 90, 100]).tolist()
115 |     dataset_statistics["output_length"]["max"] = max(output_lens)
116 |     dataset_statistics["output_length"]["min"] = min(output_lens)
117 |     dataset_statistics["total_length"] = {}
118 |     dataset_statistics["total_length"]["mean"] = np.mean(total_lens).item()
119 |     dataset_statistics["total_length"]["percentile"] = np.percentile(
120 |         total_lens, [0, 25, 50, 90, 100]).tolist()
121 |     dataset_statistics["total_length"]["max"] = max(total_lens)
122 |     dataset_statistics["total_length"]["min"] = min(total_lens)
123 |     error_aspects = [re.findall(
124 |         r'(?<=Error aspect \d+: )[ \w]+', item['output']) for item in formatted_data]
125 |     error_aspects = list(chain(*error_aspects))
126 |     dataset_statistics["error_aspects_distribution"] = Counter(error_aspects)
127 | 
128 |     num_errors = [len(re.findall(r'(?<=Error location \d+: ).*(?=\n|$)',
129 |                       item['output'])) for item in formatted_data]
130 |     dataset_statistics["num_errors_distribution"] = Counter(num_errors)
131 |     # severity distributions
132 |     severities = [re.findall(
133 |         r'(?<=Severity \d+: ).*(?=\n|$)', item['output']) for item in formatted_data]
134 |     severities = list(chain(*severities))
135 |     dataset_statistics["severity_distribution"] = Counter(severities)
136 |     # score reduction distributions
137 |     score_reductions = [re.findall(
138 |         r'(?<=Score reduction \d+: ).*(?=\n|$)', item['output']) for item in formatted_data]
139 |     score_reductions = list(chain(*score_reductions))
140 |     score_reductions = [abs(float(x.replace(" ", "")))
141 |                         for x in score_reductions]
142 |     dataset_statistics["score_reduction_distribution"] = Counter(
143 |         score_reductions)
144 | 
145 |     print(dataset_statistics)
146 |     output_file = Path(output_file).with_suffix(".statistics.json")
147 |     with open(output_file, "w") as f:
148 |         json.dump(dataset_statistics, f, indent=4, ensure_ascii=False)
149 |     logging.info(f"Saved statistics to {output_file}")
150 | 
151 | 
152 | if __name__ == "__main__":
153 |     logging.basicConfig(level=logging.INFO)
154 |     fire.Fire(main)
155 | 


--------------------------------------------------------------------------------
/tigerscore/finetune/format_data_v2.sh:
--------------------------------------------------------------------------------
 1 | # INPUT_FILE="../../data/train_mix.check.clean.jsonl"
 2 | # OUTPUT_FILE="../../data/train_mix.check.clean.format_v2.json"
 3 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
 4 | #         --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
 5 | 
 6 | # INPUT_FILE="../../data/train_mix.jsonl"
 7 | # OUTPUT_FILE="../../data/train_mix.format_v2.json"
 8 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
 9 | #         --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
10 | 
11 | # tasks=('data2text' 'instruction-following' 'long-form QA' 'mathQA' 'summarization' 'translation')
12 | # for task in "${tasks[@]}"; do
13 | #     INPUT_FILE="../../data/train_mix.${task}.jsonl"
14 | #     OUTPUT_FILE="../../data/train_mix.${task}.format_v2.json"
15 | #     python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
16 | #             --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
17 | # done
18 | 
19 | INPUT_FILE="../../data/train_mix.check.clean.mathQA.jsonl"
20 | OUTPUT_FILE="../../data/train_mix.check.clean.mathQA.format_v2.json"
21 | python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
22 |         --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
23 | 
24 | # INPUT_FILE="../../data/additional/alpaca_cleaned/new_alpaca_cleaned.v2.8k.gen.jsonl"
25 | # OUTPUT_FILE="../../data/additional/alpaca_cleaned/new_alpaca_cleaned.v2.8k.gen.format_v2.json"
26 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
27 | #         --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
28 | 
29 | # INPUT_FILE="../../data/additional/alpaca_cleaned/new_alpaca_cleaned.v2.2k.jsonl"
30 | # OUTPUT_FILE="../../data/additional/alpaca_cleaned/new_alpaca_cleaned.v2.2k.format_v2.json"
31 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
32 | #         --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
33 | 
34 | # INPUT_FILE="../../data/new_std_400s_m_200s_l_1100s_i3-32k.check.clean.jsonl"
35 | # OUTPUT_FILE="../../data/new_std_400s_m_200s_l_1100s_i3-32k.check.clean.format_v2.jsonl"
36 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
37 | #         --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
38 | 
39 | # INPUT_FILE="../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.story.ref.extracted.jsonl"
40 | # OUTPUT_FILE="../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.story.ref.extracted.format_v2.jsonl"
41 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
42 | #         --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
43 | 
44 | # INPUT_FILE="../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.story.1k.gen.extracted.jsonl"
45 | # OUTPUT_FILE="../../data/additional/alpaca_cleaned/alpaca_cleaned.v2.story.1k.gen.extracted.format_v2.jsonl"
46 | # # INPUT_FILE="TIGERScore/data/32k_final.json"
47 | # # OUTPUT_FILE="TIGERScore/data/32k_final_distill.json"
48 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
49 | #         --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
50 | 
51 | # INPUT_FILE="../../data/additional/metamath/metamath.1k.ref.extracted.jsonl"
52 | # OUTPUT_FILE="../../data/additional/metamath/metamath.1k.ref.extracted.format_v2.jsonl"
53 | # # INPUT_FILE="TIGERScore/data/32k_final.json"
54 | # # OUTPUT_FILE="TIGERScore/data/32k_final_distill.json"
55 | # python format_data_v2.py --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
56 | #         --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400


--------------------------------------------------------------------------------
/tigerscore/finetune/format_distill_data.sh:
--------------------------------------------------------------------------------
 1 | DATA_DIR="../../data"
 2 | 
 3 | # # transllation
 4 | # INPUT_FILE="${DATA_DIR}/wmt/train_data.wmt_mqm.distill_new_wmt_mqm.json"
 5 | # OUTPUT_FILE="${DATA_DIR}/wmt/train_data.wmt_mqm.distill_new_wmt_mqm.format_txt.json"
 6 | # python format_distill_data.py --task "translation" --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
 7 | 
 8 | # # summarization
 9 | # INPUT_FILE="${DATA_DIR}/WorkSpace/ExplainableGPTScore_bak/data/sum/train_data.align_score.filter_v2.json"
10 | # OUTPUT_FILE="${DATA_DIR}/WorkSpace/ExplainableGPTScore_bak/data/sum/train_data.align_score.filter_v2.format_txt.json"
11 | # python format_distill_data.py --task "summarization" \
12 | #     --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
13 | #     --max_eval_input_length 400 --max_eval_hyp_length 300 --max_eval_output_length 400 \
14 | 
15 | # # data2text
16 | # INPUT_FILE="${DATA_DIR}/WorkSpace/ExplainableGPTScore_bak/data/d2t/train_data.d2t.filter_v1.json"
17 | # OUTPUT_FILE="${DATA_DIR}/WorkSpace/ExplainableGPTScore_bak/data/d2t/train_data.d2t.filter_v1.format_txt.json"
18 | # python format_distill_data.py --task "data2text" --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
19 | #     --max_eval_input_length 400 --max_eval_hyp_length 400 --max_eval_output_length 400 \
20 | # # long-form QA
21 | 
22 | # # SEScore3 zh-en debug
23 | # INPUT_FILE="${DATA_DIR}/WorkSpace/ExplainableGPTScore/data/sescore3/sescore3_zh_en_llama_formatted_data.json"
24 | # OUTPUT_FILE="${DATA_DIR}/WorkSpace/ExplainableGPTScore/data/sescore3/sescore3_zh_en_llama_formatted_data.format_txt.json"
25 | # python format_distill_data.py --task "translation" --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
26 | #     # --max_eval_input_length 400 --max_eval_hyp_length 400 --max_eval_output_length 400 \
27 | 
28 | # # summarization v3
29 | # INPUT_FILE="../../data/sum/train_data.align_score.filter_v3.json"
30 | # OUTPUT_FILE="../../data/sum/train_data.align_score.filter_v3.format_txt.json"
31 | # python format_distill_data.py --task "summarization" \
32 | #     --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
33 | #     --max_eval_input_length 400 --max_eval_hyp_length 300 --max_eval_output_length 400 \
34 | 
35 | 
36 | IFS=$'\n'
37 | tasks=("translation" "long-form QA" "summarization" "data2text" "instruction-following")
38 | for task in ${tasks[@]}; do
39 |     INPUT_FILE="../../data/real_world/${task}.json"
40 |     OUTPUT_FILE="../../data/real_world/${task}.format_txt.json"
41 |     python format_distill_data.py --task ${task} \
42 |         --input_file "${INPUT_FILE}" --output_file "${OUTPUT_FILE}" \
43 |         --max_eval_input_length 600 --max_eval_hyp_length 400 --max_eval_output_length 400
44 | done
45 | 


--------------------------------------------------------------------------------
/tigerscore/finetune/format_synthesis_distill_data.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Usage: transforms the xgptscore data format into the alpaca data format for finetuning.
  3 | 
  4 | """
  5 | import sys
  6 | import os
  7 | sys.path.append("../")
  8 | templates_path = os.path.join(os.path.dirname(__file__), "..")
  9 | sys.path.append(templates_path)
 10 | from tqdm import tqdm
 11 | from transformers import AutoTokenizer
 12 | from common.datasets_config import DATASETS_CONFIG
 13 | from pathlib import Path
 14 | from string import Template
 15 | import json
 16 | import logging
 17 | import fire
 18 | import regex as re
 19 | import numpy as np
 20 | from collections import Counter
 21 | from itertools import chain
 22 | 
 23 | 
 24 | FINETUNE_INST = "You are evaluating errors in a model-generated output for a given instruction."
 25 | FINETUNE_INPUT = """\
 26 | Instruction: 
 27 | ${generation_instruction}
 28 | ${input_context}
 29 | 
 30 | Model-generated Output: 
 31 | ${hypothesis_output}
 32 | 
 33 | For each error you give in the response, please also elaborate the following information:
 34 | - error location (the words that are wrong in the output)
 35 | - error aspect it belongs to.
 36 | - explanation why it's an error, and the correction suggestions.
 37 | - severity of the error ("Major" or "Minor"). 
 38 | - reduction of score (between 0.5 and 5 given the severity of the error)
 39 | 
 40 | Your evaluation output:\
 41 | """
 42 | 
 43 | 
 44 | def main(
 45 |     task: str,
 46 |     seed: int = 42,
 47 |     input_file: str = None,
 48 |     output_file: str = None,
 49 |     overwrite: bool = False,
 50 |     max_eval_input_length: int = None,
 51 |     max_eval_hyp_length: int = None,
 52 |     max_eval_output_length: int = None,
 53 | ):
 54 |     assert task in DATASETS_CONFIG.keys()
 55 | 
 56 |     tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
 57 | 
 58 |     with open(input_file, 'r') as f:
 59 |         data = json.load(f)
 60 |     formatted_data = []
 61 |     for item in data:
 62 |         syn_output = item['responses'][-1]
 63 |         syn_output = syn_output.replace(": \n", ": ")
 64 |         # decode the synthesis outputs
 65 |         try:
 66 |             start_pos = syn_output.index(
 67 |                 "Generated incorrect output: ") + len("Generated incorrect output: ")
 68 |             end_pos = syn_output.index("\nError location 1")
 69 |             hyp = syn_output[start_pos:end_pos].strip('\n ')
 70 |             assert len(hyp) > 0
 71 |         except Exception:
 72 |             logging.warning(
 73 |                 "Failed to parse the synthesis output: {}".format(syn_output))
 74 |             continue
 75 |         inst = Template(FINETUNE_INST).substitute(task=task)
 76 |         input_context_ids = tokenizer.encode(
 77 |             item['input'], add_special_tokens=False)
 78 |         hyp_ids = tokenizer.encode(hyp, add_special_tokens=False)
 79 |         if max_eval_input_length is not None and len(input_context_ids) > max_eval_input_length:
 80 |             input_context = tokenizer.decode(
 81 |                 input_context_ids[:max_eval_input_length]) + "..."
 82 |         else:
 83 |             input_context = item['input']
 84 |         if max_eval_hyp_length is not None and len(hyp_ids) > max_eval_hyp_length:
 85 |             hypothesis_output = tokenizer.decode(
 86 |                 hyp_ids[:max_eval_hyp_length]) + "..."
 87 |         else:
 88 |             hypothesis_output = hyp
 89 |         input_ = Template(FINETUNE_INPUT).substitute(
 90 |             generation_instruction=item['instruction'],
 91 |             input_context=input_context,
 92 |             hypothesis_output=hypothesis_output,
 93 |         )
 94 |         try:
 95 |             error_locations = re.findall(
 96 |                 r'(?<=Error location \d+: ).*(?=\n|$)', syn_output)
 97 |             error_aspects = re.findall(
 98 |                 r'(?<=Error aspect \d+: ).*(?=\n|$)', syn_output)
 99 |             explanations = re.findall(
100 |                 r'(?<=Explanation \d+: ).*(?=\n|$)', syn_output)
101 |             severities = re.findall(
102 |                 r'(?<=Severity \d+: ).*(?=\n|$)', syn_output)
103 |             score_reductions = re.findall(
104 |                 r'(?<=Score reduction \d+: ).*(?=\n|$)', syn_output)
105 |             score_reductions = [abs(int(x.replace(" ", "")))
106 |                                 for x in score_reductions]
107 |         except Exception:
108 |             logging.warning(
109 |                 "Failed to parse the synthesis output: {}".format(syn_output))
110 |             continue
111 | 
112 |         if not len(error_locations) == len(error_aspects) == len(explanations) == len(severities) == len(score_reductions):
113 |             logging.warning(
114 |                 "The number of errors properties does not match!: {}".format(syn_output))
115 |             continue
116 | 
117 |         txt_output = "The model-generated output contains {} errors, with a total score reduction of {}.".format(
118 |             len(error_locations),
119 |             sum([int(score) for score in score_reductions]),
120 |         )
121 |         for i in range(len(error_locations)):
122 |             txt_output += "\nError location {}: {}\n".format(
123 |                 i + 1, error_locations[i])
124 |             txt_output += "Error aspect {}: {}\n".format(
125 |                 i + 1, error_aspects[i])
126 |             txt_output += "Explanation {}: {}\n".format(i + 1, explanations[i])
127 |             txt_output += "Severity {}: {}\n".format(i + 1, severities[i])
128 |             txt_output += "Score reduction {}: {}".format(
129 |                 i + 1, score_reductions[i])
130 |         output_ = txt_output.strip(' \n')
131 |         formatted_data.append({
132 |             "instruction": inst,
133 |             "input": input_,
134 |             "output": output_,
135 |             "task": task,
136 |         })
137 | 
138 |     # # append 20% non-error examples
139 |     # for item in data:
140 |     #     if random.random() < 0.2:
141 |     #         inst = Template(FINETUNE_INST).substitute(task=task)
142 |     #         input_context_ids = tokenizer.encode(item['input'], add_special_tokens=False)
143 |     #         if max_eval_input_length is not None and len(input_context_ids) > max_eval_input_length:
144 |     #             input_context = tokenizer.decode(input_context_ids[:max_eval_input_length]) + "..."
145 |     #         else:
146 |     #             input_context = item['input']
147 |     #         input_ = Template(FINETUNE_INPUT).substitute(
148 |     #             generation_instruction=item['instruction'],
149 |     #             input_context=input_context,
150 |     #             hypothesis_output=item['output'],
151 |     #         )
152 |     #         output_ = "The model-generated output contains 0 errors, with a total score reduction of 0."
153 |     #         formatted_data.append({
154 |     #             "instruction": inst,
155 |     #             "input": input_,
156 |     #             "output": output_,
157 |     #             "task": task,
158 |     #         })
159 | 
160 |     with open(output_file, 'w') as f:
161 |         json.dump(formatted_data, f, indent=4, ensure_ascii=False)
162 |     logging.info(f"Saved to {output_file}")
163 | 
164 |     # count the dataset statistics
165 |     dataset_statistics = {}
166 |     dataset_statistics["#total"] = len(formatted_data)
167 |     dataset_statistics["#unique input"] = len(
168 |         set([item["input"] for item in formatted_data]))
169 |     input_lens = [len(tokenizer.encode(item["input"]))
170 |                   for item in tqdm(formatted_data, desc="Counting input length")]
171 |     output_lens = [len(tokenizer.encode(item["output"]))
172 |                    for item in tqdm(formatted_data, desc="Counting output length")]
173 |     total_lens = [x + y for x, y in zip(input_lens, output_lens)]
174 |     dataset_statistics["input_length"] = {}
175 |     dataset_statistics["input_length"]["mean"] = np.mean(input_lens).item()
176 |     dataset_statistics["input_length"]["percentile"] = np.percentile(
177 |         input_lens, [0, 25, 50, 90, 100]).tolist()
178 |     dataset_statistics["input_length"]["max"] = max(input_lens)
179 |     dataset_statistics["input_length"]["min"] = min(input_lens)
180 |     dataset_statistics["output_length"] = {}
181 |     dataset_statistics["output_length"]["mean"] = np.mean(output_lens).item()
182 |     dataset_statistics["output_length"]["percentile"] = np.percentile(
183 |         output_lens, [0, 25, 50, 90, 100]).tolist()
184 |     dataset_statistics["output_length"]["max"] = max(output_lens)
185 |     dataset_statistics["output_length"]["min"] = min(output_lens)
186 |     dataset_statistics["total_length"] = {}
187 |     dataset_statistics["total_length"]["mean"] = np.mean(total_lens).item()
188 |     dataset_statistics["total_length"]["percentile"] = np.percentile(
189 |         total_lens, [0, 25, 50, 90, 100]).tolist()
190 |     dataset_statistics["total_length"]["max"] = max(total_lens)
191 |     dataset_statistics["total_length"]["min"] = min(total_lens)
192 |     error_aspects = [re.findall(
193 |         r'(?<=Error aspect \d+: ).*(?=\n|$)', item['output']) for item in formatted_data]
194 |     error_aspects = list(chain(*error_aspects))
195 |     dataset_statistics["error_aspects_distribution"] = Counter(error_aspects)
196 |     # number of errors distributions
197 |     num_errors = [len(re.findall(r'(?<=Error location \d+: ).*(?=\n|$)',
198 |                       item['output'])) for item in formatted_data]
199 |     dataset_statistics["num_errors_distribution"] = Counter(num_errors)
200 |     # severity distributions
201 |     severities = [re.findall(
202 |         r'(?<=Severity \d+: ).*(?=\n|$)', item['output']) for item in formatted_data]
203 |     severities = list(chain(*severities))
204 |     dataset_statistics["severity_distribution"] = Counter(severities)
205 |     # score reduction distributions
206 |     score_reductions = [re.findall(
207 |         r'(?<=Score reduction \d+: ).*(?=\n|$)', item['output']) for item in formatted_data]
208 |     score_reductions = list(chain(*score_reductions))
209 |     score_reductions = [abs(int(x.replace(" ", ""))) for x in score_reductions]
210 |     dataset_statistics["score_reduction_distribution"] = Counter(
211 |         score_reductions)
212 | 
213 |     print(dataset_statistics)
214 |     output_file = Path(output_file).with_suffix(".statistics.json")
215 |     with open(output_file, "w") as f:
216 |         json.dump(dataset_statistics, f, indent=4, ensure_ascii=False)
217 |     logging.info(f"Saved statistics to {output_file}")
218 | 
219 | 
220 | if __name__ == "__main__":
221 |     logging.basicConfig(level=logging.INFO)
222 |     fire.Fire(main)
223 | 


--------------------------------------------------------------------------------
/tigerscore/finetune/ft_llama_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=ft_llama_lora
 3 | #SBATCH --gres=gpu:a6000:1
 4 | #SBATCH --time=24:00:00
 5 | #SBATCH --qos=general
 6 | #SBATCH --output=../../jobs/llama_finetune/%j.out
 7 | 
 8 | MASTER_PORT=4635
 9 | MODEL_DIR="meta-llama/Llama-2-7b-hf" # 13b
10 | run_name="model_len_1024_lora_debug" # change this every time you run a new experiment
11 | output_dir="../../outputs/${MODEL_DIR}/${run_name}"
12 | # train_data_path="../../data/wmt/train_data.wmt_mqm.distill.format.json"
13 | train_data_path="../../WorkSpace/ExplainableGPTScore/finetune_data/translation/train.json"
14 | # train_data_path="../../WorkSpace/ExplainableGPTScore/finetune_data/translation/train/wmt18_zh-en.json"
15 | mkdir -p ${output_dir}
16 | 
17 | # slurm system gpus can't connect to each other by default
18 | # set the following environment variables to enable nccl
19 | export NCCL_IB_DISABLE=1;
20 | export NCCL_P2P_DISABLE=1;
21 | 
22 | export NCCL_DEBUG=INFO;
23 | export NCCL_SOCKET_IFNAME=en,eth,em,bond;
24 | export CXX=g++;
25 | 
26 | # batch_size = train_batch_size * gradient_accumulation_steps * num_gpus = 128
27 | # epoch size: alpaca using 3 epochs for 52k data
28 | # epoch size: translation data size, only 8k
29 | 
30 | ../../.conda/envs/llm_reranker/bin/deepspeed \
31 |     --num_gpus 1 \
32 |     --num_nodes 1 \
33 |     --master_port ${MASTER_PORT} \
34 |     train.py \
35 |     --model_name_or_path ${MODEL_DIR} \
36 |     --train_data_path ${train_data_path} \
37 |     --bf16 True \
38 |     --output_dir ${output_dir} \
39 |     --num_train_epochs 3 \
40 |     --per_device_train_batch_size 4 \
41 |     --per_device_eval_batch_size 2 \
42 |     --gradient_accumulation_steps 32 \
43 |     --model_max_length 1024 \
44 |     --evaluation_strategy "no" \
45 |     --save_strategy "epoch" \
46 |     --save_steps 200 \
47 |     --save_total_limit 3 \
48 |     --learning_rate 3e-4 \
49 |     --weight_decay 0. \
50 |     --warmup_ratio 0.1 \
51 |     --lr_scheduler_type "linear" \
52 |     --logging_steps 2 \
53 |     --tf32 True \
54 |     --deepspeed ds_llama_config.json \
55 |     --run_name ${run_name} \
56 |     --seed 42 \
57 |     --is_lora True \
58 | 
59 | # lora Config
60 | # lr: 3e-4


--------------------------------------------------------------------------------
/tigerscore/finetune/test_llama.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #SBATCH --job-name=test_llama
 3 | #SBATCH --gres=gpu:6000:1
 4 | #SBATCH --time=24:00:00
 5 | #SBATCH --output=../../jobs/test_llama/%j.out
 6 | nvidia-smi
 7 | 
 8 | model_name="meta-llama/Llama-2-7b-hf"
 9 | outputs_dir=""
10 | 
11 | # outputs_dir="../../outputs"
12 | checkpoint_name="model_len_1024_mix_v2"
13 | checkpoint_path="${outputs_dir}/${model_name}/${checkpoint_name}/checkpoint-best"
14 | # task="translation"
15 | # # finetune test
16 | # data_path="/home//WorkSpace/ExplainableGPTScore/finetune_data/${task}/test.json" 
17 | 
18 | # BARTScore test
19 | # data_path="/home//WorkSpace/ExplainableGPTScore/BARTScore/WMT/zh-en/final_p_with_xgptscore.test_llama_new.json"
20 | 
21 | # mtme test mqm
22 | # task="translation"
23 | # human_score_names="mqm,da"
24 | # data_path="../../data/wmt22/zh-en/eval_data.random_2.json"
25 | 
26 | # sum test relevance
27 | # task="summarization"
28 | # human_score_names="coherence,consistency,fluency,relevance"
29 | # data_path="../../BARTScore/SUM/SummEval/final_p_with_xgptscore.json"
30 | 
31 | # d2t test Correctness
32 | # task="data2text"
33 | # human_score_names="Correctness,DataCoverage,Fluency,Relevance,TextStructure"
34 | # data_path="/home//WorkSpace/ExplainableGPTScore_bak/data/webnlg/webnlg2020_gen_with_scores.json"
35 | 
36 | # instruction-following
37 | # rank
38 | # data_path="/home//WorkSpace/ExplainableGPTScore_bak/data/databricks/databricks-dolly-15k/rank_eval_mid.json"
39 | 
40 | # task="instruction-following"
41 | # human_score_names="gpt_rank_score"
42 | # data_path="/home//WorkSpace/ExplainableGPTScore_bak/data/llm-blender/mix-instruct/test_data_prepared_300.json"
43 | 
44 | # long-form QA
45 | ### ATTENTION the space in the task name is not allowed,you need use --task "long-form QA" instead of --task ${task}
46 | # task="long-form QA"
47 | # human_score_names="rank"
48 | # data_path="/home//WorkSpace/ExplainableGPTScore_bak/data/lfqa/test.json"
49 | 
50 | # Math QA
51 | # accuracy
52 | # task="mathQA"
53 | # human_score_names="accuracy"
54 | # data_path="/home//WorkSpace/ExplainableGPTScore_bak/gsm8k-ScRel/data/test_acc.json"
55 | 
56 | output_path="${data_path}.llama_2_7b_${checkpoint_name}.output"
57 | 
58 | # seems batch_size=1 is faster than batch_size=2 or higher
59 | python test_llama.py \
60 |     --model_name_or_path ${checkpoint_path} \
61 |     --task ${task} \
62 |     --data_path ${data_path} \
63 |     --output_path ${output_path} \
64 |     --torch_dtype "bfloat16" \
65 |     --batch_size 1 \
66 |     --human_score_names ${human_score_names} \
67 |     --model_max_length 1024 \
68 |     --max_eval_input_length 512 \
69 |     --max_eval_hyp_length 512 \
70 |     --max_eval_output_length 1024 \
71 |     --overwrite True \


--------------------------------------------------------------------------------
/tigerscore/finetune/test_llama_vllm.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import argparse
  3 | import json
  4 | import torch
  5 | import logging
  6 | import sys
  7 | import regex as re
  8 | from pathlib import Path
  9 | sys.path.append(str(Path(__file__).parent.parent))
 10 | from vllm import LLM, SamplingParams
 11 | from typing import List
 12 | from string import Template
 13 | from mt_metrics_eval.stats import Correlation
 14 | 
 15 | 
 16 | MAX_INT = sys.maxsize
 17 | 
 18 | IGNORE_INDEX = -100
 19 | DEFAULT_PAD_TOKEN = "[PAD]"
 20 | DEFAULT_EOS_TOKEN = "</s>"
 21 | DEFAULT_BOS_TOKEN = "<s>"
 22 | DEFAULT_UNK_TOKEN = "<unk>"
 23 | PROMPT_DICT = {
 24 |     "prompt_input": (
 25 |         "Below is an instruction that describes a task, paired with an input that provides further context. "
 26 |         "Write a response that appropriately completes the request.\n\n"
 27 |         "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
 28 |     ),
 29 |     "prompt_no_input": (
 30 |         "Below is an instruction that describes a task. "
 31 |         "Write a response that appropriately completes the request.\n\n"
 32 |         "### Instruction:\n{instruction}\n\n### Response:"
 33 |     ),
 34 | }
 35 | FINETUNE_INST = "You are evaluating errors in a model-generated output for a given instruction."
 36 | FINETUNE_INPUT = """\
 37 | Instruction: 
 38 | ${generation_instruction}
 39 | ${input_context}
 40 | 
 41 | Model-generated Output: 
 42 | ${hypothesis_output}
 43 | 
 44 | For each error you give in the response, please also elaborate the following information:
 45 | - error location (the words that are wrong in the output)
 46 | - error aspect it belongs to.
 47 | - explanation why it's an error, and the correction suggestions.
 48 | - severity of the error ("Major" or "Minor"). 
 49 | - reduction of score (between 0.5 and 5 given the severity of the error)
 50 | 
 51 | Your evaluation output:\
 52 | """
 53 | 
 54 | 
 55 | def get_sum_penalties(eval_output: dict):
 56 |     """
 57 |     Args:
 58 |         eval_output: dict, the json output of the eval function
 59 | 
 60 |     Returns:
 61 |     """
 62 |     try:
 63 |         penalty_score = 0
 64 |         for aspect in eval_output:
 65 |             for penalty_point in eval_output[aspect]["penalty_points"]:
 66 |                 penalty_score += penalty_point["score_reduction"]
 67 |         return - penalty_score
 68 |     except Exception:
 69 |         return None
 70 | 
 71 | 
 72 | def get_torch_dtype(dtype_str):
 73 |     """
 74 |         Get the torch dtype from a string
 75 |     """
 76 |     if dtype_str == "float32":
 77 |         return torch.float32
 78 |     elif dtype_str == "float16":
 79 |         return torch.float16
 80 |     elif dtype_str == "bfloat16":
 81 |         return torch.bfloat16
 82 |     elif dtype_str == "int8":
 83 |         return torch.int8
 84 |     else:
 85 |         raise ValueError("Invalid dtype {}".format(dtype_str))
 86 | 
 87 | 
 88 | def batch_data(data_list, batch_size=1):
 89 |     n = len(data_list) // batch_size
 90 |     batch_data = []
 91 |     for i in range(n - 1):
 92 |         start = i * batch_size
 93 |         end = (i + 1) * batch_size
 94 |         batch_data.append(data_list[start:end])
 95 | 
 96 |     last_start = (n - 1) * batch_size
 97 |     last_end = MAX_INT
 98 |     batch_data.append(data_list[last_start:last_end])
 99 |     return batch_data
100 | 
101 | 
102 | class MyCorrelation(Correlation):
103 |     def __init__(self, num_sys: int, gold_scores: List[int], metric_scores: List[int]):
104 |         # remove nan in metrics scores
105 |         none_metric_scores_idxs = [idx for idx,
106 |                                    x in enumerate(metric_scores) if x is None]
107 |         logging.info("Remove {} nan scores from {} scores".format(
108 |             len(none_metric_scores_idxs),
109 |             len(metric_scores)
110 |         ))
111 |         gold_scores = gold_scores.copy()
112 |         # set gold scores to None if metric scores are None
113 |         for idx in none_metric_scores_idxs[::-1]:
114 |             gold_scores[idx] = None
115 |         super().__init__(num_sys, gold_scores, metric_scores)
116 | 
117 | 
118 | def main(args):
119 | 
120 |     if args.output_path is not None:
121 |         output_file = Path(args.output_path)
122 |     else:
123 |         output_file = Path(args.data_path).with_suffix(
124 |             '.xgptscore.output.json')
125 |     if not output_file.exists() or args.overwrite:
126 |         logging.info("Loading model...")
127 |         sampling_params = SamplingParams(
128 |             temperature=0, top_p=1, max_tokens=1024)
129 |         llm = LLM(model=args.model_name_or_path, tensor_parallel_size=1)
130 |         logging.info("Model loaded from {}".format(args.model_name_or_path))
131 | 
132 |         eval_outputs = []
133 | 
134 |         logging.info("Load input data from {}".format(args.data_path))
135 |         with open(args.data_path, "r") as f:
136 |             input_data = json.load(f)
137 |         formatted_data = []
138 |         for item in input_data:
139 |             for cand in item['candidates']:
140 |                 inst = Template(FINETUNE_INST).substitute(task=args.task)
141 |                 input_ = Template(FINETUNE_INPUT).substitute(
142 |                     task=args.task,
143 |                     generation_instruction=item['instruction'],
144 |                     input_context=item['input'],
145 |                     hypothesis_output=cand['text'],
146 |                 )
147 |                 formatted_data.append({
148 |                     "instruction": inst,
149 |                     "input": input_,
150 |                 })
151 |             prompt_sources = [example['instruction'] + '\n' +
152 |                               example['input'] for example in formatted_data]
153 |             prompt_sources = [x.strip(' \n') + "\n" for x in prompt_sources]
154 | 
155 |         batch_prompts = batch_data(prompt_sources, batch_size=args.batch_size)
156 | 
157 |         for idx, batch_prompt in enumerate(batch_prompts):
158 |             if isinstance(batch_prompt, list):
159 |                 pass
160 |             else:
161 |                 batch_prompt = [batch_prompt]
162 | 
163 |             completions = llm.generate(batch_prompt, sampling_params)
164 |             for output in completions:
165 |                 generated_text = output.outputs[0].text
166 |                 eval_outputs.append(generated_text)
167 | 
168 |         cand_idx = 0
169 |         for idx, (item, eval_output) in enumerate(zip(input_data, eval_outputs)):
170 |             for cand in item['candidates']:
171 |                 cand['eval_output'] = eval_outputs[cand_idx]
172 |                 score_reductions = re.findall(
173 |                     r"(?<=\nScore reduction \d+: )(\d+\.\d+|\d+)", eval_outputs[cand_idx])
174 |                 cand['xgptscore'] = -sum(map(float, score_reductions))
175 |                 cand_idx += 1
176 | 
177 |         with open(output_file, 'w') as f:
178 |             json.dump(input_data, f, indent=4, ensure_ascii=False)
179 |         logging.info("Saved eval results to {}".format(output_file))
180 |     else:
181 |         with open(output_file, 'r') as f:
182 |             input_data = json.load(f)
183 |         for ex in input_data:
184 |             for cand in ex['candidates']:
185 |                 score_reductions = re.findall(
186 |                     r"(?<=\nScore reduction \d+: )(\d+\.\d+|\d+)", cand['eval_output'])
187 |                 cand['xgptscore'] = -sum(map(float, score_reductions))
188 |         with open(output_file, 'w') as f:
189 |             json.dump(input_data, f, indent=4, ensure_ascii=False)
190 |         logging.info("Loaded eval results from {}".format(output_file))
191 |     # Compute correlation
192 |     human_score_names = args.human_score_names.split(',')
193 | 
194 |     for h_name in human_score_names:
195 |         human_scores = []
196 |         xgptscores = []
197 |         for item in input_data:
198 |             for cand in item['candidates']:
199 |                 for s_name, score in cand['scores'].items():
200 |                     if s_name == h_name:
201 |                         xgptscores.append(cand['xgptscore'])
202 |                         human_scores.append(score)
203 |                         break
204 |         corr = MyCorrelation(1, human_scores, xgptscores)
205 |         logging.info("Human score: {}".format(h_name))
206 |         logging.info("Pearson correlation: {}".format(corr.Pearson()))
207 |         logging.info("Spearman correlation: {}".format(corr.Spearman()))
208 |         logging.info("Kendall correlation: {}".format(corr.Kendall()))
209 | 
210 | 
211 | if __name__ == "__main__":
212 | 
213 |     logging.basicConfig(level=logging.INFO)
214 |     parser = argparse.ArgumentParser()
215 |     parser.add_argument("--model_name_or_path", type=str, default=None)
216 |     parser.add_argument("--data_path", type=str, default=None)
217 |     parser.add_argument("--output_path", type=str, default=None)
218 |     parser.add_argument("--overwrite", action="store_true")
219 |     parser.add_argument("--task", type=str, default="summarization")
220 |     parser.add_argument("--batch_size", type=int, default=1)
221 |     parser.add_argument("--human_score_names", type=str, default="score")
222 |     args = parser.parse_args()
223 |     main(args)
224 | 


--------------------------------------------------------------------------------
/tigerscore/finetune/test_llama_vllm.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #SBATCH --job-name=test_llama
  3 | #SBATCH --gres=gpu:a6000:1
  4 | #SBATCH --time=24:00:00
  5 | #SBATCH --output=../../jobs/test_llama/%j.out
  6 | nvidia-smi
  7 | 
  8 | 
  9 | ## Note
 10 | # please download the data in the working directory as indicated in the Data Preparation section in the read me
 11 | # quick command: gdown https://drive.google.com/uc?id=1DAjvig-A_57CuBvENLg8A2PycOaz9ZkT
 12 | ## 
 13 | 
 14 | model_name="meta-llama/Llama-2-7b-hf"
 15 | outputs_dir=""
 16 | 
 17 | # outputs_dir="../../outputs"
 18 | checkpoint_name="ref"
 19 | # checkpoint_path="${outputs_dir}/${model_name}/${checkpoint_name}/checkpoint-532"
 20 | checkpoint_path="TIGER-Lab/TIGERScore-13B"
 21 | 
 22 | human_score_names="gpt_rank_score"
 23 | data_path="../../data/evaluation/lfqa/test_data_prepared.json"
 24 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output"
 25 | python test_llama_vllm.py \
 26 |     --model_name_or_path ${checkpoint_path} \
 27 |     --task "long-form QA" \
 28 |     --data_path ${data_path} \
 29 |     --output_path ${output_path} \
 30 |     --batch_size 60 \
 31 |     --human_score_names ${human_score_names} \
 32 |     --overwrite
 33 | 
 34 | task="instruction-following"
 35 | human_score_names="gpt_rank_score"
 36 | data_path="../../data/evaluation/instruct/just-eval-instruct/test_data_prepared.json"
 37 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output"
 38 | python test_llama_vllm.py \
 39 |     --model_name_or_path ${checkpoint_path} \
 40 |     --task ${task} \
 41 |     --data_path ${data_path} \
 42 |     --output_path ${output_path} \
 43 |     --batch_size 60 \
 44 |     --human_score_names ${human_score_names} \
 45 |     --overwrite
 46 | 
 47 | task="mathQA"
 48 | human_score_names="accuracy"
 49 | data_path="../../data/evaluation/mathqa/gsm8k/test_data_prepared.json"
 50 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output"
 51 | python test_llama_vllm.py \
 52 |     --model_name_or_path ${checkpoint_path} \
 53 |     --task ${task} \
 54 |     --data_path ${data_path} \
 55 |     --output_path ${output_path} \
 56 |     --batch_size 60 \
 57 |     --human_score_names ${human_score_names} \
 58 |     --overwrite
 59 | 
 60 | 
 61 | # mtme test mqm
 62 | task="translation"
 63 | human_score_names="mqm"
 64 | data_path="../../data/evaluation/translation/wmt22/zh-en/eval_data.json"
 65 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output"
 66 | python test_llama_vllm.py \
 67 |     --model_name_or_path ${checkpoint_path} \
 68 |     --task ${task} \
 69 |     --data_path ${data_path} \
 70 |     --output_path ${output_path} \
 71 |     --batch_size 60 \
 72 |     --human_score_names ${human_score_names} \
 73 |     --overwrite
 74 | 
 75 | # sum test relevance
 76 | task="summarization"
 77 | human_score_names="coherence,consistency,fluency,relevance"
 78 | data_path="../../data/evaluation/summarization/summeval/test_data_prepared.json"
 79 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output"
 80 | python test_llama_vllm.py \
 81 |     --model_name_or_path ${checkpoint_path} \
 82 |     --task ${task} \
 83 |     --data_path ${data_path} \
 84 |     --output_path ${output_path} \
 85 |     --batch_size 60 \
 86 |     --human_score_names ${human_score_names} \
 87 |     --overwrite
 88 | 
 89 | # d2t test Correctness
 90 | task="data2text"
 91 | human_score_names="Correctness,DataCoverage,Fluency,Relevance,TextStructure"
 92 | data_path="../../data/evaluation/d2t/webnlg_2020/test_data_prepared.json"
 93 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output"
 94 | python test_llama_vllm.py \
 95 |     --model_name_or_path ${checkpoint_path} \
 96 |     --task ${task} \
 97 |     --data_path ${data_path} \
 98 |     --output_path ${output_path} \
 99 |     --batch_size 60 \
100 |     --human_score_names ${human_score_names} \
101 |     --overwrite
102 | 
103 | 
104 | # storygen test human
105 | task="storygen"
106 | human_score_names="human"
107 | data_path="../../data/evaluation/storygen/test_data_prepared.json"
108 | output_path="${data_path}.llama_2_7b_${checkpoint_name}_test.output"
109 | python test_llama_vllm.py \
110 |     --model_name_or_path ${checkpoint_path} \
111 |     --task ${task} \
112 |     --data_path ${data_path} \
113 |     --output_path ${output_path} \
114 |     --batch_size 60 \
115 |     --human_score_names ${human_score_names} \
116 |     --overwrite
117 | 


--------------------------------------------------------------------------------
/tigerscore/finetune/test_llama_vllm_distance.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import argparse
  3 | import json
  4 | import torch
  5 | import logging
  6 | import sys
  7 | import regex as re
  8 | import numpy as np
  9 | from pathlib import Path
 10 | sys.path.append(str(Path(__file__).parent.parent))
 11 | from vllm import LLM, SamplingParams
 12 | from typing import List
 13 | from string import Template
 14 | from mt_metrics_eval.stats import Correlation
 15 | 
 16 | 
 17 | MAX_INT = sys.maxsize
 18 | 
 19 | IGNORE_INDEX = -100
 20 | DEFAULT_PAD_TOKEN = "[PAD]"
 21 | DEFAULT_EOS_TOKEN = "</s>"
 22 | DEFAULT_BOS_TOKEN = "<s>"
 23 | DEFAULT_UNK_TOKEN = "<unk>"
 24 | PROMPT_DICT = {
 25 |     "prompt_input": (
 26 |         "Below is an instruction that describes a task, paired with an input that provides further context. "
 27 |         "Write a response that appropriately completes the request.\n\n"
 28 |         "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
 29 |     ),
 30 |     "prompt_no_input": (
 31 |         "Below is an instruction that describes a task. "
 32 |         "Write a response that appropriately completes the request.\n\n"
 33 |         "### Instruction:\n{instruction}\n\n### Response:"
 34 |     ),
 35 | }
 36 | FINETUNE_INST = "You are evaluating errors in a model-generated output for a(an) ${task} task."
 37 | FINETUNE_INPUT = """\
 38 | Task instruction: ${generation_instruction}
 39 | Source: ${input_context}
 40 | Model-generated Output: ${hypothesis_output}
 41 | 
 42 | Based on the given task instruction and source, identify errors in this model-generated output.
 43 | For each error you give in the response, please also elaborate the following information:
 44 | - error location (the words that are wrong in the output)
 45 | - error aspect it belongs to.
 46 | - explanation why it's an error, and the correction suggestions.
 47 | - severity of the error ("Major" or "Minor"). 
 48 | - reduction of score (an interger between 0.5 and 5 given the severity of the error)
 49 | 
 50 | Your evaluation output:
 51 | """
 52 | 
 53 | 
 54 | def get_sum_penalties(eval_output: dict):
 55 |     """
 56 |     Args:
 57 |         eval_output: dict, the json output of the eval function
 58 | 
 59 |     Returns:
 60 |     """
 61 |     try:
 62 |         penalty_score = 0
 63 |         for aspect in eval_output:
 64 |             for penalty_point in eval_output[aspect]["penalty_points"]:
 65 |                 penalty_score += penalty_point["score_reduction"]
 66 |         return - penalty_score
 67 |     except Exception:
 68 |         return None
 69 | 
 70 | 
 71 | def get_torch_dtype(dtype_str):
 72 |     """
 73 |         Get the torch dtype from a string
 74 |     """
 75 |     if dtype_str == "float32":
 76 |         return torch.float32
 77 |     elif dtype_str == "float16":
 78 |         return torch.float16
 79 |     elif dtype_str == "bfloat16":
 80 |         return torch.bfloat16
 81 |     elif dtype_str == "int8":
 82 |         return torch.int8
 83 |     else:
 84 |         raise ValueError("Invalid dtype {}".format(dtype_str))
 85 | 
 86 | 
 87 | def batch_data(data_list, batch_size=1):
 88 |     n = len(data_list) // batch_size
 89 |     batch_data = []
 90 |     for i in range(n - 1):
 91 |         start = i * batch_size
 92 |         end = (i + 1) * batch_size
 93 |         batch_data.append(data_list[start:end])
 94 | 
 95 |     last_start = (n - 1) * batch_size
 96 |     last_end = MAX_INT
 97 |     batch_data.append(data_list[last_start:last_end])
 98 |     return batch_data
 99 | 
100 | 
101 | class MyCorrelation(Correlation):
102 |     def __init__(self, num_sys: int, gold_scores: List[int], metric_scores: List[int]):
103 |         # remove nan in metrics scores
104 |         none_metric_scores_idxs = [idx for idx,
105 |                                    x in enumerate(metric_scores) if x is None]
106 |         logging.info("Remove {} nan scores from {} scores".format(
107 |             len(none_metric_scores_idxs),
108 |             len(metric_scores)
109 |         ))
110 |         gold_scores = gold_scores.copy()
111 |         # set gold scores to None if metric scores are None
112 |         for idx in none_metric_scores_idxs[::-1]:
113 |             gold_scores[idx] = None
114 |         super().__init__(num_sys, gold_scores, metric_scores)
115 | 
116 | 
117 | def main(args):
118 | 
119 |     if args.output_path is not None:
120 |         output_file = Path(args.output_path)
121 |     else:
122 |         output_file = Path(args.data_path).with_suffix(
123 |             '.xgptscore.output.json')
124 |     if not output_file.exists() or args.overwrite:
125 |         logging.info("Loading model...")
126 |         sampling_params = SamplingParams(
127 |             temperature=0, top_p=1, max_tokens=1024)
128 |         llm = LLM(model=args.model_name_or_path, tensor_parallel_size=1)
129 |         logging.info("Model loaded from {}".format(args.model_name_or_path))
130 | 
131 |         eval_outputs = []
132 | 
133 |         logging.info("Load input data from {}".format(args.data_path))
134 |         with open(args.data_path, "r") as f:
135 |             input_data = json.load(f)
136 |         formatted_data = []
137 |         for item in input_data:
138 |             inst = Template(FINETUNE_INST).substitute(task=args.task)
139 |             refs = item['output'] if "output" in item else item["refs"]
140 |             item["candidates"] = []
141 |             if isinstance(refs,list):
142 |                 for ref in refs:
143 |                     item["candidates"].append(
144 |                         {
145 |                             "text":ref,
146 |                             "source":"unknown",
147 |                             "scores":{}
148 |                         }
149 |                     )
150 |             else:
151 |                 item["candidates"].append(
152 |                         {
153 |                             "text":refs,
154 |                             "source":"unknown",
155 |                             "scores":{}
156 |                         }
157 |                     )
158 |             for cand in item['candidates']:
159 |                 inst = Template(FINETUNE_INST).substitute(task=args.task)
160 |                 input_ = Template(FINETUNE_INPUT).substitute(
161 |                     task=args.task,
162 |                     generation_instruction=item['instruction'],
163 |                     input_context=item['input'],
164 |                     hypothesis_output=cand['text'],
165 |                 )
166 |                 formatted_data.append({
167 |                     "instruction": inst,
168 |                     "input": input_,
169 |                 })
170 |             prompt_sources = [example['instruction'] + '\n' +
171 |                               example['input'] for example in formatted_data]
172 |             prompt_sources = [x.strip(' \n') + "\n" for x in prompt_sources]
173 | 
174 |         batch_prompts = batch_data(prompt_sources, batch_size=args.batch_size)
175 | 
176 |         for idx, batch_prompt in enumerate(batch_prompts):
177 |             if isinstance(batch_prompt, list):
178 |                 pass
179 |             else:
180 |                 batch_prompt = [batch_prompt]
181 | 
182 |             completions = llm.generate(batch_prompt, sampling_params)
183 |             for output in completions:
184 |                 generated_text = output.outputs[0].text
185 |                 eval_outputs.append(generated_text)
186 | 
187 |         cand_idx = 0
188 |         for idx, (item, eval_output) in enumerate(zip(input_data, eval_outputs)):
189 |             for cand in item['candidates']:
190 |                 cand['eval_output'] = eval_outputs[cand_idx]
191 |                 score_reductions = re.findall(
192 |                     r"(?<=\nScore reduction \d+: )(\d+\.\d+|\d+)", eval_outputs[cand_idx])
193 |                 cand['xgptscore'] = -sum(map(float, score_reductions))
194 |                 cand_idx += 1
195 | 
196 |         with open(output_file, 'w') as f:
197 |             json.dump(input_data, f, indent=4, ensure_ascii=False)
198 |         logging.info("Saved eval results to {}".format(output_file))
199 |     else:
200 |         with open(output_file, 'r') as f:
201 |             input_data = json.load(f)
202 |         for ex in input_data:
203 |             for cand in ex['candidates']:
204 |                 score_reductions = re.findall(
205 |                     r"(?<=\nScore reduction \d+: )(\d+\.\d+|\d+)", cand['eval_output'])
206 |                 cand['xgptscore'] = -sum(map(float, score_reductions))
207 |         with open(output_file, 'w') as f:
208 |             json.dump(input_data, f, indent=4, ensure_ascii=False)
209 |         logging.info("Loaded eval results from {}".format(output_file))
210 |     # Compute correlation
211 |     xgptscores = []
212 |     for item in input_data:
213 |         xgptscores.append(item['xgptscore'])
214 |     print("Absolute score sum: {}".format(abs(sum(xgptscores))))
215 |     print("Average score: {}".format(sum(xgptscores) / len(xgptscores)))
216 |     print("Median score: {}".format(np.median(xgptscores)))
217 |     print("Standard deviation: {}".format(np.std(list(map(abs, xgptscores)))))
218 | 
219 | 
220 | 
221 | if __name__ == "__main__":
222 | 
223 |     logging.basicConfig(level=logging.INFO)
224 |     parser = argparse.ArgumentParser()
225 |     parser.add_argument("--model_name_or_path", type=str, default=None)
226 |     parser.add_argument("--data_path", type=str, default=None)
227 |     parser.add_argument("--output_path", type=str, default=None)
228 |     parser.add_argument("--overwrite", action="store_true")
229 |     parser.add_argument("--task", type=str, default="summarization")
230 |     parser.add_argument("--batch_size", type=int, default=1)
231 |     parser.add_argument("--human_score_names", type=str, default="score")
232 |     args = parser.parse_args()
233 |     main(args)
234 | 


--------------------------------------------------------------------------------
/tigerscore/finetune/test_llama_vllm_vanilla.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import argparse
  3 | import json
  4 | import torch
  5 | import logging
  6 | import sys
  7 | import regex as re
  8 | from pathlib import Path
  9 | sys.path.append(str(Path(__file__).parent.parent))
 10 | from vllm import LLM, SamplingParams
 11 | from typing import List
 12 | from string import Template
 13 | from mt_metrics_eval.stats import Correlation
 14 | 
 15 | 
 16 | MAX_INT = sys.maxsize
 17 | 
 18 | IGNORE_INDEX = -100
 19 | DEFAULT_PAD_TOKEN = "[PAD]"
 20 | DEFAULT_EOS_TOKEN = "</s>"
 21 | DEFAULT_BOS_TOKEN = "<s>"
 22 | DEFAULT_UNK_TOKEN = "<unk>"
 23 | PROMPT_DICT = {
 24 |     "prompt_input": (
 25 |         "Below is an instruction that describes a task, paired with an input that provides further context. "
 26 |         "Write a response that appropriately completes the request.\n\n"
 27 |         "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
 28 |     ),
 29 |     "prompt_no_input": (
 30 |         "Below is an instruction that describes a task. "
 31 |         "Write a response that appropriately completes the request.\n\n"
 32 |         "### Instruction:\n{instruction}\n\n### Response:"
 33 |     ),
 34 | }
 35 | # FINETUNE_INST = """"""
 36 | # FINETUNE_INPUT = """\
 37 | # ${generation_instruction}
 38 | # ${input_context}
 39 | 
 40 | # Model-generated Output:
 41 | # ${hypothesis_output}
 42 | 
 43 | 
 44 | # You should rate Model-generated Output on a scale from 0.5 (worst) to 10 (best).
 45 | # Rating: \
 46 | # """
 47 | FINETUNE_INST = """[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant.\n<</SYS>>\n"""
 48 | FINETUNE_INPUT = """\
 49 | ${generation_instruction}
 50 | ${input_context}
 51 | 
 52 | Model-generated Output:
 53 | ${hypothesis_output}
 54 | 
 55 | 
 56 | You should rate Model-generated Output on a scale from 0.5 (worst) to 10 (best).  [/INST] Rating: \
 57 | """
 58 | # FINETUNE_INPUT = """\
 59 | # USER:You are evaluating errors in a model-generated output for a(an) ${task} task.
 60 | # Task instruction: ${generation_instruction}
 61 | # Source: ${input_context}
 62 | # Model-generated Output: ${hypothesis_output}
 63 | 
 64 | # Based on the given task instruction and source, identify errors in this model-generated output.
 65 | # For each error you give in the response, please also elaborate the following information:
 66 | # - error location (the words that are wrong in the output)
 67 | # - error aspect it belongs to.
 68 | # - explanation why it's an error, and the correction suggestions.
 69 | # - severity of the error ("Major" or "Minor"). 
 70 | # - reduction of score (between 0.5 and 5 given the severity of the error)
 71 | 
 72 | # Please give a summary of the errors you found in the output, and the total score reduction.
 73 | # The model-generated output contains {num_errors} errors, with a total score reduction of {total_score_reduction}.
 74 | 
 75 | # Your evaluation output: ASSISTANT:\
 76 | # """
 77 | def find_first_float(s):
 78 |     match = re.search(r"[-+]?\d*\.\d+|\d+", s)
 79 |     return float(match.group()) if match else None
 80 | 
 81 | def get_sum_penalties(eval_output: dict):
 82 |     """
 83 |     Args:
 84 |         eval_output: dict, the json output of the eval function
 85 | 
 86 |     Returns:
 87 |     """
 88 |     try:
 89 |         penalty_score = 0
 90 |         for aspect in eval_output:
 91 |             for penalty_point in eval_output[aspect]["penalty_points"]:
 92 |                 penalty_score += penalty_point["score_reduction"]
 93 |         return - penalty_score
 94 |     except Exception:
 95 |         return None
 96 | 
 97 | 
 98 | def get_torch_dtype(dtype_str):
 99 |     """
100 |         Get the torch dtype from a string
101 |     """
102 |     if dtype_str == "float32":
103 |         return torch.float32
104 |     elif dtype_str == "float16":
105 |         return torch.float16
106 |     elif dtype_str == "bfloat16":
107 |         return torch.bfloat16
108 |     elif dtype_str == "int8":
109 |         return torch.int8
110 |     else:
111 |         raise ValueError("Invalid dtype {}".format(dtype_str))
112 | 
113 | 
114 | def batch_data(data_list, batch_size=1):
115 |     n = len(data_list) // batch_size
116 |     batch_data = []
117 |     for i in range(n - 1):
118 |         start = i * batch_size
119 |         end = (i + 1) * batch_size
120 |         batch_data.append(data_list[start:end])
121 | 
122 |     last_start = (n - 1) * batch_size
123 |     last_end = MAX_INT
124 |     batch_data.append(data_list[last_start:last_end])
125 |     return batch_data
126 | 
127 | 
128 | class MyCorrelation(Correlation):
129 |     def __init__(self, num_sys: int, gold_scores: List[int], metric_scores: List[int]):
130 |         # remove nan in metrics scores
131 |         none_metric_scores_idxs = [idx for idx,
132 |                                    x in enumerate(metric_scores) if x is None]
133 |         logging.info("Remove {} nan scores from {} scores".format(
134 |             len(none_metric_scores_idxs),
135 |             len(metric_scores)
136 |         ))
137 |         gold_scores = gold_scores.copy()
138 |         # set gold scores to None if metric scores are None
139 |         for idx in none_metric_scores_idxs[::-1]:
140 |             gold_scores[idx] = None
141 |         super().__init__(num_sys, gold_scores, metric_scores)
142 | 
143 | 
144 | def main(args):
145 | 
146 |     if args.output_path is not None:
147 |         output_file = Path(args.output_path)
148 |     else:
149 |         output_file = Path(args.data_path).with_suffix(
150 |             '.xgptscore.output.json')
151 |     if not output_file.exists() or args.overwrite:
152 |         logging.info("Loading model...")
153 |         sampling_params = SamplingParams(
154 |             temperature=0, top_p=1, max_tokens=1024)
155 |         llm = LLM(model=args.model_name_or_path, tensor_parallel_size=1)
156 |         logging.info("Model loaded from {}".format(args.model_name_or_path))
157 | 
158 |         eval_outputs = []
159 | 
160 |         logging.info("Load input data from {}".format(args.data_path))
161 |         with open(args.data_path, "r") as f:
162 |             input_data = json.load(f)
163 |         formatted_data = []
164 |         for item in input_data:
165 |             for cand in item['candidates']:
166 |                 inst = Template(FINETUNE_INST).substitute(task=args.task)
167 |                 input_ = Template(FINETUNE_INPUT).substitute(
168 |                     task=args.task,
169 |                     generation_instruction=item['instruction'],
170 |                     input_context=item['input'],
171 |                     hypothesis_output=cand['text'],
172 |                 )
173 |                 formatted_data.append({
174 |                     "instruction": inst,
175 |                     "input": input_,
176 |                 })
177 |             prompt_sources = [example['instruction'] + '\n' +
178 |                               example['input'] for example in formatted_data]
179 |             prompt_sources = [x.strip(' \n') + "\n" for x in prompt_sources]
180 | 
181 |         batch_prompts = batch_data(prompt_sources, batch_size=args.batch_size)
182 | 
183 |         for idx, batch_prompt in enumerate(batch_prompts):
184 |             if isinstance(batch_prompt, list):
185 |                 pass
186 |             else:
187 |                 batch_prompt = [batch_prompt]
188 | 
189 |             completions = llm.generate(batch_prompt, sampling_params)
190 |             for output in completions:
191 |                 generated_text = output.outputs[0].text
192 |                 eval_outputs.append(generated_text)
193 | 
194 |         cand_idx = 0
195 |         for idx, (item, eval_output) in enumerate(zip(input_data, eval_outputs)):
196 |             for cand in item['candidates']:
197 |                 cand['eval_output'] = eval_outputs[cand_idx]
198 |                 score_reduction = find_first_float(eval_outputs[cand_idx])
199 |                 if score_reduction is None:
200 |                     cand['vanilla_xgptscore'] = -float(score_reduction)
201 |                 else:
202 |                     cand['vanilla_xgptscore'] = None
203 |                 cand_idx += 1
204 | 
205 |         with open(output_file, 'w') as f:
206 |             json.dump(input_data, f, indent=4, ensure_ascii=False)
207 |         logging.info("Saved eval results to {}".format(output_file))
208 |     else:
209 |         with open(output_file, 'r') as f:
210 |             input_data = json.load(f)
211 |         for ex in input_data:
212 |             for cand in ex['candidates']:
213 |                 score_reduction = find_first_float(cand["eval_output"])
214 |                 if score_reduction is None:
215 |                     cand['vanilla_xgptscore'] = -float(score_reduction)
216 |                 else:
217 |                     cand['vanilla_xgptscore'] = None
218 |         with open(output_file, 'w') as f:
219 |             json.dump(input_data, f, indent=4, ensure_ascii=False)
220 |         logging.info("Loaded eval results from {}".format(output_file))
221 |     # Compute correlation
222 |     human_score_names = args.human_score_names.split(',')
223 | 
224 |     for h_name in human_score_names:
225 |         human_scores = []
226 |         xgptscores = []
227 |         for item in input_data:
228 |             for cand in item['candidates']:
229 |                 for s_name, score in cand['scores'].items():
230 |                     if s_name == h_name:
231 |                         xgptscores.append(cand['vanilla_xgptscore'])
232 |                         human_scores.append(score)
233 |                         break
234 |         corr = MyCorrelation(1, human_scores, xgptscores)
235 |         print("Human score: {}".format(h_name))
236 |         print("Pearson correlation: {}".format(corr.Pearson()))
237 |         print("Spearman correlation: {}".format(corr.Spearman()))
238 |         print("Kendall correlation: {}".format(corr.Kendall()))
239 | 
240 | 
241 | if __name__ == "__main__":
242 | 
243 |     logging.basicConfig(level=logging.INFO)
244 |     parser = argparse.ArgumentParser()
245 |     parser.add_argument("--model_name_or_path", type=str, default=None)
246 |     parser.add_argument("--data_path", type=str, default=None)
247 |     parser.add_argument("--output_path", type=str, default=None)
248 |     parser.add_argument("--overwrite", action="store_true")
249 |     parser.add_argument("--task", type=str, default="summarization")
250 |     parser.add_argument("--batch_size", type=int, default=1)
251 |     parser.add_argument("--human_score_names", type=str, default="score")
252 |     args = parser.parse_args()
253 |     main(args)
254 | 


--------------------------------------------------------------------------------
/tigerscore/finetune/trainer.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | import os
 3 | from transformers.trainer import *
 4 | from peft import PeftModel
 5 | 
 6 | 
 7 | class CustomLoraTrainer(Trainer):
 8 |     def _save(self, output_dir: Optional[str] = None, state_dict=None):
 9 |         # If we are executing this function, we are the process zero, so we don't check for that.
10 |         output_dir = output_dir if output_dir is not None else self.args.output_dir
11 |         os.makedirs(output_dir, exist_ok=True)
12 |         logger.info(f"Saving model checkpoint to {output_dir}")
13 |         # Save a trained model and configuration using `save_pretrained()`.
14 |         # They can then be reloaded using `from_pretrained()`
15 |         if not isinstance(self.model, PreTrainedModel) and not isinstance(self.model, PeftModel):
16 |             if state_dict is None:
17 |                 state_dict = self.model.state_dict()
18 | 
19 |             if isinstance(unwrap_model(self.model), PreTrainedModel):
20 |                 unwrap_model(self.model).save_pretrained(
21 |                     output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
22 |                 )
23 |             else:
24 |                 logger.info(
25 |                     "Trainer.model is not a `PreTrainedModel`, only saving its state dict.")
26 |                 if self.args.save_safetensors:
27 |                     safetensors.torch.save_file(
28 |                         state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME))
29 |                 else:
30 |                     torch.save(state_dict, os.path.join(
31 |                         output_dir, WEIGHTS_NAME))
32 |         else:
33 |             print("Saving LoRA model...")
34 |             self.model.save_pretrained(
35 |                 output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors
36 |             )
37 | 
38 |         if self.tokenizer is not None:
39 |             self.tokenizer.save_pretrained(output_dir)
40 | 
41 |         # Good practice: save your training arguments together with the trained model
42 |         torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
43 | 


--------------------------------------------------------------------------------
/tigerscore/finetune/utils.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | import logging
  3 | import math
  4 | import os
  5 | import io
  6 | import sys
  7 | import time
  8 | import json
  9 | from typing import Optional, Sequence, Union, Dict
 10 | 
 11 | import openai
 12 | import tqdm
 13 | from openai import openai_object
 14 | import copy
 15 | 
 16 | StrOrOpenAIObject = Union[str, openai_object.OpenAIObject]
 17 | 
 18 | openai_org = os.getenv("OPENAI_ORG")
 19 | if openai_org is not None:
 20 |     openai.organization = openai_org
 21 |     logging.warning(
 22 |         f"Switching to organization: {openai_org} for OAI API key.")
 23 | 
 24 | 
 25 | @dataclasses.dataclass
 26 | class OpenAIDecodingArguments(object):
 27 |     max_tokens: int = 1800
 28 |     temperature: float = 0.2
 29 |     top_p: float = 1.0
 30 |     n: int = 1
 31 |     stream: bool = False
 32 |     stop: Optional[Sequence[str]] = None
 33 |     presence_penalty: float = 0.0
 34 |     frequency_penalty: float = 0.0
 35 |     suffix: Optional[str] = None
 36 |     logprobs: Optional[int] = None
 37 |     echo: bool = False
 38 | 
 39 | 
 40 | def openai_completion(
 41 |     prompts: Union[str, Sequence[str], Sequence[Dict[str, str]], Dict[str, str]],
 42 |     decoding_args: OpenAIDecodingArguments,
 43 |     model_name="text-davinci-003",
 44 |     sleep_time=2,
 45 |     batch_size=1,
 46 |     max_instances=sys.maxsize,
 47 |     max_batches=sys.maxsize,
 48 |     return_text=False,
 49 |     **decoding_kwargs,
 50 | ) -> Union[Union[StrOrOpenAIObject, StrOrOpenAIObject], Sequence[StrOrOpenAIObject], Sequence[Sequence[StrOrOpenAIObject]],]:
 51 |     """Decode with OpenAI API.
 52 | 
 53 |     Args:
 54 |         prompts: A string or a list of strings to complete. If it is a chat model the strings should be formatted
 55 |             as explained here: https://github.com/openai/openai-python/blob/main/chatml.md. If it is a chat model
 56 |             it can also be a dictionary (or list thereof) as explained here:
 57 |             https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb
 58 |         decoding_args: Decoding arguments.
 59 |         model_name: Model name. Can be either in the format of "org/model" or just "model".
 60 |         sleep_time: Time to sleep once the rate-limit is hit.
 61 |         batch_size: Number of prompts to send in a single request. Only for non chat model.
 62 |         max_instances: Maximum number of prompts to decode.
 63 |         max_batches: Maximum number of batches to decode. This argument will be deprecated in the future.
 64 |         return_text: If True, return text instead of full completion object (which contains things like logprob).
 65 |         decoding_kwargs: Additional decoding arguments. Pass in `best_of` and `logit_bias` if you need them.
 66 | 
 67 |     Returns:
 68 |         A completion or a list of completions.
 69 |         Depending on return_text, return_openai_object, and decoding_args.n, the completion type can be one of
 70 |             - a string (if return_text is True)
 71 |             - an openai_object.OpenAIObject object (if return_text is False)
 72 |             - a list of objects of the above types (if decoding_args.n > 1)
 73 |     """
 74 |     is_single_prompt = isinstance(prompts, (str, dict))
 75 |     if is_single_prompt:
 76 |         prompts = [prompts]
 77 | 
 78 |     if max_batches < sys.maxsize:
 79 |         logging.warning(
 80 |             "`max_batches` will be deprecated in the future, please use `max_instances` instead."
 81 |             "Setting `max_instances` to `max_batches * batch_size` for now."
 82 |         )
 83 |         max_instances = max_batches * batch_size
 84 | 
 85 |     prompts = prompts[:max_instances]
 86 |     num_prompts = len(prompts)
 87 |     prompt_batches = [
 88 |         prompts[batch_id * batch_size: (batch_id + 1) * batch_size]
 89 |         for batch_id in range(int(math.ceil(num_prompts / batch_size)))
 90 |     ]
 91 | 
 92 |     completions = []
 93 |     for batch_id, prompt_batch in tqdm.tqdm(
 94 |         enumerate(prompt_batches),
 95 |         desc="prompt_batches",
 96 |         total=len(prompt_batches),
 97 |     ):
 98 |         batch_decoding_args = copy.deepcopy(
 99 |             decoding_args)  # cloning the decoding_args
100 | 
101 |         while True:
102 |             try:
103 |                 shared_kwargs = dict(
104 |                     model=model_name,
105 |                     **batch_decoding_args.__dict__,
106 |                     **decoding_kwargs,
107 |                 )
108 |                 completion_batch = openai.Completion.create(
109 |                     prompt=prompt_batch, **shared_kwargs)
110 |                 choices = completion_batch.choices
111 | 
112 |                 for choice in choices:
113 |                     choice["total_tokens"] = completion_batch.usage.total_tokens
114 |                 completions.extend(choices)
115 |                 break
116 |             except openai.error.OpenAIError as e:
117 |                 logging.warning(f"OpenAIError: {e}.")
118 |                 if "Please reduce your prompt" in str(e):
119 |                     batch_decoding_args.max_tokens = int(
120 |                         batch_decoding_args.max_tokens * 0.8)
121 |                     logging.warning(
122 |                         f"Reducing target length to {batch_decoding_args.max_tokens}, Retrying...")
123 |                 else:
124 |                     logging.warning("Hit request rate limit; retrying...")
125 |                     time.sleep(sleep_time)  # Annoying rate limit on requests.
126 | 
127 |     if return_text:
128 |         completions = [completion.text for completion in completions]
129 |     if decoding_args.n > 1:
130 |         # make completions a nested list, where each entry is a consecutive decoding_args.n of original entries.
131 |         completions = [completions[i: i + decoding_args.n]
132 |                        for i in range(0, len(completions), decoding_args.n)]
133 |     if is_single_prompt:
134 |         # Return non-tuple if only 1 input and 1 generation.
135 |         (completions,) = completions
136 |     return completions
137 | 
138 | 
139 | def _make_w_io_base(f, mode: str):
140 |     if not isinstance(f, io.IOBase):
141 |         f_dirname = os.path.dirname(f)
142 |         if f_dirname != "":
143 |             os.makedirs(f_dirname, exist_ok=True)
144 |         f = open(f, mode=mode)
145 |     return f
146 | 
147 | 
148 | def _make_r_io_base(f, mode: str):
149 |     if not isinstance(f, io.IOBase):
150 |         f = open(f, mode=mode)
151 |     return f
152 | 
153 | 
154 | def jdump(obj, f, mode="w", indent=4, default=str):
155 |     """Dump a str or dictionary to a file in json format.
156 | 
157 |     Args:
158 |         obj: An object to be written.
159 |         f: A string path to the location on disk.
160 |         mode: Mode for opening the file.
161 |         indent: Indent for storing json dictionaries.
162 |         default: A function to handle non-serializable entries; defaults to `str`.
163 |     """
164 |     f = _make_w_io_base(f, mode)
165 |     if isinstance(obj, (dict, list)):
166 |         json.dump(obj, f, indent=indent, default=default)
167 |     elif isinstance(obj, str):
168 |         f.write(obj)
169 |     else:
170 |         raise ValueError(f"Unexpected type: {type(obj)}")
171 |     f.close()
172 | 
173 | 
174 | def jload(f, mode="r"):
175 |     """Load a .json file into a dictionary."""
176 |     f = _make_r_io_base(f, mode)
177 |     jdict = json.load(f)
178 |     f.close()
179 |     return jdict
180 | 


--------------------------------------------------------------------------------
/tigerscore/get_error_types/get_error_types.py:
--------------------------------------------------------------------------------
 1 | # Example usage
 2 | """
 3 | This file isn't used in final version.
 4 | """
 5 | import os
 6 | import sys
 7 | import fire
 8 | import json
 9 | from pathlib import Path
10 | os.environ["OPENAI_API_KEY"] = ""
11 | os.environ["OPENAI_API_BASE"] = ""
12 | os.environ["OPENAI_API_TYPE"] = "azure"
13 | os.environ["OPENAI_API_VERSION"] = "2023-03-15-preview"
14 | sys.path.append("../")
15 | from xgptscore.openai_utils import openai_completions, _chatml_to_prompt
16 | from xgptscore.constants import EVAL_ASPECTS
17 | from string import Template
18 | 
19 | TEMPLATE = """\
20 | You are evaluating an ${task} task. Some errors in an incorrect output could be attributed to the following aspects:
21 | ${aspects_descriptions}
22 | 
23 | Please elaborate 10 specific error types for each aspect above. Each error type should represent a specific error that falls under the aspect. Error types should be mutually exclusive and collectively exhaustive.\
24 | """
25 | 
26 | 
27 | def main(
28 |     task: str,
29 | ):
30 |     
31 |     task_aspects = EVAL_ASPECTS[task]
32 |     prompt = Template(TEMPLATE).substitute(
33 |         task=task,
34 |         aspects_descriptions="\n".join([f"- {aspect}: {description}" for aspect, description in task_aspects.items()])
35 |     )
36 |     prompts = [prompt]
37 |     chatmls = [[{"role": "system",
38 |                  "content": " You are an AI assistant that helps people find information."},
39 |                 {"role": "user",
40 |                  "content": prompt}] for prompt in prompts[:1]]
41 | 
42 |     chatml_prompts = [_chatml_to_prompt(chatml) for chatml in chatmls]
43 |     results = openai_completions(chatml_prompts, model_name="gpt-4")
44 |     output_file = Path("./error_types/" + task + ".txt")
45 |     output_file.parent.mkdir(parents=True, exist_ok=True)
46 |     results['propmts'] = prompts
47 |     with open(output_file, "w") as f:
48 |         json.dump(results, f, indent=4, ensure_ascii=False)
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     fire.Fire(main)


--------------------------------------------------------------------------------
/tigerscore/scorer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/TIGER-AI-Lab/TIGERScore/5133420066ee65a875d5b64cfd20ab35cfce0112/tigerscore/scorer/__init__.py


--------------------------------------------------------------------------------
/tigerscore/xgptscore/README.md:
--------------------------------------------------------------------------------
 1 | ## XGPTScore Overview
 2 | This folder contains all the templates that we used to query ChatGPT or GPT-4 to get the identified errors in the hypothesis output for different tasks that TIGERScore involved. We call these API query methods as XGPTScore for a e**X**planainable **Scoring** method by querying **GPT** Models.
 3 | 
 4 | The overall pipeline of XGPTScore is:
 5 | 
 6 | 1. We define a query template that askes GPT Models to idnetify errors in the hypothesis output based on the task instruction, source text and reference text.
 7 | 2. We mannual construct various evaluation aspects to focus on for different tasks, as shown in [./constants.py](./constants.py).
 8 | 3. Then, by applying the templates and also specifiy the aspects to focus on in the template, GPT Models are required to return the identified errors in a predefined format (like json format).
 9 | 
10 | Sometimes GPTModels will output apparently lower-quality output if we require them to output in a specific format. To mitigate the affections from the predefined format on the response quality, we conduct 2-round evaluation. Firstly, we focus on the evaluation only, allowing the GPT models to output free-form evaluation results on the hypothesis output. Then we ask the GPT-models to format their free-form response in the first round into a specific format and provide elaborated information, which is an easier task for GPTModels.
11 | 
12 | ## Quick start
13 | 
14 | We have provided a single function `xgptscore()` as the inferface, which takes the `xgptitems` along with the template mode and the OpenAI models as input to start the query.
15 | 
16 | Example Usage:
17 | ```python
18 | task = "translation"
19 | with open("example.json", "r") as f:
20 |     items = json.load(f)
21 | xgptitems = []
22 | for item in items:
23 |     for cand in item['candidates']:
24 |         xgptitems.append(XPGTItem(
25 |             task=task,
26 |             instruction=item['instruction'],
27 |             input=item['input'],
28 |             ref_output=item['output'],
29 |             hypo_output=cand['text']
30 |         ))
31 | result = xgptscore(xgptitems, "ea", "ChatGPT")
32 | idx = 0
33 | for item in items:
34 |     for cand in item['candidates']:
35 |         cand['responses'] = result['round_completions'][idx]
36 |         cand['messages_records'] = result['messages_records'][idx]
37 | json.dump(items, open("example_result.json", "w"), indent=4, ensure_ascii=False)
38 | ```
39 | 
40 | Please check out the input file `example.json` and the result file `example_results.json` to better understand how it actually works.


--------------------------------------------------------------------------------
/tigerscore/xgptscore/mode_configs/align_score.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "mode": "align_score",
 3 |     "decoding": {
 4 |         "max_tokens": 3600,
 5 |         "temperature": 0.0,
 6 |         "top_p": 1.0,
 7 |         "timeout": 60,
 8 |         "request_timeout": 60
 9 |     },
10 |     "max_lengths": {
11 |         "inst": null,
12 |         "input": 600,
13 |         "hypo_output": 400,
14 |         "ref_output": 400
15 |     }
16 | }


--------------------------------------------------------------------------------
/tigerscore/xgptscore/mode_configs/default.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "mode": "default",
 3 |     "decoding": {
 4 |         "max_tokens": 3600,
 5 |         "temperature": 0.0,
 6 |         "top_p": 1.0,
 7 |         "timeout": 60
 8 |     },
 9 |     "max_lengths": {
10 |         "inst": null,
11 |         "input": 512,
12 |         "hypo_output": 400,
13 |         "ref_output": 400
14 |     }
15 | }


--------------------------------------------------------------------------------
/tigerscore/xgptscore/mode_configs/kb_txt.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "mode": "default",
 3 |     "decoding": {
 4 |         "max_tokens": 3600,
 5 |         "temperature": 0.0,
 6 |         "top_p": 1.0,
 7 |         "timeout": 120,
 8 |         "request_timeout": 120
 9 |     },
10 |     "max_lengths": {
11 |         "inst": null,
12 |         "input": 512,
13 |         "hypo_output": 400,
14 |         "ref_output": 400
15 |     }
16 | }


--------------------------------------------------------------------------------
/tigerscore/xgptscore/mode_configs/wmt_mqm.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "mode": "wmt_mqm",
 3 |     "decoding": {
 4 |         "max_tokens": 3600,
 5 |         "temperature": 0.0,
 6 |         "top_p": 1.0,
 7 |         "timeout": 60,
 8 |         "request_timeout": 60
 9 |     },
10 |     "max_lengths": {
11 |         "inst": null,
12 |         "input": 400,
13 |         "hypo_output": 400,
14 |         "ref_output": 400
15 |     }
16 | }


--------------------------------------------------------------------------------
/tigerscore/xgptscore/openai_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | if os.environ.get('OPENAI_API_TYPE', None) == 'azure':
 3 |     # pip install openai<=0.28.1, fire, numpy, tiktoken
 4 |     from .openai_utils_azure import (
 5 |         openai_completions,
 6 |         _prompt_to_chatml,
 7 |         _chatml_to_prompt,
 8 |     )
 9 |     import openai
10 |     assert openai.VERSION <= "0.28.1", "Azure API is only supported in openai-python 0.28.1 or later."
11 | elif os.environ.get('OPENAI_UTILS_TYPE', None) == 'curl':
12 |     # pip install openai>=1.0.0, fire, numpy, tiktoken
13 |     from .openai_utils_curl import (
14 |         openai_completions,
15 |         _prompt_to_chatml,
16 |         _chatml_to_prompt,
17 |     )
18 |     import openai
19 |     assert openai.VERSION >= "1.0.0", "OpenAI API is only supported in openai-python 1.0.0 or later."
20 | else:
21 |     # pip install openai>=1.0.0, fire, numpy, tiktoken
22 |     from .openai_utils_openAI import (
23 |         openai_completions,
24 |         _prompt_to_chatml,
25 |         _chatml_to_prompt,
26 |     )
27 |     import openai
28 |     assert openai.VERSION >= "1.0.0", "OpenAI API is only supported in openai-python 1.0.0 or later."
29 | 


--------------------------------------------------------------------------------
/tigerscore/xgptscore/process_utils.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import json5
  4 | import logging
  5 | from dataclasses import dataclass
  6 | from transformers import AutoTokenizer
  7 | from tqdm import tqdm
  8 | from typing import List, Union
  9 | from itertools import chain
 10 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
 11 | 
 12 | 
 13 | @dataclass
 14 | class XPGTItem():
 15 |     task: str
 16 |     instruction: str
 17 |     input: str
 18 |     ref_output: Union[str, List[str]]
 19 |     hypo_output: str
 20 | 
 21 | # Message map functions
 22 | 
 23 | 
 24 | def default_msg_map(cur_message: dict, messages: List[dict]):
 25 |     """ Map the text and old messages to the new messages for query
 26 |     Args:
 27 |         text (str): the prompt text
 28 |         messages (List[dict]): the messages list before this query
 29 |     Returns:
 30 |         prompt (str): the prompt text
 31 |     """
 32 |     new_messages = messages + [{
 33 |         "role": cur_message['role'],
 34 |         "content": cur_message['content']}
 35 |     ]
 36 |     return new_messages
 37 | 
 38 | # Postprocess functions
 39 | 
 40 | 
 41 | def default_postprocess(content: str):
 42 |     return content
 43 | 
 44 | 
 45 | def json_postprocess(content: str):
 46 |     try:
 47 |         # find the json content
 48 |         json_content = content[content.find("{"):content.rfind("}") + 1]
 49 |         json_content = json.loads(json_content)
 50 |         return json_content
 51 |     except json.decoder.JSONDecodeError:
 52 |         try:
 53 |             json_content = json5.loads(json_content)
 54 |             return json_content
 55 |         except Exception:
 56 |             return content
 57 | 
 58 | 
 59 | tokenizer = None
 60 | 
 61 | 
 62 | def truncate_texts(texts: Union[List[str], List[List[str]]], max_length: int = None):
 63 |     """
 64 |     Truncate the texts to the max length.
 65 |     Args:
 66 |         texts (List[str] or List[List[str]]): The list of texts.
 67 |         max_length (int): The max length.
 68 |     Returns:
 69 |         List[str]: The truncated texts.
 70 |     """
 71 |     if max_length is None:
 72 |         return texts
 73 |     if isinstance(texts[0], list) and \
 74 |         (
 75 |             all([len(x) == 0 for x in texts]) or
 76 |             all([x is None for x in list(chain(*texts))])
 77 |     ) or isinstance(texts[0], str) and \
 78 |             all([x is None for x in list(chain(texts))]):
 79 |         logging.warning("All texts are None, skip truncating")
 80 |         return texts
 81 |     # using llama tokenizer by default
 82 |     global tokenizer
 83 |     disable_tqdm = len(texts) < 1000
 84 |     logging.warning(f"Truncating texts to max length {max_length}")
 85 |     if tokenizer is None:
 86 |         tokenizer = AutoTokenizer.from_pretrained(
 87 |             "meta-llama/Llama-2-7b-hf", use_auth_token=True)
 88 |     # ...
 89 |     token_ids = []
 90 |     for text in tqdm(texts, desc="Truncating texts (tokenizing)", disable=disable_tqdm):
 91 |         if isinstance(text, list):
 92 |             token_ids.append(
 93 |                 [tokenizer.encode(x, add_special_tokens=False) for x in text])
 94 |         else:
 95 |             token_ids.append(tokenizer.encode(text, add_special_tokens=False))
 96 |     # ...
 97 |     truncated_texts = []
 98 |     for i, _token_ids in tqdm(enumerate(token_ids), desc="Truncating texts (truncating)", disable=disable_tqdm):
 99 |         if (len(_token_ids)) and isinstance(_token_ids[0], list):
100 |             truncated_texts.append([])
101 |             for _token_id in _token_ids:
102 |                 if len(_token_id) > max_length:
103 |                     truncated_text = tokenizer.decode(
104 |                         _token_id[:max_length], skip_special_tokens=True)
105 |                     truncated_text = truncated_text + " ..."
106 |                 else:
107 |                     truncated_text = tokenizer.decode(
108 |                         _token_id, skip_special_tokens=True)
109 |                 truncated_texts[i].append(truncated_text)
110 |         else:
111 |             if len(_token_ids) > max_length:
112 |                 truncated_text = tokenizer.decode(
113 |                     _token_ids[:max_length], skip_special_tokens=True)
114 |                 truncated_text = truncated_text + " ..."
115 |             else:
116 |                 truncated_text = tokenizer.decode(
117 |                     _token_ids, skip_special_tokens=True)
118 | 
119 |             truncated_texts.append(truncated_text)
120 |     return truncated_texts
121 | 
122 | 
123 | def truncate_items(items: List[XPGTItem], max_lengths):
124 |     """
125 |     Truncate the texts in the items to the max length.
126 |     Args:
127 |         items (List[XPGTItem]): The list of items.
128 |         max_length (int): The max length.
129 |     Returns:
130 |         List[XPGTItem]: The truncated items.
131 |     """
132 |     truncated_inputs = truncate_texts(
133 |         [item.input for item in items], max_lengths.get("input", None))
134 |     truncated_insts = truncate_texts(
135 |         [item.instruction for item in items], max_lengths.get("instruction", None))
136 |     truncated_ref_outputs = truncate_texts(
137 |         [item.ref_output for item in items], max_lengths.get("ref_output", None))
138 |     truncated_hypo_outputs = truncate_texts(
139 |         [item.hypo_output for item in items], max_lengths.get("hypo_output", None))
140 |     for i, item in enumerate(items):
141 |         item.instruction = truncated_insts[i]
142 |         item.input = truncated_inputs[i]
143 |         item.ref_output = truncated_ref_outputs[i]
144 |         item.hypo_output = truncated_hypo_outputs[i]
145 |     return items
146 | 
147 | 
148 | def get_query_messages(messages: List[dict], queried_messages: List[dict]):
149 |     """
150 |     Args:
151 |         messages (List[dict]): the messages list to add for query
152 |         queried_messages (List[dict]): the messages list already queried, which contains the query responses also,
153 |     Returns:
154 |         new_messages (List[dict]): the new messages list to query
155 |         postprocess (function): the postprocess function for the query response
156 |     """
157 |     if len(queried_messages) == 0:
158 |         last_prompt_idx = -1
159 |     else:
160 |         assert len(
161 |             queried_messages) >= 2, "queried_messages should have at least 2 messages, i.e., the user (system) and the response"
162 |         last_prompt = queried_messages[-2]['content']
163 |         prompt_texts = [x['content'] for x in messages]
164 |         last_prompt_idx = prompt_texts.index(last_prompt)
165 |     if last_prompt_idx == len(messages) - 1:
166 |         return None
167 |     new_messages = queried_messages.copy()
168 |     for idx in range(last_prompt_idx + 1, len(messages)):
169 |         new_messages = messages[idx]["map_func"](messages[idx], new_messages)
170 |         if messages[idx]["do_query"]:
171 |             break
172 |     return new_messages, messages[idx]["postprocess"]
173 | 
174 | 
175 | def get_xgptscore_from_json(json_content: dict):
176 |     """
177 |     Args:
178 |         json_content (dict): the json content
179 |     Returns:
180 |         xgptscore (float): the xgptscore, i.e. the sum of the reduction scores for all the errors
181 |     """
182 |     if isinstance(json_content, str):
183 |         return None
184 |     try:
185 |         xgptscore = 0
186 |         for error in json_content['errors'].values():
187 |             if error['score_reduction'] == "N/A":
188 |                 continue
189 |             xgptscore -= error['score_reduction']
190 |         return xgptscore
191 |     except Exception:
192 |         return None
193 | 
194 | 
195 | def get_xgptscore_from_json_star(json_content: dict):
196 |     """
197 |     Args:
198 |         json_content (dict): the json content
199 |     Returns:
200 |         xgptscore (float): the xgptscore, i.e. the sum of the reduction scores for all the errors
201 |     """
202 |     xgptscore = 0
203 |     res = {}
204 |     for aspect_key, aspect in json_content.items():
205 |         if isinstance(aspect, dict):
206 |             score = aspect['Score']
207 |             try:
208 |                 score = float(score)
209 |             except Exception:
210 |                 score = 0
211 |             xgptscore += score
212 |             res["xgptscore_" + aspect_key] = score
213 |     res["xgptscore"] = xgptscore
214 |     return res
215 | 
216 | 
217 | def get_xgptscore_from_json_per_aspect(json_content: dict):
218 |     """
219 |     Args:
220 |         json_content (dict): the json content
221 |     Returns:
222 |         xgptscore (float): the xgptscore, i.e. the sum of the reduction scores for all the errors
223 |     """
224 |     if not isinstance(json_content, dict):
225 |         return None
226 |     xgptscore = 0
227 |     res = {}
228 |     for error in json_content['errors'].values():
229 |         if error['error_aspect'] is not None:
230 |             if ("xgptscore_" + error['error_aspect'] not in res):
231 |                 res["xgptscore_" + error['error_aspect']] = 0
232 |             res["xgptscore_" + error['error_aspect']] -= error['score_reduction']
233 |             xgptscore -= error['score_reduction']
234 |     res["xgptscore"] = xgptscore
235 |     return res
236 | 


--------------------------------------------------------------------------------
/tigerscore/xgptscore/xgptscore.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import logging
  4 | from .process import MODE_PROCESS_MAP
  5 | from .process_utils import XPGTItem, truncate_items, get_query_messages
  6 | from .openai_utils import openai_completions, _chatml_to_prompt
  7 | from typing import List, Union
  8 | from dacite import from_dict
  9 | from pathlib import Path
 10 | from functools import partial
 11 | 
 12 | 
 13 | def xgptscore(
 14 |     items: List[Union[XPGTItem, dict]],
 15 |     mode: str,
 16 |     model_name: str,
 17 |     num_workers: int = None,
 18 |     batch_size: int = None,
 19 |     **kwargs,
 20 | ):
 21 |     config_path = os.path.join(os.path.dirname(
 22 |         __file__), f"mode_configs/{mode}.json")
 23 |     config_path = Path(config_path)
 24 |     if not config_path.exists():
 25 |         logging.warning(
 26 |             f"Config file {config_path} does not exist. Use default config.")
 27 |         config_path = config_path.with_name("default.json")
 28 | 
 29 |     with open(config_path, "r") as f:
 30 |         config = json.load(f)
 31 |     config.update(kwargs)
 32 |     if "max_lengths" in config:
 33 |         items = truncate_items(items, config["max_lengths"])
 34 | 
 35 |     if isinstance(items[0], dict):
 36 |         items = [from_dict(data_class=XPGTItem, data=item) for item in items]
 37 |     process_func = MODE_PROCESS_MAP[mode]
 38 |     if "process_kwargs" in config:
 39 |         process_func = partial(process_func, **config["process_kwargs"])
 40 |     process_results = list(map(process_func, items))
 41 | 
 42 |     total_round = len([x for x in process_results[0] if x['do_query']])
 43 |     logging.warning(f"Total chat rounds: {total_round}")
 44 |     logging.warning(f"Total chat messages: {len(items)}")
 45 |     # query and process
 46 |     round = 0
 47 |     queried_messages = [[] for _ in range(len(items))]
 48 |     total_price = 0
 49 |     total_time = 0
 50 |     round_completions = []
 51 |     while True:
 52 |         round += 1
 53 |         logging.warning(f"Processing chat round {round}/{total_round}")
 54 |         query_messages = list(
 55 |             map(get_query_messages, process_results, queried_messages))
 56 |         query_messages, postprocess_funcs = list(zip(*query_messages))
 57 |         chatml_prompts = list(map(_chatml_to_prompt, query_messages))
 58 |         openai_results = openai_completions(
 59 |             chatml_prompts,
 60 |             model_name=model_name,
 61 |             num_procs=num_workers,
 62 |             batch_size=batch_size,
 63 |             **config['decoding'],
 64 |         )
 65 |         completions = openai_results['completions']
 66 |         total_price += sum(openai_results['price_per_example'])
 67 |         total_time += sum(openai_results['time_per_example'])
 68 |         logging.warning(f"Round {round} price: {total_price}$")
 69 |         logging.warning(f"Round {round} time: {total_time}")
 70 |         postprocess_completions = [postprocess_funcs[idx](
 71 |             completion) for idx, completion in enumerate(completions)]
 72 |         round_completions.append(postprocess_completions)
 73 |         for idx, completion in enumerate(completions):
 74 |             queried_messages[idx] = query_messages[idx] + \
 75 |                 [{"role": "assistant", "content": completion}
 76 |                  ]  # add the assistant response
 77 |         if round == total_round:
 78 |             _query_messages = list(
 79 |                 map(get_query_messages, process_results, queried_messages))
 80 |             assert all([x is None for x in _query_messages]
 81 |                        ), "All messages should be queried"
 82 |             break
 83 |     logging.warning(f"Total price: {total_price}$")
 84 |     logging.warning(f"Total time: {total_time}")
 85 |     logging.warning(f"Total time per example: {total_time / len(items)}")
 86 |     round_completions = list(zip(*round_completions))
 87 |     return dict(
 88 |         round_completions=round_completions,
 89 |         messages_records=queried_messages,
 90 |     )
 91 | 
 92 | 
 93 | """
 94 | Example Usage:
 95 | task = "translation"
 96 | with open("example.json", "r") as f:
 97 |     items = json.load(f)
 98 | xgptitems = []
 99 | for item in items:
100 |     for cand in item['candidates']:
101 |         xgptitems.append(XPGTItem(
102 |             task=task,
103 |             instruction=item['instruction'],
104 |             input=item['input'],
105 |             ref_output=item['output'],
106 |             hypo_output=cand['text']
107 |         ))
108 | result = xgptscore(xgptitems, "ea", "ChatGPT")
109 | idx = 0
110 | for item in items:
111 |     for cand in item['candidates']:
112 |         cand['responses'] = result['round_completions'][idx]
113 |         cand['messages_records'] = result['messages_records'][idx]
114 | json.dump(items, open("example_result.json", "w"), indent=4, ensure_ascii=False)
115 | """
116 | 


--------------------------------------------------------------------------------