├── LICENSE ├── README.md ├── data └── data_final.json ├── eval.sh ├── evaluation ├── aggregation.py ├── con_rule.py ├── llm_based_evaluation.py ├── llm_based_extraction.py ├── prompts │ ├── RAL_evaluator.py │ └── RAL_extractor.py └── rule_based_evaluation.py ├── llm_generations ├── baichuan2_13b.jsonl ├── chatglm3_6b.jsonl ├── claude3_opus.jsonl ├── erniebot_4.jsonl ├── glm4.jsonl ├── gpt3.5_turbo_1106.jsonl ├── gpt4_1106.jsonl ├── internlm2_20b.jsonl ├── internlm2_7b.jsonl ├── llama3_70b.jsonl ├── llama3_8b.jsonl ├── mistral_7b.jsonl ├── qwen1.5_14b.jsonl ├── qwen1.5_72b.jsonl └── qwen1.5_7b.jsonl └── requirements.txt /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Papers with code 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking Complex Instruction-Following with Multiple Constraints Composition (NeurIPS 2024) 2 | 3 | This repository is the official implementation of Benchmarking Complex Instruction-Following with Multiple Constraints Composition. [Our Paper](https://arxiv.org/abs/2407.03978) 4 | 5 | ## 🔥 Update 6 | 7 | [2025.01.29] We have released the English translation version of ComplexBench in `data/data_final.json`. 8 | 9 | ## ⚙️ Requirements 10 | 11 | To install requirements: 12 | 13 | ```setup 14 | pip install -r requirements.txt 15 | ``` 16 | 17 | ## 📦 Data 18 | 19 | All data of ComplexBench is placed in `data/data_final.json` . The entire dataset is organized in the format of a list, where each element of the list is an instance of the dataset. The format of an instance is as follows. 20 | 21 | - `main_id` (integer): A unique identifier for the instance. 22 | - `group` (string): The task group identifier for The Coherent Test for Selection (Section 5.2.3). Applicable only in instructions that contain Selection. 23 | - `idx_in_group` (integer): Number of the instruction in the task group. Applicable only in instructions that contain Selection. 24 | - `instruction` (string): The actual instruction. 25 | - `instruction_en` (string): The actual instruction (English translation). 26 | - `task_types` (string): The task type of the instruction. 27 | - `constraint_dimensions` (list): All constraint dimensions within the instruction. 28 | - `composition_types` (list): All composition types of constraints within the instruction. 29 | - `category` (list): The category of the instruction based on composition types within it. 30 | - `scoring_questions` (list): All scoring questions of the instruction, which are designed to verify each constraint dimensions and composition types. Each element of this list contains: 31 | - `point_id` (integer): Number of the scoring question. 32 | - `question` (string): The actual scoring question. 33 | - `question_en` (string): The actual scoring question (English translation). 34 | - `rule` (string): Verification rule for the score scoring. `null` indicates that the score question cannot be verified by rules. 35 | - `constraint_dimensions` (string): All constraint dimensions that the scoring question verifies. 36 | - `composition_types` (string): All composition types that the scoring question verifies. 37 | - `dep` (list): The numbers of all score questions on which this score question depends. 38 | 39 | Here is an example of ComplexBench. 40 | 41 | ```json 42 | { 43 | "main_id": 899, 44 | "group": "complex_instruction_eval_1285", 45 | "idx_in_group": 1, 46 | "instruction": "依次判断以下两个案例中的国家是否有特别提款权。如果有,请写出一篇为该国申请提款的文章,字数不少于300字,且分点明确。如果没有则解释原因,字数不超过100字。\n\n案例1:\n\n国家A是一个发展中国家,是国际货币基金组织(IMF)的成员国,正在经历一场自然灾害,导致该国经济陷入危机,失去了国际支付能力。\n\n案例2:\n\n国家B是一个富裕国家,面临着国内通货膨胀问题,B是IMF的成员国,拥有充足的外汇储备。", 47 | "instruction_en": "Sequentially determine whether the countries in the following two cases have Special Drawing Rights (SDRs). If they do, please write an article of no less than 300 characters applying for withdrawals for that country, output in bullet points format. If not, explain the reason in no more than 100 characters.\n \nCase 1:\n \nCountry A is a developing country and a member of the International Monetary Fund (IMF). It is currently experiencing a natural disaster, leading to an economic crisis and a loss of international payment capability.\n \nCase 2:\n \nCountry B is a wealthy country facing domestic inflation issues. B is a member of the IMF and has ample foreign exchange reserves.", 48 | "task_types": "Professional Writing", 49 | "constraint_dimensions": [ 50 | "Length", 51 | "Helpfulness", 52 | "Bullets Format", 53 | "Factuality" 54 | ], 55 | "composition_types": [ 56 | "And", 57 | "Selection" 58 | ], 59 | "category": "Selection_2", 60 | "scoring_questions": [ 61 | { 62 | "point_id": 0, 63 | "question": "模型是否正确判断国家A有特别提款权?", 64 | "question_en": "Does the model correctly determine that Country A possesses Special Drawing Rights?", 65 | "rule": null, 66 | "constraint_dimensions": [ 67 | "Factuality" 68 | ], 69 | "composition_types": [ 70 | "Selection" 71 | ], 72 | "dep": [] 73 | }, 74 | { 75 | "point_id": 1, 76 | "question": "模型是否正确判断国家B没有特别提款权?", 77 | "question_en": "Does the model correctly determine that Country B does not have Special Drawing Rights?", 78 | "rule": null, 79 | "constraint_dimensions": [ 80 | "Factuality" 81 | ], 82 | "composition_types": [ 83 | "Selection" 84 | ], 85 | "dep": [] 86 | }, 87 | { 88 | "point_id": 2, 89 | "question": "模型是否根据国家A有特别提款权生成申请提款的文章?", 90 | "question_en": "Does the model generate an article on the application for withdrawal based on the Special Drawing Rights of Country A?", 91 | "rule": null, 92 | "constraint_dimensions": [ 93 | "Helpfulness" 94 | ], 95 | "composition_types": [], 96 | "dep": [ 97 | 0 98 | ] 99 | }, 100 | { 101 | "point_id": 3, 102 | "question": "模型生成的申请提款文章是否逻辑合理,符合事实?", 103 | "question_en": "Is the withdrawal application article generated by the model logically sound and factually accurate?", 104 | "rule": null, 105 | "constraint_dimensions": [ 106 | "Factuality" 107 | ], 108 | "composition_types": [], 109 | "dep": [ 110 | 0 111 | ] 112 | }, 113 | { 114 | "point_id": 4, 115 | "question": "模型生成的申请提款文章是否在300字以上?", 116 | "question_en": "Is the withdrawal application article generated by the model over 300 charcters?", 117 | "rule": "model_length:[300,10000]", 118 | "constraint_dimensions": [ 119 | "Length" 120 | ], 121 | "composition_types": [], 122 | "dep": [ 123 | 0 124 | ] 125 | }, 126 | { 127 | "point_id": 5, 128 | "question": "模型生成的申请提款文章是否分点明确?", 129 | "question_en": "Is the withdrawal application article generated by the model output in bullet points format?", 130 | "rule": null, 131 | "constraint_dimensions": [ 132 | "Bullets Format" 133 | ], 134 | "composition_types": [], 135 | "dep": [ 136 | 0 137 | ] 138 | }, 139 | { 140 | "point_id": 6, 141 | "question": "模型是否生成国家B没有特别提款权的解释?", 142 | "question_en": "Does the model generate an explanation for why Country B does not have Special Drawing Rights?", 143 | "rule": null, 144 | "constraint_dimensions": [ 145 | "Helpfulness" 146 | ], 147 | "composition_types": [], 148 | "dep": [ 149 | 1 150 | ] 151 | }, 152 | { 153 | "point_id": 7, 154 | "question": "模型生成的解释是否逻辑合理,符合事实?", 155 | "question_en": "Are the explanations generated by the model logically sound and factually accurate?", 156 | "rule": null, 157 | "constraint_dimensions": [ 158 | "Factuality" 159 | ], 160 | "composition_types": [], 161 | "dep": [ 162 | 1 163 | ] 164 | }, 165 | { 166 | "point_id": 8, 167 | "question": "模型生成的解释是否不超过100字?", 168 | "question_en": "Does the explanation generated by the model not exceed 100 characters?", 169 | "rule": "model_length:[1,100]", 170 | "constraint_dimensions": [ 171 | "Length" 172 | ], 173 | "composition_types": [], 174 | "dep": [ 175 | 1 176 | ] 177 | } 178 | ] 179 | } 180 | ``` 181 | 182 | ## 🚀 Evaluation 183 | 184 | ### Step1: Generating the responses 185 | 186 | First, you need to deploy your target LLM and generate responses of it (This part is not included in this repository). We have placed the generated responses of 15 LLMs in our paper in `llm_generations`. You can refer the format of one of these files and replace `genereated` field with your target LLM responses. 187 | 188 | We suggest using greedy decoding to avoid the randomness of decoding. 189 | 190 | ### Step2: Evaluation 191 | 192 | Then, you can evaluate any desired model via scirpt `eval.sh`: 193 | 194 | ```bash 195 | bash eval.sh "" "" "" "" "" "" "" 196 | ``` 197 | 198 | Here is an example: 199 | 200 | ```bash 201 | bash eval.sh "data/data_final.json" "llm_generations/glm4.jsonl" "evaluation_results" "" "" "glm4" "zh" 202 | ``` 203 | 204 | In this example, you can find the results of each scored question in `evaluation_results/glm4_final_results.json`. The statistical data of the final results will be saved in `evaluation_results/glm4_statistics.json`. 205 | 206 | ## 👏 Citation 207 | 208 | ``` 209 | @article{wen2024benchmarking, 210 | title={Benchmarking Complex Instruction-Following with Multiple Constraints Composition}, 211 | author={Wen, Bosi and Ke, Pei and Gu, Xiaotao and Wu, Lindong and Huang, Hao and Zhou, Jinfeng and Li, Wenchuang and Hu, Binxin and Gao, Wendy and Xu, Jiaxin and others}, 212 | journal={arXiv preprint arXiv:2407.03978}, 213 | year={2024} 214 | } 215 | ``` 216 | 217 | Please kindly cite our paper if this paper and the codes are helpful. 218 | -------------------------------------------------------------------------------- /eval.sh: -------------------------------------------------------------------------------- 1 | data_path=$1 2 | llm_output_path=$2 3 | output_dir=$3 4 | api_key=$4 5 | api_base=$5 6 | model_name=$6 7 | language=$7 8 | 9 | 10 | python3 evaluation/llm_based_extraction.py \ 11 | --data_path $data_path \ 12 | --llm_output_path $llm_output_path \ 13 | --output_path "${output_dir}/${model_name}_llm_extraction_results.jsonl" \ 14 | --api_key $api_key \ 15 | --api_base $api_base \ 16 | --language $language 17 | 18 | 19 | python3 evaluation/llm_based_evaluation.py \ 20 | --data_path $data_path \ 21 | --llm_output_path $llm_output_path \ 22 | --output_path "${output_dir}/${model_name}_llm_evaluation_results.jsonl" \ 23 | --api_key $api_key \ 24 | --api_base $api_base \ 25 | --language $language 26 | 27 | 28 | python3 evaluation/rule_based_evaluation.py \ 29 | --data_path $data_path \ 30 | --extraction_path "${output_dir}/${model_name}_llm_extraction_results.jsonl" \ 31 | --output_path "${output_dir}/${model_name}_rule_evaluation_results.jsonl" 32 | 33 | 34 | python3 evaluation/aggregation.py \ 35 | --data_path $data_path \ 36 | --llm_evaluation_path "${output_dir}/${model_name}_llm_evaluation_results.jsonl" \ 37 | --rule_evaluation_path "${output_dir}/${model_name}_rule_evaluation_results.jsonl" \ 38 | --model $model_name \ 39 | --output_path $output_dir -------------------------------------------------------------------------------- /evaluation/aggregation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | import os 4 | from copy import deepcopy 5 | 6 | def load_jsonl(file_path): 7 | _data = [] 8 | with open(file_path, 'r') as f: 9 | for data in f: 10 | jline = json.loads(data) 11 | _data.append(jline) 12 | return _data 13 | 14 | 15 | def get_rely_judgment(entry): 16 | point_judges_rely = deepcopy(entry['point_judges']) 17 | for i, q in enumerate(entry['scoring_questions']): 18 | for dep in q['dep']: 19 | if entry['point_judges'][dep] == False: 20 | point_judges_rely[i] = False 21 | entry['point_judges_rely'] = point_judges_rely 22 | return entry 23 | 24 | 25 | def get_selection_depth(entry): 26 | depth = 0 27 | for q in entry['scoring_questions']: 28 | if 'Selection' in q['composition_types']: 29 | depth += 1 30 | 31 | return depth 32 | 33 | 34 | def get_nested_method(entry): 35 | if "_".join(sorted(entry['composition_types'])) == "And_And_Selection": 36 | print(entry) 37 | return "_".join(sorted(entry['composition_types'])) 38 | 39 | 40 | Lexical = ["Word Matching", "Keywords"] 41 | Structure = ["JSON Format", "Markdown Format", "Bullets Format", "Length", "Start with", "End with", "Punctuation", "Template"] 42 | Semantic = ["Language Style", "Personalization", "Topic", "Sentiment"] 43 | Utillity = ["Helpfulness", "Target Language", "Supportiveness", "Consistency", "Factuality"] 44 | 45 | Tasks = ["Fundamental Language Ability", "Advanced Chinese Understanding", "Open-ended Questions", "Practical Writing", "Creative Writing", "Professional Writing", "Custom Writing", "Logical Reasoning", "Task-oriented Role Play", "Professional Knowledge"] 46 | 47 | 48 | def get_type(text): 49 | if text in Lexical: 50 | return "Lexical" 51 | if text in Structure: 52 | return "Structure" 53 | if text in Semantic: 54 | return "Semantic" 55 | if text in Utillity: 56 | return "Utillity" 57 | return "Others" 58 | 59 | 60 | def aggregation(_data): 61 | categories_cnt = {} 62 | categories_acc = {} 63 | 64 | categories_avg_cnt = {} 65 | categories_avg_acc = {} 66 | 67 | constraint_types_and_composition_types_cnt = {} 68 | constraint_types_and_composition_types_acc = {} 69 | 70 | each_cnt = {} 71 | each_acc = {} 72 | 73 | task_types_cnt = {} 74 | task_types_acc = {} 75 | 76 | nested_methods_cnt = {} 77 | nested_methods_acc = {} 78 | 79 | count_true = 0 80 | count_false = 0 81 | 82 | for entry in _data: 83 | for i, p in enumerate(entry['scoring_questions']): 84 | for constraint in p['constraint_dimensions']: 85 | type = get_type(constraint) 86 | constraint_types_and_composition_types_cnt[type] = constraint_types_and_composition_types_cnt.get(type, 0) + 1 87 | each_cnt[constraint] = each_cnt.get(constraint, 0) + 1 88 | if entry['point_judges_rely'][i]: 89 | constraint_types_and_composition_types_acc[type] = constraint_types_and_composition_types_acc.get(type, 0) + 1 90 | each_acc[constraint] = each_acc.get(constraint, 0) + 1 91 | 92 | for composition in p['composition_types']: 93 | constraint_types_and_composition_types_cnt[composition] = constraint_types_and_composition_types_cnt.get(composition, 0) + 1 94 | each_cnt[composition] = each_cnt.get(composition, 0) + 1 95 | if entry['point_judges_rely'][i]: 96 | constraint_types_and_composition_types_acc[composition] = constraint_types_and_composition_types_acc.get(composition, 0) + 1 97 | each_acc[composition] = each_acc.get(composition, 0) + 1 98 | 99 | category_avg = entry['category'].split("_")[0] 100 | nested_method = get_nested_method(entry) 101 | categories_cnt[entry['category']] = categories_cnt.get(entry['category'], 0) + len(entry['scoring_questions']) 102 | categories_avg_cnt[category_avg] = categories_avg_cnt.get(category_avg, 0) + len(entry['scoring_questions']) 103 | task_types_cnt[entry['task_types']] = task_types_cnt.get(entry['task_types'], 0) + len(entry['scoring_questions']) 104 | nested_methods_cnt[nested_method] = nested_methods_cnt.get(nested_method, 0) + len(entry['scoring_questions']) 105 | 106 | for i in entry['point_judges_rely']: 107 | if i == True: 108 | categories_acc[entry['category']] = categories_acc.get(entry['category'], 0) + 1 109 | categories_avg_acc[category_avg] = categories_avg_acc.get(category_avg, 0) + 1 110 | task_types_acc[entry['task_types']] = task_types_acc.get(entry['task_types'], 0) + 1 111 | nested_methods_acc[nested_method] = nested_methods_acc.get(nested_method, 0) + 1 112 | count_true += 1 113 | else: 114 | count_false += 1 115 | 116 | for t in constraint_types_and_composition_types_cnt.keys(): 117 | constraint_types_and_composition_types_acc[t] = constraint_types_and_composition_types_acc.get(t, 0) / constraint_types_and_composition_types_cnt[t] 118 | 119 | for t in categories_cnt.keys(): 120 | categories_acc[t] = categories_acc.get(t, 0) / categories_cnt[t] 121 | 122 | for t in categories_avg_cnt.keys(): 123 | categories_avg_acc[t] = categories_avg_acc.get(t, 0) / categories_avg_cnt[t] 124 | 125 | for t in nested_methods_cnt.keys(): 126 | nested_methods_acc[t] = nested_methods_acc.get(t, 0) / nested_methods_cnt[t] 127 | 128 | categories_acc = {k: categories_acc[k] for k in sorted(categories_acc)} 129 | categories_avg_acc = {k: categories_avg_acc[k] for k in sorted(categories_avg_acc)} 130 | nested_methods_acc = {k: nested_methods_acc[k] for k in sorted(nested_methods_acc)} 131 | 132 | for t in each_cnt.keys(): 133 | each_acc[t] = each_acc.get(t, 0) / each_cnt[t] 134 | 135 | for t in task_types_cnt.keys(): 136 | task_types_acc[t] = task_types_acc.get(t, 0) / task_types_cnt[t] 137 | 138 | # Coherent Test 139 | data_by_id = {} 140 | tree_1 = [] 141 | tree_2 = [] 142 | 143 | for entry in _data: 144 | data_by_id[entry['group']] = data_by_id.get(entry['group'], []) 145 | data_by_id[entry['group']].append(entry) 146 | 147 | cnt = 0 148 | for k in data_by_id.keys(): 149 | depth = get_selection_depth(data_by_id[k][0]) 150 | if depth == 1: 151 | tree_1.append(data_by_id[k]) 152 | elif depth >= 2: 153 | tree_2.append(data_by_id[k]) 154 | 155 | 156 | tree_1_acc = 0 157 | tree_2_acc = 0 158 | 159 | total_1_acc = 0 160 | total_2_acc = 0 161 | 162 | total_1_cnt = 0 163 | total_2_cnt = 0 164 | 165 | for t in tree_1: 166 | flag = True 167 | for sample in t: 168 | total_1_cnt += 1 169 | fg = True 170 | for q in sample['point_judges_rely']: 171 | if q == False: 172 | flag = False 173 | fg = False 174 | 175 | if fg == True: 176 | total_1_acc += 1 177 | if flag == True: 178 | tree_1_acc += 1 179 | 180 | 181 | for t in tree_2: 182 | flag = True 183 | for sample in t: 184 | total_2_cnt += 1 185 | fg = True 186 | for q in sample['point_judges_rely']: 187 | if q == False: 188 | flag = False 189 | fg = False 190 | 191 | if fg == True: 192 | total_2_acc += 1 193 | if flag == True: 194 | tree_2_acc += 1 195 | 196 | return { 197 | "overall_drfr" : count_true / (count_true + count_false), 198 | "categorized_drfr" : categories_acc, 199 | "categorized_avg_drfr" : categories_avg_acc, 200 | "task_types_drfr" : task_types_acc, 201 | "nested_methods_drfr" : nested_methods_acc, 202 | "constraint_types_and_composition_types_acc" : constraint_types_and_composition_types_acc, 203 | "each_constraint_dimensions_and_composition_types_acc" : each_acc, 204 | "single_origin_test" : total_1_acc / total_1_cnt, 205 | "single_coherent_test" : tree_1_acc / len(tree_1), 206 | "single_relative_drop" : (total_1_acc / total_1_cnt - tree_1_acc / len(tree_1)) / (total_1_acc / total_1_cnt), 207 | "multiple_origin_test" : total_2_acc / total_2_cnt, 208 | "multiple_coherent_test" : tree_2_acc / len(tree_2), 209 | "miltiple_relative_drop" : (total_2_acc / total_2_cnt - tree_2_acc / len(tree_2)) / (total_2_acc / total_2_cnt), 210 | } 211 | 212 | 213 | 214 | if __name__ == "__main__": 215 | parser = argparse.ArgumentParser() 216 | parser.add_argument("--data_path", type=str, default="") 217 | parser.add_argument("--llm_evaluation_path", type=str, default="") 218 | parser.add_argument("--rule_evaluation_path", type=str, default="") 219 | parser.add_argument("--output_path", type=str, default="") 220 | parser.add_argument("--model", type=str, default="") 221 | args = parser.parse_args() 222 | model = args.model 223 | 224 | with open(args.data_path, 'r', encoding='utf-8') as f: 225 | data = json.load(f) 226 | 227 | outputs = {} 228 | point_judges = {} 229 | point_explanation = {} 230 | 231 | llm_evaluations = load_jsonl(args.llm_evaluation_path) 232 | for l in llm_evaluations: 233 | outputs[l['main_id']] = l['output'] 234 | point_judges[(l['main_id'], l['point_id'])] = l['point_judge'] 235 | point_explanation[(l['main_id'], l['point_id'])] = l['point_explanation'] 236 | 237 | rule_evaluations = load_jsonl(args.rule_evaluation_path) 238 | for r in rule_evaluations: 239 | if r['point_judge'] == 0: 240 | r['point_judge'] = False 241 | if r['point_judge'] == 1: 242 | r['point_judge'] = True 243 | outputs[l['main_id']] = l['output'] 244 | point_judges[(r['main_id'], r['point_id'])] = r['point_judge'] 245 | point_explanation[(r['main_id'], r['point_id'])] = r['point_explanation'] 246 | 247 | outs = [] 248 | for d in data: 249 | d['output'] = outputs[d['main_id']] 250 | d['point_judges'] = [] 251 | d['point_explanations'] = [] 252 | 253 | for i in range(len(d['scoring_questions'])): 254 | d['point_judges'].append(point_judges[(d['main_id'], i)]) 255 | d['point_explanations'].append(point_explanation[(d['main_id'], i)]) 256 | 257 | d = get_rely_judgment(d) 258 | outs.append(d) 259 | 260 | 261 | with open(os.path.join(args.output_path, f"{model}_final_results.json"), "w", encoding='utf-8') as f: 262 | json.dump(outs, f, ensure_ascii=False, indent=4) 263 | 264 | results = {model : aggregation(outs)} 265 | 266 | with open(os.path.join(args.output_path, f"{model}_statistics.json"), "w", encoding='utf-8') as f: 267 | json.dump(results, f, ensure_ascii=False, indent=4) 268 | 269 | -------------------------------------------------------------------------------- /evaluation/con_rule.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import string 4 | 5 | keyword_pattern = r"keyword:(\[.*?\])$" 6 | forbidden_word_pattern = r"forbidden_word:(\[.*?\])$" 7 | startswith_pattern = r"startswith:(.*?)$" 8 | endswith_pattern = r"endswith:(.*?)$" 9 | startswith_and_endswith_pattern = r"startswith:(.*?)\nendswith:(.*?)$" 10 | not_endswith_pattern = r"not endswith:(.*?)$" 11 | length_pattern = r"length:(\[.*?\])$" 12 | length_word_pattern = r"length_word:(\[.*?\])$" 13 | match_pattern = r"match:(.*?)$" 14 | keys_pattern = r"keys:(\[.*?\])$" 15 | json_pattern = r"Json:(\[.*?\])$" 16 | constraint_punc_pattern = r"constraint_punc:(\[.*?\])$" 17 | 18 | model_startswith_pattern = r"model_startswith:(.*?)$" 19 | model_endswith_pattern = r"model_endswith:(.*?)$" 20 | model_length_word_pattern = r"model_length_word:(\[.*?\])$" 21 | model_keyword_num_pattern = r"model_keyword_num:(\[.*?\])$" 22 | model_forbidden_word_each_pattern = r"model_forbidden_word_each:(\[.*?\])$" 23 | model_forbidden_word_pattern = r"model_forbidden_word:(\[.*?\])$" 24 | model_keyword_num_and_forbidden_word_each_pattern = r"model_keyword_num_each:(\[.*?\])\nmodel_forbidden_word_each:(\[.*?\])$" 25 | model_keyword_num_each_pattern = r"model_keyword_num_each:(\[.*?\])$" 26 | 27 | model_not_startswith_pattern = r"model_not_startswith:(.*?)$" 28 | model_not_endswith_pattern = r"model_not_endswith:(.*?)$" 29 | model_not_endswith_each_pattern = r"model_not_endswith_each:(.*?)$" 30 | 31 | 32 | model_startswith_and_endswith_pattern = r"model_startswith:(.*?)\nmodel_endswith:(.*?)$" 33 | model_constraint_punc_pattern = r"model_constraint_punc:(\[.*?\])$" 34 | model_constraint_punc_each_pattern = r"model_constraint_punc_each:(\[.*?\])$" 35 | model_startswith_each_pattern = r"model_startswith_each:(.*?)$" 36 | model_endswith_each_pattern = r"model_endswith_each:(.*?)$" 37 | model_starts_with_list_each_pattern = r"model_starts_with_list_each:(\[.*?\])$" 38 | model_keyword_each_pattern = r"model_keyword_each:(\[.*?\])$" 39 | model_keyword_pattern = r"model_keyword:(\[.*?\])$" 40 | model_number_each_pattern = r"model_number_each:(\[.*?\])$" 41 | model_length_each_pattern = r"model_length_each:(\[.*?\])$" 42 | model_length_word_each_pattern = r"model_length_word_each:(\[.*?\])$" 43 | model_length_pattern = r"model_length:(\[.*?\])$" 44 | 45 | 46 | 47 | def decom(s): 48 | if s == None or s == "": 49 | return "" 50 | while s[0] == "\n": 51 | s = s[1:] 52 | if s[0] in ['"', '“', '`'] and s[-1] in ['"', '”', '`']: 53 | return s[1:-1] 54 | else: 55 | return s 56 | 57 | 58 | def extract_json(s): 59 | stack = 0 60 | json_start = 0 61 | 62 | if '{' not in s: 63 | return "" 64 | 65 | for i, char in enumerate(s): 66 | if char == '{': 67 | if stack == 0: 68 | json_start = i 69 | stack += 1 70 | elif char == '}': 71 | stack -= 1 72 | if stack == 0: 73 | return s[json_start:i+1] 74 | 75 | return "" 76 | 77 | 78 | def extract_and_replace_punctuation_zh(text): 79 | en_punctuation = string.punctuation 80 | cn_punctuation = "!" + "\"" + "#" + '$' + '%' + '&' + '\'' + '('+ ')' + '*' \ 81 | + '+' + ',' + '-' + '。' + '/' + ':' + ';' + '<' + '=' + '>' + '?' + '@' + '[' + '\' \ 82 | + ']' + '^' + '_' + '`' + '{' + '|' + '}' + "~" 83 | en_to_cn_punctuation = str.maketrans(en_punctuation, cn_punctuation) 84 | text = text.translate(en_to_cn_punctuation) 85 | text = text.replace("“", "\"") 86 | text = text.replace("”", "\"") 87 | return text 88 | 89 | 90 | def keyword(s, keywords): 91 | flag = True 92 | for keyword in keywords: 93 | keyword = extract_and_replace_punctuation_zh(keyword) 94 | if keyword not in s: 95 | flag = False 96 | 97 | return flag 98 | 99 | 100 | def forbidden_word(s, forbidden_words): 101 | flag = True 102 | for forbidden_word in forbidden_words: 103 | forbidden_word = extract_and_replace_punctuation_zh(forbidden_word) 104 | if forbidden_word in s: 105 | flag = False 106 | 107 | return flag 108 | 109 | 110 | def keyword_and_forbidden_word(s, keywords, forbidden_words): 111 | return keyword(s, keywords) and forbidden_word(s, forbidden_words) 112 | 113 | 114 | def startswith(s, starts): 115 | starts = extract_and_replace_punctuation_zh(starts) 116 | if s.startswith(starts): 117 | return True 118 | if s.startswith(starts + "。"): 119 | return True 120 | if s.startswith(starts + "?"): 121 | return True 122 | return False 123 | 124 | 125 | def endswith(s, ends): 126 | ends = extract_and_replace_punctuation_zh(ends) 127 | if s.endswith(ends): 128 | return True 129 | if s.endswith(ends + "。"): 130 | return True 131 | if s.endswith(ends + "?"): 132 | return True 133 | return False 134 | 135 | 136 | def not_endswith(s, ends): 137 | ends = extract_and_replace_punctuation_zh(ends) 138 | if s.endswith(ends): 139 | return False 140 | if s.endswith(ends + "。"): 141 | return False 142 | if s.endswith(ends + "?"): 143 | return False 144 | return True 145 | 146 | 147 | def length(s, l): 148 | if len(s) >= l[0] and len(s) <= l[1]: 149 | return True 150 | return False 151 | 152 | 153 | def length_word(s, l): 154 | s = s.split() 155 | if len(s) >= l[0] and len(s) <= l[1]: 156 | return True 157 | return False 158 | 159 | 160 | def json_test(s, keys): 161 | try: 162 | s = json.loads(s) 163 | except: 164 | return False 165 | if keys == None: 166 | return True 167 | 168 | for key in keys: 169 | if key not in s.keys(): 170 | return False 171 | return True 172 | 173 | 174 | def check_json(s, keys): 175 | s = extract_json(s) 176 | keys = [k for k in keys if k != ""] 177 | try: 178 | s = json.loads(s) 179 | if type(s) == dict: 180 | for k in keys: 181 | if k not in s.keys(): 182 | return False 183 | if len(s.keys()) != len(keys) and len(keys) != 0: 184 | return False 185 | return True 186 | else: 187 | return False 188 | except: 189 | return False 190 | 191 | 192 | def constraint_punc(s, punc): 193 | punc = [extract_and_replace_punctuation_zh(p) for p in punc] 194 | for p in string.punctuation: 195 | p = extract_and_replace_punctuation_zh(p) 196 | if p in s and p not in punc: 197 | return False 198 | return True 199 | 200 | 201 | def decompose_judge(point, rs): 202 | if 'model_keyword_each' in point: 203 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 204 | return False 205 | flag = True 206 | for r in rs: 207 | if r == "": 208 | continue 209 | r = extract_and_replace_punctuation_zh(r) 210 | k_matches = re.findall(model_keyword_each_pattern, point) 211 | k_matches = [[x.strip() for x in k.strip('[]').split(',')] for k in k_matches] 212 | if keyword(r, k_matches[0]) == False: 213 | flag = False 214 | return flag 215 | elif 'model_keyword_num_each' in point and "model_forbidden_word_each" in point: 216 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 217 | return False 218 | k_matches = re.search(model_keyword_num_and_forbidden_word_each_pattern, point, re.DOTALL).group(1) 219 | f_matches = re.search(model_keyword_num_and_forbidden_word_each_pattern, point, re.DOTALL).group(2) 220 | k_matches = k_matches.strip('[]').split(',') 221 | f_matches = f_matches.strip('[]').split(',') 222 | L = int(k_matches[0]) 223 | R = int(k_matches[1]) 224 | 225 | keys = k_matches[2:] 226 | flag = True 227 | for r in rs: 228 | if r == "": 229 | continue 230 | 231 | cnt_1 = 0 232 | cnt_2 = 0 233 | for k in keys: 234 | cnt_1 += keyword(r, [k]) 235 | for f in f_matches: 236 | cnt_2 += keyword(r, [f]) 237 | 238 | if cnt_1 < L or cnt_1 > R: 239 | flag = False 240 | if cnt_2 > 0: 241 | flag = False 242 | return flag 243 | elif 'model_keyword_num_each' in point: 244 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 245 | return False 246 | k_matches = re.search(model_keyword_num_each_pattern, point, re.DOTALL).group(1) 247 | k_matches = k_matches.strip('[]').split(',') 248 | L = int(k_matches[0]) 249 | R = int(k_matches[1]) 250 | 251 | keys = k_matches[2:] 252 | flag = True 253 | for r in rs: 254 | if r == "": 255 | continue 256 | cnt_1 = 0 257 | for k in keys: 258 | cnt_1 += keyword(r, [k]) 259 | if cnt_1 < L or cnt_1 > R: 260 | flag = False 261 | return flag 262 | elif 'model_keyword_num' in point: 263 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 264 | return False 265 | 266 | k_matches = re.findall(model_keyword_num_pattern, point) 267 | k_matches = [[x.strip() for x in k.strip('[]').split(',')] for k in k_matches] 268 | L = int(k_matches[0][0]) 269 | R = int(k_matches[0][1]) 270 | keys = k_matches[0][2:] 271 | 272 | cnt = 0 273 | for k in keys: 274 | cnt += keyword(rs[0], [k]) 275 | 276 | return cnt >= L and cnt <= R 277 | elif 'model_keyword' in point: 278 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 279 | return False 280 | k_matches = re.findall(model_keyword_pattern, point) 281 | k_matches = [[x.strip() for x in k.strip('[]').split(',')] for k in k_matches] 282 | return keyword(rs[0], k_matches[0]) 283 | elif 'model_forbidden_word_each' in point: 284 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 285 | return True 286 | f_matches = re.findall(model_forbidden_word_each_pattern, point) 287 | f_matches = [[x.strip() for x in f.strip('[]').split(',')] for f in f_matches] 288 | flag = True 289 | for r in rs: 290 | if r == "": 291 | continue 292 | r = extract_and_replace_punctuation_zh(r) 293 | if forbidden_word(r, f_matches[0]) == False: 294 | flag = False 295 | 296 | return flag 297 | elif 'model_forbidden_word' in point: 298 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 299 | return True 300 | f_matches = re.findall(model_forbidden_word_pattern, point) 301 | f_matches = [[x.strip() for x in f.strip('[]').split(',')] for f in f_matches] 302 | return forbidden_word(rs[0], f_matches[0]) 303 | elif 'model_number_each' in point: 304 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 305 | return False 306 | n_matches = re.findall(model_number_each_pattern, point) 307 | number = eval(n_matches[0]) 308 | 309 | return len(rs) >= number[0] and len(rs) <= number[1] 310 | elif 'model_length_each' in point: 311 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 312 | return False 313 | l_matches = re.findall(model_length_each_pattern, point) 314 | l = eval(l_matches[0]) 315 | flag = True 316 | for r in rs: 317 | if r == "": 318 | continue 319 | if length(r, l) == False and length(decom(r), l) == False: 320 | flag = False 321 | return flag 322 | elif 'model_length_word_each' in point: 323 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 324 | return False 325 | l_matches = re.findall(model_length_word_each_pattern, point) 326 | l = eval(l_matches[0]) 327 | flag = True 328 | for r in rs: 329 | if r == "": 330 | continue 331 | if length_word(r, l) == False and length_word(decom(r), l) == False: 332 | flag = False 333 | return flag 334 | elif 'model_length_word' in point: 335 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 336 | return False 337 | l_matches = re.findall(model_length_word_pattern, point) 338 | l = eval(l_matches[0]) 339 | return length_word(rs[0], l) or length_word(decom(rs[0]), l) 340 | elif 'model_length:' in point: 341 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 342 | return False 343 | l_matches = re.findall(model_length_pattern, point) 344 | l = eval(l_matches[0]) 345 | return length(decom(rs[0]), l) or length(rs[0], l) 346 | elif 'model_startswith_each' in point: 347 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 348 | return False 349 | flag = True 350 | s_matches = re.findall(model_startswith_each_pattern, point, flags=re.DOTALL) 351 | for r in rs: 352 | if r == "": 353 | continue 354 | r = extract_and_replace_punctuation_zh(r) 355 | if startswith(r, s_matches[0]) == False: 356 | flag = False 357 | return flag 358 | elif 'model_starts_with_list_each' in point: 359 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 360 | return False 361 | s_matches = re.findall(model_starts_with_list_each_pattern, point) 362 | ks = [[x.strip() for x in s.strip('[]').split(',')] for s in s_matches][0] 363 | 364 | if len(rs) != len(ks): 365 | return False 366 | 367 | flag = True 368 | for i, r in enumerate(rs): 369 | if r == "": 370 | continue 371 | r = extract_and_replace_punctuation_zh(r) 372 | if startswith(r, ks[i]) == False: 373 | flag = False 374 | return flag 375 | elif 'model_startswith' in point and "model_endswith" in point: 376 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 377 | return False 378 | rs[0] = extract_and_replace_punctuation_zh(rs[0]) 379 | s_matches = re.search(model_startswith_and_endswith_pattern, point, flags=re.DOTALL).group(1) 380 | e_matches = re.search(model_startswith_and_endswith_pattern, point, flags=re.DOTALL).group(2) 381 | return startswith(rs[0], s_matches) and endswith(rs[0], e_matches) 382 | elif 'model_startswith' in point: 383 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 384 | return False 385 | rs[0] = extract_and_replace_punctuation_zh(rs[0]) 386 | s_matches = re.findall(model_startswith_pattern, point, flags=re.DOTALL) 387 | return startswith(rs[0], s_matches[0]) 388 | elif 'model_endswith_each' in point: 389 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 390 | return False 391 | flag = True 392 | e_matches = re.findall(model_endswith_each_pattern, point, flags=re.DOTALL) 393 | if len(rs) != 1: 394 | rs = rs[:-1] 395 | for r in rs: 396 | if r == "": 397 | continue 398 | r = extract_and_replace_punctuation_zh(r) 399 | if endswith(decom(r), e_matches[0]) == False and endswith(r, e_matches[0]) == False: 400 | flag = False 401 | return flag 402 | elif 'model_endswith' in point: 403 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 404 | return False 405 | rs[0] = extract_and_replace_punctuation_zh(rs[0]) 406 | e_matches = re.findall(model_endswith_pattern, point, flags=re.DOTALL) 407 | return endswith(rs[0], e_matches[0]) 408 | elif 'model_constraint_punc_each' in point: 409 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 410 | return False 411 | flag = True 412 | p_matches = re.findall(model_constraint_punc_each_pattern, point) 413 | p_matches = [[x.strip() for x in p.strip('[]').split(',')] for p in p_matches] 414 | for r in rs: 415 | if r == "": 416 | continue 417 | r = extract_and_replace_punctuation_zh(r) 418 | if constraint_punc(decom(r), p_matches[0]) == False and constraint_punc(r, p_matches[0]) == False: 419 | flag = False 420 | return flag 421 | elif 'model_constraint_punc' in point: 422 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 423 | return False 424 | p_matches = re.findall(model_constraint_punc_pattern, point) 425 | p_matches = [[x.strip() for x in p.strip('[]').split(',')] for p in p_matches] 426 | return constraint_punc(decom(rs[0]), p_matches[0]) 427 | elif 'model_not_startswith' in point: 428 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 429 | return True 430 | rs[0] = extract_and_replace_punctuation_zh(rs[0]) 431 | s_matches = re.findall(model_not_startswith_pattern, point, flags=re.DOTALL) 432 | return not startswith(rs[0], s_matches[0]) 433 | elif 'model_not_endswith_each' in point: 434 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 435 | return True 436 | rs[0] = extract_and_replace_punctuation_zh(rs[0]) 437 | e_matches = re.findall(model_not_endswith_each_pattern, point, flags=re.DOTALL) 438 | flag = True 439 | for r in rs: 440 | if r == "": 441 | continue 442 | r = extract_and_replace_punctuation_zh(r) 443 | if not_endswith(r, e_matches[0]) == False: 444 | flag = False 445 | return flag 446 | elif 'model_not_endswith' in point: 447 | if len(rs) == 0 or (len(rs) == 1 and rs[0] == ""): 448 | return True 449 | rs[0] = extract_and_replace_punctuation_zh(rs[0]) 450 | e_matches = re.findall(model_not_endswith_pattern, point, flags=re.DOTALL) 451 | return not_endswith(rs[0], e_matches[0]) 452 | elif 'model' in point or 'each' in point: 453 | return 2 454 | else: 455 | if len(rs) == 0: 456 | return rule("", point) 457 | return rule(decom(rs[0]), point) or rule(rs[0], point) 458 | 459 | 460 | def rule(s, point): 461 | if 'model' in point: 462 | return 2 463 | if "keyword" in point and "forbidden_word" in point: 464 | if s == "" or "None" in s: 465 | return False 466 | s = extract_and_replace_punctuation_zh(s) 467 | k_matches = re.findall(keyword_pattern, point) 468 | f_matches = re.findall(forbidden_word_pattern, point) 469 | k_matches = [[x.strip() for x in k.strip('[]').split(',')] for k in k_matches] 470 | f_matches = [[x.strip() for x in k.strip('[]').split(',')] for k in f_matches] 471 | return keyword_and_forbidden_word(s, k_matches[0], f_matches[0]) 472 | elif "keyword" in point and "keyword_num" not in point: 473 | if s == "" or "None" in s: 474 | return False 475 | s = extract_and_replace_punctuation_zh(s) 476 | k_matches = re.findall(keyword_pattern, point) 477 | k_matches = [[x.strip() for x in k.strip('[]').split(',')] for k in k_matches] 478 | return keyword(s, k_matches[0]) 479 | 480 | elif "match" in point: 481 | if s == "" or "None" in s: 482 | return False 483 | s = extract_and_replace_punctuation_zh(s) 484 | m_matches = re.findall(match_pattern, point) 485 | return s == m_matches[0] or s == m_matches[0] + "。" or s == m_matches[0] + "." 486 | 487 | elif "constraint_punc" in point: 488 | if s == "" or "None" in s: 489 | return False 490 | s = extract_and_replace_punctuation_zh(s) 491 | p_matches = re.findall(constraint_punc_pattern, point) 492 | p_matches = [[x.strip() for x in p.strip('[]').split(',')] for p in p_matches] 493 | return constraint_punc(s, p_matches[0]) 494 | 495 | elif "forbidden_word" in point: 496 | if s == "" or "None" in s: 497 | return True 498 | s = extract_and_replace_punctuation_zh(s) 499 | f_matches = re.findall(forbidden_word_pattern, point) 500 | f_matches = [[x.strip() for x in k.strip('[]').split(',')] for k in f_matches] 501 | return forbidden_word(s, f_matches[0]) 502 | 503 | elif "startswith" in point and "endswith" in point: 504 | if s == "" or "None" in s: 505 | return False 506 | s = extract_and_replace_punctuation_zh(s) 507 | s_matches = re.search(startswith_and_endswith_pattern, point, flags=re.DOTALL).group(1) 508 | e_matches = re.search(startswith_and_endswith_pattern, point, flags=re.DOTALL).group(2) 509 | return startswith(s, s_matches) and endswith(s, e_matches) 510 | elif "startswith" in point: 511 | if s == "" or "None" in s: 512 | return False 513 | s = extract_and_replace_punctuation_zh(s) 514 | s_matches = re.findall(startswith_pattern, point, flags=re.DOTALL) 515 | return startswith(s, s_matches[0]) 516 | 517 | elif "not endswith" in point: 518 | if s == "" or "None" in s: 519 | return True 520 | s = extract_and_replace_punctuation_zh(s) 521 | e_matches = re.findall(not_endswith_pattern, point, flags=re.DOTALL) 522 | return not_endswith(s, e_matches[0]) 523 | 524 | elif "endswith" in point: 525 | if s == "" or "None" in s: 526 | return False 527 | s = extract_and_replace_punctuation_zh(s) 528 | e_matches = re.findall(endswith_pattern, point, flags=re.DOTALL) 529 | return endswith(s, e_matches[0]) 530 | 531 | elif "length_word:" in point: 532 | if s == "" or "None" in s: 533 | return False 534 | s = extract_and_replace_punctuation_zh(s) 535 | l_w_matches = re.findall(length_word_pattern, point, flags=re.DOTALL) 536 | return length_word(s, eval(l_w_matches[0])) 537 | elif "length:" in point: 538 | if s == "" or "None" in s: 539 | return False 540 | s = extract_and_replace_punctuation_zh(s) 541 | l_matches = re.findall(length_pattern, point, flags=re.DOTALL) 542 | return length(s, eval(l_matches[0])) 543 | elif "Json" in point: 544 | if s == "" or "None" in s: 545 | return False 546 | j_matches = re.findall(json_pattern, point.strip(), flags=re.DOTALL) 547 | return check_json(s, eval(j_matches[0])) 548 | else: 549 | return 2 550 | 551 | -------------------------------------------------------------------------------- /evaluation/llm_based_evaluation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import time 4 | import argparse 5 | import re 6 | from tqdm import tqdm 7 | from concurrent.futures import ThreadPoolExecutor 8 | import openai 9 | import traceback 10 | from prompts.RAL_evaluator import EVALUATION_PROMPT 11 | 12 | 13 | def load_jsonl(file_path): 14 | _data = [] 15 | with open(file_path, 'r') as f: 16 | for data in f: 17 | jline = json.loads(data) 18 | _data.append(jline) 19 | return _data 20 | 21 | 22 | def get_payload(line): 23 | instruction = line['instruction'][:6000] 24 | question = line['question'] 25 | if line['output'] != None: 26 | output = line['output'][:4000] 27 | else: 28 | output = 'None' 29 | content = SYS_MSG.format(input=instruction, output=output, question=question) 30 | payload = { 31 | "model": "gpt-4-1106-preview", 32 | "messages": [ 33 | { 34 | "role": "user", 35 | "content": content 36 | } 37 | ], 38 | "max_tokens": 8192, 39 | "temperature": 0.0, 40 | "top_p": 0.3, 41 | "stream": True 42 | } 43 | return payload 44 | 45 | 46 | def save_jsonl(entry, sava_path): 47 | with open(sava_path, 'a', encoding='utf-8')as file: 48 | file.write(json.dumps(entry, ensure_ascii=False)+ "\n") 49 | 50 | 51 | def get_answer(input_data: dict, retry=30): 52 | entry, save_path = input_data['data'], input_data['save_path'] 53 | try: 54 | payload = get_payload(entry) 55 | chat_completion = openai.ChatCompletion.create(model=payload['model'], temperature=0, messages=payload['messages']) 56 | generation = chat_completion.choices[0].message.content 57 | 58 | if generation == None or generation == "": 59 | get_answer(input_data, retry=retry-1) 60 | 61 | re_result = re.findall(r'答案:是|答案:否', generation) 62 | if len(re_result) == 1: 63 | if "是" in re_result[0]: 64 | entry['point_judge'] = True 65 | else: 66 | entry['point_judge'] = False 67 | else: 68 | if "是" in generation and "否" not in generation: 69 | entry['point_judge'] = True 70 | else: 71 | entry['point_judge'] = False 72 | 73 | entry['point_explanation'] = generation 74 | entry['payload'] = payload 75 | save_jsonl(entry, save_path) 76 | return entry 77 | except Exception as e: 78 | time.sleep(1.2) 79 | retry -= 1 80 | if retry < 0: 81 | entry['point_judge'] = False 82 | entry['point_explanation'] = "None" 83 | entry['payload'] = payload 84 | save_jsonl(entry, save_path) 85 | print(f"retry:剩余{retry}次") 86 | print(e) 87 | print(traceback.format_exc()) 88 | get_answer(input_data, retry=retry) 89 | 90 | 91 | def run_evaluation(save_path, datas, num_pool): 92 | _input = [{"data":i, "eval_model": "gpt-4-1106-preview", "save_path": save_path} for i in datas if i ] 93 | with ThreadPoolExecutor(max_workers=num_pool) as executor: 94 | tqdm(executor.map(get_answer, _input), total=len(_input), desc='Processing', ncols=100) 95 | 96 | 97 | def get_data(data_path, llm_output_path, language="zh"): 98 | with open(data_path, 'r', encoding='utf-8') as f: 99 | data = json.load(f) 100 | with open(llm_output_path, 'r', encoding='utf-8') as f: 101 | outputs = [json.loads(line) for line in f.readlines()] 102 | 103 | datas = [] 104 | for i, (d, o) in enumerate(zip(data, outputs)): 105 | for j, q in enumerate(d['scoring_questions']): 106 | if q['rule'] != None: 107 | continue 108 | 109 | if language == "zh": 110 | datas.append({ 111 | "main_id" : i, 112 | "point_id" : j, 113 | "instruction" : d['instruction'], 114 | "question" : q['question'], 115 | "output" : o['generated'], 116 | }) 117 | elif language == "en": 118 | datas.append({ 119 | "main_id" : i, 120 | "point_id" : j, 121 | "instruction" : d['instruction_en'], 122 | "question" : q['question_en'], 123 | "output" : o['generated'], 124 | }) 125 | return datas 126 | 127 | 128 | def main_run(args): 129 | datas = get_data(data_path=args.data_path, llm_output_path=args.llm_output_path, language=args.language) 130 | run_evaluation(args.output_path, datas, args.num_pool) 131 | 132 | 133 | if __name__ == "__main__": 134 | parser = argparse.ArgumentParser() 135 | parser.add_argument("--data_path", type=str, default="") 136 | parser.add_argument("--llm_output_path", type=str, default="") 137 | parser.add_argument("--num_pool", type=int, default=40) 138 | parser.add_argument("--output_path", type=str, default="") 139 | parser.add_argument("--api_key", type=str, default="") 140 | parser.add_argument("--api_base", type=str, default="") 141 | parser.add_argument("--language", type=str, default="zh") 142 | args = parser.parse_args() 143 | openai.api_key = args.api_key 144 | openai.api_base = args.api_base 145 | SYS_MSG = EVALUATION_PROMPT 146 | main_run(args) 147 | -------------------------------------------------------------------------------- /evaluation/llm_based_extraction.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import time 4 | import argparse 5 | import traceback 6 | from os.path import exists 7 | import openai 8 | from tqdm import tqdm 9 | from concurrent.futures import ThreadPoolExecutor 10 | from prompts.RAL_extractor import EXTRACTION_PROMPT, EXTRACTION_PROMPT_EACH 11 | 12 | 13 | def load_jsonl(file_path): 14 | _data = [] 15 | with open(file_path, 'r') as f: 16 | for data in f: 17 | jline = json.loads(data) 18 | _data.append(jline) 19 | return _data 20 | 21 | 22 | def get_payload(line): 23 | instruction = line['instruction'][:6000] 24 | question = line['question'] 25 | if line['output'] != None: 26 | output = line['output'][:4000] 27 | else: 28 | output = "None" 29 | if 'each' in line['rule']: 30 | content = SYS_MSG_EACH.format(instruction=instruction, response=output, question=question) 31 | else: 32 | content = SYS_MSG.format(instruction=instruction, response=output, question=question) 33 | payload = { 34 | "model": "gpt-4-1106-preview", 35 | "messages": [ 36 | { 37 | "role": "user", 38 | "content": content 39 | } 40 | ], 41 | "max_tokens": 8192, 42 | "temperature": 0.0, 43 | "top_p": 0.95, 44 | "stream": True 45 | } 46 | return payload 47 | 48 | 49 | def save_jsonl(entry, sava_path): 50 | with open(sava_path, 'a', encoding='utf-8') as f: 51 | f.write(json.dumps(entry, ensure_ascii=False)+ "\n") 52 | 53 | 54 | def get_answer(input_data: dict, retry=30): 55 | entry, save_path = input_data['data'], input_data['save_path'] 56 | try: 57 | payload = get_payload(entry) 58 | chat_completion = openai.ChatCompletion.create(model=payload['model'], temperature=0, messages=payload['messages']) 59 | generation = chat_completion.choices[0].message.content 60 | 61 | if generation == None or generation == "": 62 | get_answer(input_data, retry=retry-1) 63 | 64 | entry['ass'] = generation 65 | entry['payload'] = payload 66 | save_jsonl(entry, save_path) 67 | except Exception as e: 68 | time.sleep(1.2) 69 | retry -= 1 70 | if retry < 0: 71 | entry['ass'] = "None" 72 | entry['payload'] = payload 73 | save_jsonl(entry, save_path) 74 | print(f"retry:剩余{retry}次") 75 | print(e) 76 | print(traceback.format_exc()) 77 | get_answer(input_data, retry=retry) 78 | 79 | 80 | def run_extraction(save_path, datas, num_pool): 81 | _input = [{"data": i, "eval_model": "gpt-4-1106-preview", "save_path":save_path} for i in datas if i] 82 | with ThreadPoolExecutor(max_workers=num_pool) as executor: 83 | tqdm(executor.map(get_answer, _input), total=len(_input), desc='Processing', ncols=100) 84 | 85 | 86 | def get_data(data_path, llm_output_path, language="zh"): 87 | with open(data_path, 'r', encoding='utf-8') as f: 88 | data = json.load(f) 89 | with open(llm_output_path, 'r', encoding='utf-8') as f: 90 | outputs = [json.loads(line) for line in f.readlines()] 91 | 92 | datas = [] 93 | for i, (d, o) in enumerate(zip(data, outputs)): 94 | for j, q in enumerate(d['scoring_questions']): 95 | if q['rule'] == None: 96 | continue 97 | 98 | if language == "zh": 99 | datas.append({ 100 | "main_id" : i, 101 | "point_id" : j, 102 | "instruction" : d['instruction'], 103 | "rule" : q['rule'], 104 | "question" : q['question'], 105 | "output" : o['generated'], 106 | }) 107 | elif language == "en": 108 | datas.append({ 109 | "main_id" : i, 110 | "point_id" : j, 111 | "instruction" : d['instruction_en'], 112 | "rule" : q['rule'], 113 | "question" : q['question_en'], 114 | "output" : o['generated'], 115 | }) 116 | 117 | return datas 118 | 119 | 120 | def main_run(args): 121 | datas = get_data(data_path=args.data_path, llm_output_path=args.llm_output_path, language=args.language) 122 | run_extraction(args.output_path, datas, args.num_pool) 123 | 124 | 125 | if __name__ == "__main__": 126 | parser = argparse.ArgumentParser() 127 | parser.add_argument("--data_path", type=str, default="") 128 | parser.add_argument("--llm_output_path", type=str, default="") 129 | parser.add_argument("--num_pool", type=int, default=40) 130 | parser.add_argument("--output_path", type=str, default="") 131 | parser.add_argument("--api_key", type=str, default="") 132 | parser.add_argument("--api_base", type=str, default="") 133 | parser.add_argument("--language", type=str, default="zh") 134 | args = parser.parse_args() 135 | openai.api_key = args.api_key 136 | openai.api_base = args.api_base 137 | SYS_MSG = EXTRACTION_PROMPT 138 | SYS_MSG_EACH = EXTRACTION_PROMPT_EACH 139 | main_run(args) 140 | -------------------------------------------------------------------------------- /evaluation/prompts/RAL_evaluator.py: -------------------------------------------------------------------------------- 1 | EVALUATION_PROMPT = """ 2 | 请你作为一个客观公正的评委,分析**模型回复**的内容,选择“是”或“否”来回答随后的**问题**是否成立。 3 | ## 请你根据以下判断规则作出客观公正的回答: 4 | 5 | - **问题**可以理解为**输入指令**的分步骤得分点,评判其一个具体部分是否得到满足。因此,你只需要考虑**问题**询问的要求是否成立,而不需要关注整个输出**输入指令**是否得到完全满足。 6 | 7 | 例: 8 | - **输入指令**:“请生成一首李白的诗歌,然后在最后以“生成结束。”结尾”。 9 | - **模型回复**:“床前明月光,疑是地上霜。举头望明月,低头思故乡。”。 10 | - **问题**:“模型回复是否生成为李白的诗歌?” 11 | 对于该问题,模型回复虽然没有完成**输入指令**要求,以“生成结束。” 结尾,但满足了**问题**要求,该诗歌为李白的《静夜思》,因此应该回答:“是”。对于满足**问题**的要求,但整个**输入指令**没有完全满足的情况,均应该回答“是”。 12 | 13 | - 是:请针对**问题**去查看**模型回复**是否完成,充分的理解**问题**的含义,不要漏掉小的细节,只针对当前**问题**,不要关注**输入指令**中其他的要求,必须是完美充分地完成了问题要求,才能评价为“是”,哪怕存在一点错误、存在歧义的内容,都不能为“是”,不应该存在“基本上正确”,“大部分正确”,“部分条件下正确”的说法,这类情况均应该评价为“否”。 14 | 15 | - 否:如果**模型回复**的文本不能满足当前**问题**的要求或者没有提供可用于回答问题的信息。选择“否”。 16 | 例:如果问题询问“生成的文本的第二个句子是复合句吗?”但**模型回复**只有一句话。它没有提供回答问题的相关信息。因此,答案应该是“否”。 17 | 18 | ## 评分细则: 19 | (1)评价模型回复是否分点时,必须带有明确的项目符号或者数字才能认定为分点,才能评价为“是”,否则评价为“否”。仅仅使用“首先”,“然后”,“其次”,“最后”等连接词不能认定为分点,应该评价为“否”。 20 | (2)评价模型回复是否使用某种语言(如中文/英文)输出时,除非**输入指令**中提到了需要使用多种语言,必须是只使用该语言才能评价为“是”,出现多语混杂的情况(即出现其他语言的单词),应该评价为“否”。 21 | (3)如果某个**问题**出现“每个”,“都”等描述,需要考虑每个对象的满足情况,只要有任何一个对象不满足要求,就应该评价为“否”;只有所有对象均满足要求,才可以评价为“是”。 22 | (4)对于以“模型是否正确判断”开头的**问题**,评价模型是否正确判断某一选择条件时,需要根据**输入指令**中的不同选择分支的指令要求,判断模型回复是否正确选择了对应的分支: 23 | 24 | 例: 25 | **输入指令**:1.如果用户问题和旅游有关,可以从相关信息中找出答案则依据相关信息原文,总结、润色后输出攻略,没找出答案只能回答“这个问题我还在学习中呢。”攻略的整体语言风格要轻松愉悦。并且攻略只能参考所给的相关信息来回答,不能使用你本身任何已知的知识、信息、新闻来回答问题。 26 | 2.如果用户问题和旅游无关,请直接回答“这个问题我不太了解呢。” 27 | **问题**:模型是否正确判断用户问题和旅游相关? 28 | 对于上述情况,当模型根据相关信息中输出攻略,或者输出“这个问题我还在学习中呢。”,都说明模型正确判断了用户问题和旅游相关,评价为“是”;只有当模型回答“这个问题我不太了解呢。”,才说明模型错误判断了“用户问题和旅游无关”,评价为“否”。 29 | 当模型选择了错误的执行分支时,应当认定为模型回答错误,评价为“否”。 30 | 31 | 32 | 33 | 34 | ## 输出格式 35 | 分析:xxx 36 | 答案:是/否 37 | 38 | ## 评判信息 39 | **输入指令** 40 | {input} 41 | 42 | **模型回复** 43 | {output} 44 | 45 | **问题** 46 | {question} 47 | 48 | 请进行分析回答**模型回复**是否满足**问题**: 49 | """ -------------------------------------------------------------------------------- /evaluation/prompts/RAL_extractor.py: -------------------------------------------------------------------------------- 1 | EXTRACTION_PROMPT = """你是一个信息抽取专家。以下将会给你提供一个【输入指令】和对应的【模型回复】。此外,还会给你提供一个【评分问题】,设计该问题的目的是评测【模型回复】是否满足【输入指令】的某个要求。现在,你需要抽取出【模型回复】中,【评分问题】针对的评测对象。 2 | 3 | 例如,【模型回复】中包含两篇作文,若【评分问题】为“模型生成的第一篇作文是否不少于500字?”,则你只需要输出【模型回复】中的第一篇作文即可; 4 | 若【评分问题】为“模型生成的第二篇作文是否使用了生动的语言风格?”,则你只需要输出【模型回复】中的第二篇作文即可; 5 | 又例如,【评分问题】为“输出是否以“彭博社记者”作为输出的结尾?“,则你只需要输出【模型回复】的结尾**最后一句**。 6 | 7 | **注意:你应该完全复制【模型回复】中连续的原文片段,严禁做任何修改、增添、删除、拼接,只能进行复制。同时强调,你的任务不是提取或构造出【模型回复】符合【评分问题】的部分,而是提取出【模型回复】中,【评分问题】针对的评测对象,即使其不符合【评分问题】要求。你不需要关注【评分问题】的具体要求是什么,更不需要评测【模型回复】是否满足【评分问题】要求。** 8 | 如果评分对象是整个【模型回复】,请你直接输出"all"。 9 | 如果评分对象在【模型回复】中不存在,请你直接输出"None"。 10 | 11 | **注意**:通常而言,“开头”指的是文段的最开始一句话(第一个句号、感叹号、问号等为结尾标志),“结尾”指的是文段最末尾的一句话。 12 | 13 | 请先给出你对该任务的分析解释,然后输出你抽取的评测对象结果。 14 | 15 | ### 输出格式: 16 | 【解释】 17 | XXX 18 | 19 | 【模型回复中评分问题的评测对象】 20 | 评分对象:XXX 21 | 22 | 你必须完全按照上述格式进行输出,禁止输出任何多余解释或多余标点。 23 | 24 | ### 例一 25 | 26 | 【输入指令】 27 | 分析下列艺术家及其代表作品: 28 | 1. 梵高——《星空》 29 | 2. 奥古斯特·罗丹——《思想者》 30 | 当该艺术家与作品匹配时,判断他属于画家还是雕塑家,如果是画家,生成一篇关于他作品的介绍,要求包含“文艺复兴时期”一词。如果是雕刻家,生成一篇关于这位艺术家的简介,要求在50字左右。 31 | 当该艺术家与作品不匹配时,判断他属于画家还是雕塑家,如果是画家,修正错误的作品名称,以“以下为修正内容”为开头进行输出。如果是雕塑家,则输出“请重新输入”。 32 | 33 | 【模型回复】 34 | 梵高——《星空》 35 | 梵高是荷兰后印象派画家,代表作品有《星空》、《向日葵》等。《星空》是他的成名作,展现了梵高独特的画风和色彩运用,是后印象派绘画的代表作之一。 36 | 37 | 2. 奥古斯特·罗丹——《思想者》 38 | 奥古斯特·罗丹是法国雕塑家,代表作品有《思想者》、《思想者》等。《思想者》是他的代表作之一,采用了立体主义风格,表现了人类思考和探索的精神内涵。 39 | 40 | 当该艺术家与作品匹配时: 41 | 梵高是画家。 42 | 43 | 当该艺术家与作品不匹配时: 44 | 需要重新输入作品名称。 45 | 46 | 【评分问题】 47 | 模型生成的关于奥古斯特·罗丹的简介文本是否在50字左右? 48 | 49 | 50 | 【解释】 51 | 该评分问题的评分对象是“关于奥古斯特·罗丹的简介文本”,所以需要直接输出【模型回复】中,奥古斯特·罗丹的简介文本对应部分。 52 | 53 | 【模型回复中评分问题的评测对象】 54 | 评分对象:奥古斯特·罗丹是法国雕塑家,代表作品有《思想者》、《思想者》等。《思想者》是他的代表作之一,采用了立体主义风格,表现了人类思考和探索的精神内涵。 55 | 56 | 57 | 58 | ### 例二 59 | 60 | 【输入指令】 61 | “小明正在玩游戏,妈妈走过来说,‘你这样下去,将来一事无成。’小明回应说,‘我知道妈妈,我会注意时间管理。’”\n描述上面这段话的情景,要求分点明确,包括下列关键词中的1个:1. 时不我待 2. 杀鸡儆猴 3. 知错能改 4. 自知之明,字数在300字左右,以“完成。”作为结尾。 62 | 63 | 【模型回复】 64 | 首先,我们来看“时不我待”。在这个场景中,小明的母亲提醒他要珍惜时间,因为时间是无法等待的。她可能担心如果小明继续沉迷于游戏,他会错过很多学习和工作机会,最终一事无成。这种对时间的紧迫感反映了“时不我待”的关键词。\n\n接下来,我们看到“知错能改”。当小明的母亲警告他不要浪费时间时,小明承认了他的错误并表示会改进。他承诺会更加注意时间管理,这表明他已经意识到了自己的错误,并愿意采取措施来改正。这种承认错误并采取行动的态度正是“知错能改”的体现。\n\n总的来说,这段对话展示了小明在母亲的提醒下意识到时间的重要性,并表示会改变自己的行为。这个情景可以用“时不我待”、“知错能改”这两个关键词来描述。同时,我们也可以看到小明具有“自知之明”,能够认识到自己的问题并在母亲的提醒下采取行动。 65 | 66 | 【评分问题】 67 | 模型生成的描述结尾处是否为“完成。”? 68 | 69 | 70 | 【解释】 71 | 该评分问题的评分对象是【模型回复】的结尾部分。根据要求,需要提取出【模型回复】中结尾最后一句话。注意不应该输出“完成。”,它不是【模型回复】的结尾最后一句话。 72 | 73 | 【模型回复中评分问题的评测对象】 74 | 评分对象:同时,我们也可以看到小明具有“自知之明”,能够认识到自己的问题并在母亲的提醒下采取行动。 75 | 76 | 77 | 78 | ### 例三 79 | 80 | 【输入指令】 81 | 分析下列艺术家及其代表作品:梵高 -《蒙娜丽莎》 82 | 当该艺术家与作品匹配时,判断他属于画家还是雕塑家,如果是画家,生成一篇关于他作品的介绍,要求包含“文艺复兴时期”一词。如果是雕刻家,生成一篇关于这位艺术家的简介,要求在50字左右。 83 | 当该艺术家与作品不匹配时,判断他属于画家还是雕塑家,如果是画家,修正错误的作品名称,以“以下为修正内容”为开头进行输出。如果是雕塑家,则输出“请重新输入”。 84 | 85 | 【模型回复】 86 | 很抱歉,根据您提供的信息,无法确定艺术家梵高是否与《蒙娜丽莎》匹配。请您提供更多关于该艺术家的信息,以便我们进行更准确的判断。谢谢! 87 | 88 | 【评分问题】 89 | 模型对作品名称的修正是否以“以下为修正内容”为开头? 90 | 91 | 92 | 【解释】 93 | 该评分问题的评分对象是“模型对作品名称的修正”的开头部分,所以应该输出【模型回复】的开头第一句,即“很抱歉,根据您提供的信息,无法确定艺术家梵高是否与《蒙娜丽莎》匹配。” 94 | 95 | 【模型回复中评分问题的评测对象】 96 | 很抱歉,根据您提供的信息,无法确定艺术家梵高是否与《蒙娜丽莎》匹配。 97 | 98 | 99 | 100 | ### 例四 101 | 102 | 【输入指令】 103 | 福田区政务局副局长李东升将于 12月30日为成都七中初三的小朋友们介绍智慧政务的发展情况,激发孩子们的学习热情,请为他写一篇演讲稿。\n要求:\n1. 开头要感谢四川省委书记孙大强的支持,结尾要呼吁同学们努力。\n2. 全文不能出现”福田“两个字。\n3. 用一句英文的名言警句鼓励他们。\n最后为这篇演讲稿起一个不超过10个字的标题。 104 | 105 | 【模型回复】 106 | ### 演讲稿标题:智能未来的探索者\n\n尊敬的各位老师,亲爱的同学们,大家下午好!\n\n在这个美好的时刻,我有幸来到这里,和大家一起探讨智慧政务的未来。首先,请允许我代表区政务局,向长期以来关心和支持智慧政务发展的四川省委书记孙大强表示最诚挚的感谢!是他的卓见和推动,使我们得以在这条光明大道上稳步前行。\n\n智慧政务,简而言之,就是利用现代科技手段,如大数据、云计算等信息技术,来优化政府的服务和管理。想象一下,你是否有过等待许久才能办理某项政务服务的经历?智慧政务的目标,就是要让这样的等待成为历史,让政府服务像网购一样便捷,让数据多跑路,群众少跑腿。 107 | 108 | 【评分问题】 109 | 模型是否为这篇演讲稿起的标题是否不超过10个字? 110 | 111 | 112 | 【解释】 113 | 该评分问题是评测【模型回复】中,给演讲稿起的标题的字数,所以应该输出演讲稿的标题“智能未来的探索者”。注意不应该输出“演讲稿标题:”,这个仅仅是一些说明装饰文本,不是演讲稿真正的标题。 114 | 115 | 【模型回复中评分问题的评测对象】 116 | 评分对象:智能未来的探索者 117 | 118 | 119 | 120 | ### 例五 121 | 122 | 【输入指令】 123 | 请先告诉我贝多芬的第九交响曲中的合唱部分是基于哪位诗人的作品创作的?以“贝多芬的第九交响曲中的合唱部分是基于”开头回答。 124 | 然后对这首交响曲进行200字以内的简要评析,评析部分要分点作答,并且包含关键词"赏析内容如下:"。 125 | 126 | 【模型回复】 127 | 贝多芬的第九交响曲中的合唱部分是基于德国诗人席勒的《欢乐颂》创作的。 128 | 129 | 简要评析: 130 | 131 | 1. 主题思想:贝多芬的第九交响曲以德国诗人席勒的《欢乐颂》为歌词,表达了人类对自由、平等和友爱的追求。这首交响曲被认为是人类精神的象征,展现了人类对美好未来的向往。 132 | 133 | 2. 音乐形式:贝多芬的第九交响曲采用了古典交响乐的形式,结构严谨、逻辑清晰。在音乐中,贝多芬巧妙地运用了和声、节奏和旋律等元素,展现了丰富的音乐风格和表现力。 134 | 135 | 【评分问题】 136 | 模型回复的评析部分是否包含关键词"赏析内容如下:"? 137 | 138 | 139 | 【解释】 140 | 该评分问题是评测【模型回复】全文中是否包含“"赏析内容如下:"”这个关键词,所以评测对象是【模型回复】整体,因此应该输出“all”。 141 | 142 | 【模型回复中评分问题的评测对象】 143 | 评分对象:all 144 | 145 | 146 | ### 例六 147 | 148 | 【输入指令】 149 | 邯郸市委网信办主办邯郸电信协办邀请各县(市、区)委网信办网络安全主管领导和科室人员2人。市直各有关单位网络安全处室负责同志及科室人员2人,参会地点中国电信邯郸分公司2楼大会议室,共计300余人参加,会议主题网络安全和参观电信展示厅,让各单位了解电信能力。 150 | 151 | 请你根据以上内容,写一篇不少于500字的信息稿,不可以使用分点格式,开头要求用“邀请函:邯郸市网络安全会议暨电信展示厅参观”来开头,结尾用“邯郸市委网信办”进行署名。 152 | 153 | 【模型回复】 154 | 邀请函:邯郸市网络安全会议暨电信展示厅参观 155 | 156 | 尊敬的各位领导、科室人员: 157 | 158 | 我们诚挚地邀请您参加由邯郸市委网信办主办,邯郸电信协办的邯郸市网络安全会议暨电信展示厅参观活动。此次会议旨在加强网络安全意识,提升网络安全防护能力,同时让您们更加了解电信能力。 159 | 160 | 一、会议时间:2022年10月15日(星期六)上午9:00-11:30 161 | 二、会议地点:中国电信邯郸分公司2楼大会议室 162 | 三、参会人员:各县(市、区)委网信办网络安全主管领导和科室人员2人,市直各有关单位网络安全处室负责同志及科室人员2人。 163 | 164 | 四、会议内容: 165 | 1. 网络安全主题演讲,邀请相关专家就当前网络安全形势、防护策略等方面进行深入讲解。 166 | 2. 参观电信展示厅,了解电信能力,感受高科技的魅力。 167 | 168 | 我们期待您的参加,共同提升网络安全防护水平,共创美好未来。 169 | 170 | 邯郸市委网信办 171 | 2022年10月10日 172 | 173 | 【评分问题】 174 | 输出是否以“邯郸市委网信办”结尾? 175 | 176 | 177 | 【解释】 178 | 该评分问题的评分对象是【模型回复】的结尾部分,所以应该输出【模型回复】的结尾最后一句,即“邯郸市委网信办\n2022年10月10日”。 179 | 180 | 【模型回复中评分问题的评测对象】 181 | 评分对象:邯郸市委网信办\n2022年10月10日 182 | 183 | 184 | ### 参考示例,提取如下【模型回复】中的【评分问题】针对的评测对象。 185 | **再次强调:你应该完全复制【模型回复】中连续的原文片段,禁止做任何修改、增添、删除、拼接,只能进行复制。同时强调,你的任务不是提取或构造出【模型回复】符合【评分问题】的部分,而是提取出【模型回复】中,【评分问题】针对的评测对象,即使其不符合【评分问题】要求。** 186 | **给出你的评测对象时,必须确保每一个字都是【模型回复】中真实存在的。 187 | 188 | 【输入指令】 189 | {instruction} 190 | 191 | 【模型回复】 192 | {response} 193 | 194 | 【评分问题】 195 | {question} 196 | 197 | 198 | 【解释】""" 199 | 200 | 201 | EXTRACTION_PROMPT_EACH = """你是一个信息抽取专家。以下将会给你提供一个【输入指令】和对应的【模型回复】。此外,还会给你提供一个【评分问题】,设计该问题的目的是评测【模型回复】是否满足【输入指令】的某个要求。现在,你需要抽取出【模型回复】中,【评分问题】针对的评测对象。 202 | 203 | 例如,【模型回复】中包含两篇作文,若【评分问题】为“模型生成的第一篇作文是否不少于500字?”,则你只需要输出【模型回复】中的第一篇作文即可; 204 | 若【评分问题】为“模型生成的第二篇作文是否使用了生动的语言风格?”,则你只需要输出【模型回复】中的第二篇作文即可; 205 | 又例如,【评分问题】为“输出是否使用了XX作为文章的结尾?“,则你只需要输出【模型回复】的结尾**最后一句**。 206 | 207 | **注意:你应该完全复制【模型回复】中连续的原文片段,严禁做任何修改、增添、删除、拼接,只能进行复制。同时强调,你的任务不是提取或构造出【模型回复】符合【评分问题】的部分,而是提取出【模型回复】中,【评分问题】针对的评测对象,即使其不符合【评分问题】要求。你不需要关注【评分问题】的具体要求是什么,更不需要评测【模型回复】是否满足【评分问题】要求。** 208 | 已知评分对象在【模型回复】中存在多个,请你使用||将不同的评分对象分隔开。 209 | 如果评分对象在【模型回复】中不存在,请你直接输出"None"。 210 | 211 | **注意**:通常而言,“开头”指的是文段的最开始一句话(第一个句号、感叹号、问号等为结尾标志),“结尾”指的是文段最末尾的一句话。 212 | 213 | 请先给出你对该任务的分析解释,然后输出你抽取的评测对象结果。 214 | 215 | ### 输出格式: 216 | 【解释】 217 | XXX 218 | 219 | 【模型回复中评分问题的评测对象】 220 | 评分对象:XXX||YYY 221 | 222 | 你必须完全按照上述格式进行输出,禁止输出任何多余解释或多余标点。 223 | 224 | ### 例一 225 | 226 | 【输入指令开始】 227 | 在什么样的场景下会产生“怎么才能让头发看着蓬松很多”这样的用户痛点 ?请用简短的语句写一些日常的场景,每个场景用一句话概括,不允许除了句号外的其它标点。每句话以“场景1/场景2/场景X”开头。写8句话,每句话的字数不超过15个字。着重要求:以markdown表格的形式呈现,表格只需要一列“场景”。 228 | 229 | 【模型回复开始】 230 | | 场景1 |\n| ---- |\n| 洗完头发后发现头发扁塌无精神。 |\n| 头发稀疏,看起来毫无蓬松感。 |\n| 想要在特殊场合展现蓬松发型。 |\n| 头发容易油腻,缺乏蓬松感。 |\n| 想要尝试新的蓬松发型。 |\n| 头发干燥无光泽,缺乏蓬松感。 |\n| 想要让头发看起来更加丰盈。 |\n| 想要让头发看起来更加有质感。 | 231 | 232 | 【评分问题开始】 233 | 模型生成的每个场景是否是以“场景X”开头? 234 | 235 | 236 | 【解释】 237 | 该评分问题的评分对象是【模型回复】中对每个场景的描述的开头,所以应该提取每个场景的描述的开头。 238 | 239 | 【模型回复中评分问题的评测对象】 240 | 评分对象:洗完头发后发现头发扁塌无精神。||头发稀疏,看起来毫无蓬松感。||想要在特殊场合展现蓬松发型。||头发容易油腻,缺乏蓬松感。||想要尝试新的蓬松发型。||头发干燥无光泽,缺乏蓬松感。||想要让头发看起来更加丰盈。||想要让头发看起来更加有质感。 241 | 242 | 243 | 244 | ### 例二 245 | 246 | 【输入指令开始】 247 | 为“松下剃须刀(松下小方盒)安萊精钢刀头,小巧机身便于携带”写一条带货短视频脚本。 248 | 以markdown表格输出,包括分镜序号、分镜内容、台词文本、运镜。 249 | 每个分镜的台词都要以疑问句开头,以感叹句结尾;每个分镜的台词不超过50字。 250 | 251 | 【模型回复开始】 252 | 253 | 254 | | 分镜序号 | 分镜内容 | 台词文本 | 运镜 | 255 | | -------- | -------------------------------------- | ------------------------------------------ | ------ | 256 | | 1 | 手持松下剃须刀,展示刀头和机身 | “这是松下剃须刀,安萊精钢刀头,看多小巧!” | 静态 | 257 | | 2 | 拍摄手持剃须刀的人正在旅行或出门的场景 | “小巧的机身,便于携带,真是出门旅行的好选择!” | 跟随拍 | 258 | | 3 | 展示剃须刀在使用时的效果 | “而且刮得干净利落,真是太方便了!” | 特写 | 259 | | 4 | 手持剃须刀,展示产品包装和价格 | “赶紧入手吧,这么好用的剃须刀,价格还这么实惠!” | 静态 | 260 | 261 | 【评分问题开始】 262 | 模型输出的每个分镜的台词是否以疑问句开头? 263 | 264 | 265 | 【解释】 266 | 该评分问题的评分对象是【模型回复】中每个分镜台词的开头,所以应该提取每个分镜台词的开头第一句话。 267 | 268 | 【模型回复中评分问题的评测对象】 269 | 评分对象:“这是松下剃须刀,安萊精钢刀头,看多小巧!”||“小巧的机身,便于携带,真是出门旅行的好选择!”||“小巧的机身,便于携带,真是出门旅行的好选择!”|| “而且刮得干净利落,真是太方便了!”||“赶紧入手吧,这么好用的剃须刀,价格还这么实惠!” 270 | 271 | 272 | 273 | ### 例三 274 | 275 | 【输入指令开始】 276 | 撰写一篇关于时尚新型法官的博客文章,要求字数在300字左右。文章中必须包含3个小标题,每个小标题前标有星号(*)。文章必须以导语开头,以结语结尾。在撰写时,请确保每个小标题下的内容都能够紧扣主题,提供有关时尚新型法官的有趣见解或信息。 277 | 278 | 【模型回复开始】 279 | 时尚新型法官:法庭上的时尚ICON 280 | 281 | 一、时尚与正义的完美结合 282 | 时尚与法律似乎是两个截然不同的领域,但是随着社会的不断发展,时尚新型法官正逐渐成为法庭上的新宠。这些法官不仅在法庭上展现出专业的法律素养,更以他们独特的时尚品味成为了法庭上的时尚ICON。他们的着装不仅展现了个人魅力,更传递出了一种自信和权威,让人们对法律和正义有了全新的认识。 283 | 284 | 二、时尚新型法官的着装风格 285 | 与传统的法官黑袍相比,时尚新型法官们的着装风格更加多样化和个性化。他们可能会选择搭配一件时尚的西装外套,或是配上一条别致的领带,甚至可能会在法庭上展现出一些时尚的配饰,如手表、眼镜等。这种与众不同的着装风格不仅让他们在法庭上脱颖而出,更让法庭变得更加生动有趣。 286 | 287 | 三、时尚新型法官的形象建设 288 | 时尚新型法官们的形象建设不仅仅停留在外表,更多的是通过自己的言行举止展现出对法律的尊重和对正义的追求。他们可能会在社交媒体上分享自己的时尚穿搭,与公众分享法律知识,甚至可能会参与一些公益活动,让更多的人了解法律,关注正义。通过这种形象建设,时尚新型法官们不仅仅是法庭上的执法者,更是社会上的时尚偶像和价值引领者。 289 | 290 | 结语: 291 | 时尚新型法官的出现,不仅让法庭变得更加有趣和生动,更让人们对法律和正义有了全新的认识。他们的时尚形象不仅仅是外在的展示,更是对法律和正义的一种内在追求。让我们一起期待更多时尚新型法官的出现,让法庭变得更加时尚与有趣。 292 | 293 | 【评分问题开始】 294 | 博客文章是否包含3个小标题? 295 | 296 | 297 | 【解释】 298 | 该评分问题的评分对象是【模型回复】中每个小标题,所以应该提取出其中的每个小标题。 299 | 300 | 【模型回复中评分问题的评测对象】 301 | 评分对象:一、时尚与正义的完美结合||二、时尚新型法官的着装风格||三、时尚新型法官的形象建设 302 | 303 | 304 | 305 | ### 例四 306 | 307 | 【输入指令开始】 308 | 帮我为一只黑白色的狗起三个名字,以json格式输出,只有一个字段“名字”,对应值为一个包含所有名字的list 309 | 310 | 【模型回复开始】 311 | ```json 312 | {{ 313 | "名字": ["奥利奥", "斑点", "墨墨"] 314 | }} 315 | ``` 316 | 317 | 【评分问题开始】 318 | 模型是否给出了三个名字? 319 | 320 | 321 | 【解释】 322 | 该评分问题的评分对象是【模型回复】中起的所有名字,所以应该提取出其中的所有名字。 323 | 324 | 【模型回复中评分问题的评测对象】 325 | 评分对象:"奥利奥"||"斑点"||"墨墨" 326 | 327 | 328 | 329 | ### 例四 330 | 331 | 【输入指令开始】 332 | 想要推广一个农药科研新产品,可以采取哪五项推广措施?要求传播速度快、效果好、推广扎实。五项措施的内容请使用分号分隔。 333 | 334 | 【模型回复开始】 335 | 1. 网络营销与社交媒体推广:利用微博、微信、抖音等社交平台发布产品信息,利用短视频、直播等形式展示产品效果和使用方法;\n\n2. 与农业专业媒体合作:在农业杂志、专业网站、行业论坛等媒体上发布广告和专题报道,提高产品知名度;\n\n3. 参加农业展会和研讨会:在各类农业展览会上设立展位,进行现场演示和讲解,同时参与研讨会分享科研成果,与行业内人士建立联系;\n\n4. 建立试点示范基地:在不同地区建立试验示范点,邀请农户参观,亲眼见证产品效果,通过口碑传播增强信任度;\n\n5. 培训销售和技术服务团队:组织专业的销售和技术服务团队,对农户进行产品知识和使用技巧的培训,提供优质的售后服务,确保用户满意度。 336 | 337 | 【评分问题开始】 338 | 模型回复是否将五项措施的内容使用分号分隔? 339 | 340 | 341 | 【解释】 342 | 该评分问题的评分对象是【模型回复】中关于“想要推广一个农药科研新产品”的推广措施,所以应该将【模型回复】中提到的所有推广措施提取出来。 343 | 344 | 【模型回复中评分问题的评测对象】 345 | 评分对象:1. 网络营销与社交媒体推广:利用微博、微信、抖音等社交平台发布产品信息,利用短视频、直播等形式展示产品效果和使用方法;||2. 与农业专业媒体合作:在农业杂志、专业网站、行业论坛等媒体上发布广告和专题报道,提高产品知名度;||3. 参加农业展会和研讨会:在各类农业展览会上设立展位,进行现场演示和讲解,同时参与研讨会分享科研成果,与行业内人士建立联系;||4. 建立试点示范基地:在不同地区建立试验示范点,邀请农户参观,亲眼见证产品效果,通过口碑传播增强信任度;||5. 培训销售和技术服务团队:组织专业的销售和技术服务团队,对农户进行产品知识和使用技巧的培训,提供优质的售后服务,确保用户满意度。 346 | 347 | 348 | 349 | ### 参考示例,提取如下【模型回复】中的【评分问题】针对的评测对象。 350 | **再次强调:你应该完全复制【模型回复】中连续的原文片段,禁止做任何修改、增添、删除、拼接,只能进行复制。同时强调,你的任务不是提取或构造出【模型回复】符合【评分问题】的部分,而是提取出【模型回复】中,【评分问题】针对的评测对象,即使其不符合【评分问题】要求。** 351 | **给出你的评测对象时,必须确保每一个字都是【模型回复】中真实存在的。 352 | 353 | 【输入指令开始】 354 | {instruction} 355 | 356 | 【模型回复开始】 357 | {response} 358 | 359 | 【评分问题开始】 360 | {question} 361 | 362 | 363 | 【解释】""" -------------------------------------------------------------------------------- /evaluation/rule_based_evaluation.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | from con_rule import decompose_judge 4 | 5 | def load_jsonl(file_path): 6 | _data = [] 7 | with open(file_path, 'r') as f: 8 | for data in f: 9 | jline = json.loads(data) 10 | _data.append(jline) 11 | return _data 12 | 13 | 14 | def preprocess(s, ass): 15 | if ass == None or s == None: 16 | return None, [] 17 | if "【模型回复中评分问题的评测对象】\n" in ass: 18 | ass = ass.rsplit("【模型回复中评分问题的评测对象】\n", maxsplit=1)[1] 19 | ass = ass.rsplit("\n\n请注意", maxsplit=1)[0] 20 | ass = ass.rsplit("\n\n**注意", maxsplit=1)[0] 21 | ass = ass.rsplit("\n\n注意", maxsplit=1)[0] 22 | if "评分对象:" in ass: 23 | decompose = ass.rsplit("评分对象:", maxsplit=1)[1] 24 | elif "评测对象:" in ass: 25 | decompose = ass.rsplit("评测对象:", maxsplit=1)[1] 26 | elif ":" in ass: 27 | decompose = ass.rsplit(":", maxsplit=1)[1] 28 | else: 29 | decompose = ass 30 | 31 | if decompose.startswith('all'): 32 | rs = [s] 33 | elif 'None' in decompose: 34 | rs = [] 35 | else: 36 | rs = decompose.split('||') 37 | rs = [r.strip() for r in rs] 38 | return decompose, rs 39 | 40 | 41 | if __name__ == "__main__": 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument("--data_path", type=str, default="") 44 | parser.add_argument("--extraction_path", type=str, default="") 45 | parser.add_argument("--output_path", type=str, default="") 46 | args = parser.parse_args() 47 | 48 | extractions = load_jsonl(args.extraction_path) 49 | rule_evaluation_results = [] 50 | for e in extractions: 51 | decompose, rs = preprocess(e['output'], e['ass']) 52 | result = decompose_judge(e['rule'], rs) 53 | e['point_judge'] = result 54 | e['point_explanation'] = 'RAL' 55 | rule_evaluation_results.append(e) 56 | 57 | with open(args.output_path, 'w') as f: 58 | for r in rule_evaluation_results: 59 | f.write(json.dumps(r, ensure_ascii=False) + '\n') -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | openai==0.28.0 2 | Requests==2.32.3 3 | tqdm==4.66.1 4 | --------------------------------------------------------------------------------