\n<|Bot|>:"""
137 | return prompt, tokenizer.encode(prompt)
138 |
139 |
140 | if __name__ == '__main__':
141 | query = '你好'
142 | system = '请帮助我'
143 | tokenizer = '/mnt/llm/devopspal/model/Qwen-7B'
144 |
145 |
--------------------------------------------------------------------------------
/resources/tutorial.md:
--------------------------------------------------------------------------------
1 | ## Evaluate Tutorial
2 |
3 | ## 🚀 How to Evaluate
4 | If you need to test your own huggingface-formatted model, the overall steps are as follows:
5 | 1. Write the loader function for the model.
6 | 2. Write the context_builder function for the model.
7 | 3. Register the model in the configuration file.
8 | 4. Run the testing script.
9 | If the model does not require any special processing after loading, and the input does not need to be converted to a specific format (e.g. chatml format or other human-bot formats), you can directly proceed to step 4 to initiate the testing.
10 |
11 | #### 1. Write the loader function
12 | If the model requires additional processing after loading (e.g. adjusting the tokenizer), you need to inherit the `ModelAndTokenizerLoader` class in `src.context_builder.context_builder_family.py` and override the corresponding `load_model` and `load_tokenizer` functions. You can refer to the following example:
13 | ```python
14 | class QwenModelAndTokenizerLoader(ModelAndTokenizerLoader):
15 | def __init__(self):
16 | super().__init__()
17 | pass
18 |
19 | def load_model(self, model_path: str):
20 | model = super().load_model(model_path)
21 | model.generation_config = GenerationConfig.from_pretrained(model_path)
22 | return model
23 |
24 | def load_tokenizer(self, model_path: str):
25 | tokenizer = super().load_tokenizer(model_path)
26 |
27 | # read generation config
28 | with open(model_path + '/generation_config.json', 'r') as f:
29 | generation_config = json.load(f)
30 | tokenizer.pad_token_id = generation_config['pad_token_id']
31 | tokenizer.eos_token_id = generation_config['eos_token_id']
32 | return tokenizer
33 | ```
34 |
35 | #### 2. Write the context_builder function for the Model
36 | If the input needs to be converted to a specific format (e.g. chatml format or other human-bot formats), you need to inherit the ContextBuilder class in `src.context_builder.context_builder_family` and override the make_context function. This function is used to convert the input to the corresponding required format. An example is shown below:
37 | ```python
38 | class QwenChatContextBuilder(ContextBuilder):
39 | def __init__(self):
40 | super().__init__()
41 |
42 | def make_context(
43 | self,
44 | model,
45 | tokenizer,
46 | query: str,
47 | system: str = "you are a helpful assistant"
48 | ):
49 | '''
50 | model: PretrainedModel
51 | tokenizer: PretrainedTokenzier
52 | query: Input string
53 | system: System prompt if needed
54 | '''
55 | im_start, im_end = "<|im_start|>", "<|im_end|>"
56 | im_start_tokens = [tokenizer.im_start_id]
57 | im_end_tokens = [tokenizer.im_end_id]
58 | nl_tokens = tokenizer.encode("\n")
59 |
60 | def _tokenize_str(role, content):
61 | return f"{role}\n{content}", tokenizer.encode(
62 | role, allowed_special=set()
63 | ) + nl_tokens + tokenizer.encode(content, allowed_special=set())
64 |
65 | system_text, system_tokens_part = _tokenize_str("system", system)
66 | system_tokens = im_start_tokens + system_tokens_part + im_end_tokens
67 |
68 | raw_text = ""
69 | context_tokens = []
70 |
71 | context_tokens = system_tokens + context_tokens
72 | raw_text = f"{im_start}{system_text}{im_end}" + raw_text
73 | context_tokens += (
74 | nl_tokens
75 | + im_start_tokens
76 | + _tokenize_str("user", query)[1]
77 | + im_end_tokens
78 | + nl_tokens
79 | + im_start_tokens
80 | + tokenizer.encode("assistant")
81 | + nl_tokens
82 | )
83 | raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n"
84 | return raw_text, context_tokens
85 | ```
86 |
87 | #### 3. Register the model in the configuration file
88 | Go to the `model_conf.json` file in the conf directory and register the corresponding model name and the loader and context_builder that will be used for this model. Simply write the class names defined in the first and second steps for the loader and context_builder. Here is an example:
89 | ```json
90 | {
91 | "Qwen-Chat": {
92 | "loader": "QwenModelAndTokenizerLoader",
93 | "context_builder": "QwenChatContextBuilder"
94 | }
95 | }
96 | ```
97 |
98 | #### 4. Execute the testing script
99 | Run the following code to initiate the test:
100 | ```Bash
101 | # model_path: path to the model for testing
102 | # model_name: the model name corresponding to the model in the configuration file, default is Default, which represents using the default loader and context_builder
103 | # model_conf_path: path to the model configuration file, usually the devopseval_dataset_fp.json file in the conf directory
104 | # eval_dataset_list: the names of the datasets to be tested, default is all to test all datasets, if you need to test one or more datasets, use the # symbol to connect them, for example: dataset1#dataset2
105 | # eval_dataset_fp_conf_path: path to the dataset configuration file
106 | # eval_dataset_type: the type of testing, only supports the default test type of test dataset
107 | # data_path: path to the evaluation dataset, fill in the downloaded dataset address
108 | # k_shot: supports 0-5, represents the number of example prefixes added for few-shot
109 |
110 | python src/run_eval.py \
111 | --model_path path_to_model \
112 | --model_name model_name_in_conf \
113 | --model_conf_path path_to_model_conf \
114 | --eval_dataset_list all \
115 | --eval_dataset_fp_conf_path path_to_dataset_conf \
116 | --eval_dataset_type test \
117 | --data_path path_to_downloaded_devops_eval_data \
118 | --k_shot 0
119 | ```
120 |
121 | For example, if the evaluation dataset is downloaded to `folder1`, the code is placed in `folder2`, and the model is in `folder3`, and the model does not require custom loader and context_builder, and all zero-shot scores of all datasets need to be tested, you can use the following script to initiate the test:
122 | ```Bash
123 | python folder2/src/run_eval.py \
124 | --model_path folder3 \
125 | --model_name Default \
126 | --model_conf_path folder1/conf/model_conf.json \
127 | --eval_dataset_list all \
128 | --eval_dataset_fp_conf_path folder1/conf/devopseval_dataset_fp.json \
129 | --eval_dataset_type test \
130 | --data_path folder2 \
131 | --k_shot 0
132 | ```
133 |
--------------------------------------------------------------------------------
/resources/tool_learning_evalution.md:
--------------------------------------------------------------------------------
1 | ## tool learning 数据集评测教程
2 |
3 | ### chatml接入方式
4 | 如果需要在自己的 huggingface 格式的模型上进行测试的话,总的步骤分为如下几步:
5 | 1. 编写 ~/evals/FuncCallEvalution 的 create_prompts 函数
6 | 2. 编写 ~/models/base_model 的 相关函数
7 | 3. 注册模型和评估函数
8 | 4. 执行测试脚本
9 | 如果模型在加载进来后不需要特殊的处理,而且输入也不需要转换为特定的格式(e.g. chatml 格式或者其他的 human-bot 格式),请直接跳转到第四步直接发起测试。
10 |
11 | #### 1. 编写 loader 函数
12 | 如果模型在加载进来还需要做一些额外的处理(e.g. tokenizer 调整),需要去 `src.context_builder.context_builder_family.py` 中继承 `ModelAndTokenizerLoader` 类来覆写对应的 `load_model` 和 `load_tokenizer` 函数,具体可以参照以下示例:
13 | ```python
14 | class FuncCallEvalution(ToolEvalution):
15 |
16 | def create_prompts(self, func_call_datas):
17 | '''
18 | datas: [
19 | {
20 | "instruction": history[his_idx],
21 | "input": "",
22 | "output": output,
23 | "history": [(human_content, ai_content), (), ()],
24 | "functions": tools
25 | }
26 | ]
27 | '''
28 | system_content = '''CodeFuse是一个面向研发领域的智能助手,旨在中立的、无害的帮助用户解决开发相关的问题,所有的回答均使用Markdown格式返回。
29 | 你能利用许多工具和功能来完成给定的任务,在每一步中,你需要分析当前状态,并通过执行函数调用来确定下一步的行动方向。你可以进行多次尝试。如果你计划连续尝试不同的条件,请每次尝试一种条件。若给定了Finish函数,则以Finish调用结束,若没提供Finish函数,则以不带function_call的对话结束。'''
30 | function_format = '''You are ToolGPT, you have access to the following APIs:\n{tools}'''
31 |
32 | func_call_train_datas = []
33 | history_error_cnt = 0
34 | funccall_error_cnt = 0
35 |
36 | for data in func_call_datas:
37 | tools = data["functions"]
38 | chatrounds = data["chatrounds"]
39 |
40 | function_content = ""
41 | if len(tools) > 0:
42 | function_content = function_format.format(tools=json.dumps(tools, ensure_ascii=False, sort_keys=True))
43 |
44 | history = []
45 | for i in chatrounds:
46 | if i["role"]=="system":
47 | continue
48 |
49 | if i["role"]=="user":
50 | history.append(("user", i["content"]))
51 |
52 | if i["role"] == "assistant":
53 | if "function_call" in i:
54 | if not isinstance(i["function_call"], dict):
55 | funccall_error_cnt+=1
56 | continue
57 | content = "#function" + json.dumps({**{"content": i["content"]}, **i["function_call"]}, ensure_ascii=False)
58 | else:
59 | content = i["content"]
60 | history.append(("assistant", content))
61 |
62 |
63 | if i["role"] == "function":
64 | content = json.dumps({**{"content": i["content"]}, **{"name": i["name"]}}, ensure_ascii=False)
65 | history.append(("user", content))
66 |
67 |
68 | history = [i[1] for i in history]
69 | history[0] = "\n".join([system_content,function_content, history[0]])
70 |
71 | for his_idx in range(0, len(history), 2):
72 | output = history[his_idx+1]
73 |
74 | if "#function" in output:
75 | output = output.split("#function")[-1]
76 |
77 | try:
78 | output = json.loads(output)
79 | except:
80 | output = {"content": output}
81 |
82 |
83 | func_call_train_datas.append(
84 | {
85 | "instruction": history[his_idx],
86 | "input": "",
87 | "output": output,
88 | "history": [history[:his_idx+2][i:i+2] for i in range(0, len(history[:his_idx]), 2)],
89 | "functions": tools
90 | },
91 | )
92 | return func_call_train_datas
93 | ```
94 |
95 | #### 2. 编写 Model 的 context_builder 函数
96 | 如果输入需要转换为特定的格式(e.g. chatml 格式或者其他的 human-bot 格式),则需要去 `src.context_builder.context_builder_family` 中继承 ContextBuilder 类来覆写 make_context 函数,这个函数是用来将输入转换格式为对应需要的输出的,一个示例如下:
97 | ```python
98 | class ToolModel:
99 | def __init__(self, model_path: str, template: str, trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25):
100 | self.model_path = model_path
101 | self.trust_remote_code = trust_remote_code
102 | self.tensor_parallel_size = tensor_parallel_size
103 | self.gpu_memory_utilization = gpu_memory_utilization
104 | self.load_model(self.model_path, self.trust_remote_code, self.tensor_parallel_size, self.gpu_memory_utilization)
105 |
106 | def generate(self, prompts: str, template: str = None, generate_configs: GenerateConfigs = None) -> list:
107 | '''产出对应结果'''
108 | pass
109 |
110 | def generate_params(
111 | self, generate_configs: GenerateConfigs,
112 | ):
113 | '''generate param'''
114 | kargs = generate_configs.dict()
115 | return kargs
116 |
117 | def load_model(self, model_path, trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25):
118 | '''加载模型'''
119 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=trust_remote_code)
120 | self.model = AutoModelForCausalLM.from_pretrained(self.model_path, device_map="auto", trust_remote_code=trust_remote_code).eval()
121 |
122 | # self.model = LLM(model=model_path, trust_remote_code=trust_remote_code, tensor_parallel_size=tensor_parallel_size, gpu_memory_utilization=gpu_memory_utilization)
123 | ```
124 |
125 | #### 3. 注册模型和eval函数即可
126 | 在 ~/models/__init__.py 中注册即可
127 | ```python
128 | from .base_model import ToolModel
129 |
130 | __all__ = [
131 | "ToolModel",
132 | ]
133 | ```
134 | 在 ~/evasl/__init__.py 中注册即可
135 | ```python
136 | from .base_evalution import ToolEvalution
137 | from .toolfill_evalution import ToolFillEvalution
138 | from .toolparser_evalution import ToolParserEvalution
139 | from .toolsummary_evalution import ToolSummaryEvalution
140 | from .func_call_evalution import FuncCallEvalution
141 |
142 |
143 | __all__ = [
144 | "ToolEvalution", "ToolFillEvalution", "ToolParserEvalution", "ToolSummaryEvalution", "FuncCallEvalution"
145 | ]
146 | ```
147 |
148 |
149 | #### 4. 执行测试脚本
150 | 修改 ~/src/qwen_eval_main.py# datainfos和model_infos
151 | ```python
152 | model_infos = [
153 | {"model_name": "", "template": "chatml", "model_path": "",
154 | "peft_path": "", "model_class": QwenModel}]
155 |
156 | datainfos = [
157 | {"dataset_path": "~/fcdata_luban_zh_test.jsonl", "dataset_name": "fcdata_luban_zh", "tool_task": "func_call"},
158 | {"dataset_path": "~/test_datas/fcdata_zh_test_v1.jsonl", "dataset_name": "fcdata_zh", "tool_task": "func_call"},
159 | ]
160 | ```
161 |
162 | 运行下述命令即可
163 | ```Bash
164 | python qwen_eval_main.py
165 | ```
166 |
167 |
168 |
169 | ### 非chatml接入
170 | 如果需要在自己的 huggingface 格式的模型上进行测试的话,总的步骤分为如下几步:
171 | 1. 编写 ~/getAssistantAns.py 相关代码
172 | 2. 执行测试脚本
173 |
174 |
175 | #### 1、编写 getAssistantAns 示例
176 | ```
177 | class GetAssistantAns():
178 | # 按照自己推理需求自己修改代码
179 |
180 | def __init__(self, gpu_num=1):
181 | model = AutoModelForCausalLM.from_pretrained(model_name)
182 | device_list = []
183 | for gpu_idx in range(gpu_num):
184 | device_list.append(torch.device("cuda:0"))
185 |
186 | # 将模型移动到指定的GPU设备
187 | model.to(device)
188 |
189 |
190 | def gen_answer(self, chat_dict, gpu_index):
191 | # 这里实际根据自己推理逻辑 然后转为标准格式返回
192 | # 以下仅仅是样例
193 | import time
194 | print(os.environ["CUDA_VISIBLE_DEVICES"])
195 | time.sleep(1)
196 | rtn_dict1 = {
197 | "role": "assistant",
198 | "content": None,
199 | "function_call":
200 | {
201 | "name": "get_fudan_university_scoreline",
202 | "arguments": "{\n \"year\": \"2020\"\n}"
203 | }
204 | }
205 |
206 | rtn_dict2 = {
207 | "role": "assistant",
208 | "content": "2020年复旦大学的分数线如下:\n\n- 文科一批:630分\n- 文科二批:610分\n- 理科一批:650分\n- 理科二批:630分"
209 | }
210 |
211 | return random.choice([rtn_dict1, rtn_dict2])
212 | ```
213 | #### 2、执行测试脚本
214 | 修改 ~/src/opensource_functioncall_evalution.py # test_ans_file_list
215 | ```python
216 | test_ans_file_list = [
217 | "fcdata_zh_test.jsonl"
218 | ]
219 | ```
220 |
221 | 运行下述命令即可
222 | ```Bash
223 | python opensource_functioncall_evalution.py
224 | ```
225 |
--------------------------------------------------------------------------------
/src/opensource_functioncall_evalution.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/python
2 | # -*- coding: utf-8 -*-
3 |
4 | ############################################
5 | # File: opensource_functioncall_evalution.py
6 | # create by youmi
7 | # Time: 2023-11-23 13:10
8 | ############################################
9 |
10 |
11 | import os
12 | import sys
13 | import random
14 | import time
15 | import shutil
16 | import json
17 | import jieba
18 | import re
19 | import copy
20 | import numpy as np
21 | from tqdm import tqdm
22 | from collections import Counter
23 | from concurrent.futures import ThreadPoolExecutor, as_completed
24 | from getAssistantAns import GetAssistantAns
25 |
26 |
27 | test_ans_file_list = [
28 | "fcdata_zh_test.jsonl"
29 | ]
30 |
31 | # 多进程评测加速
32 | GPU_NUM = 1
33 |
34 | # function call 回复测试总数
35 | function_call_sum = 0
36 | # function call 回复正确数
37 | function_call_correct = 0
38 | # function call 回复失败数
39 | function_call_fail = 0
40 | # function call 回复失败中,本应该调用工具但是模型没有调用, 无工具识别识别错误数
41 | function_call_fail_functioncall = 0
42 | # function call 回复失败数中,因为函数名不对导致的失败数, 这部分包括模型幻觉出错
43 | function_call_fail_name = 0
44 | # function call 回复失败数中,工具名对了,但是参数不对导致的失败数
45 | function_call_fail_param = 0
46 | # function call 回复失败中 函数名幻觉的失败数
47 | function_call_fail_name_illusion = 0
48 |
49 | # assistant ans 回复相关度列表
50 | assistant_ans_relevancy_list = []
51 |
52 | # 推理结果
53 | test_result_lines = []
54 |
55 | get_assistant_ans = GetAssistantAns(gpu_num=GPU_NUM)
56 |
57 | def remove_punctuation(text):
58 | pattern = r'[^\w\s]'
59 | return re.sub(pattern, '', text)
60 |
61 |
62 | def cmp_arguments(args_str1, args_str2):
63 | rtn_flag = False
64 | try:
65 | args_dict1 = json.loads(args_str1)
66 | args_dict2 = json.loads(args_str2)
67 | # 比较两个字典是否一致
68 | if args_dict1 == args_dict2:
69 | rtn_flag = True
70 | except Exception as e:
71 | print("json.loads error: ", e)
72 | return rtn_flag
73 | return rtn_flag
74 |
75 |
76 | # 计算两个答案的相关度
77 | # 要是预测回复的是functioncall类型的,相似为0
78 | # 要是预测回复的包含了所有要点,相似度为1
79 | # 相似度保存在assistant_ans_relevancy_list中
80 | def calc_relevancy(ass_predict, ass_truth, chatrounds):
81 | global assistant_ans_relevancy_list
82 | if "function_call" in ass_predict:
83 | assistant_ans_relevancy_list.append(0)
84 | return
85 | # 将user 和 function 的部分组合
86 | content_msg = ""
87 | for chatround in chatrounds["chatrounds"]:
88 | if chatround["role"] == "user":
89 | content_msg += chatround["content"]
90 | elif chatround["role"] == "function":
91 | content_msg += chatround["content"]
92 | content_msg_counter = Counter(jieba.cut(remove_punctuation(content_msg)))
93 | ass_truth_counter = Counter(jieba.cut(remove_punctuation(ass_truth["content"])))
94 | ass_predict_counter = Counter(jieba.cut(remove_punctuation(ass_predict["content"])))
95 | relative_counter = content_msg_counter & ass_truth_counter
96 | len_relative = sum(relative_counter.values())
97 | predict_relative = ass_predict_counter & relative_counter
98 |
99 | if len_relative == 0:
100 | # 要是标准答案和问题相关词都无 直接给1
101 | assistant_ans_relevancy_list.append(1)
102 | else:
103 | # 交集与相关词的占比
104 | assistant_ans_relevancy_list.append(sum(predict_relative.values())/len_relative)
105 |
106 |
107 |
108 |
109 | def calc_llm_index(ass_predict, ass_truth, chatrounds):
110 | global function_call_sum, function_call_correct, function_call_fail, function_call_fail_functioncall, function_call_fail_name, function_call_fail_name_illusion, function_call_fail_param
111 |
112 | chatrounds_functionname_list = []
113 | for function_dict in chatrounds.get("functions", []):
114 | chatrounds_functionname_list.append(function_dict["name"])
115 |
116 | if "function_call" in ass_truth:
117 | function_call_sum += 1
118 | if "function_call" not in ass_predict:
119 | function_call_fail += 1
120 | function_call_fail_functioncall += 1
121 | elif ass_predict["function_call"]["name"] not in chatrounds_functionname_list:
122 | # 模型幻觉
123 | function_call_fail += 1
124 | function_call_fail_name += 1
125 | function_call_fail_name_illusion += 1
126 | else:
127 | function_call_name_label = False
128 | function_call_args_label = False
129 | if ass_predict["function_call"]["name"] == ass_truth["function_call"]["name"]:
130 | function_call_name_label = True
131 | if cmp_arguments(ass_predict["function_call"]["arguments"], ass_truth["function_call"]["arguments"]):
132 | function_call_args_label = True
133 | else:
134 | function_call_fail_param += 1
135 | else:
136 | function_call_fail_name += 1
137 |
138 | if function_call_name_label and function_call_args_label:
139 | function_call_correct += 1
140 | else:
141 | function_call_fail += 1
142 | else:
143 | calc_relevancy(ass_predict, ass_truth, chatrounds)
144 |
145 |
146 | def print_result():
147 | # 打印指标结果
148 | print("=============统计数据=========================")
149 | print(f"function_call_sum: {function_call_sum}")
150 | print(f"function_call_correct: {function_call_correct}")
151 | print(f"function_call_fail: {function_call_fail}")
152 | print(f"function_call_fail_functioncall: {function_call_fail_functioncall}")
153 | print(f"function_call_fail_name: {function_call_fail_name}")
154 | print(f"function_call_fail_param: {function_call_fail_param}")
155 | print(f"function_call_fail_name_illusion: {function_call_fail_name_illusion}")
156 | print(f"assistant_ans_sum: {len(assistant_ans_relevancy_list)}")
157 | print(f"assistant_ans_relevancy: {np.mean(assistant_ans_relevancy_list)}")
158 | print("=============实验结果=========================")
159 | function_call_correct_rate = function_call_correct/function_call_sum
160 | function_call_fail_rate = function_call_fail/function_call_sum
161 | function_call_fail_functioncall_rate = function_call_fail_functioncall/function_call_fail if function_call_fail else 0
162 | function_call_fail_name_rate = function_call_fail_name/function_call_fail if function_call_fail else 0
163 | function_call_fail_param_rate = function_call_fail_param/function_call_fail if function_call_fail else 0
164 | function_call_fail_name_illusion_rate = function_call_fail_name_illusion/function_call_fail if function_call_fail else 0
165 | print(f"工具识别正确率fccr: {function_call_correct_rate}")
166 | print(f"工具识别失败率fcfr: {function_call_fail_rate}")
167 | print(f"工具调用识别失败占比fcffr: {function_call_fail_functioncall_rate}")
168 | print(f"工具名识别失败占比fcfnr: {function_call_fail_name_rate}")
169 | print(f"工具参数识别失败占比fcfpr: {function_call_fail_param_rate}")
170 | print(f"工具幻觉识别失败占比fcfnir: {function_call_fail_name_illusion_rate}")
171 | print(f"助手回复答案相关度aar: {np.mean(assistant_ans_relevancy_list)}")
172 | print("==============================================")
173 | # 保存数据
174 | with open("test_result_data.jsonl","w") as fw:
175 | for line in test_result_lines:
176 | print(line, file=fw)
177 |
178 |
179 | def test_process(test_lines, gpu_index):
180 | global test_result_lines
181 | for line in tqdm(test_lines, desc="Process%02d"%(gpu_index)):
182 | chat_dict = json.loads(line)
183 | test_dict = {}
184 | test_dict["functions"] = chat_dict["functions"]
185 | test_dict["chatrounds"] = []
186 | for chatround in chat_dict["chatrounds"]:
187 | if chatround["role"] == "assistant":
188 | ass_predict = get_assistant_ans.gen_answer(test_dict, gpu_index=gpu_index)
189 | save_dict = copy.deepcopy(test_dict)
190 | save_dict["chatrounds"].append(ass_predict)
191 | test_result_lines.append(json.dumps(save_dict, ensure_ascii=False))
192 | calc_llm_index(ass_predict, chatround, test_dict)
193 | test_dict["chatrounds"].append(chatround)
194 |
195 |
196 | def main():
197 | pool = ThreadPoolExecutor(max_workers=GPU_NUM)
198 |
199 | test_lines = []
200 | for test_ans_file in test_ans_file_list:
201 | print(test_ans_file)
202 | with open(test_ans_file, "r") as f:
203 | lines = f.readlines()
204 | test_lines += lines
205 |
206 | batch_num = len(test_lines)//GPU_NUM + int(len(test_lines)%GPU_NUM>0)
207 |
208 | obj_list = []
209 | for idx in range(GPU_NUM):
210 | batch_test_lines = test_lines[idx*batch_num:(idx+1)*batch_num]
211 | obj = pool.submit(test_process, batch_test_lines, gpu_index=idx)
212 | obj_list.append(obj)
213 |
214 | for future in as_completed(obj_list):
215 | # 暂时留在这里,但是其实没有返回数据
216 | data = future.result()
217 |
218 | print_result()
219 |
220 | if __name__ == "__main__":
221 | main()
222 |
--------------------------------------------------------------------------------
/resources/categroy_mapping.json:
--------------------------------------------------------------------------------
1 | {
2 | "Visualization.csv":[
3 | "visualization",
4 | "可视化",
5 | {
6 | "dev":5,
7 | "test":44
8 | },
9 | "Visualization.csv"
10 | ],
11 | "Logging.csv":[
12 | "logging",
13 | "日志",
14 | {
15 | "dev":5,
16 | "test":100
17 | },
18 | "Logging.csv"
19 | ],
20 | "Storage.csv":[
21 | "storage",
22 | "存储",
23 | {
24 | "dev":5,
25 | "test":36
26 | },
27 | "Storage.csv"
28 | ],
29 | "DataAcquisition.csv":[
30 | "data acquisition",
31 | "数据采集",
32 | {
33 | "dev":5,
34 | "test":36
35 | },
36 | "DataAcquisition.csv"
37 | ],
38 | "IntegrationTesting.csv":[
39 | "integration testing",
40 | "集成测试",
41 | {
42 | "dev":5,
43 | "test":31
44 | },
45 | "IntegrationTesting.csv"
46 | ],
47 | "UserAcceptanceTesting.csv":[
48 | "user acceptance testing",
49 | "用户验收测试",
50 | {
51 | "dev":5,
52 | "test":39
53 | },
54 | "UserAcceptanceTesting.csv"
55 | ],
56 | "SecurityTesting.csv":[
57 | "security testing",
58 | "安全测试",
59 | {
60 | "dev":5,
61 | "test":38
62 | },
63 | "SecurityTesting.csv"
64 | ],
65 | "UnitTesting.csv":[
66 | "unit testing",
67 | "单元测试",
68 | {
69 | "dev":5,
70 | "test":32
71 | },
72 | "UnitTesting.csv"
73 | ],
74 | "PerformanceTesting.csv":[
75 | "performance testing",
76 | "性能测试",
77 | {
78 | "dev":5,
79 | "test":36
80 | },
81 | "PerformanceTesting.csv"
82 | ],
83 | "SystemTesting.csv":[
84 | "system testing",
85 | "系统测试",
86 | {
87 | "dev":5,
88 | "test":52
89 | },
90 | "SystemTesting.csv"
91 | ],
92 | "ProgM.csv":[
93 | "programme management",
94 | "进度管理",
95 | {
96 | "dev":5,
97 | "test":21
98 | },
99 | "ProgM.csv"
100 | ],
101 | "REQM.csv":[
102 | "requirements management",
103 | "需求管理",
104 | {
105 | "dev":5,
106 | "test":24
107 | },
108 | "REQM.csv"
109 | ],
110 | "RiskMgmt.csv":[
111 | "risk management",
112 | "风险管理",
113 | {
114 | "dev":5,
115 | "test":21
116 | },
117 | "RiskMgmt.csv"
118 | ],
119 | "InfrastructureAsCode.csv":[
120 | "infrastructure as code",
121 | "基础设施即代码",
122 | {
123 | "dev":5,
124 | "test":34
125 | },
126 | "InfrastructureAsCode.csv"
127 | ],
128 | "Provisioning.csv":[
129 | "provisioning",
130 | "置备",
131 | {
132 | "dev":5,
133 | "test":19
134 | },
135 | "Provisioning.csv"
136 | ],
137 | "ConfigMgmt.csv":[
138 | "config management",
139 | "配置管理",
140 | {
141 | "dev":5,
142 | "test":100
143 | },
144 | "ConfigMgmt.csv"
145 | ],
146 | "Azure.csv":[
147 | "microsoft azure",
148 | "微软云服务",
149 | {
150 | "dev":5,
151 | "test":27
152 | },
153 | "Azure.csv"
154 | ],
155 | "GoogleCloud.csv":[
156 | "google cloud",
157 | "谷歌云服务",
158 | {
159 | "dev":5,
160 | "test":31
161 | },
162 | "GoogleCloud.csv"
163 | ],
164 | "AWS.csv":[
165 | "amazon web services",
166 | "亚马逊云服务",
167 | {
168 | "dev":5,
169 | "test":44
170 | },
171 | "AWS.csv"
172 | ],
173 | "LogDesign.csv":[
174 | "log design",
175 | "日志设计",
176 | {
177 | "dev":5,
178 | "test":33
179 | },
180 | "LogDesign.csv"
181 | ],
182 | "ServiceDesign.csv":[
183 | "service design",
184 | "服务设计",
185 | {
186 | "dev":5,
187 | "test":44
188 | },
189 | "ServiceDesign.csv"
190 | ],
191 | "CapabilityDesign.csv":[
192 | "capability design",
193 | "容量设计",
194 | {
195 | "dev":5,
196 | "test":33
197 | },
198 | "CapabilityDesign.csv"
199 | ],
200 | "CloudNativeDesign.csv":[
201 | "cloud native design",
202 | "云原生设计",
203 | {
204 | "dev":5,
205 | "test":44
206 | },
207 | "CloudNativeDesign.csv"
208 | ],
209 | "CacheDesign.csv":[
210 | "cache design",
211 | "缓存设计",
212 | {
213 | "dev":5,
214 | "test":28
215 | },
216 | "CacheDesign.csv"
217 | ],
218 | "DBDesign.csv":[
219 | "database design",
220 | "数据库设计",
221 | {
222 | "dev":5,
223 | "test":38
224 | },
225 | "DBDesign.csv"
226 | ],
227 | "ArtificialIntelligence.csv":[
228 | "artificial intelligence",
229 | "人工智能",
230 | {
231 | "dev":5,
232 | "test":45
233 | },
234 | "ArtificialIntelligence.csv"
235 | ],
236 | "ComputerBasics.csv":[
237 | "computer basics",
238 | "计算机基础",
239 | {
240 | "dev":5,
241 | "test":100
242 | },
243 | "ComputerBasics.csv"
244 | ],
245 | "DataBase.csv":[
246 | "database",
247 | "数据库",
248 | {
249 | "dev":5,
250 | "test":75
251 | },
252 | "DataBase.csv"
253 | ],
254 | "ComputerNetwork.csv":[
255 | "computer network",
256 | "计算机网络",
257 | {
258 | "dev":5,
259 | "test":88
260 | },
261 | "ComputerNetwork.csv"
262 | ],
263 | "OperatingSystem.csv":[
264 | "operating system",
265 | "操作系统",
266 | {
267 | "dev":5,
268 | "test":36
269 | },
270 | "OperatingSystem.csv"
271 | ],
272 | "Go.csv":[
273 | "go",
274 | "go语言",
275 | {
276 | "dev":5,
277 | "test":100
278 | },
279 | "Go.csv"
280 | ],
281 | "Java.csv":[
282 | "java",
283 | "java语言",
284 | {
285 | "dev":5,
286 | "test":100
287 | },
288 | "Java.csv"
289 | ],
290 | "C:C++.csv":[
291 | "c/c++",
292 | "c/c++语言",
293 | {
294 | "dev":5,
295 | "test":100
296 | },
297 | "C:C++.csv"
298 | ],
299 | "Python.csv":[
300 | "python",
301 | "python语言",
302 | {
303 | "dev":5,
304 | "test":73
305 | },
306 | "Python.csv"
307 | ],
308 | "BigData.csv":[
309 | "big data",
310 | "大数据",
311 | {
312 | "dev":5,
313 | "test":15
314 | },
315 | "BigData.csv"
316 | ],
317 | "Front-end.csv":[
318 | "front-end",
319 | "前端",
320 | {
321 | "dev":5,
322 | "test":100
323 | },
324 | "Front-end.csv"
325 | ],
326 | "MobileApp.csv":[
327 | "mobile app",
328 | "移动应用",
329 | {
330 | "dev":5,
331 | "test":100
332 | },
333 | "MobileApp.csv"
334 | ],
335 | "MachineLearning.csv":[
336 | "machine learning",
337 | "机器学习",
338 | {
339 | "dev":5,
340 | "test":69
341 | },
342 | "MachineLearning.csv"
343 | ],
344 | "Back-end.csv":[
345 | "back-end",
346 | "后端",
347 | {
348 | "dev":5,
349 | "test":100
350 | },
351 | "Back-end.csv"
352 | ],
353 | "ArtifactMgmt.csv":[
354 | "artifact management",
355 | "产出物管理",
356 | {
357 | "dev":5,
358 | "test":12
359 | },
360 | "ArtifactMgmt.csv"
361 | ],
362 | "CI:CD.csv":[
363 | "cd/cd",
364 | "持续集成/持续部署",
365 | {
366 | "dev":5,
367 | "test":100
368 | },
369 | "CI:CD.csv"
370 | ],
371 | "Linux.csv":[
372 | "linux",
373 | "linux操作系统",
374 | {
375 | "dev":5,
376 | "test":100
377 | },
378 | "Linux.csv"
379 | ],
380 | "ContainerOrchestration.csv":[
381 | "container orchestration",
382 | "容器编排",
383 | {
384 | "dev":5,
385 | "test":100
386 | },
387 | "ContainerOrchestration.csv"
388 | ],
389 | "Virtualization.csv":[
390 | "virtualization",
391 | "虚拟化技术",
392 | {
393 | "dev":5,
394 | "test":34
395 | },
396 | "Virtualization.csv"
397 | ],
398 | "TimeSeriesAnomalyDetection.csv":[
399 | "time series anomaly detection",
400 | "时序异常检测",
401 | {
402 | "dev":5,
403 | "test":300
404 | },
405 | "TimeSeriesAnomalyDetection.csv"
406 | ],
407 | "TimeSeriesClassification.csv":[
408 | "time series classification",
409 | "时序分类",
410 | {
411 | "dev":5,
412 | "test":200
413 | },
414 | "TimeSeriesClassification.csv"
415 | ],
416 | "RootCauseAnalysis.csv":[
417 | "root cause analysis",
418 | "根因分析",
419 | {
420 | "dev":5,
421 | "test":250
422 | },
423 | "RootCauseAnalysis.csv"
424 | ],
425 | "LogParser.csv":[
426 | "log parser",
427 | "日志解析",
428 | {
429 | "dev":5,
430 | "test":350
431 | },
432 | "LogParser.csv"
433 | ],
434 | "VersionControl.csv":[
435 | "version control",
436 | "版本控制",
437 | {
438 | "dev":5,
439 | "test":100
440 | },
441 | "VersionControl.csv"
442 | ],
443 | "DBMgnt.csv":[
444 | "database management",
445 | "数据库管理",
446 | {
447 | "dev":5,
448 | "test":19
449 | },
450 | "DBMgnt.csv"
451 | ],
452 | "Dependency.csv":[
453 | "dependency",
454 | "依赖管理",
455 | {
456 | "dev":5,
457 | "test":44
458 | },
459 | "Dependency.csv"
460 | ],
461 | "Compile.csv":[
462 | "compile",
463 | "编译",
464 | {
465 | "dev":5,
466 | "test":31
467 | },
468 | "Compile.csv"
469 | ],
470 | "Package.csv":[
471 | "package",
472 | "包管理",
473 | {
474 | "dev":5,
475 | "test":24
476 | },
477 | "Package.csv"
478 | ]
479 | }
--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright [2023] [Ant Group]
2 | Licensed under the Apache License, Version 2.0 (the "License");
3 | you may not use this file except in compliance with the License.
4 | You may obtain a copy of the License at
5 | http://www.apache.org/licenses/LICENSE-2.0
6 |
7 | Unless required by applicable law or agreed to in writing, software
8 | distributed under the License is distributed on an "AS IS" BASIS,
9 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | See the License for the specific language governing permissions and
11 | limitations under the License.
12 |
13 |
14 | Apache License
15 | Version 2.0, January 2004
16 | http://www.apache.org/licenses/
17 |
18 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
19 |
20 | 1. Definitions.
21 |
22 | "License" shall mean the terms and conditions for use, reproduction,
23 | and distribution as defined by Sections 1 through 9 of this document.
24 |
25 | "Licensor" shall mean the copyright owner or entity authorized by
26 | the copyright owner that is granting the License.
27 |
28 | "Legal Entity" shall mean the union of the acting entity and all
29 | other entities that control, are controlled by, or are under common
30 | control with that entity. For the purposes of this definition,
31 | "control" means (i) the power, direct or indirect, to cause the
32 | direction or management of such entity, whether by contract or
33 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
34 | outstanding shares, or (iii) beneficial ownership of such entity.
35 |
36 | "You" (or "Your") shall mean an individual or Legal Entity
37 | exercising permissions granted by this License.
38 |
39 | "Source" form shall mean the preferred form for making modifications,
40 | including but not limited to software source code, documentation
41 | source, and configuration files.
42 |
43 | "Object" form shall mean any form resulting from mechanical
44 | transformation or translation of a Source form, including but
45 | not limited to compiled object code, generated documentation,
46 | and conversions to other media types.
47 |
48 | "Work" shall mean the work of authorship, whether in Source or
49 | Object form, made available under the License, as indicated by a
50 | copyright notice that is included in or attached to the work
51 | (an example is provided in the Appendix below).
52 |
53 | "Derivative Works" shall mean any work, whether in Source or Object
54 | form, that is based on (or derived from) the Work and for which the
55 | editorial revisions, annotations, elaborations, or other modifications
56 | represent, as a whole, an original work of authorship. For the purposes
57 | of this License, Derivative Works shall not include works that remain
58 | separable from, or merely link (or bind by name) to the interfaces of,
59 | the Work and Derivative Works thereof.
60 |
61 | "Contribution" shall mean any work of authorship, including
62 | the original version of the Work and any modifications or additions
63 | to that Work or Derivative Works thereof, that is intentionally
64 | submitted to Licensor for inclusion in the Work by the copyright owner
65 | or by an individual or Legal Entity authorized to submit on behalf of
66 | the copyright owner. For the purposes of this definition, "submitted"
67 | means any form of electronic, verbal, or written communication sent
68 | to the Licensor or its representatives, including but not limited to
69 | communication on electronic mailing lists, source code control systems,
70 | and issue tracking systems that are managed by, or on behalf of, the
71 | Licensor for the purpose of discussing and improving the Work, but
72 | excluding communication that is conspicuously marked or otherwise
73 | designated in writing by the copyright owner as "Not a Contribution."
74 |
75 | "Contributor" shall mean Licensor and any individual or Legal Entity
76 | on behalf of whom a Contribution has been received by Licensor and
77 | subsequently incorporated within the Work.
78 |
79 | 2. Grant of Copyright License. Subject to the terms and conditions of
80 | this License, each Contributor hereby grants to You a perpetual,
81 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
82 | copyright license to reproduce, prepare Derivative Works of,
83 | publicly display, publicly perform, sublicense, and distribute the
84 | Work and such Derivative Works in Source or Object form.
85 |
86 | 3. Grant of Patent License. Subject to the terms and conditions of
87 | this License, each Contributor hereby grants to You a perpetual,
88 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
89 | (except as stated in this section) patent license to make, have made,
90 | use, offer to sell, sell, import, and otherwise transfer the Work,
91 | where such license applies only to those patent claims licensable
92 | by such Contributor that are necessarily infringed by their
93 | Contribution(s) alone or by combination of their Contribution(s)
94 | with the Work to which such Contribution(s) was submitted. If You
95 | institute patent litigation against any entity (including a
96 | cross-claim or counterclaim in a lawsuit) alleging that the Work
97 | or a Contribution incorporated within the Work constitutes direct
98 | or contributory patent infringement, then any patent licenses
99 | granted to You under this License for that Work shall terminate
100 | as of the date such litigation is filed.
101 |
102 | 4. Redistribution. You may reproduce and distribute copies of the
103 | Work or Derivative Works thereof in any medium, with or without
104 | modifications, and in Source or Object form, provided that You
105 | meet the following conditions:
106 |
107 | (a) You must give any other recipients of the Work or
108 | Derivative Works a copy of this License; and
109 |
110 | (b) You must cause any modified files to carry prominent notices
111 | stating that You changed the files; and
112 |
113 | (c) You must retain, in the Source form of any Derivative Works
114 | that You distribute, all copyright, patent, trademark, and
115 | attribution notices from the Source form of the Work,
116 | excluding those notices that do not pertain to any part of
117 | the Derivative Works; and
118 |
119 | (d) If the Work includes a "NOTICE" text file as part of its
120 | distribution, then any Derivative Works that You distribute must
121 | include a readable copy of the attribution notices contained
122 | within such NOTICE file, excluding those notices that do not
123 | pertain to any part of the Derivative Works, in at least one
124 | of the following places: within a NOTICE text file distributed
125 | as part of the Derivative Works; within the Source form or
126 | documentation, if provided along with the Derivative Works; or,
127 | within a display generated by the Derivative Works, if and
128 | wherever such third-party notices normally appear. The contents
129 | of the NOTICE file are for informational purposes only and
130 | do not modify the License. You may add Your own attribution
131 | notices within Derivative Works that You distribute, alongside
132 | or as an addendum to the NOTICE text from the Work, provided
133 | that such additional attribution notices cannot be construed
134 | as modifying the License.
135 |
136 | You may add Your own copyright statement to Your modifications and
137 | may provide additional or different license terms and conditions
138 | for use, reproduction, or distribution of Your modifications, or
139 | for any such Derivative Works as a whole, provided Your use,
140 | reproduction, and distribution of the Work otherwise complies with
141 | the conditions stated in this License.
142 |
143 | 5. Submission of Contributions. Unless You explicitly state otherwise,
144 | any Contribution intentionally submitted for inclusion in the Work
145 | by You to the Licensor shall be under the terms and conditions of
146 | this License, without any additional terms or conditions.
147 | Notwithstanding the above, nothing herein shall supersede or modify
148 | the terms of any separate license agreement you may have executed
149 | with Licensor regarding such Contributions.
150 |
151 | 6. Trademarks. This License does not grant permission to use the trade
152 | names, trademarks, service marks, or product names of the Licensor,
153 | except as required for reasonable and customary use in describing the
154 | origin of the Work and reproducing the content of the NOTICE file.
155 |
156 | 7. Disclaimer of Warranty. Unless required by applicable law or
157 | agreed to in writing, Licensor provides the Work (and each
158 | Contributor provides its Contributions) on an "AS IS" BASIS,
159 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
160 | implied, including, without limitation, any warranties or conditions
161 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
162 | PARTICULAR PURPOSE. You are solely responsible for determining the
163 | appropriateness of using or redistributing the Work and assume any
164 | risks associated with Your exercise of permissions under this License.
165 |
166 | 8. Limitation of Liability. In no event and under no legal theory,
167 | whether in tort (including negligence), contract, or otherwise,
168 | unless required by applicable law (such as deliberate and grossly
169 | negligent acts) or agreed to in writing, shall any Contributor be
170 | liable to You for damages, including any direct, indirect, special,
171 | incidental, or consequential damages of any character arising as a
172 | result of this License or out of the use or inability to use the
173 | Work (including but not limited to damages for loss of goodwill,
174 | work stoppage, computer failure or malfunction, or any and all
175 | other commercial damages or losses), even if such Contributor
176 | has been advised of the possibility of such damages.
177 |
178 | 9. Accepting Warranty or Additional Liability. While redistributing
179 | the Work or Derivative Works thereof, You may choose to offer,
180 | and charge a fee for, acceptance of support, warranty, indemnity,
181 | or other liability obligations and/or rights consistent with this
182 | License. However, in accepting such obligations, You may act only
183 | on Your own behalf and on Your sole responsibility, not on behalf
184 | of any other Contributor, and only if You agree to indemnify,
185 | defend, and hold each Contributor harmless for any liability
186 | incurred by, or claims asserted against, such Contributor by reason
187 | of your accepting any such warranty or additional liability.
188 |
189 | END OF TERMS AND CONDITIONS
190 |
191 | APPENDIX: How to apply the Apache License to your work.
192 |
193 | To apply the Apache License to your work, attach the following
194 | boilerplate notice, with the fields enclosed by brackets "[]"
195 | replaced with your own identifying information. (Don't include
196 | the brackets!) The text should be enclosed in the appropriate
197 | comment syntax for the file format. We also recommend that a
198 | file or class name and description of purpose be included on the
199 | same "printed page" as the copyright notice for easier
200 | identification within third-party archives.
201 |
202 | Copyright [yyyy] [name of copyright owner]
203 |
204 | Licensed under the Apache License, Version 2.0 (the "License");
205 | you may not use this file except in compliance with the License.
206 | You may obtain a copy of the License at
207 |
208 | http://www.apache.org/licenses/LICENSE-2.0
209 |
210 | Unless required by applicable law or agreed to in writing, software
211 | distributed under the License is distributed on an "AS IS" BASIS,
212 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
213 | See the License for the specific language governing permissions and
214 | limitations under the License.
--------------------------------------------------------------------------------
/src/evals/func_call_evalution.py:
--------------------------------------------------------------------------------
1 | from src.models.base_model import ToolModel
2 | from src.models.generate_configs import GenerateConfigs
3 | from src.datasets import FuncCallDataset
4 | from src.utils.jsonl_utils import read_jsonl_file
5 | from .base_evalution import ToolEvalution
6 |
7 | from collections import Counter
8 | import jieba, re, json, os
9 | import numpy as np
10 | from loguru import logger
11 |
12 |
13 | def remove_punctuation(text):
14 | pattern = r'[^\w\s]'
15 | return re.sub(pattern, '', text)
16 |
17 |
18 | def cmp_arguments(args_str1, args_str2):
19 | rtn_flag = False
20 | try:
21 | args_dict1 = json.loads(args_str1)
22 | args_dict2 = json.loads(args_str2)
23 | # 比较两个字典是否一致
24 | if args_dict1 == args_dict2:
25 | rtn_flag = True
26 | except Exception as e:
27 | print("json.loads error: ", e)
28 | return rtn_flag
29 | return rtn_flag
30 |
31 |
32 | class FuncCallEvalution(ToolEvalution):
33 | def __init__(
34 | self,
35 | model: ToolModel,
36 | dataset: FuncCallDataset,
37 | base_prompt: str = '',
38 | template: str = 'default',
39 | generate_configs: GenerateConfigs = None,
40 | ):
41 | self.model = model
42 | self.dataset = dataset
43 | self.base_prompt = base_prompt
44 | self.template = template
45 | self.generate_configs = generate_configs
46 |
47 | if not isinstance(model, ToolModel):
48 | raise BaseException(f"must be ToolModel Class! not {model}")
49 |
50 | def calc(self):
51 | '''开始计算结果'''
52 | self.predicts = []
53 | func_call_train_datas = self.create_prompts(self.dataset)
54 |
55 | for idx, data in enumerate(func_call_train_datas):
56 | print(f"总共 {len(func_call_train_datas)} 条prompt,当前运行到第 {idx} 条prompt", end="\r")
57 | prompt = data["instruction"]
58 | history = data["history"]
59 | answer = data["output"]
60 | functions = data["functions"]
61 | predict = self.generate(prompt, self.template, self.generate_configs, history)
62 |
63 | if "arguments" in answer:
64 | answer = {"content": answer["content"], "function_call": {"name": answer["name"], "arguments": answer["arguments"]}}
65 |
66 | if "#function" in predict:
67 | try:
68 | predict_param = json.loads(predict.split("#function")[-1])
69 | if "arguments" in predict_param:
70 | predict_param = {
71 | "content": predict_param["content"],
72 | "function_call": {"name": predict_param["name"], "arguments": predict_param["arguments"]}
73 | }
74 | predict = {**predict_param, **{"role": "assistant"}}
75 | except Exception as e:
76 | logger.error("content: {content}")
77 | predict = {**{"content": predict_param}, **{"role": "assistant"}}
78 | else:
79 | predict = {
80 | "role": "assistant",
81 | "content": predict
82 | }
83 |
84 | self.predicts.append({
85 | "prompt": prompt, "history": history,
86 | "predict": predict, "answer": answer,
87 | "functions": functions
88 | })
89 |
90 | metric = self.eval_metric(self.predicts)
91 | return metric
92 |
93 | def calc_from_predicts(self, file_path):
94 | if os.path.exists(file_path):
95 | self.predicts = read_jsonl_file(file_path)
96 | metric = self.eval_metric(self.predicts)
97 | return metric
98 | else:
99 | return self.calc()
100 |
101 | def create_prompts(self, func_call_datas):
102 | system_content = '''CodeFuse是一个面向研发领域的智能助手,旨在中立的、无害的帮助用户解决开发相关的问题,所有的回答均使用Markdown格式返回。
103 | 你能利用许多工具和功能来完成给定的任务,在每一步中,你需要分析当前状态,并通过执行函数调用来确定下一步的行动方向。你可以进行多次尝试。如果你计划连续尝试不同的条件,请每次尝试一种条件。若给定了Finish函数,则以Finish调用结束,若没提供Finish函数,则以不带function_call的对话结束。'''
104 | function_format = '''You are ToolGPT, you have access to the following APIs:\n{tools}'''
105 |
106 | func_call_train_datas = []
107 | history_error_cnt = 0
108 | funccall_error_cnt = 0
109 |
110 | for data in func_call_datas:
111 | tools = data["functions"]
112 | chatrounds = data["chatrounds"]
113 |
114 | function_content = ""
115 | if len(tools) > 0:
116 | function_content = function_format.format(tools=json.dumps(tools, ensure_ascii=False, sort_keys=True))
117 |
118 | history = []
119 | for i in chatrounds:
120 | if i["role"]=="system":
121 | continue
122 |
123 | if i["role"]=="user":
124 | history.append(("user", i["content"]))
125 |
126 | if i["role"] == "assistant":
127 | if "function_call" in i:
128 | if not isinstance(i["function_call"], dict):
129 | funccall_error_cnt+=1
130 | continue
131 | content = "#function" + json.dumps({**{"content": i["content"]}, **i["function_call"]}, ensure_ascii=False)
132 | else:
133 | content = i["content"]
134 | history.append(("assistant", content))
135 |
136 |
137 | if i["role"] == "function":
138 | content = json.dumps({**{"content": i["content"]}, **{"name": i["name"]}}, ensure_ascii=False)
139 | history.append(("user", content))
140 |
141 |
142 | history = [i[1] for i in history]
143 | history[0] = "\n".join([system_content,function_content, history[0]])
144 |
145 | for his_idx in range(0, len(history), 2):
146 | output = history[his_idx+1]
147 |
148 | if "#function" in output:
149 | output = output.split("#function")[-1]
150 |
151 | try:
152 | output = json.loads(output)
153 | except:
154 | output = {"content": output}
155 |
156 |
157 | func_call_train_datas.append(
158 | {
159 | "instruction": history[his_idx],
160 | "input": "",
161 | "output": output,
162 | "history": [history[:his_idx+2][i:i+2] for i in range(0, len(history[:his_idx]), 2)],
163 | "functions": tools
164 | },
165 | )
166 | return func_call_train_datas
167 |
168 | def generate(self, prompt, template, generate_configs, history=None):
169 | '''返回结果'''
170 | return self.model.generate(prompt, template, generate_configs, history)
171 |
172 | def eval_metric(self, datas):
173 | ''''''
174 | # function call 回复测试总数
175 | self.function_call_sum = 0
176 | # function call 回复正确数
177 | self.function_call_correct = 0
178 | # function call 回复失败数
179 | self.function_call_fail = 0
180 | # function call 回复失败中,本应该调用工具但是模型没有调用, 无工具识别识别错误数
181 | self.function_call_fail_functioncall = 0
182 | # function call 回复失败数中,因为函数名不对导致的失败数
183 | self.function_call_fail_name = 0
184 | # function call 回复失败数中,因为参数不对导致的失败数
185 | self.function_call_fail_param = 0
186 | # function call 回复失败中 函数名幻觉的失败数
187 | self.function_call_fail_name_illusion = 0
188 |
189 | # assistant ans 回复相关度列表
190 | self.assistant_ans_relevancy_list = []
191 |
192 | for data in datas:
193 | ass_predict = data["predict"]
194 | ass_truth = data["answer"]
195 | functions = data["functions"]
196 | history = data["history"]
197 | # 将user 和 function 的部分组合
198 | content_msg = ""
199 | for user_msg, assistant_msg in history:
200 | content_msg += user_msg
201 |
202 | # if "#function" in ass_truth:
203 | if "function_call" in ass_truth:
204 | self.calc_func_params(ass_predict, ass_truth, functions)
205 | else:
206 | self.calc_relevancy(ass_predict, ass_truth, content_msg)
207 |
208 | self.print_result()
209 | return {
210 | "function_call_correct_rate": self.function_call_correct_rate,
211 | "function_call_fail_rate": self.function_call_fail_rate,
212 | "function_call_fail_functioncall_rate": self.function_call_fail_functioncall_rate,
213 | "function_call_fail_name_rate": self.function_call_fail_name_rate,
214 | "function_call_fail_param_rate": self.function_call_fail_param_rate,
215 | "function_call_fail_name_illusion_rate": self.function_call_fail_name_illusion_rate
216 | }
217 |
218 | def calc_func_params(self, ass_predict, ass_truth, functions):
219 | self.function_call_sum += 1
220 |
221 | function_names = [i["name"] for i in functions]
222 | # ass_predict_param = json.loads(ass_predict.split("#function")[-1])
223 | # ass_truth_param = json.loads(ass_truth.split("#function")[-1])
224 |
225 | if "function_call" not in ass_predict:
226 | self.function_call_fail += 1
227 | self.function_call_fail_functioncall += 1
228 | elif ass_predict["function_call"]["name"] not in function_names:
229 | # 模型幻觉
230 | self.function_call_fail += 1
231 | self.function_call_fail_name += 1
232 | self.function_call_fail_name_illusion += 1
233 | else:
234 | function_call_name_label = False
235 | function_call_args_label = False
236 | if ass_predict["function_call"]["name"] == ass_truth["function_call"]["name"]:
237 | function_call_name_label = True
238 | if cmp_arguments(ass_predict["function_call"]["arguments"], ass_truth["function_call"]["arguments"]):
239 | function_call_args_label = True
240 | else:
241 | self.function_call_fail_param += 1
242 | else:
243 | self.function_call_fail_name += 1
244 | # # 是否可能存在名字错误参数正确的情况?
245 | # if self.cmp_arguments(ass_predict["function_call"]["arguments"], ass_truth["function_call"]["arguments"]):
246 | # function_call_args_label = True
247 | # else:
248 | # self.function_call_fail_param += 1
249 |
250 | if function_call_name_label and function_call_args_label:
251 | self.function_call_correct += 1
252 | else:
253 | self.function_call_fail += 1
254 |
255 | def calc_relevancy(self, ass_predict, ass_truth, content_msg):
256 | if "function_call" in ass_predict:
257 | self.assistant_ans_relevancy_list.append(0)
258 | return
259 |
260 | content_msg_counter = Counter(jieba.cut(remove_punctuation(content_msg)))
261 | ass_truth_counter = Counter(jieba.cut(remove_punctuation(ass_truth["content"])))
262 | ass_predict_counter = Counter(jieba.cut(remove_punctuation(ass_predict["content"])))
263 | relative_counter = content_msg_counter & ass_truth_counter
264 | len_relative = sum(relative_counter.values())
265 | predict_relative = ass_predict_counter & relative_counter
266 |
267 | if len_relative == 0:
268 | # 要是标准答案和问题相关词都无 直接给1
269 | self.assistant_ans_relevancy_list.append(1)
270 | else:
271 | # 交集与相关词的占比
272 | self.assistant_ans_relevancy_list.append(sum(predict_relative.values())/len_relative)
273 |
274 | def print_result(self, ):
275 | # 打印指标结果
276 | print("=============统计数据=========================")
277 | print(f"function_call_sum: {self.function_call_sum}")
278 | print(f"function_call_correct: {self.function_call_correct}")
279 | print(f"function_call_fail: {self.function_call_fail}")
280 | print(f"function_call_fail_name: {self.function_call_fail_name}")
281 | print(f"function_call_fail_param: {self.function_call_fail_param}")
282 | print(f"function_call_fail_name_illusion: {self.function_call_fail_name_illusion}")
283 | print(f"assistant_ans_sum: {len(self.assistant_ans_relevancy_list)}")
284 | print(f"assistant_ans_relevancy: {np.mean(self.assistant_ans_relevancy_list)}")
285 | print("=============实验结果=========================")
286 | self.function_call_correct_rate = self.function_call_correct/self.function_call_sum
287 | self.function_call_fail_rate = self.function_call_fail/self.function_call_sum
288 | self.function_call_fail_functioncall_rate = self.function_call_fail_functioncall/self.function_call_sum
289 | self.function_call_fail_name_rate = self.function_call_fail_name/self.function_call_sum
290 | self.function_call_fail_param_rate = self.function_call_fail_param/self.function_call_sum
291 | self.function_call_fail_name_illusion_rate = self.function_call_fail_name_illusion/self.function_call_sum
292 |
293 | # self.function_call_fail_functioncall_rate = self.function_call_fail_functioncall/self.function_call_fail if self.function_call_fail else 0
294 | # self.function_call_fail_name_rate = self.function_call_fail_name/self.function_call_fail if self.function_call_fail else 0
295 | # self.function_call_fail_param_rate = self.function_call_fail_param/self.function_call_fail if self.function_call_fail else 0
296 | # self.function_call_fail_name_illusion_rate = self.function_call_fail_name_illusion/self.function_call_fail if self.function_call_fail else 0
297 | print(f"工具识别正确率fccr: {self.function_call_correct_rate}")
298 | print(f"工具识别失败率fcfr: {self.function_call_fail_rate}")
299 | print(f"工具调用识别失败占比fcffr: {self.function_call_fail_functioncall_rate}")
300 | print(f"工具名识别失败占比fcfnr: {self.function_call_fail_name_rate}")
301 | print(f"工具参数识别失败占比fcfpr: {self.function_call_fail_param_rate}")
302 | print(f"工具幻觉识别失败占比fcfnir: {self.function_call_fail_name_illusion_rate}")
303 | print(f"助手回复答案相关度aar: {np.mean(self.assistant_ans_relevancy_list)}")
304 | print("==============================================")
305 |
--------------------------------------------------------------------------------
/README_zh.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | 🤗 Hugging Face • ⏬ 数据 • 📖 教程
5 |
6 | English | 中文
7 |
8 |
9 | DevOps-Eval是一个专门为DevOps领域大模型设计的综合评估数据集。我们希望DevOps-Eval能够帮助开发者,尤其是DevOps领域的开发者,追踪进展并分析他们拥有的DevOps大模型的优势和不足之处。
10 |
11 | 📚 该仓库包含与DevOps和AIOps相关的问题和练习, 还添加了关于ToolLearning相关的样本。
12 |
13 | 💥 目前有 **7486** 个多项选择题,根据DevOps的通用流程将其归纳未8个模块,如[下图](images/data_info.png)所示。
14 |
15 | 🔥 AIOps样本总计 **2840** 个,覆盖的场景包括**日志解析**、**时序异常检测**、**时序分类**、**时序预测**和**根因分析**。
16 |
17 | 🔧 ToolLearning样本 **1509** 个,涵盖59个领域,总计 239 种工具类别。
18 |
19 | 
20 |
21 |
22 | ## 🔔 更新
23 | * **[2023.12.27]** 新增1509个ToolLearning样本,发布了相应的评测排行榜
24 | * **[2023.11.27]** 增加运维场景样本487例、时序预测样本640例;同步更新评测排行榜
25 | * **[2023.10.30]** 增加针对AIOps场景的评测排行榜
26 | * **[2023.10.25]** 增加AIOps样本,包含日志解析、时序异常检测、时序分类和根因分析
27 | * **[2023.10.18]** DevOps-Eval发布大模型评测排行版
28 |
29 |
30 | ## 📜 目录
31 |
32 | - [🏆 排行榜](#-排行榜)
33 | - [👀 DevOps](#-devops)
34 | - [🔥 AIOps](#-aiops)
35 | - [🔧 ToolLearning](#-toollearning)
36 | - [⏬ 数据](#-数据)
37 | - [👀 说明](#-说明)
38 | - [🔥 AIOps样本示例](#-AIOps样本示例)
39 | - [🔧 ToolLearning样本示例](#-toollearning样本示例)
40 | - [🚀 如何进行测试](#-如何进行测试)
41 | - [🧭 TODO](#-todo)
42 | - [🏁 Licenses](#-licenses)
43 | - [😃 引用](#-引用)
44 | - [🗂 Miscellaneous](#-miscellaneous)
45 | - [✨ Star History](#-star-history)
46 | - [🤝 Friendship Links](#-friendship-links)
47 |
48 | ## 🏆 排行榜
49 | 以下是我们获得的初版评测结果,包括多个开源模型的zero-shot和five-shot准确率。我们注意到,对于大多数指令模型来说,five-shot的准确率要优于zero-shot。
50 |
51 | ### 👀 DevOps
52 | #### Zero Shot
53 |
54 | | **模型** | plan | code | build | test | release | deploy | operate | monitor | **平均分** |
55 | |:------------------------:|:-----:|:-----:|:-----:|:------:|:--------:|:------:|:-------:|:--------:|:---------:|
56 | | DevOpsPal-14B-Chat | 60.61 | 78.35 | 84.86 | 84.65 | 87.26 | 82.75 | 69.89 | 79.17 | 78.23 |
57 | | DevOpsPal-14B-Base | 54.55 | 77.82 | 83.49 | 85.96 | 86.32 | 81.96 | 71.18 | 82.41 | 78.23 |
58 | | Qwen-14B-Chat | 60.61 | 75.4 | 85.32 | 84.21 | 89.62 | 82.75 | 69.57 | 80.56 | 77.18 |
59 | | Qwen-14B-Base | 57.58 | 73.81 | 84.4 | 85.53 | 86.32 | 81.18 | 70.05 | 80.09 | 76.19 |
60 | | Baichuan2-13B-Base | 60.61 | 69.42 | 79.82 | 79.82 | 82.55 | 81.18 | 70.37 | 83.8 | 73.73 |
61 | | Baichuan2-13B-Chat | 60.61 | 68.43 | 77.98 | 80.7 | 81.6 | 83.53 | 67.63 | 84.72 | 72.9 |
62 | | DevOpsPal-7B-Chat | 54.55 | 69.11 | 83.94 | 82.02 | 76.89 | 80 | 64.73 | 77.78 | 71.92 |
63 | | DevOpsPal-7B-Base | 54.55 | 68.96 | 82.11 | 78.95 | 80.66 | 76.47 | 65.54 | 78.7 | 71.69 |
64 | | Qwen-7B-Base | 53.03 | 68.13 | 78.9 | 75.44 | 80.19 | 80 | 65.06 | 80.09 | 71.09 |
65 | | Qwen-7B-Chat | 57.58 | 66.01 | 80.28 | 79.82 | 76.89 | 77.65 | 62.64 | 79.17 | 69.75 |
66 | | Baichuan2-7B-Chat | 54.55 | 63.66 | 77.98 | 76.32 | 71.7 | 73.33 | 59.42 | 79.63 | 66.97 |
67 | | Internlm-7B-Chat | 60.61 | 62.15 | 77.06 | 76.32 | 66.98 | 74.51 | 60.39 | 78.24 | 66.27 |
68 | | Baichuan2-7B-Base | 56.06 | 62.45 | 75.69 | 70.61 | 74.06 | 69.8 | 61.67 | 75.93 | 66.21 |
69 | | Internlm-7B-Base | 54.55 | 58.29 | 79.36 | 78.95 | 77.83 | 70.59 | 65.86 | 75.93 | 65.99 |
70 |
71 |
72 | #### Five Shot
73 |
74 | | **模型** | plan | code | build | test | release | deploy | operate | monitor | **平均分** |
75 | |:------------------------:|:-----:|:-----:|:-----:|:------:|:--------:|:------:|:-------:|:--------:|:---------:|
76 | | DevOpsPal-14B-Chat | 63.64 | 79.49 | 81.65 | 85.96 | 86.79 | 86.67 | 72.95 | 81.48 | 79.69 |
77 | | DevOpsPal-14B-Base | 62.12 | 80.55 | 82.57 | 85.53 | 85.85 | 84.71 | 71.98 | 80.09 | 79.63 |
78 | | Qwen-14B-Chat | 65.15 | 76 | 82.57 | 85.53 | 84.91 | 84.31 | 70.85 | 81.48 | 77.81 |
79 | | Qwen-14B-Base | 66.67 | 76.15 | 84.4 | 85.53 | 86.32 | 80.39 | 72.46 | 80.56 | 77.56 |
80 | | Baichuan2-13B-Base | 63.64 | 71.39 | 80.73 | 82.46 | 81.13 | 84.31 | 73.75 | 85.19 | 75.8 |
81 | | Qwen-7B-Base | 75.76 | 72.52 | 78.9 | 81.14 | 83.96 | 81.18 | 70.37 | 81.94 | 75.36 |
82 | | Baichuan2-13B-Chat | 62.12 | 69.95 | 76.61 | 84.21 | 83.49 | 79.61 | 71.98 | 80.56 | 74.12 |
83 | | DevOpsPal-7B-Chat | 66.67 | 69.95 | 83.94 | 81.14 | 80.19 | 82.75 | 68.6 | 76.85 | 73.61 |
84 | | DevOpsPal-7B-Base | 69.7 | 69.49 | 82.11 | 81.14 | 82.55 | 82.35 | 67.15 | 79.17 | 73.35 |
85 | | Qwen-7B-Chat | 65.15 | 66.54 | 82.57 | 81.58 | 81.6 | 81.18 | 65.38 | 81.02 | 71.69 |
86 | | Baichuan2-7B-Base | 60.61 | 67.22 | 76.61 | 75 | 77.83 | 78.43 | 67.31 | 79.63 | 70.8 |
87 | | Internlm-7B-Chat | 60.61 | 63.06 | 79.82 | 80.26 | 67.92 | 75.69 | 60.06 | 77.31 | 69.21 |
88 | | Baichuan2-7B-Chat | 60.61 | 64.95 | 81.19 | 75.88 | 71.23 | 75.69 | 64.9 | 79.17 | 69.05 |
89 | | Internlm-7B-Base | 62.12 | 65.25 | 77.52 | 80.7 | 74.06 | 78.82 | 63.45 | 75.46 | 67.17 |
90 |
91 |
92 | ### 🔥 AIOps
93 |
94 |
95 |
96 | #### Zero Shot
97 | | **模型** | 日志解析 | 根因分析 | 时序异常检测 | 时序分类 | 时序预测 | **平均分** |
98 | |:-------------------:|:-----:|:----:|:------:|:----:|:-----:|:-------:|
99 | | Qwen-14B-Base | 66.29 | 58.8 | 25.33 | 43.5 | 62.5 | 52.25 |
100 | | DevOpsPal-14B—Base | 63.14 | 53.6 | 23.33 | 43.5 | 64.06 | 50.49 |
101 | | Qwen-14B-Chat | 64.57 | 51.6 | 22.67 | 36 | 62.5 | 48.94 |
102 | | DevOpsPal-14B—Chat | 60 | 56 | 24 | 43 | 57.81 | 48.8 |
103 | | Qwen-7B-Base | 50 | 39.2 | 22.67 | 54 | 43.75 | 41.48 |
104 | | DevOpsPal-7B—Chat | 56.57 | 30.4 | 25.33 | 45 | 44.06 | 40.92 |
105 | | Baichuan2-13B-Chat | 64 | 18 | 21.33 | 37.5 | 46.88 | 39.3 |
106 | | Qwen-7B-Chat | 57.43 | 38.8 | 22.33 | 39.5 | 25.31 | 36.97 |
107 | | Internlm-7B—Chat | 58.86 | 8.8 | 22.33 | 28.5 | 51.25 | 36.34 |
108 | | Baichuan2-7B-Chat | 60.86 | 10 | 28 | 34.5 | 39.06 | 36.34 |
109 | | Baichuan2-7B-Base | 53.43 | 12.8 | 27.67 | 36.5 | 40.31 | 35.49 |
110 | | Baichuan2-13B-Base | 54 | 12.4 | 23 | 34.5 | 42.81 | 34.86 |
111 | | DevOpsPal-7B—Base | 46.57 | 20.8 | 25 | 34 | 38.75 | 33.94 |
112 | | Internlm-7B—Base | 48.57 | 18.8 | 23.33 | 37.5 | 33.75 | 33.1 |
113 |
114 | #### One Shot
115 | | **模型** | 日志解析 | 根因分析 | 时序异常检测 | 时序分类 | 时序预测 | **平均分** |
116 | |:-------------------:|:-----:|:----:|:------:|:----:|:-----:|:-------:|
117 | | DevOpsPal-14B—Chat | 66.29 | 80.8 | 23.33 | 44.5 | 56.25 | 54.44 |
118 | | DevOpsPal-14B—Base | 60 | 74 | 25.33 | 43.5 | 52.5 | 51.13 |
119 | | Qwen-14B-Base | 64.29 | 74.4 | 28 | 48.5 | 40.31 | 50.77 |
120 | | Qwen-7B-Base | 56 | 60.8 | 27.67 | 44 | 57.19 | 49.44 |
121 | | Qwen-14B-Chat | 49.71 | 65.6 | 28.67 | 48 | 42.19 | 46.13 |
122 | | Baichuan2-13B-Base | 56 | 43.2 | 24.33 | 41 | 46.88 | 42.89 |
123 | | Baichuan2-7B-Chat | 58.57 | 31.6 | 27 | 31.5 | 51.88 | 41.83 |
124 | | DevOpsPal-7B—Base | 52.86 | 44.4 | 28 | 44.5 | 36.25 | 41.2 |
125 | | Baichuan2-7B-Base | 48.29 | 40.4 | 27 | 42 | 40.94 | 39.86 |
126 | | Qwen-7B-Chat | 54.57 | 52 | 29.67 | 26.5 | 27.19 | 38.73 |
127 | | Baichuan2-13B-Chat | 57.43 | 44.4 | 25 | 25.5 | 30.63 | 37.75 |
128 | | DevOpsPal-7B—Chat | 56.57 | 27.2 | 25.33 | 41.5 | 33.44 | 37.46 |
129 | | Internlm-7B—Chat | 62.57 | 12.8 | 22.33 | 21 | 50.31 | 36.69 |
130 | | Internlm-7B—Base | 48 | 33.2 | 29 | 35 | 31.56 | 35.85 |
131 |
132 |
133 |
134 | ### 🔧 ToolLearning
135 |
136 |
137 | | **FuncCall-Filler** | dataset_name | fccr | 1-fcffr | 1-fcfnr | 1-fcfpr | 1-fcfnir | aar |
138 | |:-------------------:| :---: | :---: | :---: | :---: | :---: | :---: | :---: |
139 | | Qwen-14b-chat | luban | 61 | 100 | 97.68 | 63.32 | 100 | 69.46 |
140 | | Qwen-7b-chat | luban | 50.58 | 100 | 98.07 | 52.51 | 100 | 63.59 |
141 | | Baichuan-7b-chat | luban | 60.23 | 100 | 97.3 | 62.93 | 99.61 | 61.12 |
142 | | Internlm-chat-7b | luban | 47.88 | 100 | 96.14 | 51.74 | 99.61 | 61.85 |
143 | | Qwen-14b-chat | fc_data | 98.37 | 99.73 | 99.86 | 98.78 | 100 | 81.58 |
144 | | Qwen-7b-chat | fc_data | 99.46 | 99.86 | 100 | 99.59 | 100 | 79.25 |
145 | | Baichuan-7b-chat | fc_data | 97.96 | 99.32 | 100 | 98.64 | 100 | 89.53 |
146 | | Internlm-chat-7b | fc_data | 94.29 | 95.78 | 100 | 98.5 | 100 | 88.19 |
147 | | CodeLLaMa-7b | fc_data | 98.78 | 99.73 | 100 | 99.05 | 100 | 94.7 |
148 | | CodeLLaMa-7b-16 | fc_data | 98.1 | 99.87 | 99.73 | 98.5 | 100 | 93.14 |
149 | | CodeFuse-7b-4k | fc_data | 98.91 | 99.87 | 99.87 | 99.18 | 100 | 89.5 |
150 |
151 |
152 |
153 |
154 | ## ⏬ 数据
155 | #### 下载
156 | * 方法一:下载zip压缩文件(你也可以直接用浏览器打开下面的链接):
157 | ```
158 | wget https://huggingface.co/datasets/codefuse-admin/devopseval-exam/resolve/main/devopseval-exam.zip
159 | ```
160 | 然后可以使用 pandas加载数据:
161 |
162 | ```
163 | import os
164 | import pandas as pd
165 |
166 | File_Dir="devopseval-exam"
167 | test_df=pd.read_csv(os.path.join(File_Dir,"test","UnitTesting.csv"))
168 | ```
169 | * 方法二:使用[Hugging Face datasets](https://huggingface.co/datasets/codefuse-admin/devopseval-exam)直接加载数据集。示例如下:
170 | ```python
171 | from datasets import load_dataset
172 | dataset=load_dataset(r"DevOps-Eval/devopseval-exam",name="UnitTesting")
173 |
174 | print(dataset['val'][0])
175 | # {"id": 1, "question": "单元测试应该覆盖以下哪些方面?", "A": "正常路径", "B": "异常路径", "C": "边界值条件","D": 所有以上,"answer": "D", "explanation": ""} ```
176 |
177 | * 方法三:使用modelscope下载相关所有数据。示例如下:
178 | ```python
179 | from modelscope.msdatasets import MsDataset
180 | MsDataset.clone_meta(dataset_work_dir='./xxx', dataset_id='codefuse-ai/devopseval-exam')
181 | ```
182 |
183 | #### 👀 说明
184 | 为了方便使用,我们已经整理出了 55 个细分类别以及它们的中英文名称。具体细节请查看 [category_mapping.json](resources/categroy_mapping.json) 。格式如下:
185 |
186 | ```
187 | {
188 | "UnitTesting.csv": [
189 | "unit testing",
190 | "单元测试",
191 | {"dev": 5, "test": 32}
192 | "TEST"
193 | ],
194 | ...
195 | "file_name":[
196 | "英文名称",
197 | "中文名称",
198 | "样本数量",
199 | "类别(PLAN,CODE,BUILD,TEST,RELEASE,DEPOLY,OPERATE,MONITOR八选一)"
200 | ]
201 | }
202 | ```
203 | 每个细分类别由两个部分组成:dev 和 test。每个细分类别的 dev 集包含五个示范实例以及为 few-shot 评估提供的解释。而 test 集则用于模型评估,并且test数据已包含准确标签。
204 |
205 | 下面是 dev 数据的示例,来自"版本控制"细分类别:
206 | ```
207 | id: 4
208 | question: 如何找到Git特定提交中已更改的文件列表?
209 | A: 使用命令 `git diff --name-only SHA`
210 | B: 使用命令 `git log --name-only SHA`
211 | C: 使用命令 `git commit --name-only SHA`
212 | D: 使用命令 `git clone --name-only SHA`
213 | answer: A
214 | explanation:
215 | 分析原因:
216 | git diff --name-only SHA命令会显示与SHA参数对应的提交中已修改的文件列表。参数--name-only让命令只输出文件名,而忽略其他信息。其它选项中的命令并不能实现此功能。
217 | ```
218 | #### 🔥 AIOps样本示例
219 | 👀 👀 此处以日志解析和时序异常检测为例,对AIOps样本做一些简要的展示:
220 |
221 | 日志解析
222 | ```
223 | id: 0
224 | question:
225 | 下面是一些运行日志
226 | 0 04:21:15,429 WARN Cannot open channel to 2 at election address /10.10.34.12:3888
227 | 1 19:18:56,377 WARN ******* GOODBYE /10.10.34.11:52703 ********
228 | 2 19:13:46,128 WARN ******* GOODBYE /10.10.34.11:52308 ********
229 | 3 19:16:26,268 WARN ******* GOODBYE /10.10.34.11:52502 ********
230 | 4 09:11:16,012 WARN Cannot open channel to 3 at election address /10.10.34.13:3888
231 | 5 16:37:13,837 WARN Cannot open channel to 2 at election address /10.10.34.12:3888
232 | 6 09:09:16,008 WARN Cannot open channel to 3 at election address /10.10.34.13:3888
233 | 7 15:27:03,681 WARN Cannot open channel to 3 at election address /10.10.34.13:3888
234 | 日志最前面三部分别为序号、时间戳和日志Level,在不考虑这三部分内容的情况下,此处我们设定日志的变量用'<*>'代替,token与token之间用空格分隔,那么请问上述日志的日志模版具体是什么?
235 | A: Notification time out: <*> 和 Connection broken for id <*>, my id = <*>, error =
236 | B: Send worker leaving thread 和 Connection broken for id <*>, my id = <*>, error =
237 | C: Received connection request /<*>:<*> 和 Interrupting SendWorker
238 | D: Cannot open channel to <*> at election address /<*>:<*> 和 ******* GOODBYE /<*>:<*> ********
239 | answer: D
240 | explanation: 根据日志中的内容,选项D是最符合日志模板的。日志中包含了"Cannot open channel to <*> at election address /<*>:<*>"和"******* GOODBYE /<*>:<*> ********"这两个固定的模板片段,它们都在选项D中出现了。同时,其他选项中的模板片段与日志中的内容不匹配。因此,选项D是最符合日志模板的。
241 | ```
242 | 时序异常检测
243 | ```
244 | id: 0
245 | question:
246 | 分析如下时间序列
247 | [50,62,74,84,92,97,99,98,94,87,77,65,265,40,28,17,8,3,0,0,4,10,20,31,43,56,68,79,89,95,99,99,96,91,82,71,59,46,34,22,12,5,1,0,2,7,15,25,37,49]
248 | 请找出其中明显异常点的下标。所谓的异常点一般指的是明显与数据整体趋势不符的点。
249 | A: 46
250 | B: 0
251 | C: 37
252 | D: 12
253 | answer: D
254 | explanation: 根据分析,题目中的时间序列在12点出的值265要明显大于周围数据,存在着突增现象,因此选择D是正确的。
255 | ```
256 | #### 🔧 ToolLearning样本示例
257 | 工具学习样本的数据格式与OpenAI的函数调用格式兼容。
258 | 详情请参阅[tool_learning_info_zh.md](resources/tool_learning_info_zh.md)。
259 | 工具学习评测过程,详情请参阅见 [tool_learning_evalution.md](resources/tool_learning_evalution.md)。
260 |
261 |
262 | ## 🚀 如何进行测试
263 | 如果需要在自己的 HuggingFace 格式的模型上进行测试的话,总的步骤分为如下几步:
264 | 1. 编写 Model 的 loader 函数
265 | 2. 编写 Model 的 context_builder 函数
266 | 3. 注册模型到配置文件中
267 | 4. 执行测试脚本
268 | 如果模型在加载进来后不需要特殊的处理,而且输入也不需要转换为特定的格式(e.g. chatml 格式或者其他的 human-bot 格式),请直接跳转到第四步直接发起测试。
269 |
270 | #### 1. 编写 loader 函数
271 | 模型加载时还需要做一些额外的处理(e.g. tokenizer 调整),需要继承 `ModelAndTokenizerLoader` 类来覆写对应的 `load_model` 和 `load_tokenizer` 函数, 如下所示:
272 | ```python
273 | class QwenModelAndTokenizerLoader(ModelAndTokenizerLoader):
274 | def __init__(self):
275 | super().__init__()
276 | pass
277 |
278 | @override
279 | def load_model(self, model_path: str):
280 | # Implementation of the method
281 | pass
282 |
283 | @override
284 | def load_tokenizer(self, model_path: str):
285 | # Implementation of the method
286 | pass
287 | ```
288 | #### 2. 编写 Model 的 context_builder 函数
289 | 如果输入需要转换为特定的格式(e.g. chatml 格式或者其他的 human-bot 格式),则需要继承 ContextBuilder 类来覆写 make_context 函数,如下所示:
290 | ```python
291 | class QwenChatContextBuilder(ContextBuilder):
292 | def __init__(self):
293 | super().__init__()
294 |
295 | @override
296 | def make_context(self, model, tokenizer, query: str, system: str = "hello!"):
297 | # Implementation of the method
298 | pass
299 | ```
300 | #### 3. 注册模型到配置文件中
301 | 去 conf 中的 `model_conf.json`,注册对应的模型名和这个模型将要使用的 loader 和 context_builder,示例如下:
302 | ```json
303 | {
304 | "Qwen-Chat": {
305 | "loader": "QwenModelAndTokenizerLoader",
306 | "context_builder": "QwenChatContextBuilder"
307 | }
308 | }
309 | ```
310 |
311 | #### 4. 执行测试脚本
312 | 直接运行以下代码发起测试
313 | ```Bash
314 | python src/run_eval.py \
315 | --model_path path_to_model \
316 | --model_name model_name_in_conf \
317 | --model_conf_path path_to_model_conf \
318 | --eval_dataset_list all \
319 | --eval_dataset_fp_conf_path path_to_dataset_conf \
320 | --eval_dataset_type test \
321 | --data_path path_to_downloaded_devops_eval_data \
322 | --k_shot 0
323 | ```
324 | 👀 👀 具体评测流程见📖 [**数据集评测教程**](resources/tutorial_zh.md)
325 |
326 |
327 | ## 🧭 TODO
328 | - [x] 添加AIOps样本
329 | - [x] 添加AIOps场景,比如**时间预测**
330 | - [x] 增加 **ToolLearning** 样本
331 | - [ ] 当前各类别样本量不平均,后续进一步增加样本数量
332 | - [ ] 增加困难程度的样本集
333 | - [ ] 增加样本的英文版本
334 |
335 |
336 |
337 |
338 | ## 🏁 Licenses
339 | This project is licensed under the [Apache License (Version 2.0)](LICENSE.md).
340 |
341 |
342 |
343 | ## 😃 引用
344 |
345 | 如果您使用了我们的数据集,请引用我们的论文。
346 | Coming soon...
347 |
348 |
349 |
350 |
351 |
352 | ## 🗂 Miscellaneous
353 |
354 | ### ✨ Star History
355 | [](https://star-history.com/#codefuse-ai/codefuse-devops-eval&Date)
356 |
357 | ### 🤝 Friendship Links
358 | - [Codefuse-ChatBot](https://github.com/codefuse-ai/codefuse-chatbot)
359 | - Codefuse-ChatBot is an open-source AI smart assistant designed to support the software development lifecycle with conversational access to tools, knowledge, and platform integration.
360 | - [Awesome AIGC Tutorials](https://github.com/luban-agi/Awesome-AIGC-Tutorials)
361 | - Awesome AIGC Tutorials houses a curated collection of tutorials and resources spanning across Large Language Models, AI Painting, and related fields.
362 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | 🤗 Hugging Face • ⏬ Data • 📖 Tutorial
5 |
6 | 中文 | English
7 |
8 |
9 | DevOps-Eval is a comprehensive evaluation suite specifically designed for foundation models in the DevOps field. We hope DevOps-Eval could help developers, especially in the DevOps field, track the progress and analyze the important strengths/shortcomings of their models.
10 |
11 |
12 | 📚 This repo contains questions and exercises related to DevOps, including the AIOps, ToolLearning;
13 |
14 | 💥️ There are currently **7486** multiple-choice questions spanning 8 diverse general categories, as shown [below](images/data_info.png).
15 |
16 | 🔥 There are a total of **2840** samples in the AIOps subcategory, covering scenarios such as **log parsing**, **time series anomaly detection**, **time series classification**, **time series forecasting**, and **root cause analysis**.
17 |
18 | 🔧 There are a total of **1509** samples in the ToolLearning subcategory, covering 239 tool scenes across 59 fields.
19 |
20 | 
21 |
22 |
23 | ## 🔔 News
24 | * **[2023.12.27]** Add 1509 **ToolLearning** samples, covering 239 tool categories across 59 fields; Release the associated evaluation leaderboard;
25 | * **[2023.11.27]** Add 487 operation scene samples and 640 time series forecasting samples; Update the Leaderboard;
26 | * **[2023.10.30]** Add the AIOps Leaderboard.
27 | * **[2023.10.25]** Add the AIOps samples, including log parsing, time series anomaly detection, time series classification and root cause analysis.
28 | * **[2023.10.18]** Update the initial Leaderboard...
29 |
30 |
31 | ## 📜 Table of Contents
32 |
33 | - [🏆 Leaderboard](#-leaderboard)
34 | - [👀 DevOps](#-devops)
35 | - [🔥 AIOps](#-aiops)
36 | - [🔧 ToolLearning](#-toollearning)
37 | - [⏬ Data](#-data)
38 | - [👀 Notes](#-notes)
39 | - [🔥 AIOps Sample Example](#-aiops-sample-example)
40 | - [🔧 ToolLearning Sample Example](#-toollearning-sample-example)
41 | - [🚀 How to Evaluate](#-how-to-evaluate)
42 | - [🧭 TODO](#-todo)
43 | - [🏁 Licenses](#-licenses)
44 | - [😃 Citation](#-citation)
45 | - [🗂 Miscellaneous](#-miscellaneous)
46 | - [📱 Contact Us](#-contact-us)
47 | - [✨ Star History](#-star-history)
48 | - [🤝 Friendship Links](#-friendship-links)
49 | ## 🏆 Leaderboard
50 | Below are zero-shot and five-shot accuracies from the models that we evaluate in the initial release. We note that five-shot performance is better than zero-shot for many instruction-tuned models.
51 | ### 👀 DevOps
52 | #### Zero Shot
53 |
54 | | **ModelName** | plan | code | build | test | release | deploy | operate | monitor | **AVG** |
55 | |:------------------------:|:-----:|:-----:|:-----:|:------:|:--------:|:------:|:-------:|:--------:|:-----------:|
56 | | DevOpsPal-14B-Chat | 60.61 | 78.35 | 84.86 | 84.65 | 87.26 | 82.75 | 69.89 | 79.17 | 78.23 |
57 | | DevOpsPal-14B-Base | 54.55 | 77.82 | 83.49 | 85.96 | 86.32 | 81.96 | 71.18 | 82.41 | 78.23 |
58 | | Qwen-14B-Chat | 60.61 | 75.4 | 85.32 | 84.21 | 89.62 | 82.75 | 69.57 | 80.56 | 77.18 |
59 | | Qwen-14B-Base | 57.58 | 73.81 | 84.4 | 85.53 | 86.32 | 81.18 | 70.05 | 80.09 | 76.19 |
60 | | Baichuan2-13B-Base | 60.61 | 69.42 | 79.82 | 79.82 | 82.55 | 81.18 | 70.37 | 83.8 | 73.73 |
61 | | Baichuan2-13B-Chat | 60.61 | 68.43 | 77.98 | 80.7 | 81.6 | 83.53 | 67.63 | 84.72 | 72.9 |
62 | | DevOpsPal-7B-Chat | 54.55 | 69.11 | 83.94 | 82.02 | 76.89 | 80 | 64.73 | 77.78 | 71.92 |
63 | | DevOpsPal-7B-Base | 54.55 | 68.96 | 82.11 | 78.95 | 80.66 | 76.47 | 65.54 | 78.7 | 71.69 |
64 | | Qwen-7B-Base | 53.03 | 68.13 | 78.9 | 75.44 | 80.19 | 80 | 65.06 | 80.09 | 71.09 |
65 | | Qwen-7B-Chat | 57.58 | 66.01 | 80.28 | 79.82 | 76.89 | 77.65 | 62.64 | 79.17 | 69.75 |
66 | | Baichuan2-7B-Chat | 54.55 | 63.66 | 77.98 | 76.32 | 71.7 | 73.33 | 59.42 | 79.63 | 66.97 |
67 | | Internlm-7B-Chat | 60.61 | 62.15 | 77.06 | 76.32 | 66.98 | 74.51 | 60.39 | 78.24 | 66.27 |
68 | | Baichuan2-7B-Base | 56.06 | 62.45 | 75.69 | 70.61 | 74.06 | 69.8 | 61.67 | 75.93 | 66.21 |
69 | | Internlm-7B-Base | 54.55 | 58.29 | 79.36 | 78.95 | 77.83 | 70.59 | 65.86 | 75.93 | 65.99 |
70 |
71 |
72 | #### Five Shot
73 |
74 | | **ModelName** | plan | code | build | test | release | deploy | operate | monitor | **AVG** |
75 | |:------------------------:|:-----:|:-----:|:-----:|:------:|:--------:|:------:|:-------:|:--------:|:---------:|
76 | | DevOpsPal-14B-Chat | 63.64 | 79.49 | 81.65 | 85.96 | 86.79 | 86.67 | 72.95 | 81.48 | 79.69 |
77 | | DevOpsPal-14B-Base | 62.12 | 80.55 | 82.57 | 85.53 | 85.85 | 84.71 | 71.98 | 80.09 | 79.63 |
78 | | Qwen-14B-Chat | 65.15 | 76 | 82.57 | 85.53 | 84.91 | 84.31 | 70.85 | 81.48 | 77.81 |
79 | | Qwen-14B-Base | 66.67 | 76.15 | 84.4 | 85.53 | 86.32 | 80.39 | 72.46 | 80.56 | 77.56 |
80 | | Baichuan2-13B-Base | 63.64 | 71.39 | 80.73 | 82.46 | 81.13 | 84.31 | 73.75 | 85.19 | 75.8 |
81 | | Qwen-7B-Base | 75.76 | 72.52 | 78.9 | 81.14 | 83.96 | 81.18 | 70.37 | 81.94 | 75.36 |
82 | | Baichuan2-13B-Chat | 62.12 | 69.95 | 76.61 | 84.21 | 83.49 | 79.61 | 71.98 | 80.56 | 74.12 |
83 | | DevOpsPal-7B-Chat | 66.67 | 69.95 | 83.94 | 81.14 | 80.19 | 82.75 | 68.6 | 76.85 | 73.61 |
84 | | DevOpsPal-7B-Base | 69.7 | 69.49 | 82.11 | 81.14 | 82.55 | 82.35 | 67.15 | 79.17 | 73.35 |
85 | | Qwen-7B-Chat | 65.15 | 66.54 | 82.57 | 81.58 | 81.6 | 81.18 | 65.38 | 81.02 | 71.69 |
86 | | Baichuan2-7B-Base | 60.61 | 67.22 | 76.61 | 75 | 77.83 | 78.43 | 67.31 | 79.63 | 70.8 |
87 | | Internlm-7B-Chat | 60.61 | 63.06 | 79.82 | 80.26 | 67.92 | 75.69 | 60.06 | 77.31 | 69.21 |
88 | | Baichuan2-7B-Chat | 60.61 | 64.95 | 81.19 | 75.88 | 71.23 | 75.69 | 64.9 | 79.17 | 69.05 |
89 | | Internlm-7B-Base | 62.12 | 65.25 | 77.52 | 80.7 | 74.06 | 78.82 | 63.45 | 75.46 | 67.17 |
90 |
91 | ### 🔥 AIOps
92 |
93 |
94 |
95 | #### Zero Shot
96 | | **ModelName** | LogParsing | RootCauseAnalysis | TimeSeriesAnomalyDetection | TimeSeriesClassification | TimeSeriesForecasting | **AVG** |
97 | |:-------------------:|:------------:|:------------------:|:---------------------------:|:-----------------------------------------:|:---------------------------:|:-------:|
98 | | Qwen-14B-Base | 66.29 | 58.8 | 25.33 | 43.5 | 62.5 | 52.25 |
99 | | DevOpsPal-14B—Base | 63.14 | 53.6 | 23.33 | 43.5 | 64.06 | 50.49 |
100 | | Qwen-14B-Chat | 64.57 | 51.6 | 22.67 | 36 | 62.5 | 48.94 |
101 | | DevOpsPal-14B—Chat | 60 | 56 | 24 | 43 | 57.81 | 48.8 |
102 | | Qwen-7B-Base | 50 | 39.2 | 22.67 | 54 | 43.75 | 41.48 |
103 | | DevOpsPal-7B—Chat | 56.57 | 30.4 | 25.33 | 45 | 44.06 | 40.92 |
104 | | Baichuan2-13B-Chat | 64 | 18 | 21.33 | 37.5 | 46.88 | 39.3 |
105 | | Qwen-7B-Chat | 57.43 | 38.8 | 22.33 | 39.5 | 25.31 | 36.97 |
106 | | Internlm-7B—Chat | 58.86 | 8.8 | 22.33 | 28.5 | 51.25 | 36.34 |
107 | | Baichuan2-7B-Chat | 60.86 | 10 | 28 | 34.5 | 39.06 | 36.34 |
108 | | Baichuan2-7B-Base | 53.43 | 12.8 | 27.67 | 36.5 | 40.31 | 35.49 |
109 | | Baichuan2-13B-Base | 54 | 12.4 | 23 | 34.5 | 42.81 | 34.86 |
110 | | DevOpsPal-7B—Base | 46.57 | 20.8 | 25 | 34 | 38.75 | 33.94 |
111 | | Internlm-7B—Base | 48.57 | 18.8 | 23.33 | 37.5 | 33.75 | 33.1 |
112 |
113 | #### One Shot
114 | | **ModelName** | LogParsing | RootCauseAnalysis | TimeSeriesAnomalyDetection | TimeSeriesClassification | TimeSeriesForecasting | **AVG** |
115 | |:-------------------:|:------------:|:------------------:|:---------------------------:|:-----------------------------------------:|:---------------------------:|:-------:|
116 | | DevOpsPal-14B—Chat | 66.29 | 80.8 | 23.33 | 44.5 | 56.25 | 54.44 |
117 | | DevOpsPal-14B—Base | 60 | 74 | 25.33 | 43.5 | 52.5 | 51.13 |
118 | | Qwen-14B-Base | 64.29 | 74.4 | 28 | 48.5 | 40.31 | 50.77 |
119 | | Qwen-7B-Base | 56 | 60.8 | 27.67 | 44 | 57.19 | 49.44 |
120 | | Qwen-14B-Chat | 49.71 | 65.6 | 28.67 | 48 | 42.19 | 46.13 |
121 | | Baichuan2-13B-Base | 56 | 43.2 | 24.33 | 41 | 46.88 | 42.89 |
122 | | Baichuan2-7B-Chat | 58.57 | 31.6 | 27 | 31.5 | 51.88 | 41.83 |
123 | | DevOpsPal-7B—Base | 52.86 | 44.4 | 28 | 44.5 | 36.25 | 41.2 |
124 | | Baichuan2-7B-Base | 48.29 | 40.4 | 27 | 42 | 40.94 | 39.86 |
125 | | Qwen-7B-Chat | 54.57 | 52 | 29.67 | 26.5 | 27.19 | 38.73 |
126 | | Baichuan2-13B-Chat | 57.43 | 44.4 | 25 | 25.5 | 30.63 | 37.75 |
127 | | DevOpsPal-7B—Chat | 56.57 | 27.2 | 25.33 | 41.5 | 33.44 | 37.46 |
128 | | Internlm-7B—Chat | 62.57 | 12.8 | 22.33 | 21 | 50.31 | 36.69 |
129 | | Internlm-7B—Base | 48 | 33.2 | 29 | 35 | 31.56 | 35.85 |
130 |
131 |
132 |
133 |
134 | ### 🔧 ToolLearning
135 |
136 |
137 | | **FuncCall-Filler** | dataset_name | fccr | 1-fcffr | 1-fcfnr | 1-fcfpr | 1-fcfnir | aar |
138 | |:-------------------:| :---: | :---: | :---: | :---: | :---: | :---: | :---: |
139 | | Qwen-14b-chat | luban | 61 | 100 | 97.68 | 63.32 | 100 | 69.46 |
140 | | Qwen-7b-chat | luban | 50.58 | 100 | 98.07 | 52.51 | 100 | 63.59 |
141 | | Baichuan-7b-chat | luban | 60.23 | 100 | 97.3 | 62.93 | 99.61 | 61.12 |
142 | | Internlm-chat-7b | luban | 47.88 | 100 | 96.14 | 51.74 | 99.61 | 61.85 |
143 | | Qwen-14b-chat | fc_data | 98.37 | 99.73 | 99.86 | 98.78 | 100 | 81.58 |
144 | | Qwen-7b-chat | fc_data | 99.46 | 99.86 | 100 | 99.59 | 100 | 79.25 |
145 | | Baichuan-7b-chat | fc_data | 97.96 | 99.32 | 100 | 98.64 | 100 | 89.53 |
146 | | Internlm-chat-7b | fc_data | 94.29 | 95.78 | 100 | 98.5 | 100 | 88.19 |
147 | | CodeLLaMa-7b | fc_data | 98.78 | 99.73 | 100 | 99.05 | 100 | 94.7 |
148 | | CodeLLaMa-7b-16 | fc_data | 98.1 | 99.87 | 99.73 | 98.5 | 100 | 93.14 |
149 | | CodeFuse-7b-4k | fc_data | 98.91 | 99.87 | 99.87 | 99.18 | 100 | 89.5 |
150 |
151 |
152 |
153 |
154 |
155 | ## ⏬ Data
156 | #### Download
157 | * Method 1: Download the zip file (you can also simply open the following link with the browser):
158 | ```
159 | wget https://huggingface.co/datasets/codefuse-admin/devopseval-exam/resolve/main/devopseval-exam.zip
160 | ```
161 | then unzip it and you may load the data with pandas:
162 | ```
163 | import os
164 | import pandas as pd
165 |
166 | File_Dir="devopseval-exam"
167 | test_df=pd.read_csv(os.path.join(File_Dir,"test","UnitTesting.csv"))
168 | ```
169 | * Method 2: Directly load the dataset using [Hugging Face datasets](https://huggingface.co/datasets/codefuse-admin/devopseval-exam):
170 | ```python
171 | from datasets import load_dataset
172 | dataset=load_dataset(r"DevOps-Eval/devopseval-exam",name="UnitTesting")
173 |
174 | print(dataset['val'][0])
175 | # {"id": 1, "question": "单元测试应该覆盖以下哪些方面?", "A": "正常路径", "B": "异常路径", "C": "边界值条件","D": 所有以上,"answer": "D", "explanation": ""} ```
176 |
177 | * Method 3: Directly load the datase t using [ModelScope datasets](https://modelscope.cn/datasets/codefuse-ai/devopseval-exam/files):
178 | ```python
179 | from modelscope.msdatasets import MsDataset
180 | MsDataset.clone_meta(dataset_work_dir='./xxx', dataset_id='codefuse-ai/devopseval-exam')
181 | ```
182 |
183 | #### 👀 Notes
184 | To facilitate usage, we have organized the category name handlers and English/Chinese names corresponding to 55 subcategories. Please refer to [category_mapping.json](resources/categroy_mapping.json) for details. The format is:
185 |
186 | ```
187 | {
188 | "UnitTesting.csv": [
189 | "unit testing",
190 | "单元测试",
191 | {"dev": 5, "test": 32}
192 | "TEST"
193 | ],
194 | ...
195 | "file_name":[
196 | "English Name",
197 | "Chinese Name",
198 | "Sample Number",
199 | "Supercatagory Label(PLAN,CODE,BUILD,TEST,RELEASE,DEPOLY,OPERATE,MONITOR choose 1 out of 8)"
200 | ]
201 | }
202 | ```
203 | Each subcategory consists of two splits: dev and test. The dev set per subcategory consists of five exemplars with explanations for few-shot evaluation. And the test set is for model evaluation. Labels on the test split are also released.
204 |
205 | Below is a dev example from 'version control':
206 |
207 | ```
208 | id: 4
209 | question: 如何找到Git特定提交中已更改的文件列表?
210 | A: 使用命令 `git diff --name-only SHA`
211 | B: 使用命令 `git log --name-only SHA`
212 | C: 使用命令 `git commit --name-only SHA`
213 | D: 使用命令 `git clone --name-only SHA`
214 | answer: A
215 | explanation:
216 | 分析原因:
217 | git diff --name-only SHA命令会显示与SHA参数对应的提交中已修改的文件列表。参数--name-only让命令只输出文件名,而忽略其他信息。其它选项中的命令并不能实现此功能。
218 | ```
219 | #### 🔥 AIOps Sample Example
220 | 👀 👀 Taking **log parsing** and **time series anomaly detection** as examples, here is a brief showcase of the AIOps samples:
221 |
222 | LogParsing
223 | ```
224 | id: 0
225 | question:
226 | Here are some running logs
227 | 0 04:21:15,429 WARN Cannot open channel to 2 at election address /10.10.34.12:3888
228 | 1 19:18:56,377 WARN ******* GOODBYE /10.10.34.11:52703 ********
229 | 2 19:13:46,128 WARN ******* GOODBYE /10.10.34.11:52308 ********
230 | 3 19:16:26,268 WARN ******* GOODBYE /10.10.34.11:52502 ********
231 | 4 09:11:16,012 WARN Cannot open channel to 3 at election address /10.10.34.13:3888
232 | 5 16:37:13,837 WARN Cannot open channel to 2 at election address /10.10.34.12:3888
233 | 6 09:09:16,008 WARN Cannot open channel to 3 at election address /10.10.34.13:3888
234 | 7 15:27:03,681 WARN Cannot open channel to 3 at election address /10.10.34.13:3888
235 | The first three parts of the log are index, timestamp, and log level. Without considering these three parts, Here we assume that the variables in the logs are represented as '<*>', separated by spaces between tokens. What is the specific log template for the above logs?
236 | A: Notification time out: <*> 和 Connection broken for id <*>, my id = <*>, error =
237 | B: Send worker leaving thread 和 Connection broken for id <*>, my id = <*>, error =
238 | C: Received connection request /<*>:<*> 和 Interrupting SendWorker
239 | D: Cannot open channel to <*> at election address /<*>:<*> 和 ******* GOODBYE /<*>:<*> ********
240 | answer: D
241 | explanation: The log includes the fixed template fragments "Cannot open channel to <> at election address /<>:<>" and "****** GOODBYE /<>:<> ********," both of which appear in option D. Meanwhile, the template fragments in the other options do not match the content in the log. Therefore, option D is the most consistent with the log template.
242 | ```
243 | TimeSeriesAnomalyDetection
244 | ```
245 | id: 0
246 | question:
247 | Analyze the following time series
248 | [50,62,74,84,92,97,99,98,94,87,77,65,265,40,28,17,8,3,0,0,4,10,20,31,43,56,68,79,89,95,99,99,96,91,82,71,59,46,34,22,12,5,1,0,2,7,15,25,37,49]
249 | Please identify the indices of obvious outlier points. Outlier points generally refer to points that significantly deviate from the overall trend of the data.
250 | A: 46
251 | B: 0
252 | C: 37
253 | D: 12
254 | answer: D
255 | explanation: According to the analysis, the value 265 in the given time series at 12 o'clock is significantly larger than the surrounding data, indicating a sudden increase phenomenon. Therefore, selecting option D is correct.
256 | ```
257 | #### 🔧 ToolLearning Sample Example
258 |
259 | 👀 👀The data format of ToolLearning samples is compatible with OpenAI's Function Calling.
260 |
261 | Please refer to [tool_learning_info.md](resources/tool_learning_info.md) for details.
262 |
263 |
264 | ## 🚀 How to Evaluate
265 | If you need to test your own huggingface-formatted model, the overall steps are as follows:
266 | 1. Write the loader function for the model.
267 | 2. Write the context_builder function for the model.
268 | 3. Register the model in the configuration file.
269 | 4. Run the testing script.
270 | If the model does not require any special processing after loading, and the input does not need to be converted to a specific format (e.g. chatml format or other human-bot formats), you can directly proceed to step 4 to initiate the testing.
271 |
272 | #### 1. Write the loader function
273 | If the model requires additional processing after loading (e.g. adjusting the tokenizer), you need to inherit the `ModelAndTokenizerLoader` class in `src.context_builder.context_builder_family.py` and override the corresponding `load_model` and `load_tokenizer` functions. You can refer to the following example:
274 | ```python
275 | class QwenModelAndTokenizerLoader(ModelAndTokenizerLoader):
276 | def __init__(self):
277 | super().__init__()
278 | pass
279 |
280 | @override
281 | def load_model(self, model_path: str):
282 | # Implementation of the method
283 | pass
284 |
285 | @override
286 | def load_tokenizer(self, model_path: str):
287 | # Implementation of the method
288 | pass
289 | ```
290 |
291 | #### 2. Write the context_builder function for the Model
292 | If the input needs to be converted to a specific format (e.g. chatml format or other human-bot formats), you need to inherit the ContextBuilder class in `src.context_builder.context_builder_family` and override the make_context function. This function is used to convert the input to the corresponding required format. An example is shown below:
293 | ```python
294 | class QwenChatContextBuilder(ContextBuilder):
295 | def __init__(self):
296 | super().__init__()
297 |
298 | @override
299 | def make_context(self, model, tokenizer, query: str, system: str = "hello!"):
300 | # Implementation of the method
301 | pass
302 | ```
303 |
304 | #### 3. Register the model in the configuration file
305 | Go to the `model_conf.json` file in the conf directory and register the corresponding model name and the loader and context_builder that will be used for this model. Simply write the class names defined in the first and second steps for the loader and context_builder. Here is an example:
306 | ```json
307 | {
308 | "Qwen-Chat": {
309 | "loader": "QwenModelAndTokenizerLoader",
310 | "context_builder": "QwenChatContextBuilder"
311 | }
312 | }
313 | ```
314 |
315 | #### 4. Execute the testing script
316 | Run the following code to initiate the test:
317 | ```Bash
318 | python src/run_eval.py \
319 | --model_path path_to_model \
320 | --model_name model_name_in_conf \
321 | --model_conf_path path_to_model_conf \
322 | --eval_dataset_list all \
323 | --eval_dataset_fp_conf_path path_to_dataset_conf \
324 | --eval_dataset_type test \
325 | --data_path path_to_downloaded_devops_eval_data \
326 | --k_shot 0
327 | ```
328 | 👀 👀 The specific evaluation process is as follows 📖 [**Evaluate Tutorial**](resources/tutorial.md)
329 |
330 |
331 |
332 | ## 🧭 TODO
333 | - [x] add AIOps samples.
334 | - [x] add AIOps scenario **time series forecasting**.
335 | - [x] add **ToolLearning** samples.
336 | - [ ] increase in sample size.
337 | - [ ] add samples with the difficulty level set to hard.
338 | - [ ] add the English version of the samples.
339 |
340 |
341 |
342 |
343 | ## 🏁 Licenses
344 | This project is licensed under the [Apache License (Version 2.0)](LICENSE.md).
345 |
346 |
347 |
348 | ## 😃 Citation
349 |
350 | Please cite our paper if you use our dataset.
351 |
352 | Coming Soon...
353 |
354 |
355 |
356 | ## 🗂 Miscellaneous
357 |
358 | ### 📱 Contact Us
359 |
360 |

361 |
362 |
363 | ### ✨ Star History
364 | [](https://star-history.com/#codefuse-ai/codefuse-devops-eval&Date)
365 |
366 | ### 🤝 Friendship Links
367 | - [Codefuse-ChatBot](https://github.com/codefuse-ai/codefuse-chatbot)
368 | - Codefuse-ChatBot is an open-source AI smart assistant designed to support the software development lifecycle with conversational access to tools, knowledge, and platform integration.
369 | - [Awesome AIGC Tutorials](https://github.com/luban-agi/Awesome-AIGC-Tutorials)
370 | - Awesome AIGC Tutorials houses a curated collection of tutorials and resources spanning across Large Language Models, AI Painting, and related fields.
371 |
372 |
--------------------------------------------------------------------------------