├── .gitignore ├── LEGAL.md ├── LICENSE.md ├── README.md ├── README_zh.md ├── conf ├── dataset_fp.json └── model_conf.json ├── images ├── data_info.png ├── devops_eval_logo.png └── toolLearning_performance_metrics.png ├── requirements.txt ├── resources ├── __init__.py ├── categroy_mapping.json ├── devops_diagram_zh.jpg ├── tool_learning_evalution.md ├── tool_learning_info.md ├── tool_learning_info_zh.md ├── tutorial.md ├── tutorial_zh.md └── wechat.png ├── scripts ├── run_eval_example.sh └── run_fc_example.sh ├── src ├── context_builder │ ├── context_builder.py │ └── context_builder_family.py ├── data │ ├── data_load.py │ └── data_preprocess.py ├── datasets │ ├── __init__.py │ ├── base_dataset.py │ ├── funccall_dataset.py │ ├── toolfill_dataset.py │ ├── toolparser_dataset.py │ └── toolsummary_dataset.py ├── evals │ ├── __init__.py │ ├── base_evalution.py │ ├── func_call_evalution.py │ ├── toolfill_evalution.py │ ├── toolparser_evalution.py │ ├── toolsummary_evalution.py │ └── utils.py ├── evaluate │ └── evaluate.py ├── getAssistantAns.py ├── hparams │ ├── evaluate_args.py │ └── parser.py ├── metric │ └── metric_score.py ├── model_and_tokenizer_loader │ ├── model_and_tokenizer_loader.py │ └── model_and_tokenizer_loader_family.py ├── models │ ├── __init__.py │ ├── baichuan_model.py │ ├── base_model.py │ ├── generate_configs.py │ ├── internlm_model.py │ ├── openai_model.py │ ├── qwen_model.py │ └── test.py ├── opensource_functioncall_evalution.py ├── prompts │ ├── __init__.py │ └── base_prompts_config.py ├── qwen_eval_main.py ├── run_eval.py └── utils │ ├── json_utils.py │ ├── jsonl_utils.py │ └── set_seed.py └── tests ├── context_builder_test.py └── data_preprocess_test.py /.gitignore: -------------------------------------------------------------------------------- 1 | **/__pycache__ 2 | .DS_Store 3 | data/ 4 | !src/data/ 5 | .pyc 6 | __pycache__ 7 | start_job.py 8 | start_job2.py 9 | start_job3.py 10 | run_eval.sh 11 | run_eval2.sh 12 | run_eval3.sh 13 | dataset_fp_tech_risk.json 14 | -------------------------------------------------------------------------------- /LEGAL.md: -------------------------------------------------------------------------------- 1 | Legal Disclaimer 2 | 3 | Within this source code, the comments in Chinese shall be the original, governing version. Any comment in other languages are for reference only. In the event of any conflict between the Chinese language version comments and other language version comments, the Chinese language version shall prevail. 4 | 5 | 法律免责声明 6 | 7 | 关于代码注释部分,中文注释为官方版本,其它语言注释仅做参考。中文注释可能与其它语言注释存在不一致,当中文注释与其它语言注释存在不一致时,请以中文注释为准。 -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright [2023] [Ant Group] 2 | Licensed under the Apache License, Version 2.0 (the "License"); 3 | you may not use this file except in compliance with the License. 4 | You may obtain a copy of the License at 5 | http://www.apache.org/licenses/LICENSE-2.0 6 | 7 | Unless required by applicable law or agreed to in writing, software 8 | distributed under the License is distributed on an "AS IS" BASIS, 9 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | See the License for the specific language governing permissions and 11 | limitations under the License. 12 | 13 | 14 | Apache License 15 | Version 2.0, January 2004 16 | http://www.apache.org/licenses/ 17 | 18 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 19 | 20 | 1. Definitions. 21 | 22 | "License" shall mean the terms and conditions for use, reproduction, 23 | and distribution as defined by Sections 1 through 9 of this document. 24 | 25 | "Licensor" shall mean the copyright owner or entity authorized by 26 | the copyright owner that is granting the License. 27 | 28 | "Legal Entity" shall mean the union of the acting entity and all 29 | other entities that control, are controlled by, or are under common 30 | control with that entity. For the purposes of this definition, 31 | "control" means (i) the power, direct or indirect, to cause the 32 | direction or management of such entity, whether by contract or 33 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 34 | outstanding shares, or (iii) beneficial ownership of such entity. 35 | 36 | "You" (or "Your") shall mean an individual or Legal Entity 37 | exercising permissions granted by this License. 38 | 39 | "Source" form shall mean the preferred form for making modifications, 40 | including but not limited to software source code, documentation 41 | source, and configuration files. 42 | 43 | "Object" form shall mean any form resulting from mechanical 44 | transformation or translation of a Source form, including but 45 | not limited to compiled object code, generated documentation, 46 | and conversions to other media types. 47 | 48 | "Work" shall mean the work of authorship, whether in Source or 49 | Object form, made available under the License, as indicated by a 50 | copyright notice that is included in or attached to the work 51 | (an example is provided in the Appendix below). 52 | 53 | "Derivative Works" shall mean any work, whether in Source or Object 54 | form, that is based on (or derived from) the Work and for which the 55 | editorial revisions, annotations, elaborations, or other modifications 56 | represent, as a whole, an original work of authorship. For the purposes 57 | of this License, Derivative Works shall not include works that remain 58 | separable from, or merely link (or bind by name) to the interfaces of, 59 | the Work and Derivative Works thereof. 60 | 61 | "Contribution" shall mean any work of authorship, including 62 | the original version of the Work and any modifications or additions 63 | to that Work or Derivative Works thereof, that is intentionally 64 | submitted to Licensor for inclusion in the Work by the copyright owner 65 | or by an individual or Legal Entity authorized to submit on behalf of 66 | the copyright owner. For the purposes of this definition, "submitted" 67 | means any form of electronic, verbal, or written communication sent 68 | to the Licensor or its representatives, including but not limited to 69 | communication on electronic mailing lists, source code control systems, 70 | and issue tracking systems that are managed by, or on behalf of, the 71 | Licensor for the purpose of discussing and improving the Work, but 72 | excluding communication that is conspicuously marked or otherwise 73 | designated in writing by the copyright owner as "Not a Contribution." 74 | 75 | "Contributor" shall mean Licensor and any individual or Legal Entity 76 | on behalf of whom a Contribution has been received by Licensor and 77 | subsequently incorporated within the Work. 78 | 79 | 2. Grant of Copyright License. Subject to the terms and conditions of 80 | this License, each Contributor hereby grants to You a perpetual, 81 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 82 | copyright license to reproduce, prepare Derivative Works of, 83 | publicly display, publicly perform, sublicense, and distribute the 84 | Work and such Derivative Works in Source or Object form. 85 | 86 | 3. Grant of Patent License. Subject to the terms and conditions of 87 | this License, each Contributor hereby grants to You a perpetual, 88 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 89 | (except as stated in this section) patent license to make, have made, 90 | use, offer to sell, sell, import, and otherwise transfer the Work, 91 | where such license applies only to those patent claims licensable 92 | by such Contributor that are necessarily infringed by their 93 | Contribution(s) alone or by combination of their Contribution(s) 94 | with the Work to which such Contribution(s) was submitted. If You 95 | institute patent litigation against any entity (including a 96 | cross-claim or counterclaim in a lawsuit) alleging that the Work 97 | or a Contribution incorporated within the Work constitutes direct 98 | or contributory patent infringement, then any patent licenses 99 | granted to You under this License for that Work shall terminate 100 | as of the date such litigation is filed. 101 | 102 | 4. Redistribution. You may reproduce and distribute copies of the 103 | Work or Derivative Works thereof in any medium, with or without 104 | modifications, and in Source or Object form, provided that You 105 | meet the following conditions: 106 | 107 | (a) You must give any other recipients of the Work or 108 | Derivative Works a copy of this License; and 109 | 110 | (b) You must cause any modified files to carry prominent notices 111 | stating that You changed the files; and 112 | 113 | (c) You must retain, in the Source form of any Derivative Works 114 | that You distribute, all copyright, patent, trademark, and 115 | attribution notices from the Source form of the Work, 116 | excluding those notices that do not pertain to any part of 117 | the Derivative Works; and 118 | 119 | (d) If the Work includes a "NOTICE" text file as part of its 120 | distribution, then any Derivative Works that You distribute must 121 | include a readable copy of the attribution notices contained 122 | within such NOTICE file, excluding those notices that do not 123 | pertain to any part of the Derivative Works, in at least one 124 | of the following places: within a NOTICE text file distributed 125 | as part of the Derivative Works; within the Source form or 126 | documentation, if provided along with the Derivative Works; or, 127 | within a display generated by the Derivative Works, if and 128 | wherever such third-party notices normally appear. The contents 129 | of the NOTICE file are for informational purposes only and 130 | do not modify the License. You may add Your own attribution 131 | notices within Derivative Works that You distribute, alongside 132 | or as an addendum to the NOTICE text from the Work, provided 133 | that such additional attribution notices cannot be construed 134 | as modifying the License. 135 | 136 | You may add Your own copyright statement to Your modifications and 137 | may provide additional or different license terms and conditions 138 | for use, reproduction, or distribution of Your modifications, or 139 | for any such Derivative Works as a whole, provided Your use, 140 | reproduction, and distribution of the Work otherwise complies with 141 | the conditions stated in this License. 142 | 143 | 5. Submission of Contributions. Unless You explicitly state otherwise, 144 | any Contribution intentionally submitted for inclusion in the Work 145 | by You to the Licensor shall be under the terms and conditions of 146 | this License, without any additional terms or conditions. 147 | Notwithstanding the above, nothing herein shall supersede or modify 148 | the terms of any separate license agreement you may have executed 149 | with Licensor regarding such Contributions. 150 | 151 | 6. Trademarks. This License does not grant permission to use the trade 152 | names, trademarks, service marks, or product names of the Licensor, 153 | except as required for reasonable and customary use in describing the 154 | origin of the Work and reproducing the content of the NOTICE file. 155 | 156 | 7. Disclaimer of Warranty. Unless required by applicable law or 157 | agreed to in writing, Licensor provides the Work (and each 158 | Contributor provides its Contributions) on an "AS IS" BASIS, 159 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 160 | implied, including, without limitation, any warranties or conditions 161 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 162 | PARTICULAR PURPOSE. You are solely responsible for determining the 163 | appropriateness of using or redistributing the Work and assume any 164 | risks associated with Your exercise of permissions under this License. 165 | 166 | 8. Limitation of Liability. In no event and under no legal theory, 167 | whether in tort (including negligence), contract, or otherwise, 168 | unless required by applicable law (such as deliberate and grossly 169 | negligent acts) or agreed to in writing, shall any Contributor be 170 | liable to You for damages, including any direct, indirect, special, 171 | incidental, or consequential damages of any character arising as a 172 | result of this License or out of the use or inability to use the 173 | Work (including but not limited to damages for loss of goodwill, 174 | work stoppage, computer failure or malfunction, or any and all 175 | other commercial damages or losses), even if such Contributor 176 | has been advised of the possibility of such damages. 177 | 178 | 9. Accepting Warranty or Additional Liability. While redistributing 179 | the Work or Derivative Works thereof, You may choose to offer, 180 | and charge a fee for, acceptance of support, warranty, indemnity, 181 | or other liability obligations and/or rights consistent with this 182 | License. However, in accepting such obligations, You may act only 183 | on Your own behalf and on Your sole responsibility, not on behalf 184 | of any other Contributor, and only if You agree to indemnify, 185 | defend, and hold each Contributor harmless for any liability 186 | incurred by, or claims asserted against, such Contributor by reason 187 | of your accepting any such warranty or additional liability. 188 | 189 | END OF TERMS AND CONDITIONS 190 | 191 | APPENDIX: How to apply the Apache License to your work. 192 | 193 | To apply the Apache License to your work, attach the following 194 | boilerplate notice, with the fields enclosed by brackets "[]" 195 | replaced with your own identifying information. (Don't include 196 | the brackets!) The text should be enclosed in the appropriate 197 | comment syntax for the file format. We also recommend that a 198 | file or class name and description of purpose be included on the 199 | same "printed page" as the copyright notice for easier 200 | identification within third-party archives. 201 | 202 | Copyright [yyyy] [name of copyright owner] 203 | 204 | Licensed under the Apache License, Version 2.0 (the "License"); 205 | you may not use this file except in compliance with the License. 206 | You may obtain a copy of the License at 207 | 208 | http://www.apache.org/licenses/LICENSE-2.0 209 | 210 | Unless required by applicable law or agreed to in writing, software 211 | distributed under the License is distributed on an "AS IS" BASIS, 212 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 213 | See the License for the specific language governing permissions and 214 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 🤗 Hugging Face • ⏬ Data • 📖 Tutorial 5 |
6 | 中文 | English 7 |

8 | 9 | DevOps-Eval is a comprehensive evaluation suite specifically designed for foundation models in the DevOps field. We hope DevOps-Eval could help developers, especially in the DevOps field, track the progress and analyze the important strengths/shortcomings of their models. 10 | 11 | 12 | 📚 This repo contains questions and exercises related to DevOps, including the AIOps, ToolLearning; 13 | 14 | 💥️ There are currently **7486** multiple-choice questions spanning 8 diverse general categories, as shown [below](images/data_info.png). 15 | 16 | 🔥 There are a total of **2840** samples in the AIOps subcategory, covering scenarios such as **log parsing**, **time series anomaly detection**, **time series classification**, **time series forecasting**, and **root cause analysis**. 17 | 18 | 🔧 There are a total of **1509** samples in the ToolLearning subcategory, covering 239 tool scenes across 59 fields. 19 | 20 |

21 | 22 | 23 | ## 🔔 News 24 | * **[2023.12.27]** Add 1509 **ToolLearning** samples, covering 239 tool categories across 59 fields; Release the associated evaluation leaderboard; 25 | * **[2023.11.27]** Add 487 operation scene samples and 640 time series forecasting samples; Update the Leaderboard; 26 | * **[2023.10.30]** Add the AIOps Leaderboard. 27 | * **[2023.10.25]** Add the AIOps samples, including log parsing, time series anomaly detection, time series classification and root cause analysis. 28 | * **[2023.10.18]** Update the initial Leaderboard... 29 |
30 | 31 | ## 📜 Table of Contents 32 | 33 | - [🏆 Leaderboard](#-leaderboard) 34 | - [👀 DevOps](#-devops) 35 | - [🔥 AIOps](#-aiops) 36 | - [🔧 ToolLearning](#-toollearning) 37 | - [⏬ Data](#-data) 38 | - [👀 Notes](#-notes) 39 | - [🔥 AIOps Sample Example](#-aiops-sample-example) 40 | - [🔧 ToolLearning Sample Example](#-toollearning-sample-example) 41 | - [🚀 How to Evaluate](#-how-to-evaluate) 42 | - [🧭 TODO](#-todo) 43 | - [🏁 Licenses](#-licenses) 44 | - [😃 Citation](#-citation) 45 | - [🗂 Miscellaneous](#-miscellaneous) 46 | - [📱 Contact Us](#-contact-us) 47 | - [✨ Star History](#-star-history) 48 | - [🤝 Friendship Links](#-friendship-links) 49 | ## 🏆 Leaderboard 50 | Below are zero-shot and five-shot accuracies from the models that we evaluate in the initial release. We note that five-shot performance is better than zero-shot for many instruction-tuned models. 51 | ### 👀 DevOps 52 | #### Zero Shot 53 | 54 | | **ModelName** | plan | code | build | test | release | deploy | operate | monitor | **AVG** | 55 | |:------------------------:|:-----:|:-----:|:-----:|:------:|:--------:|:------:|:-------:|:--------:|:-----------:| 56 | | DevOpsPal-14B-Chat | 60.61 | 78.35 | 84.86 | 84.65 | 87.26 | 82.75 | 69.89 | 79.17 | 78.23 | 57 | | DevOpsPal-14B-Base | 54.55 | 77.82 | 83.49 | 85.96 | 86.32 | 81.96 | 71.18 | 82.41 | 78.23 | 58 | | Qwen-14B-Chat | 60.61 | 75.4 | 85.32 | 84.21 | 89.62 | 82.75 | 69.57 | 80.56 | 77.18 | 59 | | Qwen-14B-Base | 57.58 | 73.81 | 84.4 | 85.53 | 86.32 | 81.18 | 70.05 | 80.09 | 76.19 | 60 | | Baichuan2-13B-Base | 60.61 | 69.42 | 79.82 | 79.82 | 82.55 | 81.18 | 70.37 | 83.8 | 73.73 | 61 | | Baichuan2-13B-Chat | 60.61 | 68.43 | 77.98 | 80.7 | 81.6 | 83.53 | 67.63 | 84.72 | 72.9 | 62 | | DevOpsPal-7B-Chat | 54.55 | 69.11 | 83.94 | 82.02 | 76.89 | 80 | 64.73 | 77.78 | 71.92 | 63 | | DevOpsPal-7B-Base | 54.55 | 68.96 | 82.11 | 78.95 | 80.66 | 76.47 | 65.54 | 78.7 | 71.69 | 64 | | Qwen-7B-Base | 53.03 | 68.13 | 78.9 | 75.44 | 80.19 | 80 | 65.06 | 80.09 | 71.09 | 65 | | Qwen-7B-Chat | 57.58 | 66.01 | 80.28 | 79.82 | 76.89 | 77.65 | 62.64 | 79.17 | 69.75 | 66 | | Baichuan2-7B-Chat | 54.55 | 63.66 | 77.98 | 76.32 | 71.7 | 73.33 | 59.42 | 79.63 | 66.97 | 67 | | Internlm-7B-Chat | 60.61 | 62.15 | 77.06 | 76.32 | 66.98 | 74.51 | 60.39 | 78.24 | 66.27 | 68 | | Baichuan2-7B-Base | 56.06 | 62.45 | 75.69 | 70.61 | 74.06 | 69.8 | 61.67 | 75.93 | 66.21 | 69 | | Internlm-7B-Base | 54.55 | 58.29 | 79.36 | 78.95 | 77.83 | 70.59 | 65.86 | 75.93 | 65.99 | 70 | 71 | 72 | #### Five Shot 73 | 74 | | **ModelName** | plan | code | build | test | release | deploy | operate | monitor | **AVG** | 75 | |:------------------------:|:-----:|:-----:|:-----:|:------:|:--------:|:------:|:-------:|:--------:|:---------:| 76 | | DevOpsPal-14B-Chat | 63.64 | 79.49 | 81.65 | 85.96 | 86.79 | 86.67 | 72.95 | 81.48 | 79.69 | 77 | | DevOpsPal-14B-Base | 62.12 | 80.55 | 82.57 | 85.53 | 85.85 | 84.71 | 71.98 | 80.09 | 79.63 | 78 | | Qwen-14B-Chat | 65.15 | 76 | 82.57 | 85.53 | 84.91 | 84.31 | 70.85 | 81.48 | 77.81 | 79 | | Qwen-14B-Base | 66.67 | 76.15 | 84.4 | 85.53 | 86.32 | 80.39 | 72.46 | 80.56 | 77.56 | 80 | | Baichuan2-13B-Base | 63.64 | 71.39 | 80.73 | 82.46 | 81.13 | 84.31 | 73.75 | 85.19 | 75.8 | 81 | | Qwen-7B-Base | 75.76 | 72.52 | 78.9 | 81.14 | 83.96 | 81.18 | 70.37 | 81.94 | 75.36 | 82 | | Baichuan2-13B-Chat | 62.12 | 69.95 | 76.61 | 84.21 | 83.49 | 79.61 | 71.98 | 80.56 | 74.12 | 83 | | DevOpsPal-7B-Chat | 66.67 | 69.95 | 83.94 | 81.14 | 80.19 | 82.75 | 68.6 | 76.85 | 73.61 | 84 | | DevOpsPal-7B-Base | 69.7 | 69.49 | 82.11 | 81.14 | 82.55 | 82.35 | 67.15 | 79.17 | 73.35 | 85 | | Qwen-7B-Chat | 65.15 | 66.54 | 82.57 | 81.58 | 81.6 | 81.18 | 65.38 | 81.02 | 71.69 | 86 | | Baichuan2-7B-Base | 60.61 | 67.22 | 76.61 | 75 | 77.83 | 78.43 | 67.31 | 79.63 | 70.8 | 87 | | Internlm-7B-Chat | 60.61 | 63.06 | 79.82 | 80.26 | 67.92 | 75.69 | 60.06 | 77.31 | 69.21 | 88 | | Baichuan2-7B-Chat | 60.61 | 64.95 | 81.19 | 75.88 | 71.23 | 75.69 | 64.9 | 79.17 | 69.05 | 89 | | Internlm-7B-Base | 62.12 | 65.25 | 77.52 | 80.7 | 74.06 | 78.82 | 63.45 | 75.46 | 67.17 | 90 | 91 | ### 🔥 AIOps 92 | 93 |
94 | 95 | #### Zero Shot 96 | | **ModelName** | LogParsing | RootCauseAnalysis | TimeSeriesAnomalyDetection | TimeSeriesClassification | TimeSeriesForecasting | **AVG** | 97 | |:-------------------:|:------------:|:------------------:|:---------------------------:|:-----------------------------------------:|:---------------------------:|:-------:| 98 | | Qwen-14B-Base | 66.29 | 58.8 | 25.33 | 43.5 | 62.5 | 52.25 | 99 | | DevOpsPal-14B—Base | 63.14 | 53.6 | 23.33 | 43.5 | 64.06 | 50.49 | 100 | | Qwen-14B-Chat | 64.57 | 51.6 | 22.67 | 36 | 62.5 | 48.94 | 101 | | DevOpsPal-14B—Chat | 60 | 56 | 24 | 43 | 57.81 | 48.8 | 102 | | Qwen-7B-Base | 50 | 39.2 | 22.67 | 54 | 43.75 | 41.48 | 103 | | DevOpsPal-7B—Chat | 56.57 | 30.4 | 25.33 | 45 | 44.06 | 40.92 | 104 | | Baichuan2-13B-Chat | 64 | 18 | 21.33 | 37.5 | 46.88 | 39.3 | 105 | | Qwen-7B-Chat | 57.43 | 38.8 | 22.33 | 39.5 | 25.31 | 36.97 | 106 | | Internlm-7B—Chat | 58.86 | 8.8 | 22.33 | 28.5 | 51.25 | 36.34 | 107 | | Baichuan2-7B-Chat | 60.86 | 10 | 28 | 34.5 | 39.06 | 36.34 | 108 | | Baichuan2-7B-Base | 53.43 | 12.8 | 27.67 | 36.5 | 40.31 | 35.49 | 109 | | Baichuan2-13B-Base | 54 | 12.4 | 23 | 34.5 | 42.81 | 34.86 | 110 | | DevOpsPal-7B—Base | 46.57 | 20.8 | 25 | 34 | 38.75 | 33.94 | 111 | | Internlm-7B—Base | 48.57 | 18.8 | 23.33 | 37.5 | 33.75 | 33.1 | 112 | 113 | #### One Shot 114 | | **ModelName** | LogParsing | RootCauseAnalysis | TimeSeriesAnomalyDetection | TimeSeriesClassification | TimeSeriesForecasting | **AVG** | 115 | |:-------------------:|:------------:|:------------------:|:---------------------------:|:-----------------------------------------:|:---------------------------:|:-------:| 116 | | DevOpsPal-14B—Chat | 66.29 | 80.8 | 23.33 | 44.5 | 56.25 | 54.44 | 117 | | DevOpsPal-14B—Base | 60 | 74 | 25.33 | 43.5 | 52.5 | 51.13 | 118 | | Qwen-14B-Base | 64.29 | 74.4 | 28 | 48.5 | 40.31 | 50.77 | 119 | | Qwen-7B-Base | 56 | 60.8 | 27.67 | 44 | 57.19 | 49.44 | 120 | | Qwen-14B-Chat | 49.71 | 65.6 | 28.67 | 48 | 42.19 | 46.13 | 121 | | Baichuan2-13B-Base | 56 | 43.2 | 24.33 | 41 | 46.88 | 42.89 | 122 | | Baichuan2-7B-Chat | 58.57 | 31.6 | 27 | 31.5 | 51.88 | 41.83 | 123 | | DevOpsPal-7B—Base | 52.86 | 44.4 | 28 | 44.5 | 36.25 | 41.2 | 124 | | Baichuan2-7B-Base | 48.29 | 40.4 | 27 | 42 | 40.94 | 39.86 | 125 | | Qwen-7B-Chat | 54.57 | 52 | 29.67 | 26.5 | 27.19 | 38.73 | 126 | | Baichuan2-13B-Chat | 57.43 | 44.4 | 25 | 25.5 | 30.63 | 37.75 | 127 | | DevOpsPal-7B—Chat | 56.57 | 27.2 | 25.33 | 41.5 | 33.44 | 37.46 | 128 | | Internlm-7B—Chat | 62.57 | 12.8 | 22.33 | 21 | 50.31 | 36.69 | 129 | | Internlm-7B—Base | 48 | 33.2 | 29 | 35 | 31.56 | 35.85 | 130 | 131 |
132 | 133 | 134 | ### 🔧 ToolLearning 135 |
136 | 137 | | **FuncCall-Filler** | dataset_name | fccr | 1-fcffr | 1-fcfnr | 1-fcfpr | 1-fcfnir | aar | 138 | |:-------------------:| :---: | :---: | :---: | :---: | :---: | :---: | :---: | 139 | | Qwen-14b-chat | luban | 61 | 100 | 97.68 | 63.32 | 100 | 69.46 | 140 | | Qwen-7b-chat | luban | 50.58 | 100 | 98.07 | 52.51 | 100 | 63.59 | 141 | | Baichuan-7b-chat | luban | 60.23 | 100 | 97.3 | 62.93 | 99.61 | 61.12 | 142 | | Internlm-chat-7b | luban | 47.88 | 100 | 96.14 | 51.74 | 99.61 | 61.85 | 143 | | Qwen-14b-chat | fc_data | 98.37 | 99.73 | 99.86 | 98.78 | 100 | 81.58 | 144 | | Qwen-7b-chat | fc_data | 99.46 | 99.86 | 100 | 99.59 | 100 | 79.25 | 145 | | Baichuan-7b-chat | fc_data | 97.96 | 99.32 | 100 | 98.64 | 100 | 89.53 | 146 | | Internlm-chat-7b | fc_data | 94.29 | 95.78 | 100 | 98.5 | 100 | 88.19 | 147 | | CodeLLaMa-7b | fc_data | 98.78 | 99.73 | 100 | 99.05 | 100 | 94.7 | 148 | | CodeLLaMa-7b-16 | fc_data | 98.1 | 99.87 | 99.73 | 98.5 | 100 | 93.14 | 149 | | CodeFuse-7b-4k | fc_data | 98.91 | 99.87 | 99.87 | 99.18 | 100 | 89.5 | 150 | 151 | 152 |
153 | 154 | 155 | ## ⏬ Data 156 | #### Download 157 | * Method 1: Download the zip file (you can also simply open the following link with the browser): 158 | ``` 159 | wget https://huggingface.co/datasets/codefuse-admin/devopseval-exam/resolve/main/devopseval-exam.zip 160 | ``` 161 | then unzip it and you may load the data with pandas: 162 | ``` 163 | import os 164 | import pandas as pd 165 | 166 | File_Dir="devopseval-exam" 167 | test_df=pd.read_csv(os.path.join(File_Dir,"test","UnitTesting.csv")) 168 | ``` 169 | * Method 2: Directly load the dataset using [Hugging Face datasets](https://huggingface.co/datasets/codefuse-admin/devopseval-exam): 170 | ```python 171 | from datasets import load_dataset 172 | dataset=load_dataset(r"DevOps-Eval/devopseval-exam",name="UnitTesting") 173 | 174 | print(dataset['val'][0]) 175 | # {"id": 1, "question": "单元测试应该覆盖以下哪些方面?", "A": "正常路径", "B": "异常路径", "C": "边界值条件","D": 所有以上,"answer": "D", "explanation": ""} ``` 176 | 177 | * Method 3: Directly load the datase t using [ModelScope datasets](https://modelscope.cn/datasets/codefuse-ai/devopseval-exam/files): 178 | ```python 179 | from modelscope.msdatasets import MsDataset 180 | MsDataset.clone_meta(dataset_work_dir='./xxx', dataset_id='codefuse-ai/devopseval-exam') 181 | ``` 182 | 183 | #### 👀 Notes 184 | To facilitate usage, we have organized the category name handlers and English/Chinese names corresponding to 55 subcategories. Please refer to [category_mapping.json](resources/categroy_mapping.json) for details. The format is: 185 | 186 | ``` 187 | { 188 | "UnitTesting.csv": [ 189 | "unit testing", 190 | "单元测试", 191 | {"dev": 5, "test": 32} 192 | "TEST" 193 | ], 194 | ... 195 | "file_name":[ 196 | "English Name", 197 | "Chinese Name", 198 | "Sample Number", 199 | "Supercatagory Label(PLAN,CODE,BUILD,TEST,RELEASE,DEPOLY,OPERATE,MONITOR choose 1 out of 8)" 200 | ] 201 | } 202 | ``` 203 | Each subcategory consists of two splits: dev and test. The dev set per subcategory consists of five exemplars with explanations for few-shot evaluation. And the test set is for model evaluation. Labels on the test split are also released. 204 | 205 | Below is a dev example from 'version control': 206 | 207 | ``` 208 | id: 4 209 | question: 如何找到Git特定提交中已更改的文件列表? 210 | A: 使用命令 `git diff --name-only SHA` 211 | B: 使用命令 `git log --name-only SHA` 212 | C: 使用命令 `git commit --name-only SHA` 213 | D: 使用命令 `git clone --name-only SHA` 214 | answer: A 215 | explanation: 216 | 分析原因: 217 | git diff --name-only SHA命令会显示与SHA参数对应的提交中已修改的文件列表。参数--name-only让命令只输出文件名,而忽略其他信息。其它选项中的命令并不能实现此功能。 218 | ``` 219 | #### 🔥 AIOps Sample Example 220 | 👀 👀 Taking **log parsing** and **time series anomaly detection** as examples, here is a brief showcase of the AIOps samples: 221 | 222 | LogParsing 223 | ``` 224 | id: 0 225 | question: 226 | Here are some running logs 227 | 0 04:21:15,429 WARN Cannot open channel to 2 at election address /10.10.34.12:3888 228 | 1 19:18:56,377 WARN ******* GOODBYE /10.10.34.11:52703 ******** 229 | 2 19:13:46,128 WARN ******* GOODBYE /10.10.34.11:52308 ******** 230 | 3 19:16:26,268 WARN ******* GOODBYE /10.10.34.11:52502 ******** 231 | 4 09:11:16,012 WARN Cannot open channel to 3 at election address /10.10.34.13:3888 232 | 5 16:37:13,837 WARN Cannot open channel to 2 at election address /10.10.34.12:3888 233 | 6 09:09:16,008 WARN Cannot open channel to 3 at election address /10.10.34.13:3888 234 | 7 15:27:03,681 WARN Cannot open channel to 3 at election address /10.10.34.13:3888 235 | The first three parts of the log are index, timestamp, and log level. Without considering these three parts, Here we assume that the variables in the logs are represented as '<*>', separated by spaces between tokens. What is the specific log template for the above logs? 236 | A: Notification time out: <*> 和 Connection broken for id <*>, my id = <*>, error = 237 | B: Send worker leaving thread 和 Connection broken for id <*>, my id = <*>, error = 238 | C: Received connection request /<*>:<*> 和 Interrupting SendWorker 239 | D: Cannot open channel to <*> at election address /<*>:<*> 和 ******* GOODBYE /<*>:<*> ******** 240 | answer: D 241 | explanation: The log includes the fixed template fragments "Cannot open channel to <> at election address /<>:<>" and "****** GOODBYE /<>:<> ********," both of which appear in option D. Meanwhile, the template fragments in the other options do not match the content in the log. Therefore, option D is the most consistent with the log template. 242 | ``` 243 | TimeSeriesAnomalyDetection 244 | ``` 245 | id: 0 246 | question: 247 | Analyze the following time series 248 | [50,62,74,84,92,97,99,98,94,87,77,65,265,40,28,17,8,3,0,0,4,10,20,31,43,56,68,79,89,95,99,99,96,91,82,71,59,46,34,22,12,5,1,0,2,7,15,25,37,49] 249 | Please identify the indices of obvious outlier points. Outlier points generally refer to points that significantly deviate from the overall trend of the data. 250 | A: 46 251 | B: 0 252 | C: 37 253 | D: 12 254 | answer: D 255 | explanation: According to the analysis, the value 265 in the given time series at 12 o'clock is significantly larger than the surrounding data, indicating a sudden increase phenomenon. Therefore, selecting option D is correct. 256 | ``` 257 | #### 🔧 ToolLearning Sample Example 258 | 259 | 👀 👀The data format of ToolLearning samples is compatible with OpenAI's Function Calling. 260 | 261 | Please refer to [tool_learning_info.md](resources/tool_learning_info.md) for details. 262 |
263 | 264 | ## 🚀 How to Evaluate 265 | If you need to test your own huggingface-formatted model, the overall steps are as follows: 266 | 1. Write the loader function for the model. 267 | 2. Write the context_builder function for the model. 268 | 3. Register the model in the configuration file. 269 | 4. Run the testing script. 270 | If the model does not require any special processing after loading, and the input does not need to be converted to a specific format (e.g. chatml format or other human-bot formats), you can directly proceed to step 4 to initiate the testing. 271 | 272 | #### 1. Write the loader function 273 | If the model requires additional processing after loading (e.g. adjusting the tokenizer), you need to inherit the `ModelAndTokenizerLoader` class in `src.context_builder.context_builder_family.py` and override the corresponding `load_model` and `load_tokenizer` functions. You can refer to the following example: 274 | ```python 275 | class QwenModelAndTokenizerLoader(ModelAndTokenizerLoader): 276 | def __init__(self): 277 | super().__init__() 278 | pass 279 | 280 | @override 281 | def load_model(self, model_path: str): 282 | # Implementation of the method 283 | pass 284 | 285 | @override 286 | def load_tokenizer(self, model_path: str): 287 | # Implementation of the method 288 | pass 289 | ``` 290 | 291 | #### 2. Write the context_builder function for the Model 292 | If the input needs to be converted to a specific format (e.g. chatml format or other human-bot formats), you need to inherit the ContextBuilder class in `src.context_builder.context_builder_family` and override the make_context function. This function is used to convert the input to the corresponding required format. An example is shown below: 293 | ```python 294 | class QwenChatContextBuilder(ContextBuilder): 295 | def __init__(self): 296 | super().__init__() 297 | 298 | @override 299 | def make_context(self, model, tokenizer, query: str, system: str = "hello!"): 300 | # Implementation of the method 301 | pass 302 | ``` 303 | 304 | #### 3. Register the model in the configuration file 305 | Go to the `model_conf.json` file in the conf directory and register the corresponding model name and the loader and context_builder that will be used for this model. Simply write the class names defined in the first and second steps for the loader and context_builder. Here is an example: 306 | ```json 307 | { 308 | "Qwen-Chat": { 309 | "loader": "QwenModelAndTokenizerLoader", 310 | "context_builder": "QwenChatContextBuilder" 311 | } 312 | } 313 | ``` 314 | 315 | #### 4. Execute the testing script 316 | Run the following code to initiate the test: 317 | ```Bash 318 | python src/run_eval.py \ 319 | --model_path path_to_model \ 320 | --model_name model_name_in_conf \ 321 | --model_conf_path path_to_model_conf \ 322 | --eval_dataset_list all \ 323 | --eval_dataset_fp_conf_path path_to_dataset_conf \ 324 | --eval_dataset_type test \ 325 | --data_path path_to_downloaded_devops_eval_data \ 326 | --k_shot 0 327 | ``` 328 | 👀 👀 The specific evaluation process is as follows 📖 [**Evaluate Tutorial**](resources/tutorial.md) 329 | 330 |
331 | 332 | ## 🧭 TODO 333 | - [x] add AIOps samples. 334 | - [x] add AIOps scenario **time series forecasting**. 335 | - [x] add **ToolLearning** samples. 336 | - [ ] increase in sample size. 337 | - [ ] add samples with the difficulty level set to hard. 338 | - [ ] add the English version of the samples. 339 |
340 |
341 | 342 | 343 | ## 🏁 Licenses 344 | This project is licensed under the [Apache License (Version 2.0)](LICENSE.md). 345 |
346 |
347 | 348 | ## 😃 Citation 349 | 350 | Please cite our paper if you use our dataset. 351 | 352 | Coming Soon... 353 |
354 |
355 | 356 | ## 🗂 Miscellaneous 357 | 358 | ### 📱 Contact Us 359 |
360 | 图片 361 |
362 | 363 | ### ✨ Star History 364 | [![Star History Chart](https://api.star-history.com/svg?repos=codefuse-ai/codefuse-devops-eval&type=Date)](https://star-history.com/#codefuse-ai/codefuse-devops-eval&Date) 365 | 366 | ### 🤝 Friendship Links 367 | - [Codefuse-ChatBot](https://github.com/codefuse-ai/codefuse-chatbot) 368 | - Codefuse-ChatBot is an open-source AI smart assistant designed to support the software development lifecycle with conversational access to tools, knowledge, and platform integration. 369 | - [Awesome AIGC Tutorials](https://github.com/luban-agi/Awesome-AIGC-Tutorials) 370 | - Awesome AIGC Tutorials houses a curated collection of tutorials and resources spanning across Large Language Models, AI Painting, and related fields. 371 | 372 | -------------------------------------------------------------------------------- /README_zh.md: -------------------------------------------------------------------------------- 1 |

2 | 3 |

4 | 🤗 Hugging Face • ⏬ 数据 • 📖 教程 5 |
6 | English | 中文 7 |

8 | 9 | DevOps-Eval是一个专门为DevOps领域大模型设计的综合评估数据集。我们希望DevOps-Eval能够帮助开发者,尤其是DevOps领域的开发者,追踪进展并分析他们拥有的DevOps大模型的优势和不足之处。 10 | 11 | 📚 该仓库包含与DevOps和AIOps相关的问题和练习, 还添加了关于ToolLearning相关的样本。 12 | 13 | 💥 目前有 **7486** 个多项选择题,根据DevOps的通用流程将其归纳未8个模块,如[下图](images/data_info.png)所示。 14 | 15 | 🔥 AIOps样本总计 **2840** 个,覆盖的场景包括**日志解析**、**时序异常检测**、**时序分类**、**时序预测**和**根因分析**。 16 | 17 | 🔧 ToolLearning样本 **1509** 个,涵盖59个领域,总计 239 种工具类别。 18 | 19 |

20 | 21 | 22 | ## 🔔 更新 23 | * **[2023.12.27]** 新增1509个ToolLearning样本,发布了相应的评测排行榜 24 | * **[2023.11.27]** 增加运维场景样本487例、时序预测样本640例;同步更新评测排行榜 25 | * **[2023.10.30]** 增加针对AIOps场景的评测排行榜 26 | * **[2023.10.25]** 增加AIOps样本,包含日志解析、时序异常检测、时序分类和根因分析 27 | * **[2023.10.18]** DevOps-Eval发布大模型评测排行版 28 |
29 | 30 | ## 📜 目录 31 | 32 | - [🏆 排行榜](#-排行榜) 33 | - [👀 DevOps](#-devops) 34 | - [🔥 AIOps](#-aiops) 35 | - [🔧 ToolLearning](#-toollearning) 36 | - [⏬ 数据](#-数据) 37 | - [👀 说明](#-说明) 38 | - [🔥 AIOps样本示例](#-AIOps样本示例) 39 | - [🔧 ToolLearning样本示例](#-toollearning样本示例) 40 | - [🚀 如何进行测试](#-如何进行测试) 41 | - [🧭 TODO](#-todo) 42 | - [🏁 Licenses](#-licenses) 43 | - [😃 引用](#-引用) 44 | - [🗂 Miscellaneous](#-miscellaneous) 45 | - [✨ Star History](#-star-history) 46 | - [🤝 Friendship Links](#-friendship-links) 47 | 48 | ## 🏆 排行榜 49 | 以下是我们获得的初版评测结果,包括多个开源模型的zero-shot和five-shot准确率。我们注意到,对于大多数指令模型来说,five-shot的准确率要优于zero-shot。 50 | 51 | ### 👀 DevOps 52 | #### Zero Shot 53 | 54 | | **模型** | plan | code | build | test | release | deploy | operate | monitor | **平均分** | 55 | |:------------------------:|:-----:|:-----:|:-----:|:------:|:--------:|:------:|:-------:|:--------:|:---------:| 56 | | DevOpsPal-14B-Chat | 60.61 | 78.35 | 84.86 | 84.65 | 87.26 | 82.75 | 69.89 | 79.17 | 78.23 | 57 | | DevOpsPal-14B-Base | 54.55 | 77.82 | 83.49 | 85.96 | 86.32 | 81.96 | 71.18 | 82.41 | 78.23 | 58 | | Qwen-14B-Chat | 60.61 | 75.4 | 85.32 | 84.21 | 89.62 | 82.75 | 69.57 | 80.56 | 77.18 | 59 | | Qwen-14B-Base | 57.58 | 73.81 | 84.4 | 85.53 | 86.32 | 81.18 | 70.05 | 80.09 | 76.19 | 60 | | Baichuan2-13B-Base | 60.61 | 69.42 | 79.82 | 79.82 | 82.55 | 81.18 | 70.37 | 83.8 | 73.73 | 61 | | Baichuan2-13B-Chat | 60.61 | 68.43 | 77.98 | 80.7 | 81.6 | 83.53 | 67.63 | 84.72 | 72.9 | 62 | | DevOpsPal-7B-Chat | 54.55 | 69.11 | 83.94 | 82.02 | 76.89 | 80 | 64.73 | 77.78 | 71.92 | 63 | | DevOpsPal-7B-Base | 54.55 | 68.96 | 82.11 | 78.95 | 80.66 | 76.47 | 65.54 | 78.7 | 71.69 | 64 | | Qwen-7B-Base | 53.03 | 68.13 | 78.9 | 75.44 | 80.19 | 80 | 65.06 | 80.09 | 71.09 | 65 | | Qwen-7B-Chat | 57.58 | 66.01 | 80.28 | 79.82 | 76.89 | 77.65 | 62.64 | 79.17 | 69.75 | 66 | | Baichuan2-7B-Chat | 54.55 | 63.66 | 77.98 | 76.32 | 71.7 | 73.33 | 59.42 | 79.63 | 66.97 | 67 | | Internlm-7B-Chat | 60.61 | 62.15 | 77.06 | 76.32 | 66.98 | 74.51 | 60.39 | 78.24 | 66.27 | 68 | | Baichuan2-7B-Base | 56.06 | 62.45 | 75.69 | 70.61 | 74.06 | 69.8 | 61.67 | 75.93 | 66.21 | 69 | | Internlm-7B-Base | 54.55 | 58.29 | 79.36 | 78.95 | 77.83 | 70.59 | 65.86 | 75.93 | 65.99 | 70 | 71 | 72 | #### Five Shot 73 | 74 | | **模型** | plan | code | build | test | release | deploy | operate | monitor | **平均分** | 75 | |:------------------------:|:-----:|:-----:|:-----:|:------:|:--------:|:------:|:-------:|:--------:|:---------:| 76 | | DevOpsPal-14B-Chat | 63.64 | 79.49 | 81.65 | 85.96 | 86.79 | 86.67 | 72.95 | 81.48 | 79.69 | 77 | | DevOpsPal-14B-Base | 62.12 | 80.55 | 82.57 | 85.53 | 85.85 | 84.71 | 71.98 | 80.09 | 79.63 | 78 | | Qwen-14B-Chat | 65.15 | 76 | 82.57 | 85.53 | 84.91 | 84.31 | 70.85 | 81.48 | 77.81 | 79 | | Qwen-14B-Base | 66.67 | 76.15 | 84.4 | 85.53 | 86.32 | 80.39 | 72.46 | 80.56 | 77.56 | 80 | | Baichuan2-13B-Base | 63.64 | 71.39 | 80.73 | 82.46 | 81.13 | 84.31 | 73.75 | 85.19 | 75.8 | 81 | | Qwen-7B-Base | 75.76 | 72.52 | 78.9 | 81.14 | 83.96 | 81.18 | 70.37 | 81.94 | 75.36 | 82 | | Baichuan2-13B-Chat | 62.12 | 69.95 | 76.61 | 84.21 | 83.49 | 79.61 | 71.98 | 80.56 | 74.12 | 83 | | DevOpsPal-7B-Chat | 66.67 | 69.95 | 83.94 | 81.14 | 80.19 | 82.75 | 68.6 | 76.85 | 73.61 | 84 | | DevOpsPal-7B-Base | 69.7 | 69.49 | 82.11 | 81.14 | 82.55 | 82.35 | 67.15 | 79.17 | 73.35 | 85 | | Qwen-7B-Chat | 65.15 | 66.54 | 82.57 | 81.58 | 81.6 | 81.18 | 65.38 | 81.02 | 71.69 | 86 | | Baichuan2-7B-Base | 60.61 | 67.22 | 76.61 | 75 | 77.83 | 78.43 | 67.31 | 79.63 | 70.8 | 87 | | Internlm-7B-Chat | 60.61 | 63.06 | 79.82 | 80.26 | 67.92 | 75.69 | 60.06 | 77.31 | 69.21 | 88 | | Baichuan2-7B-Chat | 60.61 | 64.95 | 81.19 | 75.88 | 71.23 | 75.69 | 64.9 | 79.17 | 69.05 | 89 | | Internlm-7B-Base | 62.12 | 65.25 | 77.52 | 80.7 | 74.06 | 78.82 | 63.45 | 75.46 | 67.17 | 90 | 91 | 92 | ### 🔥 AIOps 93 | 94 |
95 | 96 | #### Zero Shot 97 | | **模型** | 日志解析 | 根因分析 | 时序异常检测 | 时序分类 | 时序预测 | **平均分** | 98 | |:-------------------:|:-----:|:----:|:------:|:----:|:-----:|:-------:| 99 | | Qwen-14B-Base | 66.29 | 58.8 | 25.33 | 43.5 | 62.5 | 52.25 | 100 | | DevOpsPal-14B—Base | 63.14 | 53.6 | 23.33 | 43.5 | 64.06 | 50.49 | 101 | | Qwen-14B-Chat | 64.57 | 51.6 | 22.67 | 36 | 62.5 | 48.94 | 102 | | DevOpsPal-14B—Chat | 60 | 56 | 24 | 43 | 57.81 | 48.8 | 103 | | Qwen-7B-Base | 50 | 39.2 | 22.67 | 54 | 43.75 | 41.48 | 104 | | DevOpsPal-7B—Chat | 56.57 | 30.4 | 25.33 | 45 | 44.06 | 40.92 | 105 | | Baichuan2-13B-Chat | 64 | 18 | 21.33 | 37.5 | 46.88 | 39.3 | 106 | | Qwen-7B-Chat | 57.43 | 38.8 | 22.33 | 39.5 | 25.31 | 36.97 | 107 | | Internlm-7B—Chat | 58.86 | 8.8 | 22.33 | 28.5 | 51.25 | 36.34 | 108 | | Baichuan2-7B-Chat | 60.86 | 10 | 28 | 34.5 | 39.06 | 36.34 | 109 | | Baichuan2-7B-Base | 53.43 | 12.8 | 27.67 | 36.5 | 40.31 | 35.49 | 110 | | Baichuan2-13B-Base | 54 | 12.4 | 23 | 34.5 | 42.81 | 34.86 | 111 | | DevOpsPal-7B—Base | 46.57 | 20.8 | 25 | 34 | 38.75 | 33.94 | 112 | | Internlm-7B—Base | 48.57 | 18.8 | 23.33 | 37.5 | 33.75 | 33.1 | 113 | 114 | #### One Shot 115 | | **模型** | 日志解析 | 根因分析 | 时序异常检测 | 时序分类 | 时序预测 | **平均分** | 116 | |:-------------------:|:-----:|:----:|:------:|:----:|:-----:|:-------:| 117 | | DevOpsPal-14B—Chat | 66.29 | 80.8 | 23.33 | 44.5 | 56.25 | 54.44 | 118 | | DevOpsPal-14B—Base | 60 | 74 | 25.33 | 43.5 | 52.5 | 51.13 | 119 | | Qwen-14B-Base | 64.29 | 74.4 | 28 | 48.5 | 40.31 | 50.77 | 120 | | Qwen-7B-Base | 56 | 60.8 | 27.67 | 44 | 57.19 | 49.44 | 121 | | Qwen-14B-Chat | 49.71 | 65.6 | 28.67 | 48 | 42.19 | 46.13 | 122 | | Baichuan2-13B-Base | 56 | 43.2 | 24.33 | 41 | 46.88 | 42.89 | 123 | | Baichuan2-7B-Chat | 58.57 | 31.6 | 27 | 31.5 | 51.88 | 41.83 | 124 | | DevOpsPal-7B—Base | 52.86 | 44.4 | 28 | 44.5 | 36.25 | 41.2 | 125 | | Baichuan2-7B-Base | 48.29 | 40.4 | 27 | 42 | 40.94 | 39.86 | 126 | | Qwen-7B-Chat | 54.57 | 52 | 29.67 | 26.5 | 27.19 | 38.73 | 127 | | Baichuan2-13B-Chat | 57.43 | 44.4 | 25 | 25.5 | 30.63 | 37.75 | 128 | | DevOpsPal-7B—Chat | 56.57 | 27.2 | 25.33 | 41.5 | 33.44 | 37.46 | 129 | | Internlm-7B—Chat | 62.57 | 12.8 | 22.33 | 21 | 50.31 | 36.69 | 130 | | Internlm-7B—Base | 48 | 33.2 | 29 | 35 | 31.56 | 35.85 | 131 | 132 |
133 | 134 | ### 🔧 ToolLearning 135 |
136 | 137 | | **FuncCall-Filler** | dataset_name | fccr | 1-fcffr | 1-fcfnr | 1-fcfpr | 1-fcfnir | aar | 138 | |:-------------------:| :---: | :---: | :---: | :---: | :---: | :---: | :---: | 139 | | Qwen-14b-chat | luban | 61 | 100 | 97.68 | 63.32 | 100 | 69.46 | 140 | | Qwen-7b-chat | luban | 50.58 | 100 | 98.07 | 52.51 | 100 | 63.59 | 141 | | Baichuan-7b-chat | luban | 60.23 | 100 | 97.3 | 62.93 | 99.61 | 61.12 | 142 | | Internlm-chat-7b | luban | 47.88 | 100 | 96.14 | 51.74 | 99.61 | 61.85 | 143 | | Qwen-14b-chat | fc_data | 98.37 | 99.73 | 99.86 | 98.78 | 100 | 81.58 | 144 | | Qwen-7b-chat | fc_data | 99.46 | 99.86 | 100 | 99.59 | 100 | 79.25 | 145 | | Baichuan-7b-chat | fc_data | 97.96 | 99.32 | 100 | 98.64 | 100 | 89.53 | 146 | | Internlm-chat-7b | fc_data | 94.29 | 95.78 | 100 | 98.5 | 100 | 88.19 | 147 | | CodeLLaMa-7b | fc_data | 98.78 | 99.73 | 100 | 99.05 | 100 | 94.7 | 148 | | CodeLLaMa-7b-16 | fc_data | 98.1 | 99.87 | 99.73 | 98.5 | 100 | 93.14 | 149 | | CodeFuse-7b-4k | fc_data | 98.91 | 99.87 | 99.87 | 99.18 | 100 | 89.5 | 150 | 151 |
152 | 153 | 154 | ## ⏬ 数据 155 | #### 下载 156 | * 方法一:下载zip压缩文件(你也可以直接用浏览器打开下面的链接): 157 | ``` 158 | wget https://huggingface.co/datasets/codefuse-admin/devopseval-exam/resolve/main/devopseval-exam.zip 159 | ``` 160 | 然后可以使用 pandas加载数据: 161 | 162 | ``` 163 | import os 164 | import pandas as pd 165 | 166 | File_Dir="devopseval-exam" 167 | test_df=pd.read_csv(os.path.join(File_Dir,"test","UnitTesting.csv")) 168 | ``` 169 | * 方法二:使用[Hugging Face datasets](https://huggingface.co/datasets/codefuse-admin/devopseval-exam)直接加载数据集。示例如下: 170 | ```python 171 | from datasets import load_dataset 172 | dataset=load_dataset(r"DevOps-Eval/devopseval-exam",name="UnitTesting") 173 | 174 | print(dataset['val'][0]) 175 | # {"id": 1, "question": "单元测试应该覆盖以下哪些方面?", "A": "正常路径", "B": "异常路径", "C": "边界值条件","D": 所有以上,"answer": "D", "explanation": ""} ``` 176 | 177 | * 方法三:使用modelscope下载相关所有数据。示例如下: 178 | ```python 179 | from modelscope.msdatasets import MsDataset 180 | MsDataset.clone_meta(dataset_work_dir='./xxx', dataset_id='codefuse-ai/devopseval-exam') 181 | ``` 182 | 183 | #### 👀 说明 184 | 为了方便使用,我们已经整理出了 55 个细分类别以及它们的中英文名称。具体细节请查看 [category_mapping.json](resources/categroy_mapping.json) 。格式如下: 185 | 186 | ``` 187 | { 188 | "UnitTesting.csv": [ 189 | "unit testing", 190 | "单元测试", 191 | {"dev": 5, "test": 32} 192 | "TEST" 193 | ], 194 | ... 195 | "file_name":[ 196 | "英文名称", 197 | "中文名称", 198 | "样本数量", 199 | "类别(PLAN,CODE,BUILD,TEST,RELEASE,DEPOLY,OPERATE,MONITOR八选一)" 200 | ] 201 | } 202 | ``` 203 | 每个细分类别由两个部分组成:dev 和 test。每个细分类别的 dev 集包含五个示范实例以及为 few-shot 评估提供的解释。而 test 集则用于模型评估,并且test数据已包含准确标签。 204 | 205 | 下面是 dev 数据的示例,来自"版本控制"细分类别: 206 | ``` 207 | id: 4 208 | question: 如何找到Git特定提交中已更改的文件列表? 209 | A: 使用命令 `git diff --name-only SHA` 210 | B: 使用命令 `git log --name-only SHA` 211 | C: 使用命令 `git commit --name-only SHA` 212 | D: 使用命令 `git clone --name-only SHA` 213 | answer: A 214 | explanation: 215 | 分析原因: 216 | git diff --name-only SHA命令会显示与SHA参数对应的提交中已修改的文件列表。参数--name-only让命令只输出文件名,而忽略其他信息。其它选项中的命令并不能实现此功能。 217 | ``` 218 | #### 🔥 AIOps样本示例 219 | 👀 👀 此处以日志解析和时序异常检测为例,对AIOps样本做一些简要的展示: 220 | 221 | 日志解析 222 | ``` 223 | id: 0 224 | question: 225 | 下面是一些运行日志 226 | 0 04:21:15,429 WARN Cannot open channel to 2 at election address /10.10.34.12:3888 227 | 1 19:18:56,377 WARN ******* GOODBYE /10.10.34.11:52703 ******** 228 | 2 19:13:46,128 WARN ******* GOODBYE /10.10.34.11:52308 ******** 229 | 3 19:16:26,268 WARN ******* GOODBYE /10.10.34.11:52502 ******** 230 | 4 09:11:16,012 WARN Cannot open channel to 3 at election address /10.10.34.13:3888 231 | 5 16:37:13,837 WARN Cannot open channel to 2 at election address /10.10.34.12:3888 232 | 6 09:09:16,008 WARN Cannot open channel to 3 at election address /10.10.34.13:3888 233 | 7 15:27:03,681 WARN Cannot open channel to 3 at election address /10.10.34.13:3888 234 | 日志最前面三部分别为序号、时间戳和日志Level,在不考虑这三部分内容的情况下,此处我们设定日志的变量用'<*>'代替,token与token之间用空格分隔,那么请问上述日志的日志模版具体是什么? 235 | A: Notification time out: <*> 和 Connection broken for id <*>, my id = <*>, error = 236 | B: Send worker leaving thread 和 Connection broken for id <*>, my id = <*>, error = 237 | C: Received connection request /<*>:<*> 和 Interrupting SendWorker 238 | D: Cannot open channel to <*> at election address /<*>:<*> 和 ******* GOODBYE /<*>:<*> ******** 239 | answer: D 240 | explanation: 根据日志中的内容,选项D是最符合日志模板的。日志中包含了"Cannot open channel to <*> at election address /<*>:<*>"和"******* GOODBYE /<*>:<*> ********"这两个固定的模板片段,它们都在选项D中出现了。同时,其他选项中的模板片段与日志中的内容不匹配。因此,选项D是最符合日志模板的。 241 | ``` 242 | 时序异常检测 243 | ``` 244 | id: 0 245 | question: 246 | 分析如下时间序列 247 | [50,62,74,84,92,97,99,98,94,87,77,65,265,40,28,17,8,3,0,0,4,10,20,31,43,56,68,79,89,95,99,99,96,91,82,71,59,46,34,22,12,5,1,0,2,7,15,25,37,49] 248 | 请找出其中明显异常点的下标。所谓的异常点一般指的是明显与数据整体趋势不符的点。 249 | A: 46 250 | B: 0 251 | C: 37 252 | D: 12 253 | answer: D 254 | explanation: 根据分析,题目中的时间序列在12点出的值265要明显大于周围数据,存在着突增现象,因此选择D是正确的。 255 | ``` 256 | #### 🔧 ToolLearning样本示例 257 | 工具学习样本的数据格式与OpenAI的函数调用格式兼容。 258 | 详情请参阅[tool_learning_info_zh.md](resources/tool_learning_info_zh.md)。 259 | 工具学习评测过程,详情请参阅见 [tool_learning_evalution.md](resources/tool_learning_evalution.md)。 260 |
261 | 262 | ## 🚀 如何进行测试 263 | 如果需要在自己的 HuggingFace 格式的模型上进行测试的话,总的步骤分为如下几步: 264 | 1. 编写 Model 的 loader 函数 265 | 2. 编写 Model 的 context_builder 函数 266 | 3. 注册模型到配置文件中 267 | 4. 执行测试脚本 268 | 如果模型在加载进来后不需要特殊的处理,而且输入也不需要转换为特定的格式(e.g. chatml 格式或者其他的 human-bot 格式),请直接跳转到第四步直接发起测试。 269 | 270 | #### 1. 编写 loader 函数 271 | 模型加载时还需要做一些额外的处理(e.g. tokenizer 调整),需要继承 `ModelAndTokenizerLoader` 类来覆写对应的 `load_model` 和 `load_tokenizer` 函数, 如下所示: 272 | ```python 273 | class QwenModelAndTokenizerLoader(ModelAndTokenizerLoader): 274 | def __init__(self): 275 | super().__init__() 276 | pass 277 | 278 | @override 279 | def load_model(self, model_path: str): 280 | # Implementation of the method 281 | pass 282 | 283 | @override 284 | def load_tokenizer(self, model_path: str): 285 | # Implementation of the method 286 | pass 287 | ``` 288 | #### 2. 编写 Model 的 context_builder 函数 289 | 如果输入需要转换为特定的格式(e.g. chatml 格式或者其他的 human-bot 格式),则需要继承 ContextBuilder 类来覆写 make_context 函数,如下所示: 290 | ```python 291 | class QwenChatContextBuilder(ContextBuilder): 292 | def __init__(self): 293 | super().__init__() 294 | 295 | @override 296 | def make_context(self, model, tokenizer, query: str, system: str = "hello!"): 297 | # Implementation of the method 298 | pass 299 | ``` 300 | #### 3. 注册模型到配置文件中 301 | 去 conf 中的 `model_conf.json`,注册对应的模型名和这个模型将要使用的 loader 和 context_builder,示例如下: 302 | ```json 303 | { 304 | "Qwen-Chat": { 305 | "loader": "QwenModelAndTokenizerLoader", 306 | "context_builder": "QwenChatContextBuilder" 307 | } 308 | } 309 | ``` 310 | 311 | #### 4. 执行测试脚本 312 | 直接运行以下代码发起测试 313 | ```Bash 314 | python src/run_eval.py \ 315 | --model_path path_to_model \ 316 | --model_name model_name_in_conf \ 317 | --model_conf_path path_to_model_conf \ 318 | --eval_dataset_list all \ 319 | --eval_dataset_fp_conf_path path_to_dataset_conf \ 320 | --eval_dataset_type test \ 321 | --data_path path_to_downloaded_devops_eval_data \ 322 | --k_shot 0 323 | ``` 324 | 👀 👀 具体评测流程见📖 [**数据集评测教程**](resources/tutorial_zh.md) 325 |
326 | 327 | ## 🧭 TODO 328 | - [x] 添加AIOps样本 329 | - [x] 添加AIOps场景,比如**时间预测** 330 | - [x] 增加 **ToolLearning** 样本 331 | - [ ] 当前各类别样本量不平均,后续进一步增加样本数量 332 | - [ ] 增加困难程度的样本集 333 | - [ ] 增加样本的英文版本 334 | 335 |
336 |
337 | 338 | ## 🏁 Licenses 339 | This project is licensed under the [Apache License (Version 2.0)](LICENSE.md). 340 | 341 |
342 | 343 | ## 😃 引用 344 | 345 | 如果您使用了我们的数据集,请引用我们的论文。 346 | Coming soon... 347 | 348 |
349 |
350 | 351 | 352 | ## 🗂 Miscellaneous 353 | 354 | ### ✨ Star History 355 | [![Star History Chart](https://api.star-history.com/svg?repos=codefuse-ai/codefuse-devops-eval&type=Date)](https://star-history.com/#codefuse-ai/codefuse-devops-eval&Date) 356 | 357 | ### 🤝 Friendship Links 358 | - [Codefuse-ChatBot](https://github.com/codefuse-ai/codefuse-chatbot) 359 | - Codefuse-ChatBot is an open-source AI smart assistant designed to support the software development lifecycle with conversational access to tools, knowledge, and platform integration. 360 | - [Awesome AIGC Tutorials](https://github.com/luban-agi/Awesome-AIGC-Tutorials) 361 | - Awesome AIGC Tutorials houses a curated collection of tutorials and resources spanning across Large Language Models, AI Painting, and related fields. 362 | -------------------------------------------------------------------------------- /conf/dataset_fp.json: -------------------------------------------------------------------------------- 1 | { 2 | "computer_science": "computer_science.csv", 3 | "computer_security": "computer_security.csv", 4 | "machine_learning": "machine_learning.csv", 5 | "college_programming": "college_programming.csv", 6 | "computer_architecture": "computer_architecture.csv", 7 | "computer_network": "computer_network.csv", 8 | "tech_risk": "risk.csv" 9 | } -------------------------------------------------------------------------------- /conf/model_conf.json: -------------------------------------------------------------------------------- 1 | { 2 | "Qwen-7b-Base": { 3 | "loader": "QwenModelAndTokenizerLoader", 4 | "context_builder": "ContextBuilder" 5 | }, 6 | "Qwen-7b-Chat": { 7 | "loader": "QwenModelAndTokenizerLoader", 8 | "context_builder": "QwenChatContextBuilder" 9 | }, 10 | "Baichuan-13b-Base": { 11 | "loader": "BaichuanModelAndTokenizerLoader", 12 | "context_builder": "ContextBuilder" 13 | }, 14 | "Baichuan2-13b-Chat": { 15 | "loader": "BaichuanModelAndTokenizerLoader", 16 | "context_builder": "Baichuan2ChatContextBuilder" 17 | }, 18 | "Internlm-7b-Base": { 19 | "loader": "ModelAndTokenizerLoader", 20 | "context_builder": "ContextBuilder" 21 | }, 22 | "Internlm-7b-Chat": { 23 | "loader": "ModelAndTokenizerLoader", 24 | "context_builder": "InternlmChatContextBuilder" 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /images/data_info.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/codefuse-devops-eval/f0f12d4380cc5bb684bc583d8e4d0a86e4e18c37/images/data_info.png -------------------------------------------------------------------------------- /images/devops_eval_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/codefuse-devops-eval/f0f12d4380cc5bb684bc583d8e4d0a86e4e18c37/images/devops_eval_logo.png -------------------------------------------------------------------------------- /images/toolLearning_performance_metrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/codefuse-devops-eval/f0f12d4380cc5bb684bc583d8e4d0a86e4e18c37/images/toolLearning_performance_metrics.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=2.0.1 2 | transformers==4.32.0 3 | datasets>=2.12.0 4 | accelerate>=0.21.0 5 | peft>=0.4.0 6 | trl>=0.5.0 7 | scipy 8 | sentencepiece 9 | tiktoken 10 | jieba 11 | rouge-chinese 12 | nltk 13 | gradio>=3.36.0 14 | uvicorn 15 | pydantic==1.10.11 16 | fastapi==0.95.1 17 | sse-starlette 18 | matplotlib 19 | loguru 20 | jsonlines 21 | transformers_stream_generator==0.0.4 22 | deepspeed>=0.9.3 23 | einops 24 | xformers -------------------------------------------------------------------------------- /resources/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/codefuse-devops-eval/f0f12d4380cc5bb684bc583d8e4d0a86e4e18c37/resources/__init__.py -------------------------------------------------------------------------------- /resources/categroy_mapping.json: -------------------------------------------------------------------------------- 1 | { 2 | "Visualization.csv":[ 3 | "visualization", 4 | "可视化", 5 | { 6 | "dev":5, 7 | "test":44 8 | }, 9 | "Visualization.csv" 10 | ], 11 | "Logging.csv":[ 12 | "logging", 13 | "日志", 14 | { 15 | "dev":5, 16 | "test":100 17 | }, 18 | "Logging.csv" 19 | ], 20 | "Storage.csv":[ 21 | "storage", 22 | "存储", 23 | { 24 | "dev":5, 25 | "test":36 26 | }, 27 | "Storage.csv" 28 | ], 29 | "DataAcquisition.csv":[ 30 | "data acquisition", 31 | "数据采集", 32 | { 33 | "dev":5, 34 | "test":36 35 | }, 36 | "DataAcquisition.csv" 37 | ], 38 | "IntegrationTesting.csv":[ 39 | "integration testing", 40 | "集成测试", 41 | { 42 | "dev":5, 43 | "test":31 44 | }, 45 | "IntegrationTesting.csv" 46 | ], 47 | "UserAcceptanceTesting.csv":[ 48 | "user acceptance testing", 49 | "用户验收测试", 50 | { 51 | "dev":5, 52 | "test":39 53 | }, 54 | "UserAcceptanceTesting.csv" 55 | ], 56 | "SecurityTesting.csv":[ 57 | "security testing", 58 | "安全测试", 59 | { 60 | "dev":5, 61 | "test":38 62 | }, 63 | "SecurityTesting.csv" 64 | ], 65 | "UnitTesting.csv":[ 66 | "unit testing", 67 | "单元测试", 68 | { 69 | "dev":5, 70 | "test":32 71 | }, 72 | "UnitTesting.csv" 73 | ], 74 | "PerformanceTesting.csv":[ 75 | "performance testing", 76 | "性能测试", 77 | { 78 | "dev":5, 79 | "test":36 80 | }, 81 | "PerformanceTesting.csv" 82 | ], 83 | "SystemTesting.csv":[ 84 | "system testing", 85 | "系统测试", 86 | { 87 | "dev":5, 88 | "test":52 89 | }, 90 | "SystemTesting.csv" 91 | ], 92 | "ProgM.csv":[ 93 | "programme management", 94 | "进度管理", 95 | { 96 | "dev":5, 97 | "test":21 98 | }, 99 | "ProgM.csv" 100 | ], 101 | "REQM.csv":[ 102 | "requirements management", 103 | "需求管理", 104 | { 105 | "dev":5, 106 | "test":24 107 | }, 108 | "REQM.csv" 109 | ], 110 | "RiskMgmt.csv":[ 111 | "risk management", 112 | "风险管理", 113 | { 114 | "dev":5, 115 | "test":21 116 | }, 117 | "RiskMgmt.csv" 118 | ], 119 | "InfrastructureAsCode.csv":[ 120 | "infrastructure as code", 121 | "基础设施即代码", 122 | { 123 | "dev":5, 124 | "test":34 125 | }, 126 | "InfrastructureAsCode.csv" 127 | ], 128 | "Provisioning.csv":[ 129 | "provisioning", 130 | "置备", 131 | { 132 | "dev":5, 133 | "test":19 134 | }, 135 | "Provisioning.csv" 136 | ], 137 | "ConfigMgmt.csv":[ 138 | "config management", 139 | "配置管理", 140 | { 141 | "dev":5, 142 | "test":100 143 | }, 144 | "ConfigMgmt.csv" 145 | ], 146 | "Azure.csv":[ 147 | "microsoft azure", 148 | "微软云服务", 149 | { 150 | "dev":5, 151 | "test":27 152 | }, 153 | "Azure.csv" 154 | ], 155 | "GoogleCloud.csv":[ 156 | "google cloud", 157 | "谷歌云服务", 158 | { 159 | "dev":5, 160 | "test":31 161 | }, 162 | "GoogleCloud.csv" 163 | ], 164 | "AWS.csv":[ 165 | "amazon web services", 166 | "亚马逊云服务", 167 | { 168 | "dev":5, 169 | "test":44 170 | }, 171 | "AWS.csv" 172 | ], 173 | "LogDesign.csv":[ 174 | "log design", 175 | "日志设计", 176 | { 177 | "dev":5, 178 | "test":33 179 | }, 180 | "LogDesign.csv" 181 | ], 182 | "ServiceDesign.csv":[ 183 | "service design", 184 | "服务设计", 185 | { 186 | "dev":5, 187 | "test":44 188 | }, 189 | "ServiceDesign.csv" 190 | ], 191 | "CapabilityDesign.csv":[ 192 | "capability design", 193 | "容量设计", 194 | { 195 | "dev":5, 196 | "test":33 197 | }, 198 | "CapabilityDesign.csv" 199 | ], 200 | "CloudNativeDesign.csv":[ 201 | "cloud native design", 202 | "云原生设计", 203 | { 204 | "dev":5, 205 | "test":44 206 | }, 207 | "CloudNativeDesign.csv" 208 | ], 209 | "CacheDesign.csv":[ 210 | "cache design", 211 | "缓存设计", 212 | { 213 | "dev":5, 214 | "test":28 215 | }, 216 | "CacheDesign.csv" 217 | ], 218 | "DBDesign.csv":[ 219 | "database design", 220 | "数据库设计", 221 | { 222 | "dev":5, 223 | "test":38 224 | }, 225 | "DBDesign.csv" 226 | ], 227 | "ArtificialIntelligence.csv":[ 228 | "artificial intelligence", 229 | "人工智能", 230 | { 231 | "dev":5, 232 | "test":45 233 | }, 234 | "ArtificialIntelligence.csv" 235 | ], 236 | "ComputerBasics.csv":[ 237 | "computer basics", 238 | "计算机基础", 239 | { 240 | "dev":5, 241 | "test":100 242 | }, 243 | "ComputerBasics.csv" 244 | ], 245 | "DataBase.csv":[ 246 | "database", 247 | "数据库", 248 | { 249 | "dev":5, 250 | "test":75 251 | }, 252 | "DataBase.csv" 253 | ], 254 | "ComputerNetwork.csv":[ 255 | "computer network", 256 | "计算机网络", 257 | { 258 | "dev":5, 259 | "test":88 260 | }, 261 | "ComputerNetwork.csv" 262 | ], 263 | "OperatingSystem.csv":[ 264 | "operating system", 265 | "操作系统", 266 | { 267 | "dev":5, 268 | "test":36 269 | }, 270 | "OperatingSystem.csv" 271 | ], 272 | "Go.csv":[ 273 | "go", 274 | "go语言", 275 | { 276 | "dev":5, 277 | "test":100 278 | }, 279 | "Go.csv" 280 | ], 281 | "Java.csv":[ 282 | "java", 283 | "java语言", 284 | { 285 | "dev":5, 286 | "test":100 287 | }, 288 | "Java.csv" 289 | ], 290 | "C:C++.csv":[ 291 | "c/c++", 292 | "c/c++语言", 293 | { 294 | "dev":5, 295 | "test":100 296 | }, 297 | "C:C++.csv" 298 | ], 299 | "Python.csv":[ 300 | "python", 301 | "python语言", 302 | { 303 | "dev":5, 304 | "test":73 305 | }, 306 | "Python.csv" 307 | ], 308 | "BigData.csv":[ 309 | "big data", 310 | "大数据", 311 | { 312 | "dev":5, 313 | "test":15 314 | }, 315 | "BigData.csv" 316 | ], 317 | "Front-end.csv":[ 318 | "front-end", 319 | "前端", 320 | { 321 | "dev":5, 322 | "test":100 323 | }, 324 | "Front-end.csv" 325 | ], 326 | "MobileApp.csv":[ 327 | "mobile app", 328 | "移动应用", 329 | { 330 | "dev":5, 331 | "test":100 332 | }, 333 | "MobileApp.csv" 334 | ], 335 | "MachineLearning.csv":[ 336 | "machine learning", 337 | "机器学习", 338 | { 339 | "dev":5, 340 | "test":69 341 | }, 342 | "MachineLearning.csv" 343 | ], 344 | "Back-end.csv":[ 345 | "back-end", 346 | "后端", 347 | { 348 | "dev":5, 349 | "test":100 350 | }, 351 | "Back-end.csv" 352 | ], 353 | "ArtifactMgmt.csv":[ 354 | "artifact management", 355 | "产出物管理", 356 | { 357 | "dev":5, 358 | "test":12 359 | }, 360 | "ArtifactMgmt.csv" 361 | ], 362 | "CI:CD.csv":[ 363 | "cd/cd", 364 | "持续集成/持续部署", 365 | { 366 | "dev":5, 367 | "test":100 368 | }, 369 | "CI:CD.csv" 370 | ], 371 | "Linux.csv":[ 372 | "linux", 373 | "linux操作系统", 374 | { 375 | "dev":5, 376 | "test":100 377 | }, 378 | "Linux.csv" 379 | ], 380 | "ContainerOrchestration.csv":[ 381 | "container orchestration", 382 | "容器编排", 383 | { 384 | "dev":5, 385 | "test":100 386 | }, 387 | "ContainerOrchestration.csv" 388 | ], 389 | "Virtualization.csv":[ 390 | "virtualization", 391 | "虚拟化技术", 392 | { 393 | "dev":5, 394 | "test":34 395 | }, 396 | "Virtualization.csv" 397 | ], 398 | "TimeSeriesAnomalyDetection.csv":[ 399 | "time series anomaly detection", 400 | "时序异常检测", 401 | { 402 | "dev":5, 403 | "test":300 404 | }, 405 | "TimeSeriesAnomalyDetection.csv" 406 | ], 407 | "TimeSeriesClassification.csv":[ 408 | "time series classification", 409 | "时序分类", 410 | { 411 | "dev":5, 412 | "test":200 413 | }, 414 | "TimeSeriesClassification.csv" 415 | ], 416 | "RootCauseAnalysis.csv":[ 417 | "root cause analysis", 418 | "根因分析", 419 | { 420 | "dev":5, 421 | "test":250 422 | }, 423 | "RootCauseAnalysis.csv" 424 | ], 425 | "LogParser.csv":[ 426 | "log parser", 427 | "日志解析", 428 | { 429 | "dev":5, 430 | "test":350 431 | }, 432 | "LogParser.csv" 433 | ], 434 | "VersionControl.csv":[ 435 | "version control", 436 | "版本控制", 437 | { 438 | "dev":5, 439 | "test":100 440 | }, 441 | "VersionControl.csv" 442 | ], 443 | "DBMgnt.csv":[ 444 | "database management", 445 | "数据库管理", 446 | { 447 | "dev":5, 448 | "test":19 449 | }, 450 | "DBMgnt.csv" 451 | ], 452 | "Dependency.csv":[ 453 | "dependency", 454 | "依赖管理", 455 | { 456 | "dev":5, 457 | "test":44 458 | }, 459 | "Dependency.csv" 460 | ], 461 | "Compile.csv":[ 462 | "compile", 463 | "编译", 464 | { 465 | "dev":5, 466 | "test":31 467 | }, 468 | "Compile.csv" 469 | ], 470 | "Package.csv":[ 471 | "package", 472 | "包管理", 473 | { 474 | "dev":5, 475 | "test":24 476 | }, 477 | "Package.csv" 478 | ] 479 | } -------------------------------------------------------------------------------- /resources/devops_diagram_zh.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/codefuse-devops-eval/f0f12d4380cc5bb684bc583d8e4d0a86e4e18c37/resources/devops_diagram_zh.jpg -------------------------------------------------------------------------------- /resources/tool_learning_evalution.md: -------------------------------------------------------------------------------- 1 | ## tool learning 数据集评测教程 2 | 3 | ### chatml接入方式 4 | 如果需要在自己的 huggingface 格式的模型上进行测试的话,总的步骤分为如下几步: 5 | 1. 编写 ~/evals/FuncCallEvalution 的 create_prompts 函数 6 | 2. 编写 ~/models/base_model 的 相关函数 7 | 3. 注册模型和评估函数 8 | 4. 执行测试脚本 9 | 如果模型在加载进来后不需要特殊的处理,而且输入也不需要转换为特定的格式(e.g. chatml 格式或者其他的 human-bot 格式),请直接跳转到第四步直接发起测试。 10 | 11 | #### 1. 编写 loader 函数 12 | 如果模型在加载进来还需要做一些额外的处理(e.g. tokenizer 调整),需要去 `src.context_builder.context_builder_family.py` 中继承 `ModelAndTokenizerLoader` 类来覆写对应的 `load_model` 和 `load_tokenizer` 函数,具体可以参照以下示例: 13 | ```python 14 | class FuncCallEvalution(ToolEvalution): 15 | 16 | def create_prompts(self, func_call_datas): 17 | ''' 18 | datas: [ 19 | { 20 | "instruction": history[his_idx], 21 | "input": "", 22 | "output": output, 23 | "history": [(human_content, ai_content), (), ()], 24 | "functions": tools 25 | } 26 | ] 27 | ''' 28 | system_content = '''CodeFuse是一个面向研发领域的智能助手,旨在中立的、无害的帮助用户解决开发相关的问题,所有的回答均使用Markdown格式返回。 29 | 你能利用许多工具和功能来完成给定的任务,在每一步中,你需要分析当前状态,并通过执行函数调用来确定下一步的行动方向。你可以进行多次尝试。如果你计划连续尝试不同的条件,请每次尝试一种条件。若给定了Finish函数,则以Finish调用结束,若没提供Finish函数,则以不带function_call的对话结束。''' 30 | function_format = '''You are ToolGPT, you have access to the following APIs:\n{tools}''' 31 | 32 | func_call_train_datas = [] 33 | history_error_cnt = 0 34 | funccall_error_cnt = 0 35 | 36 | for data in func_call_datas: 37 | tools = data["functions"] 38 | chatrounds = data["chatrounds"] 39 | 40 | function_content = "" 41 | if len(tools) > 0: 42 | function_content = function_format.format(tools=json.dumps(tools, ensure_ascii=False, sort_keys=True)) 43 | 44 | history = [] 45 | for i in chatrounds: 46 | if i["role"]=="system": 47 | continue 48 | 49 | if i["role"]=="user": 50 | history.append(("user", i["content"])) 51 | 52 | if i["role"] == "assistant": 53 | if "function_call" in i: 54 | if not isinstance(i["function_call"], dict): 55 | funccall_error_cnt+=1 56 | continue 57 | content = "#function" + json.dumps({**{"content": i["content"]}, **i["function_call"]}, ensure_ascii=False) 58 | else: 59 | content = i["content"] 60 | history.append(("assistant", content)) 61 | 62 | 63 | if i["role"] == "function": 64 | content = json.dumps({**{"content": i["content"]}, **{"name": i["name"]}}, ensure_ascii=False) 65 | history.append(("user", content)) 66 | 67 | 68 | history = [i[1] for i in history] 69 | history[0] = "\n".join([system_content,function_content, history[0]]) 70 | 71 | for his_idx in range(0, len(history), 2): 72 | output = history[his_idx+1] 73 | 74 | if "#function" in output: 75 | output = output.split("#function")[-1] 76 | 77 | try: 78 | output = json.loads(output) 79 | except: 80 | output = {"content": output} 81 | 82 | 83 | func_call_train_datas.append( 84 | { 85 | "instruction": history[his_idx], 86 | "input": "", 87 | "output": output, 88 | "history": [history[:his_idx+2][i:i+2] for i in range(0, len(history[:his_idx]), 2)], 89 | "functions": tools 90 | }, 91 | ) 92 | return func_call_train_datas 93 | ``` 94 | 95 | #### 2. 编写 Model 的 context_builder 函数 96 | 如果输入需要转换为特定的格式(e.g. chatml 格式或者其他的 human-bot 格式),则需要去 `src.context_builder.context_builder_family` 中继承 ContextBuilder 类来覆写 make_context 函数,这个函数是用来将输入转换格式为对应需要的输出的,一个示例如下: 97 | ```python 98 | class ToolModel: 99 | def __init__(self, model_path: str, template: str, trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25): 100 | self.model_path = model_path 101 | self.trust_remote_code = trust_remote_code 102 | self.tensor_parallel_size = tensor_parallel_size 103 | self.gpu_memory_utilization = gpu_memory_utilization 104 | self.load_model(self.model_path, self.trust_remote_code, self.tensor_parallel_size, self.gpu_memory_utilization) 105 | 106 | def generate(self, prompts: str, template: str = None, generate_configs: GenerateConfigs = None) -> list: 107 | '''产出对应结果''' 108 | pass 109 | 110 | def generate_params( 111 | self, generate_configs: GenerateConfigs, 112 | ): 113 | '''generate param''' 114 | kargs = generate_configs.dict() 115 | return kargs 116 | 117 | def load_model(self, model_path, trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25): 118 | '''加载模型''' 119 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=trust_remote_code) 120 | self.model = AutoModelForCausalLM.from_pretrained(self.model_path, device_map="auto", trust_remote_code=trust_remote_code).eval() 121 | 122 | # self.model = LLM(model=model_path, trust_remote_code=trust_remote_code, tensor_parallel_size=tensor_parallel_size, gpu_memory_utilization=gpu_memory_utilization) 123 | ``` 124 | 125 | #### 3. 注册模型和eval函数即可 126 | 在 ~/models/__init__.py 中注册即可 127 | ```python 128 | from .base_model import ToolModel 129 | 130 | __all__ = [ 131 | "ToolModel", 132 | ] 133 | ``` 134 | 在 ~/evasl/__init__.py 中注册即可 135 | ```python 136 | from .base_evalution import ToolEvalution 137 | from .toolfill_evalution import ToolFillEvalution 138 | from .toolparser_evalution import ToolParserEvalution 139 | from .toolsummary_evalution import ToolSummaryEvalution 140 | from .func_call_evalution import FuncCallEvalution 141 | 142 | 143 | __all__ = [ 144 | "ToolEvalution", "ToolFillEvalution", "ToolParserEvalution", "ToolSummaryEvalution", "FuncCallEvalution" 145 | ] 146 | ``` 147 | 148 | 149 | #### 4. 执行测试脚本 150 | 修改 ~/src/qwen_eval_main.py# datainfos和model_infos 151 | ```python 152 | model_infos = [ 153 | {"model_name": "", "template": "chatml", "model_path": "", 154 | "peft_path": "", "model_class": QwenModel}] 155 | 156 | datainfos = [ 157 | {"dataset_path": "~/fcdata_luban_zh_test.jsonl", "dataset_name": "fcdata_luban_zh", "tool_task": "func_call"}, 158 | {"dataset_path": "~/test_datas/fcdata_zh_test_v1.jsonl", "dataset_name": "fcdata_zh", "tool_task": "func_call"}, 159 | ] 160 | ``` 161 | 162 | 运行下述命令即可 163 | ```Bash 164 | python qwen_eval_main.py 165 | ``` 166 | 167 |
168 | 169 | ### 非chatml接入 170 | 如果需要在自己的 huggingface 格式的模型上进行测试的话,总的步骤分为如下几步: 171 | 1. 编写 ~/getAssistantAns.py 相关代码 172 | 2. 执行测试脚本 173 | 174 | 175 | #### 1、编写 getAssistantAns 示例 176 | ``` 177 | class GetAssistantAns(): 178 | # 按照自己推理需求自己修改代码 179 | 180 | def __init__(self, gpu_num=1): 181 | model = AutoModelForCausalLM.from_pretrained(model_name) 182 | device_list = [] 183 | for gpu_idx in range(gpu_num): 184 | device_list.append(torch.device("cuda:0")) 185 | 186 | # 将模型移动到指定的GPU设备 187 | model.to(device) 188 | 189 | 190 | def gen_answer(self, chat_dict, gpu_index): 191 | # 这里实际根据自己推理逻辑 然后转为标准格式返回 192 | # 以下仅仅是样例 193 | import time 194 | print(os.environ["CUDA_VISIBLE_DEVICES"]) 195 | time.sleep(1) 196 | rtn_dict1 = { 197 | "role": "assistant", 198 | "content": None, 199 | "function_call": 200 | { 201 | "name": "get_fudan_university_scoreline", 202 | "arguments": "{\n \"year\": \"2020\"\n}" 203 | } 204 | } 205 | 206 | rtn_dict2 = { 207 | "role": "assistant", 208 | "content": "2020年复旦大学的分数线如下:\n\n- 文科一批:630分\n- 文科二批:610分\n- 理科一批:650分\n- 理科二批:630分" 209 | } 210 | 211 | return random.choice([rtn_dict1, rtn_dict2]) 212 | ``` 213 | #### 2、执行测试脚本 214 | 修改 ~/src/opensource_functioncall_evalution.py # test_ans_file_list 215 | ```python 216 | test_ans_file_list = [ 217 | "fcdata_zh_test.jsonl" 218 | ] 219 | ``` 220 | 221 | 运行下述命令即可 222 | ```Bash 223 | python opensource_functioncall_evalution.py 224 | ``` 225 | -------------------------------------------------------------------------------- /resources/tool_learning_info.md: -------------------------------------------------------------------------------- 1 | ### 数据样例 2 | 在数据上我们完全兼容了 OpenAI Function Calling,具体格式如下: 3 | 4 | **Function Call的数据格式** 5 | 6 | | Input Key | Input Type | Input Description | 7 | | --- | --- | --- | 8 | | functions | List[Swagger] | 工具集合 | 9 | | chatrounds | List[chatround] | 多轮对话数据 | 10 | 11 | **chatrounds的数据格式** 12 | 13 | | Input Key | Input Type | Input Description | 14 | | --- | --- | --- | 15 | | role | string | 角色名称,包含三种类别,user、assistant、function | 16 | | name | string | 若role为function,则存在name字段,为function的名称 | 17 | | content | string | role的返回内容 | 18 | | function_call | dict | 工具调用 | 19 | 20 | ``` 21 | { 22 | "functions": 23 | [ 24 | { 25 | "name": "get_fudan_university_scoreline", 26 | "description": "查询复旦大学往年分数线,例如:查询2020年复旦大学的分数线", 27 | "parameters": 28 | { 29 | "type": "object", 30 | "properties": 31 | { 32 | "year": 33 | { 34 | "type": "string", 35 | "description": "年份,例如:2020,2019,2018" 36 | } 37 | }, 38 | "required": 39 | [ 40 | "year" 41 | ] 42 | } 43 | } 44 | ], 45 | "chatrounds": 46 | [ 47 | { 48 | "role": "system", 49 | "content": "CodeFuse是一个面向研发领域的智能助手,旨在中立的、无害的帮助用户解决开发相关的问题,所有的回答均使用Markdown格式返回。\n你能利用许多工具和功能来完成给定的任务,在每一步中,你需要分析当前状态,并通过执行函数调用来确定下一步的行动方向。你可以进行多次尝试。如果你计划连续尝试不同的条件,请每次尝试一种条件。若给定了Finish函数,则以Finish调用结束,若没提供Finish函数,则以不带function_call的对话结束。" 50 | }, 51 | { 52 | "role": "user", 53 | "content": "查询2020年复旦大学的分数线" 54 | }, 55 | { 56 | "role": "assistant", 57 | "content": null, 58 | "function_call": 59 | { 60 | "name": "get_fudan_university_scoreline", 61 | "arguments": "{\n \"year\": \"2020\"\n}" 62 | } 63 | }, 64 | { 65 | "role": "function", 66 | "name": "get_fudan_university_scoreline", 67 | "content": "{\n \"scoreline\":{\n \"文科一批\": 630, \n \"文科二批\": 610, \n \"理科一批\": 650, \n \"理科二批\": 630 \n }\n}" 68 | }, 69 | { 70 | "role": "assistant", 71 | "content": "2020年复旦大学的分数线如下:\n\n- 文科一批:630分\n- 文科二批:610分\n- 理科一批:650分\n- 理科二批:630分" 72 | } 73 | ] 74 | } 75 | ``` 76 | 77 | 上述Function Call的数据样例为给定特定工具集后,用于回答用户查询某高校录取分数线的问题。 78 | 79 | 80 | ### 评测指标 81 | 由于一般通用模型无法具备工具调用的能力,因此在进行Tool Learn-Eval评测之前需要对通用模型进行微调,先让模型学会工具使用的基本范式 82 | 83 | 下面,我们定义了几种评估工具使用的指标: 84 | 85 | 86 | 87 | ②③④⑤的和为1,代表工具调用失败的总数,⑤工具幻觉是工具名识别失败的一种特殊情况 -------------------------------------------------------------------------------- /resources/tool_learning_info_zh.md: -------------------------------------------------------------------------------- 1 | ### 数据样例 2 | 在数据上我们完全兼容了 OpenAI Function Calling,具体格式如下: 3 | 4 | **Function Call的数据格式** 5 | 6 | | Input Key | Input Type | Input Description | 7 | | --- | --- | --- | 8 | | functions | List[Swagger] | 工具集合 | 9 | | chatrounds | List[chatround] | 多轮对话数据 | 10 | 11 | **chatrounds的数据格式** 12 | 13 | | Input Key | Input Type | Input Description | 14 | | --- | --- | --- | 15 | | role | string | 角色名称,包含三种类别,user、assistant、function | 16 | | name | string | 若role为function,则存在name字段,为function的名称 | 17 | | content | string | role的返回内容 | 18 | | function_call | dict | 工具调用 | 19 | 20 | ``` 21 | { 22 | "functions": 23 | [ 24 | { 25 | "name": "get_fudan_university_scoreline", 26 | "description": "查询复旦大学往年分数线,例如:查询2020年复旦大学的分数线", 27 | "parameters": 28 | { 29 | "type": "object", 30 | "properties": 31 | { 32 | "year": 33 | { 34 | "type": "string", 35 | "description": "年份,例如:2020,2019,2018" 36 | } 37 | }, 38 | "required": 39 | [ 40 | "year" 41 | ] 42 | } 43 | } 44 | ], 45 | "chatrounds": 46 | [ 47 | { 48 | "role": "system", 49 | "content": "CodeFuse是一个面向研发领域的智能助手,旨在中立的、无害的帮助用户解决开发相关的问题,所有的回答均使用Markdown格式返回。\n你能利用许多工具和功能来完成给定的任务,在每一步中,你需要分析当前状态,并通过执行函数调用来确定下一步的行动方向。你可以进行多次尝试。如果你计划连续尝试不同的条件,请每次尝试一种条件。若给定了Finish函数,则以Finish调用结束,若没提供Finish函数,则以不带function_call的对话结束。" 50 | }, 51 | { 52 | "role": "user", 53 | "content": "查询2020年复旦大学的分数线" 54 | }, 55 | { 56 | "role": "assistant", 57 | "content": null, 58 | "function_call": 59 | { 60 | "name": "get_fudan_university_scoreline", 61 | "arguments": "{\n \"year\": \"2020\"\n}" 62 | } 63 | }, 64 | { 65 | "role": "function", 66 | "name": "get_fudan_university_scoreline", 67 | "content": "{\n \"scoreline\":{\n \"文科一批\": 630, \n \"文科二批\": 610, \n \"理科一批\": 650, \n \"理科二批\": 630 \n }\n}" 68 | }, 69 | { 70 | "role": "assistant", 71 | "content": "2020年复旦大学的分数线如下:\n\n- 文科一批:630分\n- 文科二批:610分\n- 理科一批:650分\n- 理科二批:630分" 72 | } 73 | ] 74 | } 75 | ``` 76 | 77 | 上述Function Call的数据样例为给定特定工具集后,用于回答用户查询某高校录取分数线的问题。 78 | 79 | 80 | ### 评测指标 81 | 由于一般通用模型无法具备工具调用的能力,因此在进行Tool Learn-Eval评测之前需要对通用模型进行微调,先让模型学会工具使用的基本范式 82 | 83 | 下面,我们定义了几种评估工具使用的指标: 84 | 85 | 86 | 87 | ②③④⑤的和为1,代表工具调用失败的总数,⑤工具幻觉是工具名识别失败的一种特殊情况 -------------------------------------------------------------------------------- /resources/tutorial.md: -------------------------------------------------------------------------------- 1 | ## Evaluate Tutorial 2 | 3 | ## 🚀 How to Evaluate 4 | If you need to test your own huggingface-formatted model, the overall steps are as follows: 5 | 1. Write the loader function for the model. 6 | 2. Write the context_builder function for the model. 7 | 3. Register the model in the configuration file. 8 | 4. Run the testing script. 9 | If the model does not require any special processing after loading, and the input does not need to be converted to a specific format (e.g. chatml format or other human-bot formats), you can directly proceed to step 4 to initiate the testing. 10 | 11 | #### 1. Write the loader function 12 | If the model requires additional processing after loading (e.g. adjusting the tokenizer), you need to inherit the `ModelAndTokenizerLoader` class in `src.context_builder.context_builder_family.py` and override the corresponding `load_model` and `load_tokenizer` functions. You can refer to the following example: 13 | ```python 14 | class QwenModelAndTokenizerLoader(ModelAndTokenizerLoader): 15 | def __init__(self): 16 | super().__init__() 17 | pass 18 | 19 | def load_model(self, model_path: str): 20 | model = super().load_model(model_path) 21 | model.generation_config = GenerationConfig.from_pretrained(model_path) 22 | return model 23 | 24 | def load_tokenizer(self, model_path: str): 25 | tokenizer = super().load_tokenizer(model_path) 26 | 27 | # read generation config 28 | with open(model_path + '/generation_config.json', 'r') as f: 29 | generation_config = json.load(f) 30 | tokenizer.pad_token_id = generation_config['pad_token_id'] 31 | tokenizer.eos_token_id = generation_config['eos_token_id'] 32 | return tokenizer 33 | ``` 34 | 35 | #### 2. Write the context_builder function for the Model 36 | If the input needs to be converted to a specific format (e.g. chatml format or other human-bot formats), you need to inherit the ContextBuilder class in `src.context_builder.context_builder_family` and override the make_context function. This function is used to convert the input to the corresponding required format. An example is shown below: 37 | ```python 38 | class QwenChatContextBuilder(ContextBuilder): 39 | def __init__(self): 40 | super().__init__() 41 | 42 | def make_context( 43 | self, 44 | model, 45 | tokenizer, 46 | query: str, 47 | system: str = "you are a helpful assistant" 48 | ): 49 | ''' 50 | model: PretrainedModel 51 | tokenizer: PretrainedTokenzier 52 | query: Input string 53 | system: System prompt if needed 54 | ''' 55 | im_start, im_end = "<|im_start|>", "<|im_end|>" 56 | im_start_tokens = [tokenizer.im_start_id] 57 | im_end_tokens = [tokenizer.im_end_id] 58 | nl_tokens = tokenizer.encode("\n") 59 | 60 | def _tokenize_str(role, content): 61 | return f"{role}\n{content}", tokenizer.encode( 62 | role, allowed_special=set() 63 | ) + nl_tokens + tokenizer.encode(content, allowed_special=set()) 64 | 65 | system_text, system_tokens_part = _tokenize_str("system", system) 66 | system_tokens = im_start_tokens + system_tokens_part + im_end_tokens 67 | 68 | raw_text = "" 69 | context_tokens = [] 70 | 71 | context_tokens = system_tokens + context_tokens 72 | raw_text = f"{im_start}{system_text}{im_end}" + raw_text 73 | context_tokens += ( 74 | nl_tokens 75 | + im_start_tokens 76 | + _tokenize_str("user", query)[1] 77 | + im_end_tokens 78 | + nl_tokens 79 | + im_start_tokens 80 | + tokenizer.encode("assistant") 81 | + nl_tokens 82 | ) 83 | raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n" 84 | return raw_text, context_tokens 85 | ``` 86 | 87 | #### 3. Register the model in the configuration file 88 | Go to the `model_conf.json` file in the conf directory and register the corresponding model name and the loader and context_builder that will be used for this model. Simply write the class names defined in the first and second steps for the loader and context_builder. Here is an example: 89 | ```json 90 | { 91 | "Qwen-Chat": { 92 | "loader": "QwenModelAndTokenizerLoader", 93 | "context_builder": "QwenChatContextBuilder" 94 | } 95 | } 96 | ``` 97 | 98 | #### 4. Execute the testing script 99 | Run the following code to initiate the test: 100 | ```Bash 101 | # model_path: path to the model for testing 102 | # model_name: the model name corresponding to the model in the configuration file, default is Default, which represents using the default loader and context_builder 103 | # model_conf_path: path to the model configuration file, usually the devopseval_dataset_fp.json file in the conf directory 104 | # eval_dataset_list: the names of the datasets to be tested, default is all to test all datasets, if you need to test one or more datasets, use the # symbol to connect them, for example: dataset1#dataset2 105 | # eval_dataset_fp_conf_path: path to the dataset configuration file 106 | # eval_dataset_type: the type of testing, only supports the default test type of test dataset 107 | # data_path: path to the evaluation dataset, fill in the downloaded dataset address 108 | # k_shot: supports 0-5, represents the number of example prefixes added for few-shot 109 | 110 | python src/run_eval.py \ 111 | --model_path path_to_model \ 112 | --model_name model_name_in_conf \ 113 | --model_conf_path path_to_model_conf \ 114 | --eval_dataset_list all \ 115 | --eval_dataset_fp_conf_path path_to_dataset_conf \ 116 | --eval_dataset_type test \ 117 | --data_path path_to_downloaded_devops_eval_data \ 118 | --k_shot 0 119 | ``` 120 | 121 | For example, if the evaluation dataset is downloaded to `folder1`, the code is placed in `folder2`, and the model is in `folder3`, and the model does not require custom loader and context_builder, and all zero-shot scores of all datasets need to be tested, you can use the following script to initiate the test: 122 | ```Bash 123 | python folder2/src/run_eval.py \ 124 | --model_path folder3 \ 125 | --model_name Default \ 126 | --model_conf_path folder1/conf/model_conf.json \ 127 | --eval_dataset_list all \ 128 | --eval_dataset_fp_conf_path folder1/conf/devopseval_dataset_fp.json \ 129 | --eval_dataset_type test \ 130 | --data_path folder2 \ 131 | --k_shot 0 132 | ``` 133 |
-------------------------------------------------------------------------------- /resources/tutorial_zh.md: -------------------------------------------------------------------------------- 1 | ## 数据集评测教程 2 | 3 | ## 🚀 如何进行测试 4 | 如果需要在自己的 huggingface 格式的模型上进行测试的话,总的步骤分为如下几步: 5 | 1. 编写 Model 的 loader 函数 6 | 2. 编写 Model 的 context_builder 函数 7 | 3. 注册模型到配置文件中 8 | 4. 执行测试脚本 9 | 如果模型在加载进来后不需要特殊的处理,而且输入也不需要转换为特定的格式(e.g. chatml 格式或者其他的 human-bot 格式),请直接跳转到第四步直接发起测试。 10 | 11 | #### 1. 编写 loader 函数 12 | 如果模型在加载进来还需要做一些额外的处理(e.g. tokenizer 调整),需要去 `src.context_builder.context_builder_family.py` 中继承 `ModelAndTokenizerLoader` 类来覆写对应的 `load_model` 和 `load_tokenizer` 函数,具体可以参照以下示例: 13 | ```python 14 | class QwenModelAndTokenizerLoader(ModelAndTokenizerLoader): 15 | def __init__(self): 16 | super().__init__() 17 | pass 18 | 19 | def load_model(self, model_path: str): 20 | model = super().load_model(model_path) 21 | model.generation_config = GenerationConfig.from_pretrained(model_path) 22 | return model 23 | 24 | def load_tokenizer(self, model_path: str): 25 | tokenizer = super().load_tokenizer(model_path) 26 | 27 | # read generation config 28 | with open(model_path + '/generation_config.json', 'r') as f: 29 | generation_config = json.load(f) 30 | tokenizer.pad_token_id = generation_config['pad_token_id'] 31 | tokenizer.eos_token_id = generation_config['eos_token_id'] 32 | return tokenizer 33 | ``` 34 | 35 | #### 2. 编写 Model 的 context_builder 函数 36 | 如果输入需要转换为特定的格式(e.g. chatml 格式或者其他的 human-bot 格式),则需要去 `src.context_builder.context_builder_family` 中继承 ContextBuilder 类来覆写 make_context 函数,这个函数是用来将输入转换格式为对应需要的输出的,一个示例如下: 37 | ```python 38 | class QwenChatContextBuilder(ContextBuilder): 39 | def __init__(self): 40 | super().__init__() 41 | 42 | def make_context( 43 | self, 44 | model, 45 | tokenizer, 46 | query: str, 47 | system: str = "you are a helpful assistant" 48 | ): 49 | ''' 50 | model: PretrainedModel 51 | tokenizer: PretrainedTokenzier 52 | query: Input string 53 | system: System prompt if needed 54 | ''' 55 | im_start, im_end = "<|im_start|>", "<|im_end|>" 56 | im_start_tokens = [tokenizer.im_start_id] 57 | im_end_tokens = [tokenizer.im_end_id] 58 | nl_tokens = tokenizer.encode("\n") 59 | 60 | def _tokenize_str(role, content): 61 | return f"{role}\n{content}", tokenizer.encode( 62 | role, allowed_special=set() 63 | ) + nl_tokens + tokenizer.encode(content, allowed_special=set()) 64 | 65 | system_text, system_tokens_part = _tokenize_str("system", system) 66 | system_tokens = im_start_tokens + system_tokens_part + im_end_tokens 67 | 68 | raw_text = "" 69 | context_tokens = [] 70 | 71 | context_tokens = system_tokens + context_tokens 72 | raw_text = f"{im_start}{system_text}{im_end}" + raw_text 73 | context_tokens += ( 74 | nl_tokens 75 | + im_start_tokens 76 | + _tokenize_str("user", query)[1] 77 | + im_end_tokens 78 | + nl_tokens 79 | + im_start_tokens 80 | + tokenizer.encode("assistant") 81 | + nl_tokens 82 | ) 83 | raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n" 84 | return raw_text, context_tokens 85 | ``` 86 | 87 | #### 3. 注册模型到配置文件中 88 | 去 conf 中的 `model_conf.json`,注册对应的模型名和这个模型将要使用的 loader 和 context_builder,其中 loader 和 context_builder 写第一步和第二步中自定义的类名就可以,示例如下: 89 | ```json 90 | { 91 | "Qwen-Chat": { 92 | "loader": "QwenModelAndTokenizerLoader", 93 | "context_builder": "QwenChatContextBuilder" 94 | } 95 | } 96 | ``` 97 | 98 | 99 | #### 4. 执行测试脚本 100 | 直接运行以下代码发起测试 101 | ```Bash 102 | # model_path: 要测试的模型路径 103 | # model_name: 模型配置文件对应的模型命名,默认为 Default ,代表走默认的 loader 和 context_builder 104 | # model_conf_path: 模型配置文件的地址,一般就为 conf 路径下的 devopseval_dataset_fp.json 105 | # eval_dataset_list: 要测试的数据集名称,默认 all,全部测试,如果需要测试单个或者多个,用 # 符号链接,示例:dataset1#dataset2 106 | # eval_dataset_fp_conf_path: 数据集配置地址 107 | # eval_dataset_type: 测试哪种类型,只支持默认 test 类型的测试集 108 | # data_path: 评测数据集地址,填写下载数据集后的地址就可以 109 | # k_shot: 支持 0-5,代表 few-shot 会给模型前缀加的示例数量 110 | 111 | 112 | python src/run_eval.py \ 113 | --model_path path_to_model \ 114 | --model_name model_name_in_conf \ 115 | --model_conf_path path_to_model_conf \ 116 | --eval_dataset_list all \ 117 | --eval_dataset_fp_conf_path path_to_dataset_conf \ 118 | --eval_dataset_type test \ 119 | --data_path path_to_downloaded_devops_eval_data \ 120 | --k_shot 0 121 | ``` 122 | 123 | 举个🌰:比如评测数据集下载到了 `folder1`,代码放在了 `folder2`,模型在 `folder3`,模型不需要自定义 loader 和 context_builder,需要测试所有的数据集的 zero-shot 得分,那可以按照以下脚本发起测试: 124 | ```Bash 125 | python folder2/src/run_eval.py \ 126 | --model_path folder3 \ 127 | --model_name Default \ 128 | --model_conf_path folder1/conf/model_conf.json \ 129 | --eval_dataset_list all \ 130 | --eval_dataset_fp_conf_path folder1/conf/devopseval_dataset_fp.json \ 131 | --eval_dataset_type test \ 132 | --data_path folder2 \ 133 | --k_shot 0 134 | ``` 135 |
-------------------------------------------------------------------------------- /resources/wechat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/codefuse-devops-eval/f0f12d4380cc5bb684bc583d8e4d0a86e4e18c37/resources/wechat.png -------------------------------------------------------------------------------- /scripts/run_eval_example.sh: -------------------------------------------------------------------------------- 1 | # model_path: 要测试的模型路径 2 | # model_name: 模型配置文件对应的模型命名 3 | # model_conf_path: 模型配置文件的地址,一般就为 conf 路径下的 devopseval_dataset_fp.json 4 | # eval_dataset_list: 要测试的数据集名称,默认 all,全部测试,如果需要测试单个或者多个,用 # 符号链接,示例:dataset1#dataset2 5 | # eval_dataset_fp_conf_path: 数据集配置地址 6 | # eval_dataset_type: 测试哪种类型,只支持默认 test 类型的测试集 7 | # data_path: 评测数据集地址,填写下载数据集后的地址就可以 8 | # k_shot: 支持 0-5,代表 few-shot 会给模型前缀加的示例数量 9 | 10 | python src/run_eval.py \ 11 | --model_path path_to_model \ 12 | --model_name model_name_in_conf \ 13 | --model_conf_path path_to_model_conf \ 14 | --eval_dataset_list all \ 15 | --eval_dataset_fp_conf_path path_to_dataset_conf \ 16 | --eval_dataset_type test \ 17 | --data_path path_to_downloaded_devops_eval_data \ 18 | --k_shot 0 19 | -------------------------------------------------------------------------------- /scripts/run_fc_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | python src/qwen_eval_main.py 5 | -------------------------------------------------------------------------------- /src/context_builder/context_builder.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | from transformers import PreTrainedTokenizer 3 | 4 | 5 | class ContextBuilder: 6 | ''' 7 | Parent class 8 | ''' 9 | def __init__(self): 10 | pass 11 | 12 | def make_context( 13 | self, 14 | model, 15 | tokenizer, 16 | query: str, 17 | system: str = "" 18 | ): 19 | ''' 20 | Make context for query, default is do nothing 21 | ''' 22 | raw_text = query 23 | context_tokens = tokenizer.encode(raw_text) 24 | return raw_text, context_tokens 25 | 26 | if __name__ == '__main__': 27 | pass 28 | 29 | -------------------------------------------------------------------------------- /src/context_builder/context_builder_family.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from loguru import logger 4 | from src.context_builder.context_builder import ContextBuilder 5 | 6 | def get_context_builder(eval_args): 7 | ''' 8 | Load context_builder by model_name 9 | ''' 10 | with open(eval_args.model_conf_path, 'r') as f: 11 | model_conf = json.load(f) 12 | 13 | context_builder = globals()[model_conf[eval_args.model_name]['context_builder']]() 14 | return context_builder 15 | 16 | class QwenChatContextBuilder(ContextBuilder): 17 | def __init__(self): 18 | super().__init__() 19 | 20 | def make_context( 21 | self, 22 | model, 23 | tokenizer, 24 | query: str, 25 | system: str = "you are a helpful assistant" 26 | ): 27 | im_start, im_end = "<|im_start|>", "<|im_end|>" 28 | im_start_tokens = [tokenizer.im_start_id] 29 | im_end_tokens = [tokenizer.im_end_id] 30 | nl_tokens = tokenizer.encode("\n") 31 | 32 | def _tokenize_str(role, content): 33 | return f"{role}\n{content}", tokenizer.encode( 34 | role, allowed_special=set() 35 | ) + nl_tokens + tokenizer.encode(content, allowed_special=set()) 36 | 37 | system_text, system_tokens_part = _tokenize_str("system", system) 38 | system_tokens = im_start_tokens + system_tokens_part + im_end_tokens 39 | 40 | raw_text = "" 41 | context_tokens = [] 42 | 43 | context_tokens = system_tokens + context_tokens 44 | raw_text = f"{im_start}{system_text}{im_end}" + raw_text 45 | context_tokens += ( 46 | nl_tokens 47 | + im_start_tokens 48 | + _tokenize_str("user", query)[1] 49 | + im_end_tokens 50 | + nl_tokens 51 | + im_start_tokens 52 | + tokenizer.encode("assistant") 53 | + nl_tokens 54 | ) 55 | raw_text += f"\n{im_start}user\n{query}{im_end}\n{im_start}assistant\n" 56 | return raw_text, context_tokens 57 | 58 | class Baichuan2ChatContextBuilder(ContextBuilder): 59 | def __init__(self): 60 | super().__init__() 61 | 62 | def make_context( 63 | self, 64 | model, 65 | tokenizer, 66 | query: str, 67 | system: str = "you are a helpful assistant" 68 | ): 69 | messages = [] 70 | messages.append({"role": "user", "content": query}) 71 | 72 | raw_text, context_tokens = self.build_chat_input(model, tokenizer, messages) 73 | 74 | return raw_text, context_tokens 75 | 76 | def build_chat_input(self, model, tokenizer, messages, max_new_tokens: int=0): 77 | def _parse_messages(messages, split_role="user"): 78 | system, rounds = "", [] 79 | round = [] 80 | for i, message in enumerate(messages): 81 | if message["role"] == "system": 82 | assert i == 0 83 | system = message["content"] 84 | continue 85 | if message["role"] == split_role and round: 86 | rounds.append(round) 87 | round = [] 88 | round.append(message) 89 | if round: 90 | rounds.append(round) 91 | return system, rounds 92 | 93 | max_new_tokens = max_new_tokens or model.generation_config.max_new_tokens 94 | max_input_tokens = model.config.model_max_length - max_new_tokens 95 | system, rounds = _parse_messages(messages, split_role="user") 96 | system_tokens = tokenizer.encode(system) 97 | max_history_tokens = max_input_tokens - len(system_tokens) 98 | 99 | history_tokens = [] 100 | for round in rounds[::-1]: 101 | round_tokens = [] 102 | for message in round: 103 | if message["role"] == "user": 104 | round_tokens.append(model.generation_config.user_token_id) 105 | else: 106 | round_tokens.append(model.generation_config.assistant_token_id) 107 | round_tokens.extend(tokenizer.encode(message["content"])) 108 | if len(history_tokens) == 0 or len(history_tokens) + len(round_tokens) <= max_history_tokens: 109 | history_tokens = round_tokens + history_tokens # concat left 110 | if len(history_tokens) < max_history_tokens: 111 | continue 112 | break 113 | 114 | input_tokens = system_tokens + history_tokens 115 | if messages[-1]["role"] != "assistant": 116 | input_tokens.append(model.generation_config.assistant_token_id) 117 | input_tokens = input_tokens[-max_input_tokens:] # truncate left 118 | 119 | raw_text = tokenizer.decode(input_tokens) 120 | return raw_text, input_tokens 121 | 122 | class InternlmChatContextBuilder(ContextBuilder): 123 | def __init__(self): 124 | super().__init__() 125 | 126 | def make_context( 127 | self, 128 | model, 129 | tokenizer, 130 | query: str, 131 | system: str = "you are a helpful assistant" 132 | ): 133 | prompt = "" 134 | if len(prompt) == 0: 135 | prompt += "" 136 | prompt += f"""<|User|>:{query}\n<|Bot|>:""" 137 | return prompt, tokenizer.encode(prompt) 138 | 139 | 140 | if __name__ == '__main__': 141 | query = '你好' 142 | system = '请帮助我' 143 | tokenizer = '/mnt/llm/devopspal/model/Qwen-7B' 144 | 145 | -------------------------------------------------------------------------------- /src/data/data_load.py: -------------------------------------------------------------------------------- 1 | import json 2 | import jsonlines 3 | import os 4 | import pandas as pd 5 | 6 | from loguru import logger 7 | 8 | from src.hparams.evaluate_args import EvaluateArguments 9 | from src.data.data_preprocess import preprocess 10 | 11 | 12 | def load_all_dataset(eval_args: EvaluateArguments): 13 | ''' 14 | Load all eval dataset 15 | ''' 16 | # get fp for eval dataset 17 | dataset_name_list = eval_args.eval_dataset_list 18 | eval_dataset_fp_conf_path = eval_args.eval_dataset_fp_conf_path 19 | 20 | with open(eval_dataset_fp_conf_path, 'r') as f: 21 | dataset_fn_dict = json.load(f) 22 | 23 | data_dir = eval_args.data_path 24 | 25 | logger.info(dataset_name_list) 26 | if len(dataset_name_list) == 1 and dataset_name_list[0] == 'all': 27 | dataset_name_list = dataset_fn_dict.keys() 28 | dataset_fp_list = [data_dir + os.path.sep + eval_args.eval_language + os.path.sep + eval_args.eval_dataset_type + os.path.sep + dataset_fn_dict[i] for i in dataset_name_list] 29 | 30 | logger.info('Start load and preprocess dataset') 31 | all_dataset = {} 32 | for dataset_name in dataset_name_list: 33 | dataset_fp = data_dir + os.path.sep + eval_args.eval_language + os.path.sep + eval_args.eval_dataset_type + os.path.sep + dataset_fn_dict[dataset_name] 34 | df = pd.read_csv(dataset_fp) 35 | 36 | # Read dev data if doing few-shot test 37 | df_dev = None 38 | if eval_args.k_shot > 0: 39 | dev_dataset_fp = data_dir + os.path.sep + eval_args.eval_language + os.path.sep + 'dev' + os.path.sep + dataset_fn_dict[dataset_name] 40 | df_dev = pd.read_csv(dev_dataset_fp) 41 | 42 | all_dataset[dataset_name] = preprocess(df, eval_args, df_dev=df_dev) 43 | logger.info('Load success, dataset_name={}, dataset_file_path={}, dataset question count={}'.format(dataset_name, 44 | dataset_fp, 45 | len(all_dataset[dataset_name]))) 46 | return all_dataset 47 | 48 | if __name__ == '__main__': 49 | a = os.path.split(os.path.realpath(__file__))[0] 50 | b = os.path.abspath(os.path.dirname(a)+os.path.sep+"../data") 51 | logger.debug(b) 52 | -------------------------------------------------------------------------------- /src/data/data_preprocess.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from loguru import logger 3 | 4 | 5 | def preprocess(df: pd.DataFrame, eval_args, df_dev: pd.DataFrame = None): 6 | ''' 7 | Preprocess df and generate final dict 8 | ''' 9 | question_prompt = '''以下是关于开发运维领域的单项选择题,请选出其中的正确答案。请直接输出选项。\n''' 10 | 11 | if eval_args.k_shot > 0 and df_dev is not None: 12 | # uppercase to lowercase 13 | df_dev.rename(columns={ 14 | 'Question': 'question', 15 | 'Answer': 'answer' 16 | }, inplace=True) 17 | 18 | prefix = '' 19 | 20 | for idx in range(eval_args.k_shot): 21 | question = df_dev['question'].iloc[idx] 22 | prefix = prefix + question_prompt + '问题:' + question + '\n' 23 | 24 | for option in ['A', 'B', 'C', 'D']: 25 | if df_dev[option].iloc[idx]: 26 | prefix += '{}. {}\n'.format(option, df_dev[option].iloc[idx]) 27 | prefix += '答案:{}\n'.format(df_dev['answer'].iloc[idx].strip().upper()) 28 | prefix = prefix + question_prompt 29 | res = preprocess_question(df, prefix) 30 | else: 31 | res = preprocess_question(df, question_prompt) 32 | 33 | return res 34 | 35 | def preprocess_question(df: pd.DataFrame, prefix: str = ''): 36 | ''' 37 | Preprocess df and generate final dict 38 | ''' 39 | res = [] 40 | 41 | # uppercase to lowercase 42 | df.rename(columns={ 43 | 'Question': 'question', 44 | 'Answer': 'answer' 45 | }, inplace=True) 46 | 47 | for idx in range(df.shape[0]): 48 | to_append = { 49 | 'question': df['question'].iloc[idx], 50 | 'options': [], 51 | 'answer': df['answer'].iloc[idx].strip().upper() 52 | } 53 | question = df['question'].iloc[idx] 54 | 55 | query = prefix + '''问题:{question}\n'''.format(question=question) 56 | 57 | for option in ['A', 'B', 'C', 'D']: 58 | if df[option].iloc[idx]: 59 | to_append['options'].append(option) 60 | to_append[option] = df[option].iloc[idx] 61 | to_add = '{}. {}\n'.format(option, df[option].iloc[idx]) 62 | query += to_add 63 | 64 | to_add = '答案:' 65 | query += to_add 66 | to_append['query'] = query 67 | res.append(to_append) 68 | return res 69 | -------------------------------------------------------------------------------- /src/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_dataset import ToolDataset 2 | from .toolfill_dataset import ToolFillDataset 3 | from .toolparser_dataset import ToolParserDataset 4 | from .toolsummary_dataset import ToolSummaryDataset 5 | from .funccall_dataset import FuncCallDataset 6 | 7 | __all__ = [ 8 | "ToolFillDataset", "ToolDataset", "ToolParserDataset", "ToolSummaryDataset", "FuncCallDataset" 9 | ] -------------------------------------------------------------------------------- /src/datasets/base_dataset.py: -------------------------------------------------------------------------------- 1 | from src.utils.jsonl_utils import read_jsonl_file, save_to_jsonl_file 2 | from src.utils.json_utils import read_json_file, save_to_json_file 3 | 4 | 5 | 6 | class ToolDataset: 7 | def __init__(self, dataset_name, tool_task, filepath): 8 | self.dataset_name = dataset_name 9 | self.tool_task = tool_task 10 | self.filepath = filepath 11 | self.datas = self.load_data() 12 | 13 | def load_data(self, ) -> list: 14 | if self.filepath: 15 | return self.load_data_from_local(self.filepath) 16 | elif self.dataset_name and self.tool_task: 17 | return self.load_data_from_hf(self.tool_task) 18 | return [] 19 | 20 | def load_data_from_local(self, filepath): 21 | '''''' 22 | pass 23 | 24 | def load_data_from_hf(self, tool_task): 25 | pass 26 | 27 | def __iter__(self): 28 | self.current_index = 0 29 | return self 30 | 31 | def __next__(self): 32 | if self.current_index < len(self.datas): 33 | current_item = self.datas[self.current_index] 34 | self.current_index += 1 35 | return current_item 36 | else: 37 | raise StopIteration 38 | 39 | def __len__(self): 40 | return len(self.datas) 41 | -------------------------------------------------------------------------------- /src/datasets/funccall_dataset.py: -------------------------------------------------------------------------------- 1 | from src.utils.jsonl_utils import read_jsonl_file, save_to_jsonl_file 2 | from src.utils.json_utils import read_json_file, save_to_json_file 3 | from .base_dataset import ToolDataset 4 | 5 | import os 6 | 7 | 8 | 9 | 10 | class FuncCallDataset(ToolDataset): 11 | def __init__(self, dataset_name, tool_task, filepath): 12 | self.dataset_name = dataset_name 13 | self.tool_task = tool_task 14 | self.filepath = filepath 15 | self.datas = self.load_data() 16 | 17 | def load_data(self, ) -> list: 18 | if self.filepath: 19 | return self.load_data_from_local(self.filepath) 20 | elif self.dataset_name and self.tool_task: 21 | return self.load_data_from_hf(self.tool_task) 22 | return [] 23 | 24 | def load_data_from_local(self, filepath): 25 | def _load_from_file(filename): 26 | if "jsonl" in filename: 27 | return read_jsonl_file(filename) 28 | elif "json" in filename: 29 | return read_json_file(filename) 30 | 31 | datas = [] 32 | if os.path.isdir(filepath): 33 | for filename in os.listdir(filepath): 34 | datas.extend(_load_from_file(os.path.join(filepath, filename))) 35 | else: 36 | datas = _load_from_file(filepath) 37 | 38 | return datas 39 | 40 | def load_data_from_hf(self, tool_task): 41 | pass -------------------------------------------------------------------------------- /src/datasets/toolfill_dataset.py: -------------------------------------------------------------------------------- 1 | from src.utils.jsonl_utils import read_jsonl_file, save_to_jsonl_file 2 | from src.utils.json_utils import read_json_file, save_to_json_file 3 | from .base_dataset import ToolDataset 4 | 5 | 6 | 7 | 8 | class ToolFillDataset(ToolDataset): 9 | def __init__(self, dataset_name, tool_task, filepath): 10 | self.dataset_name = dataset_name 11 | self.tool_task = tool_task 12 | self.filepath = filepath 13 | self.datas = self.load_data() 14 | 15 | def load_data(self, ) -> list: 16 | if self.filepath: 17 | return self.load_data_from_local(self.filepath) 18 | elif self.dataset_name and self.tool_task: 19 | return self.load_data_from_hf(self.tool_task) 20 | return [] 21 | 22 | def load_data_from_local(self, filepath): 23 | if "jsonl" in filepath: 24 | return read_jsonl_file(filepath) 25 | elif "json" in filepath: 26 | return read_json_file(filepath) 27 | return [] 28 | 29 | def load_data_from_hf(self, tool_task): 30 | pass -------------------------------------------------------------------------------- /src/datasets/toolparser_dataset.py: -------------------------------------------------------------------------------- 1 | from src.utils.jsonl_utils import read_jsonl_file, save_to_jsonl_file 2 | from src.utils.json_utils import read_json_file, save_to_json_file 3 | from .base_dataset import ToolDataset 4 | 5 | 6 | 7 | 8 | class ToolParserDataset(ToolDataset): 9 | def __init__(self, dataset_name, tool_task, filepath): 10 | self.dataset_name = dataset_name 11 | self.tool_task = tool_task 12 | self.filepath = filepath 13 | self.datas = self.load_data() 14 | 15 | def load_data(self, ) -> list: 16 | if self.filepath: 17 | return self.load_data_from_local(self.filepath) 18 | elif self.dataset_name and self.tool_task: 19 | return self.load_data_from_hf(self.tool_task) 20 | return [] 21 | 22 | def load_data_from_local(self, filepath): 23 | if "jsonl" in filepath: 24 | return read_jsonl_file(filepath) 25 | elif "json" in filepath: 26 | return read_json_file(filepath) 27 | return [] 28 | 29 | def load_data_from_hf(self, tool_task): 30 | pass -------------------------------------------------------------------------------- /src/datasets/toolsummary_dataset.py: -------------------------------------------------------------------------------- 1 | from src.utils.jsonl_utils import read_jsonl_file, save_to_jsonl_file 2 | from src.utils.json_utils import read_json_file, save_to_json_file 3 | from .base_dataset import ToolDataset 4 | 5 | 6 | class ToolSummaryDataset(ToolDataset): 7 | def __init__(self, dataset_name, tool_task, filepath): 8 | self.dataset_name = dataset_name 9 | self.tool_task = tool_task 10 | self.filepath = filepath 11 | self.datas = self.load_data() 12 | 13 | def load_data(self, ) -> list: 14 | if self.filepath: 15 | return self.load_data_from_local(self.filepath) 16 | elif self.dataset_name and self.tool_task: 17 | return self.load_data_from_hf(self.tool_task) 18 | return [] 19 | 20 | def load_data_from_local(self, filepath): 21 | if "jsonl" in filepath: 22 | return read_jsonl_file(filepath) 23 | elif "json" in filepath: 24 | return read_json_file(filepath) 25 | return [] 26 | 27 | def load_data_from_hf(self, tool_task): 28 | pass -------------------------------------------------------------------------------- /src/evals/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_evalution import ToolEvalution 2 | from .toolfill_evalution import ToolFillEvalution 3 | from .toolparser_evalution import ToolParserEvalution 4 | from .toolsummary_evalution import ToolSummaryEvalution 5 | from .func_call_evalution import FuncCallEvalution 6 | 7 | 8 | __all__ = [ 9 | "ToolEvalution", "ToolFillEvalution", "ToolParserEvalution", "ToolSummaryEvalution", "FuncCallEvalution" 10 | ] -------------------------------------------------------------------------------- /src/evals/base_evalution.py: -------------------------------------------------------------------------------- 1 | from src.models.base_model import ToolModel 2 | from src.models.generate_configs import GenerateConfigs 3 | from src.datasets import ToolFillDataset 4 | 5 | 6 | 7 | class ToolEvalution: 8 | def __init__( 9 | self, 10 | model: ToolModel, 11 | dataset: ToolFillDataset, 12 | base_prompt: str = '', 13 | generate_configs: GenerateConfigs = None, 14 | ): 15 | self.model = model 16 | self.dataset = dataset 17 | self.base_prompt = base_prompt 18 | self.generate_configs = generate_configs 19 | 20 | if not isinstance(model, ToolModel): 21 | raise BaseException(f"must be ToolModel Class! not {model}") 22 | 23 | def calc(self): 24 | '''开始计算结果''' 25 | self.predicts = [] 26 | for idx, data in enumerate(self.dataset): 27 | # if idx >= 5: break 28 | prompt = self.base_prompt.format(**data) 29 | answer = data["api_param"] 30 | predict = self.generate(prompt, self.generate_configs) 31 | self.predicts.append({"prompt": prompt, "predict": predict, "answer": answer}) 32 | 33 | metric = self.eval_metric(self.predicts) 34 | return metric 35 | 36 | def generate(self, prompt, generate_configs): 37 | '''返回结果''' 38 | return self.model.generate(prompt, generate_configs) 39 | 40 | def eval_metric(self, datas): 41 | '''calc custom metric''' 42 | pass 43 | 44 | -------------------------------------------------------------------------------- /src/evals/func_call_evalution.py: -------------------------------------------------------------------------------- 1 | from src.models.base_model import ToolModel 2 | from src.models.generate_configs import GenerateConfigs 3 | from src.datasets import FuncCallDataset 4 | from src.utils.jsonl_utils import read_jsonl_file 5 | from .base_evalution import ToolEvalution 6 | 7 | from collections import Counter 8 | import jieba, re, json, os 9 | import numpy as np 10 | from loguru import logger 11 | 12 | 13 | def remove_punctuation(text): 14 | pattern = r'[^\w\s]' 15 | return re.sub(pattern, '', text) 16 | 17 | 18 | def cmp_arguments(args_str1, args_str2): 19 | rtn_flag = False 20 | try: 21 | args_dict1 = json.loads(args_str1) 22 | args_dict2 = json.loads(args_str2) 23 | # 比较两个字典是否一致 24 | if args_dict1 == args_dict2: 25 | rtn_flag = True 26 | except Exception as e: 27 | print("json.loads error: ", e) 28 | return rtn_flag 29 | return rtn_flag 30 | 31 | 32 | class FuncCallEvalution(ToolEvalution): 33 | def __init__( 34 | self, 35 | model: ToolModel, 36 | dataset: FuncCallDataset, 37 | base_prompt: str = '', 38 | template: str = 'default', 39 | generate_configs: GenerateConfigs = None, 40 | ): 41 | self.model = model 42 | self.dataset = dataset 43 | self.base_prompt = base_prompt 44 | self.template = template 45 | self.generate_configs = generate_configs 46 | 47 | if not isinstance(model, ToolModel): 48 | raise BaseException(f"must be ToolModel Class! not {model}") 49 | 50 | def calc(self): 51 | '''开始计算结果''' 52 | self.predicts = [] 53 | func_call_train_datas = self.create_prompts(self.dataset) 54 | 55 | for idx, data in enumerate(func_call_train_datas): 56 | print(f"总共 {len(func_call_train_datas)} 条prompt,当前运行到第 {idx} 条prompt", end="\r") 57 | prompt = data["instruction"] 58 | history = data["history"] 59 | answer = data["output"] 60 | functions = data["functions"] 61 | predict = self.generate(prompt, self.template, self.generate_configs, history) 62 | 63 | if "arguments" in answer: 64 | answer = {"content": answer["content"], "function_call": {"name": answer["name"], "arguments": answer["arguments"]}} 65 | 66 | if "#function" in predict: 67 | try: 68 | predict_param = json.loads(predict.split("#function")[-1]) 69 | if "arguments" in predict_param: 70 | predict_param = { 71 | "content": predict_param["content"], 72 | "function_call": {"name": predict_param["name"], "arguments": predict_param["arguments"]} 73 | } 74 | predict = {**predict_param, **{"role": "assistant"}} 75 | except Exception as e: 76 | logger.error("content: {content}") 77 | predict = {**{"content": predict_param}, **{"role": "assistant"}} 78 | else: 79 | predict = { 80 | "role": "assistant", 81 | "content": predict 82 | } 83 | 84 | self.predicts.append({ 85 | "prompt": prompt, "history": history, 86 | "predict": predict, "answer": answer, 87 | "functions": functions 88 | }) 89 | 90 | metric = self.eval_metric(self.predicts) 91 | return metric 92 | 93 | def calc_from_predicts(self, file_path): 94 | if os.path.exists(file_path): 95 | self.predicts = read_jsonl_file(file_path) 96 | metric = self.eval_metric(self.predicts) 97 | return metric 98 | else: 99 | return self.calc() 100 | 101 | def create_prompts(self, func_call_datas): 102 | system_content = '''CodeFuse是一个面向研发领域的智能助手,旨在中立的、无害的帮助用户解决开发相关的问题,所有的回答均使用Markdown格式返回。 103 | 你能利用许多工具和功能来完成给定的任务,在每一步中,你需要分析当前状态,并通过执行函数调用来确定下一步的行动方向。你可以进行多次尝试。如果你计划连续尝试不同的条件,请每次尝试一种条件。若给定了Finish函数,则以Finish调用结束,若没提供Finish函数,则以不带function_call的对话结束。''' 104 | function_format = '''You are ToolGPT, you have access to the following APIs:\n{tools}''' 105 | 106 | func_call_train_datas = [] 107 | history_error_cnt = 0 108 | funccall_error_cnt = 0 109 | 110 | for data in func_call_datas: 111 | tools = data["functions"] 112 | chatrounds = data["chatrounds"] 113 | 114 | function_content = "" 115 | if len(tools) > 0: 116 | function_content = function_format.format(tools=json.dumps(tools, ensure_ascii=False, sort_keys=True)) 117 | 118 | history = [] 119 | for i in chatrounds: 120 | if i["role"]=="system": 121 | continue 122 | 123 | if i["role"]=="user": 124 | history.append(("user", i["content"])) 125 | 126 | if i["role"] == "assistant": 127 | if "function_call" in i: 128 | if not isinstance(i["function_call"], dict): 129 | funccall_error_cnt+=1 130 | continue 131 | content = "#function" + json.dumps({**{"content": i["content"]}, **i["function_call"]}, ensure_ascii=False) 132 | else: 133 | content = i["content"] 134 | history.append(("assistant", content)) 135 | 136 | 137 | if i["role"] == "function": 138 | content = json.dumps({**{"content": i["content"]}, **{"name": i["name"]}}, ensure_ascii=False) 139 | history.append(("user", content)) 140 | 141 | 142 | history = [i[1] for i in history] 143 | history[0] = "\n".join([system_content,function_content, history[0]]) 144 | 145 | for his_idx in range(0, len(history), 2): 146 | output = history[his_idx+1] 147 | 148 | if "#function" in output: 149 | output = output.split("#function")[-1] 150 | 151 | try: 152 | output = json.loads(output) 153 | except: 154 | output = {"content": output} 155 | 156 | 157 | func_call_train_datas.append( 158 | { 159 | "instruction": history[his_idx], 160 | "input": "", 161 | "output": output, 162 | "history": [history[:his_idx+2][i:i+2] for i in range(0, len(history[:his_idx]), 2)], 163 | "functions": tools 164 | }, 165 | ) 166 | return func_call_train_datas 167 | 168 | def generate(self, prompt, template, generate_configs, history=None): 169 | '''返回结果''' 170 | return self.model.generate(prompt, template, generate_configs, history) 171 | 172 | def eval_metric(self, datas): 173 | '''''' 174 | # function call 回复测试总数 175 | self.function_call_sum = 0 176 | # function call 回复正确数 177 | self.function_call_correct = 0 178 | # function call 回复失败数 179 | self.function_call_fail = 0 180 | # function call 回复失败中,本应该调用工具但是模型没有调用, 无工具识别识别错误数 181 | self.function_call_fail_functioncall = 0 182 | # function call 回复失败数中,因为函数名不对导致的失败数 183 | self.function_call_fail_name = 0 184 | # function call 回复失败数中,因为参数不对导致的失败数 185 | self.function_call_fail_param = 0 186 | # function call 回复失败中 函数名幻觉的失败数 187 | self.function_call_fail_name_illusion = 0 188 | 189 | # assistant ans 回复相关度列表 190 | self.assistant_ans_relevancy_list = [] 191 | 192 | for data in datas: 193 | ass_predict = data["predict"] 194 | ass_truth = data["answer"] 195 | functions = data["functions"] 196 | history = data["history"] 197 | # 将user 和 function 的部分组合 198 | content_msg = "" 199 | for user_msg, assistant_msg in history: 200 | content_msg += user_msg 201 | 202 | # if "#function" in ass_truth: 203 | if "function_call" in ass_truth: 204 | self.calc_func_params(ass_predict, ass_truth, functions) 205 | else: 206 | self.calc_relevancy(ass_predict, ass_truth, content_msg) 207 | 208 | self.print_result() 209 | return { 210 | "function_call_correct_rate": self.function_call_correct_rate, 211 | "function_call_fail_rate": self.function_call_fail_rate, 212 | "function_call_fail_functioncall_rate": self.function_call_fail_functioncall_rate, 213 | "function_call_fail_name_rate": self.function_call_fail_name_rate, 214 | "function_call_fail_param_rate": self.function_call_fail_param_rate, 215 | "function_call_fail_name_illusion_rate": self.function_call_fail_name_illusion_rate 216 | } 217 | 218 | def calc_func_params(self, ass_predict, ass_truth, functions): 219 | self.function_call_sum += 1 220 | 221 | function_names = [i["name"] for i in functions] 222 | # ass_predict_param = json.loads(ass_predict.split("#function")[-1]) 223 | # ass_truth_param = json.loads(ass_truth.split("#function")[-1]) 224 | 225 | if "function_call" not in ass_predict: 226 | self.function_call_fail += 1 227 | self.function_call_fail_functioncall += 1 228 | elif ass_predict["function_call"]["name"] not in function_names: 229 | # 模型幻觉 230 | self.function_call_fail += 1 231 | self.function_call_fail_name += 1 232 | self.function_call_fail_name_illusion += 1 233 | else: 234 | function_call_name_label = False 235 | function_call_args_label = False 236 | if ass_predict["function_call"]["name"] == ass_truth["function_call"]["name"]: 237 | function_call_name_label = True 238 | if cmp_arguments(ass_predict["function_call"]["arguments"], ass_truth["function_call"]["arguments"]): 239 | function_call_args_label = True 240 | else: 241 | self.function_call_fail_param += 1 242 | else: 243 | self.function_call_fail_name += 1 244 | # # 是否可能存在名字错误参数正确的情况? 245 | # if self.cmp_arguments(ass_predict["function_call"]["arguments"], ass_truth["function_call"]["arguments"]): 246 | # function_call_args_label = True 247 | # else: 248 | # self.function_call_fail_param += 1 249 | 250 | if function_call_name_label and function_call_args_label: 251 | self.function_call_correct += 1 252 | else: 253 | self.function_call_fail += 1 254 | 255 | def calc_relevancy(self, ass_predict, ass_truth, content_msg): 256 | if "function_call" in ass_predict: 257 | self.assistant_ans_relevancy_list.append(0) 258 | return 259 | 260 | content_msg_counter = Counter(jieba.cut(remove_punctuation(content_msg))) 261 | ass_truth_counter = Counter(jieba.cut(remove_punctuation(ass_truth["content"]))) 262 | ass_predict_counter = Counter(jieba.cut(remove_punctuation(ass_predict["content"]))) 263 | relative_counter = content_msg_counter & ass_truth_counter 264 | len_relative = sum(relative_counter.values()) 265 | predict_relative = ass_predict_counter & relative_counter 266 | 267 | if len_relative == 0: 268 | # 要是标准答案和问题相关词都无 直接给1 269 | self.assistant_ans_relevancy_list.append(1) 270 | else: 271 | # 交集与相关词的占比 272 | self.assistant_ans_relevancy_list.append(sum(predict_relative.values())/len_relative) 273 | 274 | def print_result(self, ): 275 | # 打印指标结果 276 | print("=============统计数据=========================") 277 | print(f"function_call_sum: {self.function_call_sum}") 278 | print(f"function_call_correct: {self.function_call_correct}") 279 | print(f"function_call_fail: {self.function_call_fail}") 280 | print(f"function_call_fail_name: {self.function_call_fail_name}") 281 | print(f"function_call_fail_param: {self.function_call_fail_param}") 282 | print(f"function_call_fail_name_illusion: {self.function_call_fail_name_illusion}") 283 | print(f"assistant_ans_sum: {len(self.assistant_ans_relevancy_list)}") 284 | print(f"assistant_ans_relevancy: {np.mean(self.assistant_ans_relevancy_list)}") 285 | print("=============实验结果=========================") 286 | self.function_call_correct_rate = self.function_call_correct/self.function_call_sum 287 | self.function_call_fail_rate = self.function_call_fail/self.function_call_sum 288 | self.function_call_fail_functioncall_rate = self.function_call_fail_functioncall/self.function_call_sum 289 | self.function_call_fail_name_rate = self.function_call_fail_name/self.function_call_sum 290 | self.function_call_fail_param_rate = self.function_call_fail_param/self.function_call_sum 291 | self.function_call_fail_name_illusion_rate = self.function_call_fail_name_illusion/self.function_call_sum 292 | 293 | # self.function_call_fail_functioncall_rate = self.function_call_fail_functioncall/self.function_call_fail if self.function_call_fail else 0 294 | # self.function_call_fail_name_rate = self.function_call_fail_name/self.function_call_fail if self.function_call_fail else 0 295 | # self.function_call_fail_param_rate = self.function_call_fail_param/self.function_call_fail if self.function_call_fail else 0 296 | # self.function_call_fail_name_illusion_rate = self.function_call_fail_name_illusion/self.function_call_fail if self.function_call_fail else 0 297 | print(f"工具识别正确率fccr: {self.function_call_correct_rate}") 298 | print(f"工具识别失败率fcfr: {self.function_call_fail_rate}") 299 | print(f"工具调用识别失败占比fcffr: {self.function_call_fail_functioncall_rate}") 300 | print(f"工具名识别失败占比fcfnr: {self.function_call_fail_name_rate}") 301 | print(f"工具参数识别失败占比fcfpr: {self.function_call_fail_param_rate}") 302 | print(f"工具幻觉识别失败占比fcfnir: {self.function_call_fail_name_illusion_rate}") 303 | print(f"助手回复答案相关度aar: {np.mean(self.assistant_ans_relevancy_list)}") 304 | print("==============================================") 305 | -------------------------------------------------------------------------------- /src/evals/toolfill_evalution.py: -------------------------------------------------------------------------------- 1 | from src.models.base_model import ToolModel 2 | from src.models.generate_configs import GenerateConfigs 3 | from src.datasets import ToolFillDataset 4 | from .base_evalution import ToolEvalution 5 | 6 | 7 | 8 | class ToolFillEvalution(ToolEvalution): 9 | def __init__( 10 | self, 11 | model: ToolModel, 12 | dataset: ToolFillDataset, 13 | base_prompt: str = '', 14 | template: str = 'default', 15 | generate_configs: GenerateConfigs = None, 16 | ): 17 | self.model = model 18 | self.dataset = dataset 19 | self.base_prompt = base_prompt 20 | self.template = template 21 | self.generate_configs = generate_configs 22 | 23 | if not isinstance(model, ToolModel): 24 | raise BaseException(f"must be ToolModel Class! not {model}") 25 | 26 | def calc(self): 27 | '''开始计算结果''' 28 | self.predicts = [] 29 | for idx, data in enumerate(self.dataset): 30 | prompt = self.base_prompt.format(**data) 31 | answer = data["api_param"] 32 | predict = self.generate(prompt, self.template, self.generate_configs) 33 | self.predicts.append({"prompt": prompt, "predict": predict, "answer": answer}) 34 | 35 | metric = self.eval_metric(self.predicts) 36 | return metric 37 | 38 | def generate(self, prompt, template, generate_configs): 39 | '''返回结果''' 40 | return self.model.generate(prompt, template, generate_configs) 41 | 42 | def eval_metric(self, datas): 43 | '''''' 44 | self.right_predicts = [] 45 | self.wrong_predicts = [] 46 | self.error_predicts = [] 47 | for data in datas: 48 | prompt, predict, answer = data["prompt"], data["predict"], data["answer"] 49 | 50 | try: 51 | predict_json = predict if isinstance(predict, dict) else eval(predict) 52 | answer_json = answer if isinstance(answer, dict) else eval(answer) 53 | if predict_json == answer_json: 54 | # print("prompt: {}\npredict: {}\nanswer: {}".format(prompt, predict, answer)) 55 | self.right_predicts.append({"prompt": prompt, "predict": predict, "answer": answer}) 56 | else: 57 | self.wrong_predicts.append({"prompt": prompt, "predict": predict, "answer": answer}) 58 | except: 59 | self.error_predicts.append({"prompt": prompt, "predict": predict, "answer": answer}) 60 | # 61 | print(len(self.right_predicts), len(self.wrong_predicts), len(self.error_predicts)) 62 | 63 | metric = { 64 | "accuracy": len(self.right_predicts)/(len(self.right_predicts)+len(self.wrong_predicts)+len(self.error_predicts)), 65 | "error": len(self.error_predicts)/(len(self.right_predicts)+len(self.wrong_predicts)+len(self.error_predicts)), 66 | } 67 | return metric 68 | -------------------------------------------------------------------------------- /src/evals/toolparser_evalution.py: -------------------------------------------------------------------------------- 1 | from src.models.base_model import ToolModel 2 | from src.models.generate_configs import GenerateConfigs 3 | from src.datasets import ToolParserDataset 4 | from .base_evalution import ToolEvalution 5 | from .utils import rec_search_key 6 | 7 | 8 | class ToolParserEvalution(ToolEvalution): 9 | def __init__( 10 | self, 11 | model: ToolModel, 12 | dataset: ToolParserDataset, 13 | base_prompt: str = '', 14 | template: str = 'default', 15 | generate_configs: GenerateConfigs = None, 16 | ): 17 | self.model = model 18 | self.dataset = dataset 19 | self.base_prompt = base_prompt 20 | self.template = template 21 | self.generate_configs = generate_configs 22 | 23 | if not isinstance(model, ToolModel): 24 | raise BaseException(f"must be ToolModel Class! not {model}") 25 | 26 | def calc(self): 27 | '''开始计算结果''' 28 | self.predicts = [] 29 | for idx, data in enumerate(self.dataset): 30 | # if idx >= 5: break 31 | prompt = self.base_prompt.format(**data) 32 | response = data["response"] 33 | answer = data["selected_keys"] 34 | predict = self.generate(prompt, self.template, self.generate_configs) 35 | self.predicts.append({"prompt": prompt, "predict": predict, "answer": answer, "response": response}) 36 | 37 | metric = self.eval_metric(self.predicts) 38 | return metric 39 | 40 | self.model = model 41 | self.dataset = dataset 42 | self.base_prompt = base_prompt 43 | self.template = template 44 | self.generate_configs = generate_configs 45 | 46 | if not isinstance(model, ToolModel): 47 | raise BaseException(f"must be ToolModel Class! not {model}") 48 | 49 | def generate(self, prompt, template, generate_configs): 50 | '''返回结果''' 51 | return self.model.generate(prompt, template, generate_configs) 52 | 53 | def eval_metric(self, datas): 54 | '''''' 55 | self.right_predicts = [] 56 | self.wrong_predicts = [] 57 | self.error_predicts = [] 58 | for data in datas: 59 | prompt, predict, answer, response = data["prompt"], data["predict"], data["answer"], data["response"] 60 | selected_keys = rec_search_key(response, "", [], predict) 61 | try: 62 | predict_json = selected_keys if isinstance(selected_keys, list) else eval(selected_keys) 63 | answer_json = answer if isinstance(answer, list) else eval(answer) 64 | 65 | predict_json = set(predict_json) if isinstance(predict_json, list) else predict_json 66 | answer_json = set(answer_json) if isinstance(answer_json, list) else answer_json 67 | 68 | if predict_json == answer_json: 69 | # print("prompt: {}\npredict: {}\nanswer: {}".format(prompt, predict, answer)) 70 | self.right_predicts.append({"prompt": prompt, "predict": predict, "answer": answer, "response": response}) 71 | else: 72 | self.wrong_predicts.append({"prompt": prompt, "predict": predict, "answer": answer, "response": response}) 73 | except: 74 | self.error_predicts.append({"prompt": prompt, "predict": predict, "answer": answer, "response": response}) 75 | # 76 | print(len(self.right_predicts), len(self.wrong_predicts), len(self.error_predicts)) 77 | 78 | metric = { 79 | "accuracy": len(self.right_predicts)/(len(self.right_predicts)+len(self.wrong_predicts)+len(self.error_predicts)), 80 | "error": len(self.error_predicts)/(len(self.right_predicts)+len(self.wrong_predicts)+len(self.error_predicts)), 81 | } 82 | return metric 83 | -------------------------------------------------------------------------------- /src/evals/toolsummary_evalution.py: -------------------------------------------------------------------------------- 1 | from src.models.base_model import ToolModel 2 | from src.models.generate_configs import GenerateConfigs 3 | from src.datasets import ToolSummaryDataset 4 | from .base_evalution import ToolEvalution 5 | from .utils import rec_search_key 6 | 7 | 8 | class ToolSummaryEvalution(ToolEvalution): 9 | def __init__( 10 | self, 11 | model: ToolModel, 12 | dataset: ToolSummaryDataset, 13 | base_prompt: str = '', 14 | template: str = 'default', 15 | generate_configs: GenerateConfigs = None, 16 | ): 17 | self.model = model 18 | self.dataset = dataset 19 | self.base_prompt = base_prompt 20 | self.template = template 21 | self.generate_configs = generate_configs 22 | 23 | if not isinstance(model, ToolModel): 24 | raise BaseException(f"must be ToolModel Class! not {model}") 25 | 26 | def calc(self): 27 | '''开始计算结果''' 28 | self.predicts = [] 29 | for idx, data in enumerate(self.dataset): 30 | # if idx >= 5: break 31 | prompt = self.base_prompt.format(**data) 32 | response = data["response"] 33 | answer = data["selected_keys"] 34 | predict = self.generate(prompt, self.template, self.generate_configs) 35 | self.predicts.append({"prompt": prompt, "predict": predict, "answer": answer, "response": response}) 36 | 37 | metric = self.eval_metric(self.predicts) 38 | return metric 39 | 40 | def generate(self, prompt, template, generate_configs): 41 | '''返回结果''' 42 | return self.model.generate(prompt, template, generate_configs) 43 | 44 | def eval_metric(self, datas): 45 | '''''' 46 | self.right_predicts = [] 47 | self.wrong_predicts = [] 48 | self.error_predicts = [] 49 | for data in datas: 50 | prompt, predict, answer, response = data["prompt"], data["predict"], data["answer"], data["response"] 51 | selected_keys = rec_search_key(response, "", [], predict) 52 | try: 53 | predict_json = selected_keys if isinstance(selected_keys, list) else eval(selected_keys) 54 | answer_json = answer if isinstance(answer, list) else eval(answer) 55 | 56 | predict_json = set(predict_json) if isinstance(predict_json, list) else predict_json 57 | answer_json = set(answer_json) if isinstance(answer_json, list) else answer_json 58 | 59 | if predict_json == answer_json: 60 | # print("prompt: {}\npredict: {}\nanswer: {}".format(prompt, predict, answer)) 61 | self.right_predicts.append({"prompt": prompt, "predict": predict, "answer": answer, "response": response}) 62 | else: 63 | self.wrong_predicts.append({"prompt": prompt, "predict": predict, "answer": answer, "response": response}) 64 | except Exception as e: 65 | self.error_predicts.append({"prompt": prompt, "predict": predict, "answer": answer, "response": response, "error_message": e}) 66 | # 67 | print(len(self.right_predicts), len(self.wrong_predicts), len(self.error_predicts)) 68 | 69 | metric = { 70 | "accuracy": len(self.right_predicts)/(len(self.right_predicts)+len(self.wrong_predicts)+len(self.error_predicts)), 71 | "error": len(self.error_predicts)/(len(self.right_predicts)+len(self.wrong_predicts)+len(self.error_predicts)), 72 | } 73 | return metric 74 | -------------------------------------------------------------------------------- /src/evals/utils.py: -------------------------------------------------------------------------------- 1 | 2 | def rec_search_key(res, k="", skeys: list=[], s=""): 3 | '''递归进行分析是否存在key被获取''' 4 | if isinstance(res, dict): 5 | for new_k, v in res.items(): 6 | try: 7 | skeys = rec_search_key(v, ".".join([str(k), str(new_k)]) if k else new_k, skeys, s) 8 | except Exception as e: 9 | print(res, k, new_k) 10 | raise e 11 | elif isinstance(res, list): 12 | for i in res: 13 | skeys = rec_search_key(i, k + ".list", skeys, s) 14 | else: 15 | if str(res) in str(s): 16 | skeys.append(k[:-5] if k[-5:] == ".list" else k) 17 | return list(set(skeys)) 18 | return list(set(skeys)) 19 | 20 | 21 | -------------------------------------------------------------------------------- /src/evaluate/evaluate.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import copy 3 | 4 | from loguru import logger 5 | 6 | 7 | def evaluate(model, tokenizer, context_builder, all_dataset): 8 | ''' 9 | Evaluate on all_dataset 10 | ''' 11 | all_dataset_pred = copy.deepcopy(all_dataset) 12 | 13 | do_verbose = True 14 | for dataset_name, dataset in all_dataset_pred.items(): 15 | for question in dataset: 16 | if do_verbose: 17 | question['pred'] = get_pred(model, tokenizer, context_builder, question, do_verbose) 18 | do_verbose = False 19 | else: 20 | question['pred'] = get_pred(model, tokenizer, context_builder, question, do_verbose) 21 | return all_dataset_pred 22 | 23 | def get_pred(model, tokenizer, context_builder, question: dict, verbose: bool = False): 24 | ''' 25 | Get the prediction for single question 26 | ''' 27 | options = question['options'] 28 | query = question['query'] 29 | 30 | option_dict = {} 31 | for option in options: 32 | encoded = tokenizer.encode(option) 33 | 34 | if len(encoded) == 1: 35 | option_dict[option] = encoded 36 | else: 37 | option_dict[option] = tokenizer._convert_token_to_id(option) 38 | 39 | # build context 40 | raw_text, context_tokens = context_builder.make_context(model, tokenizer, query) 41 | input_ids = torch.tensor([context_tokens]).to(model.device) 42 | 43 | if verbose: 44 | logger.info('sample raw_text={}\ncontext_tokens={}\nlen of context_tokens={}'.format(raw_text, context_tokens, len(context_tokens))) 45 | 46 | # if len(context_tokens) > 900: 47 | # return 'A' 48 | 49 | # feed to the model 50 | output = model(input_ids) 51 | logits = output.logits 52 | 53 | # get pred option 54 | score_dict = {} 55 | for option in option_dict: 56 | score = logits[0][-1][option_dict[option]] 57 | score_dict[option] = float(score) 58 | # logger.debug('score_dict={}'.format(score_dict)) 59 | 60 | max_score = float('-inf') 61 | best_option = None 62 | for option, score in score_dict.items(): 63 | if score > max_score: 64 | max_score = score 65 | best_option = option 66 | if verbose: 67 | logger.debug('score_dict={}, max_score={}, best_option={}, answer={}'.format(score_dict, max_score, best_option, question['answer'])) 68 | return best_option 69 | 70 | -------------------------------------------------------------------------------- /src/getAssistantAns.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import random 4 | import torch 5 | from transformers import AutoModelForCausalLM 6 | 7 | class GetAssistantAns(): 8 | # 按照自己推理需求自己修改代码 9 | 10 | def __init__(self, gpu_num=1): 11 | model = AutoModelForCausalLM.from_pretrained(model_name) 12 | device_list = [] 13 | for gpu_idx in range(gpu_num): 14 | device_list.append(torch.device("cuda:0")) 15 | 16 | # 将模型移动到指定的GPU设备 17 | model.to(device) 18 | 19 | 20 | def gen_answer(self, chat_dict, gpu_index): 21 | # 这里实际根据自己推理逻辑 然后转为标准格式返回 22 | # 以下仅仅是样例 23 | import time 24 | print(os.environ["CUDA_VISIBLE_DEVICES"]) 25 | time.sleep(1) 26 | rtn_dict1 = { 27 | "role": "assistant", 28 | "content": None, 29 | "function_call": 30 | { 31 | "name": "get_fudan_university_scoreline", 32 | "arguments": "{\n \"year\": \"2020\"\n}" 33 | } 34 | } 35 | 36 | rtn_dict2 = { 37 | "role": "assistant", 38 | "content": "2020年复旦大学的分数线如下:\n\n- 文科一批:630分\n- 文科二批:610分\n- 理科一批:650分\n- 理科二批:630分" 39 | } 40 | 41 | return random.choice([rtn_dict1, rtn_dict2]) 42 | 43 | # ====================================================================== 44 | # 下面注释的部分是一个huggingface推理的多卡的demo 45 | # 备注 线程数量==2,也就是2卡效率最高 多个卡并不会提升效率,存在资源抢占的情况 46 | # 可以采用多卡部署服务,然后调用服务的方式提升效率 47 | 48 | # import os 49 | # import re 50 | # import json 51 | # import random 52 | # import torch 53 | # import copy 54 | # import transformers 55 | # from transformers import AutoModelForCausalLM, CodeLlamaTokenizer, TextStreamer 56 | # end_token_id = 2 57 | # checkpoint = "" 58 | # tokenizer = CodeLlamaTokenizer.from_pretrained(checkpoint) 59 | # torch.manual_seed(random.randint(0, 100000)) 60 | # temperature = 0.2 61 | # top_p = 0.95 62 | # top_k = 40 63 | # repetition_penalty = 1.1 64 | # output_len = 2048 65 | # role_start = "[START]" 66 | # role_end = "[END]" 67 | 68 | 69 | # def change2traindata(fc_dict): 70 | # chatrounds_list = [] 71 | # # insert your code 72 | # # eg 73 | # # chatrounds_list = [ 74 | # # { 75 | # # "role": "system", 76 | # # "content":"你好,我是小助手,我能帮你做什么呢?" 77 | # # }, 78 | # # { 79 | # # "role": "functionapis" 80 | # # "content": "You are tool gpt, you can using following apis []" 81 | # # }, 82 | # # { 83 | # # "role": "user", 84 | # # "content": "我想知道复旦大学的分数线" 85 | # # }, 86 | # # { 87 | # # "role": "bot", 88 | # # "content": "#function{*****}" 89 | # # }, 90 | # # { 91 | # # "role": "function", 92 | # # "content": ****** 93 | # # }, 94 | # # { 95 | # # "role": "bot", 96 | # # "content": "复旦大学分数线640" 97 | # # } 98 | # # ] 99 | # return chatrounds_list 100 | 101 | 102 | # def get_chatrounds_ids(chatrounds_list): 103 | # input_ids = [] 104 | # for chatround in chatrounds_list: 105 | # input_ids += tokenizer.encode(role_start + chatround["role"]+ role_end) + tokenizer.encode(chatround["content"], add_special_tokens=False) + [tokenizer.eos_token_id] 106 | # input_ids += tokenizer.encode(role_start + "bot" + role_end) 107 | # return input_ids 108 | 109 | 110 | # class GetAssistantAns(): 111 | # # 按照自己推理需求自己修改代码 112 | 113 | # def __init__(self, gpu_num=1): 114 | # print(checkpoint) 115 | # print("Loading model") 116 | # model = AutoModelForCausalLM.from_pretrained(checkpoint).half().eval() 117 | # device_list = [torch.device(f"cuda:%d"%(i)) for i in range(gpu_num)] 118 | # self.model_list = [copy.deepcopy(model.to(device)) for device in device_list] 119 | # print("Loading finish") 120 | 121 | # def gen_answer(self, chat_dict, gpu_index=0): 122 | # chatrounds_list = change2traindata(chat_dict) 123 | # input_ids = get_chatrounds_ids(chatrounds_list) 124 | # output_ids = self.model_list[gpu_index].generate(torch.tensor([input_ids]).to(self.model_list[gpu_index].device), max_new_tokens=output_len, num_beams=1, num_return_sequences=1, do_sample=True, temperature=temperature, top_p=top_p, eos_token_id=end_token_id, top_k=top_k, streamer=None, repetition_penalty=repetition_penalty, pad_token_id=10000)[0] 125 | # res = tokenizer.decode(output_ids[len(input_ids):-1]) 126 | # save_dict = {"role": "assistant"} 127 | # if res.startswith("#function"): 128 | # try: 129 | # res_dict = json.loads(re.sub("^#function", "", res)) 130 | # save_dict["content"] = res_dict["content"] 131 | # save_dict["function_call"] = {} 132 | # save_dict["function_call"]["name"] = res_dict["name"] 133 | # save_dict["function_call"]["arguments"] = res_dict["arguments"] 134 | # except Exception as e: 135 | # print(e) 136 | # save_dict = {"role": "assistant"} 137 | # save_dict["content"] = res 138 | # else: 139 | # save_dict["content"] = res 140 | # # print(save_dict) 141 | # return save_dict 142 | -------------------------------------------------------------------------------- /src/hparams/evaluate_args.py: -------------------------------------------------------------------------------- 1 | import os 2 | cur_dir = os.path.split(os.path.realpath(__file__))[0] 3 | root_dir = os.path.abspath(os.path.dirname(cur_dir) + os.path.sep + "..") 4 | 5 | import json 6 | from typing import List, Literal, Optional 7 | from dataclasses import dataclass, field 8 | 9 | from loguru import logger 10 | 11 | 12 | @dataclass 13 | class EvaluateArguments: 14 | r""" 15 | Arguments 16 | """ 17 | model_path: str = field( 18 | metadata={"help": "Path of the model and tokenizer"} 19 | ) 20 | model_name: str = field( 21 | metadata={"help": "Name of the model, now support qwen, baichuan, ..."} 22 | ) 23 | model_conf_path: str = field( 24 | default = root_dir + os.path.sep + 'conf' + os.path.sep + 'model_conf.json', 25 | metadata={"help": "Path of model's loader and context_builder class"} 26 | ) 27 | eval_dataset_list: str = field( 28 | default = 'all', 29 | metadata={"help": "Which datasets to evaluate on. default is all datasets, if want to test on multiple datasets, use # as seperator"} 30 | ) 31 | eval_dataset_type: str = field( 32 | default = 'test', 33 | metadata={"help": "Which type of datasets to evaluate on. default is test, must be one of (test, valid, dev)"} 34 | ) 35 | eval_dataset_fp_conf_path: str = field( 36 | default = root_dir + os.path.sep + 'conf' + os.path.sep + 'dataset_fp.json', 37 | metadata={"help": "Path of dataset_name and filepath config file"} 38 | ) 39 | k_shot: int = field( 40 | default = 0, 41 | metadata={"help": "k-shot test, k should be in (0, 1,2,3,4,5)"} 42 | ) 43 | seed: int = field( 44 | default = 100, 45 | metadata={"help": "Random seed, default 100"} 46 | ) 47 | data_path: str = field( 48 | default = '/mnt/llm/DevOpsEval/data/devopseval', 49 | metadata={'help': 'Path to the devopseval dataset'} 50 | ) 51 | 52 | def init_for_training(self): 53 | self.eval_dataset_list = self.eval_dataset_list.split('#') 54 | if 'all' in self.eval_dataset_list: 55 | logger.info('Detecting all in eval_dataset_list, evaluating on all dataset') 56 | self.eval_dataset_list = ['all'] 57 | 58 | assert self.eval_dataset_type in ('dev', 'test', 'val') 59 | assert self.k_shot in (0,1,2,3,4,5) -------------------------------------------------------------------------------- /src/hparams/parser.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('/mnt/llm/DevOpsEval') 3 | 4 | from loguru import logger 5 | from transformers import HfArgumentParser 6 | from src.hparams.evaluate_args import EvaluateArguments 7 | 8 | 9 | def get_all_args( 10 | args = None 11 | ) -> EvaluateArguments: 12 | parser = HfArgumentParser(( 13 | EvaluateArguments 14 | )) 15 | eval_args = parser.parse_args_into_dataclasses(args)[0] 16 | eval_args.init_for_training() 17 | return eval_args 18 | 19 | 20 | if __name__ == '__main__': 21 | a = get_all_args() 22 | logger.debug(a) -------------------------------------------------------------------------------- /src/metric/metric_score.py: -------------------------------------------------------------------------------- 1 | def get_acc_score(all_pred): 2 | ''' 3 | Get accuracy score by dataset 4 | ''' 5 | score_dict = {i:None for i in all_pred} 6 | total_corr = 0 7 | total_count = 0 8 | for dataset_name, dataset_pred in all_pred.items(): 9 | corr = 0 10 | 11 | for pred_single in dataset_pred: 12 | if pred_single['answer'] == pred_single['pred']: 13 | corr += 1 14 | score_dict[dataset_name] = { 15 | 'total': len(dataset_pred), 16 | 'corr': corr, 17 | 'score': corr / len(dataset_pred) 18 | } 19 | 20 | total_corr += corr 21 | total_count += len(dataset_pred) 22 | res = { 23 | 'total': total_count, 24 | 'corr': total_corr, 25 | 'score': total_corr / total_count, 26 | 'detail': score_dict 27 | } 28 | return res 29 | -------------------------------------------------------------------------------- /src/model_and_tokenizer_loader/model_and_tokenizer_loader.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import transformers 3 | from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig 4 | from loguru import logger 5 | 6 | 7 | class ModelAndTokenizerLoader: 8 | def __init__(self): 9 | pass 10 | 11 | def load_model_and_tokenizer(self, model_path: str): 12 | model = self.load_model(model_path) 13 | tokenizer = self.load_tokenizer(model_path) 14 | return model, tokenizer 15 | 16 | def load_model(self, model_path: str): 17 | model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=True).eval() 18 | # for name, param in model.named_parameters(): 19 | # logger.debug('param_name={}, param.device={}'.format(name, param.device)) 20 | return model 21 | 22 | def load_tokenizer(self, model_path: str): 23 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 24 | return tokenizer 25 | 26 | -------------------------------------------------------------------------------- /src/model_and_tokenizer_loader/model_and_tokenizer_loader_family.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from loguru import logger 4 | 5 | import torch 6 | import transformers 7 | from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig 8 | 9 | from src.model_and_tokenizer_loader.model_and_tokenizer_loader import ModelAndTokenizerLoader 10 | 11 | def load_model_and_tokenizer(eval_args): 12 | ''' 13 | Load model and tokenizer by model_path and model_name 14 | ''' 15 | with open(eval_args.model_conf_path, 'r') as f: 16 | model_conf = json.load(f) 17 | 18 | loader = globals()[model_conf[eval_args.model_name]['loader']]() 19 | 20 | return loader.load_model_and_tokenizer(eval_args.model_path) 21 | 22 | class QwenModelAndTokenizerLoader(ModelAndTokenizerLoader): 23 | def __init__(self): 24 | super().__init__() 25 | pass 26 | 27 | def load_model(self, model_path: str): 28 | model = super().load_model(model_path) 29 | model.generation_config = GenerationConfig.from_pretrained(model_path) 30 | 31 | return model 32 | 33 | def load_tokenizer(self, model_path: str): 34 | tokenizer = super().load_tokenizer(model_path) 35 | 36 | # read generation config 37 | with open(model_path + '/generation_config.json', 'r') as f: 38 | generation_config = json.load(f) 39 | 40 | tokenizer.pad_token_id = generation_config['pad_token_id'] 41 | tokenizer.eos_token_id = generation_config['eos_token_id'] 42 | return tokenizer 43 | 44 | class BaichuanModelAndTokenizerLoader(ModelAndTokenizerLoader): 45 | def __init__(self): 46 | super().__init__() 47 | pass 48 | 49 | def load_model(self, model_path: str): 50 | model = super().load_model(model_path) 51 | # model.generation_config = GenerationConfig.from_pretrained(model_path) 52 | 53 | return model 54 | 55 | 56 | if __name__ == '__main__': 57 | model_path = '/mnt/llm/devopspal/model/Qwen-7B' 58 | qwen_model_loader = QwenModelAndTokenizerLoader() 59 | tokenizer = qwen_model_loader.load_tokenizer(model_path) 60 | 61 | logger.info(tokenizer) 62 | -------------------------------------------------------------------------------- /src/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_model import ToolModel 2 | from .qwen_model import QwenModel 3 | from .internlm_model import InternlmModel 4 | from .openai_model import OpenaiModel 5 | from .baichuan_model import BaiChuanModel 6 | 7 | __all__ = [ 8 | "ToolModel", "QwenModel", "InternlmModel", "OpenaiModel", "BaiChuanModel" 9 | ] -------------------------------------------------------------------------------- /src/models/baichuan_model.py: -------------------------------------------------------------------------------- 1 | # from vllm import LLM, SamplingParams 2 | # from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel 3 | 4 | 5 | from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig 6 | from peft import PeftModel, PeftConfig 7 | 8 | from .generate_configs import GenerateConfigs 9 | from .base_model import ToolModel 10 | 11 | 12 | from loguru import logger 13 | 14 | 15 | 16 | class BaiChuanModel(ToolModel): 17 | def __init__(self, model_path: str, peft_path: str = None, template: str = "default", trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25): 18 | self.model_path = model_path 19 | self.peft_path = peft_path 20 | self.template = template 21 | self.trust_remote_code = trust_remote_code 22 | self.tensor_parallel_size = tensor_parallel_size 23 | self.gpu_memory_utilization = gpu_memory_utilization 24 | self.generation_config = GenerationConfig.from_pretrained(model_path) 25 | self.load_model(self.model_path, self.peft_path, self.trust_remote_code, self.tensor_parallel_size, self.gpu_memory_utilization) 26 | 27 | def generate( 28 | self, prompts: str, 29 | template: str = None, 30 | generate_configs: GenerateConfigs =None, 31 | history: list = None, 32 | ) -> list: 33 | '''产出对应结果''' 34 | template = self.template if template is None else template 35 | 36 | params = self.generate_params(generate_configs) 37 | 38 | if template == "default": 39 | inputs = self.tokenizer(prompts, return_tensors="pt") 40 | inputs["input_ids"] = inputs["input_ids"].cuda() 41 | 42 | inputs.update(params) 43 | output = self.model.generate(**inputs) 44 | predict = self.tokenizer.decode(output[0].tolist())[len(prompts):] 45 | predict = predict.replace("<|endoftext|>", "").replace("", "") 46 | return predict 47 | elif template != "default": 48 | messages = [{"role": "user" if idx==0 else "assistant", "content": ii} for i in history for idx, ii in enumerate(i)] 49 | messages.append({"role": "user", "content": prompts}) 50 | output = self.model.chat(self.tokenizer, messages=messages, generation_config=self.generation_config) 51 | return output 52 | 53 | def generate_params( 54 | self, generate_configs: GenerateConfigs, 55 | ): 56 | '''generate param''' 57 | kargs = generate_configs.dict() 58 | params = { 59 | "max_new_tokens": kargs.get("max_new_tokens", 128), 60 | "top_k": kargs.get("top_k", 50), 61 | "top_p": kargs.get("top_p", 0.95), 62 | "temperature": kargs.get("temperature", 1.0), 63 | } 64 | self.generation_config.max_new_tokens = kargs.get("max_new_tokens", 128) 65 | self.generation_config.top_k = kargs.get("top_k", 50) 66 | self.generation_config.top_p = kargs.get("top_p", 0.95) 67 | self.generation_config.temperature = kargs.get("temperature", 1.0) 68 | 69 | # params = { 70 | # "n": 1, 71 | # "max_tokens": kargs.get("max_new_tokens", 128), 72 | # "best_of": kargs.get("beam_bums", 1), 73 | # "top_k": kargs.get("top_k", 50), 74 | # "top_p": kargs.get("top_p", 0.95), 75 | # "temperature": kargs.get("temperature", 1.0), 76 | # "length_penalty": kargs.get("length_penalty", 1.0), 77 | # "presence_penalty": kargs.get("presence_penalty", 1.0), 78 | # "stop": kargs.get("stop_words", ["<|endoftext|>"]), 79 | # } 80 | return params 81 | 82 | def load_model(self, model_path, peft_path=None, trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25): 83 | '''加载模型''' 84 | print(f"self.model_path: {self.model_path}") 85 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=trust_remote_code) 86 | self.model = AutoModelForCausalLM.from_pretrained(self.model_path, device_map="auto", trust_remote_code=trust_remote_code).eval().half() 87 | if peft_path: 88 | print(f"peft_path: {peft_path}") 89 | self.model = PeftModel.from_pretrained(self.model, peft_path) 90 | 91 | # self.model = LLM(model=model_path, trust_remote_code=trust_remote_code, tensor_parallel_size=tensor_parallel_size, gpu_memory_utilization=gpu_memory_utilization) -------------------------------------------------------------------------------- /src/models/base_model.py: -------------------------------------------------------------------------------- 1 | # from vllm import LLM, SamplingParams 2 | # from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel 3 | 4 | 5 | from transformers import AutoModelForCausalLM, AutoTokenizer 6 | from transformers import AutoTokenizer 7 | from peft import PeftModel, PeftConfig 8 | 9 | from .generate_configs import GenerateConfigs 10 | 11 | 12 | 13 | class ToolModel: 14 | def __init__(self, model_path: str, template: str, trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25): 15 | self.model_path = model_path 16 | self.trust_remote_code = trust_remote_code 17 | self.tensor_parallel_size = tensor_parallel_size 18 | self.gpu_memory_utilization = gpu_memory_utilization 19 | self.load_model(self.model_path, self.trust_remote_code, self.tensor_parallel_size, self.gpu_memory_utilization) 20 | 21 | def generate(self, prompts: str, template: str = None, generate_configs: GenerateConfigs = None) -> list: 22 | '''产出对应结果''' 23 | pass 24 | 25 | def generate_params( 26 | self, generate_configs: GenerateConfigs, 27 | ): 28 | '''generate param''' 29 | kargs = generate_configs.dict() 30 | return kargs 31 | 32 | def load_model(self, model_path, trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25): 33 | '''加载模型''' 34 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=trust_remote_code) 35 | self.model = AutoModelForCausalLM.from_pretrained(self.model_path, device_map="auto", trust_remote_code=trust_remote_code).eval() 36 | 37 | # self.model = LLM(model=model_path, trust_remote_code=trust_remote_code, tensor_parallel_size=tensor_parallel_size, gpu_memory_utilization=gpu_memory_utilization) -------------------------------------------------------------------------------- /src/models/generate_configs.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | 3 | 4 | class GenerateConfigs(BaseModel): 5 | max_new_tokens: int = 128 6 | beam_bums: int = 1 7 | top_k: int = 50 8 | top_p: float = 0.95 9 | temperature: float = 1.0 10 | length_penalty: float = 1.0 11 | presence_penalty: float = 1.0 12 | stop_words: list = [] 13 | template: str = "default" -------------------------------------------------------------------------------- /src/models/internlm_model.py: -------------------------------------------------------------------------------- 1 | # from vllm import LLM, SamplingParams 2 | # from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel 3 | 4 | 5 | from transformers import AutoModelForCausalLM, AutoTokenizer 6 | from transformers import AutoTokenizer 7 | from peft import PeftModel, PeftConfig 8 | 9 | from .generate_configs import GenerateConfigs 10 | from .base_model import ToolModel 11 | 12 | 13 | 14 | class InternlmModel(ToolModel): 15 | def __init__(self, model_path: str, peft_path: str = None, template: str = "default", trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25): 16 | self.model_path = model_path 17 | self.peft_path = peft_path 18 | self.template = template 19 | self.trust_remote_code = trust_remote_code 20 | self.tensor_parallel_size = tensor_parallel_size 21 | self.gpu_memory_utilization = gpu_memory_utilization 22 | self.load_model(self.model_path, self.peft_path, self.trust_remote_code, self.tensor_parallel_size, self.gpu_memory_utilization) 23 | 24 | def generate( 25 | self, prompts: str, 26 | template: str = None, 27 | generate_configs: GenerateConfigs =None, 28 | ) -> list: 29 | '''产出对应结果''' 30 | 31 | template = self.template if template is None else template 32 | 33 | params = self.generate_params(generate_configs) 34 | 35 | if template == "default": 36 | inputs = self.tokenizer(prompts, return_tensors="pt") 37 | inputs["input_ids"] = inputs["input_ids"].cuda() 38 | inputs["attention_mask"] = inputs["attention_mask"].cuda() 39 | 40 | inputs.update(params) 41 | output = self.model.generate(**inputs) 42 | predict = self.tokenizer.decode(output[0].tolist()) 43 | predict = predict.split("\n")[-1] 44 | predict = predict.replace("<|endoftext|>", "").replace("", "") 45 | return predict 46 | elif template != "default": 47 | output, _ = self.model.chat(self.tokenizer, prompts, history=None, **params) 48 | return output 49 | # params = self.generate_params(generate_configs) 50 | # sampling_params = SamplingParams(**params) 51 | # prompts = [prompts] if isinstance(prompts, str) else prompts 52 | # outputs = self.model.generate(prompts, sampling_params) 53 | # return [i.outputs[0].text for i in outputs] 54 | 55 | def generate_params( 56 | self, generate_configs: GenerateConfigs, 57 | ): 58 | '''generate param''' 59 | kargs = generate_configs.dict() 60 | params = { 61 | "max_new_tokens": kargs.get("max_new_tokens", 128), 62 | "top_k": kargs.get("top_k", 50), 63 | "top_p": kargs.get("top_p", 0.95), 64 | "temperature": kargs.get("temperature", 1.0), 65 | } 66 | 67 | # params = { 68 | # "n": 1, 69 | # "max_tokens": kargs.get("max_new_tokens", 128), 70 | # "best_of": kargs.get("beam_bums", 1), 71 | # "top_k": kargs.get("top_k", 50), 72 | # "top_p": kargs.get("top_p", 0.95), 73 | # "temperature": kargs.get("temperature", 1.0), 74 | # "length_penalty": kargs.get("length_penalty", 1.0), 75 | # "presence_penalty": kargs.get("presence_penalty", 1.0), 76 | # "stop": kargs.get("stop_words", ["<|endoftext|>"]), 77 | # } 78 | return params 79 | 80 | def load_model(self, model_path, peft_path=None, trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25): 81 | '''加载模型''' 82 | print(model_path, peft_path, trust_remote_code) 83 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=trust_remote_code) 84 | self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto", trust_remote_code=trust_remote_code).eval() 85 | if peft_path: 86 | self.model = PeftModel.from_pretrained(self.model, peft_path) 87 | 88 | # self.model = LLM(model=model_path, trust_remote_code=trust_remote_code, tensor_parallel_size=tensor_parallel_size, gpu_memory_utilization=gpu_memory_utilization) -------------------------------------------------------------------------------- /src/models/openai_model.py: -------------------------------------------------------------------------------- 1 | # from vllm import LLM, SamplingParams 2 | # from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel 3 | 4 | 5 | from transformers import AutoModelForCausalLM, AutoTokenizer 6 | from transformers import AutoTokenizer 7 | from peft import PeftModel, PeftConfig 8 | 9 | from .generate_configs import GenerateConfigs 10 | from .base_model import ToolModel 11 | 12 | import openai, os 13 | 14 | 15 | 16 | class OpenaiModel(ToolModel): 17 | def __init__(self, model_path: str, template: str, system_prompt): 18 | self.model_path = model_path 19 | self.template = template 20 | self.system_prompt = system_prompt 21 | 22 | def generate( 23 | self, prompts: str, template: str = None, 24 | generate_configs: GenerateConfigs =None, 25 | ) -> list: 26 | '''产出对应结果''' 27 | template = self.template if template is None else template 28 | 29 | params = self.generate_params(generate_configs) 30 | 31 | messages = [{"role": "system", "content": self.system_prompt}, {"role": "user", "content": prompts}] 32 | try: 33 | result = openai.ChatCompletion.create(api_base=os.environ["OPENAI_API_BASE"], api_key=os.environ["OPENAI_API_KEY"], model=self.model_path, messages=messages, **params) 34 | # print("prompt_tokens: {}, completion_tokens: {}".format(result["usage"]["prompt_tokens"], result["usage"]["completion_tokens"])) 35 | return result["choices"][0]["message"]["content"] 36 | except Exception as e: 37 | result = str(e) 38 | 39 | def generate_params( 40 | self, generate_configs: GenerateConfigs, 41 | ): 42 | '''generate param''' 43 | kargs = generate_configs.dict() 44 | params = { 45 | "max_new_tokens": kargs.get("max_new_tokens", 128), 46 | "top_k": kargs.get("top_k", 50), 47 | "top_p": kargs.get("top_p", 0.95), 48 | "temperature": kargs.get("temperature", 1.0), 49 | } 50 | return params 51 | 52 | -------------------------------------------------------------------------------- /src/models/qwen_model.py: -------------------------------------------------------------------------------- 1 | # from vllm import LLM, SamplingParams 2 | # from vllm.model_executor.parallel_utils.parallel_state import destroy_model_parallel 3 | 4 | 5 | from transformers import AutoModelForCausalLM, AutoTokenizer 6 | from transformers import AutoTokenizer 7 | from peft import PeftModel, PeftConfig 8 | 9 | from .generate_configs import GenerateConfigs 10 | from .base_model import ToolModel 11 | 12 | 13 | 14 | class QwenModel(ToolModel): 15 | def __init__(self, model_path: str, peft_path: str = None, template: str = "default", trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25): 16 | self.model_path = model_path 17 | self.peft_path = peft_path 18 | self.template = template 19 | self.trust_remote_code = trust_remote_code 20 | self.tensor_parallel_size = tensor_parallel_size 21 | self.gpu_memory_utilization = gpu_memory_utilization 22 | self.load_model(self.model_path, self.peft_path, self.trust_remote_code, self.tensor_parallel_size, self.gpu_memory_utilization) 23 | 24 | def generate( 25 | self, prompts: str, 26 | template: str = None, 27 | generate_configs: GenerateConfigs =None, 28 | history: list = None, 29 | ) -> list: 30 | '''产出对应结果''' 31 | template = self.template if template is None else template 32 | 33 | params = self.generate_params(generate_configs) 34 | 35 | if template == "default": 36 | inputs = self.tokenizer(prompts, return_tensors="pt") 37 | inputs["input_ids"] = inputs["input_ids"].cuda() 38 | 39 | inputs.update(params) 40 | output = self.model.generate(**inputs) 41 | predict = self.tokenizer.decode(output[0].tolist())[len(prompts):] 42 | predict = predict.replace("<|endoftext|>", "").replace("", "") 43 | return predict 44 | elif template != "default": 45 | output, _ = self.model.chat(self.tokenizer, prompts, history=history, **params) 46 | return output 47 | # params = self.generate_params(generate_configs) 48 | # sampling_params = SamplingParams(**params) 49 | # prompts = [prompts] if isinstance(prompts, str) else prompts 50 | # outputs = self.model.generate(prompts, sampling_params) 51 | # return [i.outputs[0].text for i in outputs] 52 | 53 | def generate_params( 54 | self, generate_configs: GenerateConfigs, 55 | ): 56 | '''generate param''' 57 | kargs = generate_configs.dict() 58 | params = { 59 | "max_new_tokens": kargs.get("max_new_tokens", 128), 60 | "top_k": kargs.get("top_k", 50), 61 | "top_p": kargs.get("top_p", 0.95), 62 | "temperature": kargs.get("temperature", 1.0), 63 | } 64 | 65 | # params = { 66 | # "n": 1, 67 | # "max_tokens": kargs.get("max_new_tokens", 128), 68 | # "best_of": kargs.get("beam_bums", 1), 69 | # "top_k": kargs.get("top_k", 50), 70 | # "top_p": kargs.get("top_p", 0.95), 71 | # "temperature": kargs.get("temperature", 1.0), 72 | # "length_penalty": kargs.get("length_penalty", 1.0), 73 | # "presence_penalty": kargs.get("presence_penalty", 1.0), 74 | # "stop": kargs.get("stop_words", ["<|endoftext|>"]), 75 | # } 76 | return params 77 | 78 | def load_model(self, model_path, peft_path=None, trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25): 79 | '''加载模型''' 80 | self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=trust_remote_code) 81 | self.model = AutoModelForCausalLM.from_pretrained(self.model_path, device_map="auto", trust_remote_code=trust_remote_code).eval() 82 | if peft_path: 83 | self.model = PeftModel.from_pretrained(self.model, peft_path) 84 | 85 | # self.model = LLM(model=model_path, trust_remote_code=trust_remote_code, tensor_parallel_size=tensor_parallel_size, gpu_memory_utilization=gpu_memory_utilization) -------------------------------------------------------------------------------- /src/models/test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import json 4 | import random 5 | import torch 6 | import transformers 7 | from transformers import AutoModelForCausalLM, CodeLlamaTokenizer, TextStreamer 8 | end_token_id = 2 9 | checkpoint = "/mnt/user/230854/output/vbase-llama-16k-hf/transformers" 10 | print(checkpoint) 11 | print("Loading model") 12 | model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto").half().eval() 13 | tokenizer = CodeLlamaTokenizer.from_pretrained(checkpoint) 14 | print("Loading finish") 15 | streamer = TextStreamer(tokenizer, skip_prompt=True) 16 | torch.manual_seed(random.randint(0, 100000)) 17 | temperature = 0.2 18 | top_p = 0.95 19 | top_k = 40 20 | repetition_penalty = 1.1 21 | output_len = 2048 22 | role_start = "[START]" 23 | role_end = "[END]" 24 | 25 | 26 | def change2chatml(fc_dict): 27 | chatrounds_list = [] 28 | if fc_dict["chatrounds"][0]["role"] == "system": 29 | role = "system" 30 | content = fc_dict["chatrounds"][0]["content"] 31 | chatrounds_list.append({"role":role, "content":content}) 32 | else: 33 | role = "system" 34 | content = "CodeFuse是一个面向研发领域的智能助手,旨在中立的、无害的帮助用户解决开发相关的问题,所有的回答均使用Markdown格式返回。\n你能利用许多工具和功能来完成给定的任务,在每一步中,你需要分析当前状态,并通过执行函数调用来确定下一步的行动方向。你可以进行多次尝试。如果你计划连续尝试不同的条件,请每次尝试一种条件。若给定了Finish函数,则以Finish调用结束,若没提供Finish函数,则以不带function_call的对话结束。" 35 | chatrounds_list.append({"role":role, "content":content}) 36 | 37 | if fc_dict.get("functions",[]): 38 | role = "funcionapis" 39 | content = "You are ToolGPT, you have access to the following APIs:" 40 | content += json.dumps(fc_dict["functions"], ensure_ascii=False, sort_keys=True) 41 | chatrounds_list.append({"role":role, "content":content}) 42 | 43 | for chat_dict in fc_dict["chatrounds"]: 44 | if chat_dict["role"] == "user": 45 | role = "human" 46 | content = chat_dict["content"] 47 | chatrounds_list.append({"role":role, "content":content}) 48 | elif chat_dict["role"] == "assistant": 49 | role = "bot" 50 | if "function_call" in chat_dict: 51 | function_call_dict = {} 52 | function_call_dict["content"] = chat_dict["content"] 53 | function_call_dict["name"] = chat_dict["function_call"]["name"] 54 | function_call_dict["arguments"] = chat_dict["function_call"]["arguments"] 55 | content = "#function"+json.dumps(function_call_dict, ensure_ascii=False) 56 | else: 57 | content = chat_dict["content"] 58 | chatrounds_list.append({"role":role, "content":content}) 59 | elif chat_dict["role"] == "function": 60 | role = "function" 61 | function_call_rst = {} 62 | function_call_rst["name"] = chat_dict["name"] 63 | function_call_rst["content"] = chat_dict["content"] 64 | content = json.dumps(function_call_rst, ensure_ascii=False) 65 | chatrounds_list.append({"role":role, "content":content}) 66 | return chatrounds_list 67 | 68 | 69 | def get_chatrounds_ids(chatrounds_list): 70 | input_ids = [] 71 | for chatround in chatrounds_list: 72 | input_ids += tokenizer.encode(role_start + chatround["role"]+ role_end) + tokenizer.encode(chatround["content"], add_special_tokens=False) + [tokenizer.eos_token_id] 73 | input_ids += tokenizer.encode(role_start + "bot" + role_end) 74 | return input_ids 75 | 76 | class GetAssistantAns(): 77 | # 按照自己推理需求自己修改代码 78 | 79 | def __init__(self): 80 | pass 81 | 82 | def gen_answer(self, chat_dict): 83 | chatrounds_list = change2chatml(chat_dict) 84 | input_ids = get_chatrounds_ids(chatrounds_list) 85 | output_ids = model.generate(torch.tensor([input_ids]).to(model.device), max_new_tokens=output_len, num_beams=1, num_return_sequences=1, do_sample=True, temperature=temperature, top_p=top_p, eos_token_id=end_token_id, top_k=top_k, streamer=None, repetition_penalty=repetition_penalty, pad_token_id=10000)[0] 86 | res = tokenizer.decode(output_ids[len(input_ids):-1]) 87 | save_dict = {"role": "assistant"} 88 | if res.startswith("#function"): 89 | try: 90 | res_dict = json.loads(re.sub("^#function", "", res)) 91 | save_dict["content"] = res_dict["content"] 92 | save_dict["function_call"] = {} 93 | save_dict["function_call"]["name"] = res_dict["name"] 94 | save_dict["function_call"]["arguments"] = res_dict["arguments"] 95 | except Exception as e: 96 | print(e) 97 | save_dict = {"role": "assistant"} 98 | save_dict["content"] = res 99 | else: 100 | save_dict["content"] = res 101 | 102 | print(save_dict) 103 | 104 | return save_dict 105 | -------------------------------------------------------------------------------- /src/opensource_functioncall_evalution.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | # -*- coding: utf-8 -*- 3 | 4 | ############################################ 5 | # File: opensource_functioncall_evalution.py 6 | # create by youmi 7 | # Time: 2023-11-23 13:10 8 | ############################################ 9 | 10 | 11 | import os 12 | import sys 13 | import random 14 | import time 15 | import shutil 16 | import json 17 | import jieba 18 | import re 19 | import copy 20 | import numpy as np 21 | from tqdm import tqdm 22 | from collections import Counter 23 | from concurrent.futures import ThreadPoolExecutor, as_completed 24 | from getAssistantAns import GetAssistantAns 25 | 26 | 27 | test_ans_file_list = [ 28 | "fcdata_zh_test.jsonl" 29 | ] 30 | 31 | # 多进程评测加速 32 | GPU_NUM = 1 33 | 34 | # function call 回复测试总数 35 | function_call_sum = 0 36 | # function call 回复正确数 37 | function_call_correct = 0 38 | # function call 回复失败数 39 | function_call_fail = 0 40 | # function call 回复失败中,本应该调用工具但是模型没有调用, 无工具识别识别错误数 41 | function_call_fail_functioncall = 0 42 | # function call 回复失败数中,因为函数名不对导致的失败数, 这部分包括模型幻觉出错 43 | function_call_fail_name = 0 44 | # function call 回复失败数中,工具名对了,但是参数不对导致的失败数 45 | function_call_fail_param = 0 46 | # function call 回复失败中 函数名幻觉的失败数 47 | function_call_fail_name_illusion = 0 48 | 49 | # assistant ans 回复相关度列表 50 | assistant_ans_relevancy_list = [] 51 | 52 | # 推理结果 53 | test_result_lines = [] 54 | 55 | get_assistant_ans = GetAssistantAns(gpu_num=GPU_NUM) 56 | 57 | def remove_punctuation(text): 58 | pattern = r'[^\w\s]' 59 | return re.sub(pattern, '', text) 60 | 61 | 62 | def cmp_arguments(args_str1, args_str2): 63 | rtn_flag = False 64 | try: 65 | args_dict1 = json.loads(args_str1) 66 | args_dict2 = json.loads(args_str2) 67 | # 比较两个字典是否一致 68 | if args_dict1 == args_dict2: 69 | rtn_flag = True 70 | except Exception as e: 71 | print("json.loads error: ", e) 72 | return rtn_flag 73 | return rtn_flag 74 | 75 | 76 | # 计算两个答案的相关度 77 | # 要是预测回复的是functioncall类型的,相似为0 78 | # 要是预测回复的包含了所有要点,相似度为1 79 | # 相似度保存在assistant_ans_relevancy_list中 80 | def calc_relevancy(ass_predict, ass_truth, chatrounds): 81 | global assistant_ans_relevancy_list 82 | if "function_call" in ass_predict: 83 | assistant_ans_relevancy_list.append(0) 84 | return 85 | # 将user 和 function 的部分组合 86 | content_msg = "" 87 | for chatround in chatrounds["chatrounds"]: 88 | if chatround["role"] == "user": 89 | content_msg += chatround["content"] 90 | elif chatround["role"] == "function": 91 | content_msg += chatround["content"] 92 | content_msg_counter = Counter(jieba.cut(remove_punctuation(content_msg))) 93 | ass_truth_counter = Counter(jieba.cut(remove_punctuation(ass_truth["content"]))) 94 | ass_predict_counter = Counter(jieba.cut(remove_punctuation(ass_predict["content"]))) 95 | relative_counter = content_msg_counter & ass_truth_counter 96 | len_relative = sum(relative_counter.values()) 97 | predict_relative = ass_predict_counter & relative_counter 98 | 99 | if len_relative == 0: 100 | # 要是标准答案和问题相关词都无 直接给1 101 | assistant_ans_relevancy_list.append(1) 102 | else: 103 | # 交集与相关词的占比 104 | assistant_ans_relevancy_list.append(sum(predict_relative.values())/len_relative) 105 | 106 | 107 | 108 | 109 | def calc_llm_index(ass_predict, ass_truth, chatrounds): 110 | global function_call_sum, function_call_correct, function_call_fail, function_call_fail_functioncall, function_call_fail_name, function_call_fail_name_illusion, function_call_fail_param 111 | 112 | chatrounds_functionname_list = [] 113 | for function_dict in chatrounds.get("functions", []): 114 | chatrounds_functionname_list.append(function_dict["name"]) 115 | 116 | if "function_call" in ass_truth: 117 | function_call_sum += 1 118 | if "function_call" not in ass_predict: 119 | function_call_fail += 1 120 | function_call_fail_functioncall += 1 121 | elif ass_predict["function_call"]["name"] not in chatrounds_functionname_list: 122 | # 模型幻觉 123 | function_call_fail += 1 124 | function_call_fail_name += 1 125 | function_call_fail_name_illusion += 1 126 | else: 127 | function_call_name_label = False 128 | function_call_args_label = False 129 | if ass_predict["function_call"]["name"] == ass_truth["function_call"]["name"]: 130 | function_call_name_label = True 131 | if cmp_arguments(ass_predict["function_call"]["arguments"], ass_truth["function_call"]["arguments"]): 132 | function_call_args_label = True 133 | else: 134 | function_call_fail_param += 1 135 | else: 136 | function_call_fail_name += 1 137 | 138 | if function_call_name_label and function_call_args_label: 139 | function_call_correct += 1 140 | else: 141 | function_call_fail += 1 142 | else: 143 | calc_relevancy(ass_predict, ass_truth, chatrounds) 144 | 145 | 146 | def print_result(): 147 | # 打印指标结果 148 | print("=============统计数据=========================") 149 | print(f"function_call_sum: {function_call_sum}") 150 | print(f"function_call_correct: {function_call_correct}") 151 | print(f"function_call_fail: {function_call_fail}") 152 | print(f"function_call_fail_functioncall: {function_call_fail_functioncall}") 153 | print(f"function_call_fail_name: {function_call_fail_name}") 154 | print(f"function_call_fail_param: {function_call_fail_param}") 155 | print(f"function_call_fail_name_illusion: {function_call_fail_name_illusion}") 156 | print(f"assistant_ans_sum: {len(assistant_ans_relevancy_list)}") 157 | print(f"assistant_ans_relevancy: {np.mean(assistant_ans_relevancy_list)}") 158 | print("=============实验结果=========================") 159 | function_call_correct_rate = function_call_correct/function_call_sum 160 | function_call_fail_rate = function_call_fail/function_call_sum 161 | function_call_fail_functioncall_rate = function_call_fail_functioncall/function_call_fail if function_call_fail else 0 162 | function_call_fail_name_rate = function_call_fail_name/function_call_fail if function_call_fail else 0 163 | function_call_fail_param_rate = function_call_fail_param/function_call_fail if function_call_fail else 0 164 | function_call_fail_name_illusion_rate = function_call_fail_name_illusion/function_call_fail if function_call_fail else 0 165 | print(f"工具识别正确率fccr: {function_call_correct_rate}") 166 | print(f"工具识别失败率fcfr: {function_call_fail_rate}") 167 | print(f"工具调用识别失败占比fcffr: {function_call_fail_functioncall_rate}") 168 | print(f"工具名识别失败占比fcfnr: {function_call_fail_name_rate}") 169 | print(f"工具参数识别失败占比fcfpr: {function_call_fail_param_rate}") 170 | print(f"工具幻觉识别失败占比fcfnir: {function_call_fail_name_illusion_rate}") 171 | print(f"助手回复答案相关度aar: {np.mean(assistant_ans_relevancy_list)}") 172 | print("==============================================") 173 | # 保存数据 174 | with open("test_result_data.jsonl","w") as fw: 175 | for line in test_result_lines: 176 | print(line, file=fw) 177 | 178 | 179 | def test_process(test_lines, gpu_index): 180 | global test_result_lines 181 | for line in tqdm(test_lines, desc="Process%02d"%(gpu_index)): 182 | chat_dict = json.loads(line) 183 | test_dict = {} 184 | test_dict["functions"] = chat_dict["functions"] 185 | test_dict["chatrounds"] = [] 186 | for chatround in chat_dict["chatrounds"]: 187 | if chatround["role"] == "assistant": 188 | ass_predict = get_assistant_ans.gen_answer(test_dict, gpu_index=gpu_index) 189 | save_dict = copy.deepcopy(test_dict) 190 | save_dict["chatrounds"].append(ass_predict) 191 | test_result_lines.append(json.dumps(save_dict, ensure_ascii=False)) 192 | calc_llm_index(ass_predict, chatround, test_dict) 193 | test_dict["chatrounds"].append(chatround) 194 | 195 | 196 | def main(): 197 | pool = ThreadPoolExecutor(max_workers=GPU_NUM) 198 | 199 | test_lines = [] 200 | for test_ans_file in test_ans_file_list: 201 | print(test_ans_file) 202 | with open(test_ans_file, "r") as f: 203 | lines = f.readlines() 204 | test_lines += lines 205 | 206 | batch_num = len(test_lines)//GPU_NUM + int(len(test_lines)%GPU_NUM>0) 207 | 208 | obj_list = [] 209 | for idx in range(GPU_NUM): 210 | batch_test_lines = test_lines[idx*batch_num:(idx+1)*batch_num] 211 | obj = pool.submit(test_process, batch_test_lines, gpu_index=idx) 212 | obj_list.append(obj) 213 | 214 | for future in as_completed(obj_list): 215 | # 暂时留在这里,但是其实没有返回数据 216 | data = future.result() 217 | 218 | print_result() 219 | 220 | if __name__ == "__main__": 221 | main() 222 | -------------------------------------------------------------------------------- /src/prompts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefuse-ai/codefuse-devops-eval/f0f12d4380cc5bb684bc583d8e4d0a86e4e18c37/src/prompts/__init__.py -------------------------------------------------------------------------------- /src/prompts/base_prompts_config.py: -------------------------------------------------------------------------------- 1 | 2 | TOOL_FILL_BASE_PROMPT = '''你现在是一位参数填充助手,帮助我从历史问题问答中抽取出指定API入参结构所需要的参数信息 3 | HISTORY_QUESTION: {query} 4 | API_SCHEMA: {api_schema} 5 | 返回json结构的API调用参数: 6 | ''' 7 | 8 | 9 | TOOL_PARSER_BASE_PROMPT = '''你现在是一位API调用解析,帮助我生成可解析API_RESPONSE来回答用户问题的代码 10 | HISTORY_QUESTION: {query} 11 | API_SCHEMA: {api_schema} 12 | API_RESPONSE: {response} 13 | 返回解析response的代码: 14 | ''' 15 | 16 | TOOL_SUMMARY_BASE_PROMPT = '''你现在是一位API调用总结助手,帮助我从API的RESPONSE中获取到特定的信息,来回答用户问题 17 | HISTORY_QUESTION: {query} 18 | API_SCHEMA: {api_schema} 19 | API_RESPONSE: {response} 20 | 返回回答结果: 21 | ''' 22 | -------------------------------------------------------------------------------- /src/qwen_eval_main.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | 3 | from src.datasets import ToolFillDataset, ToolParserDataset, ToolSummaryDataset, FuncCallDataset 4 | from src.evals import ToolFillEvalution, ToolParserEvalution, ToolSummaryEvalution, FuncCallEvalution 5 | from src.models import QwenModel, ToolModel, InternlmModel 6 | from src.models.generate_configs import GenerateConfigs 7 | from src.prompts.base_prompts_config import TOOL_FILL_BASE_PROMPT, TOOL_PARSER_BASE_PROMPT, TOOL_SUMMARY_BASE_PROMPT 8 | from src.utils.jsonl_utils import save_to_jsonl_file 9 | 10 | import warnings 11 | import re 12 | 13 | # 定义要过滤的警告消息内容 14 | filtered_content = "for open-end generation" 15 | # 过滤包含特定内容的警告消息 16 | warnings.filterwarnings("ignore", message=re.escape(filtered_content)) 17 | 18 | 19 | model_infos = [ 20 | {"model_name": "", "template": "chatml", "model_path": "", 21 | "peft_path": "", "model_class": QwenModel}] 22 | 23 | datainfos = [ 24 | {"dataset_path": "~/fcdata_luban_zh_test.jsonl", "dataset_name": "fcdata_luban_zh", "tool_task": "func_call"}, 25 | {"dataset_path": "~/test_datas/fcdata_zh_test_v1.jsonl", "dataset_name": "fcdata_zh", "tool_task": "func_call"}, 26 | ] 27 | 28 | save_path = "" 29 | 30 | 31 | for model_info in datainfos: 32 | print(f"******** model_name: {model_info['model_name']} *****") 33 | model_path = model_info["model_path"] 34 | peft_path = model_info["peft_path"] 35 | template = model_info["template"] 36 | 37 | tool_model = model_info["model_class"](model_path, peft_path, template, trust_remote_code=True, tensor_parallel_size=1, gpu_memory_utilization=0.25) 38 | 39 | for datainfo in datainfos: 40 | 41 | print(f"******** dataset_name: {datainfo['dataset_name']} *****") 42 | 43 | dataset_name = datainfo["dataset_name"] 44 | tool_task = datainfo["tool_task"] 45 | dataset_path = datainfo["dataset_path"] 46 | funccall_dataset = FuncCallDataset(dataset_name, tool_task, dataset_path) 47 | 48 | generate_configs = GenerateConfigs(max_new_tokens=256, temperature=0.2, stop_words=["<|endoftext|>"]) 49 | 50 | funccall_evalution = FuncCallEvalution( 51 | model=tool_model, 52 | dataset=funccall_dataset, 53 | base_prompt=TOOL_FILL_BASE_PROMPT, 54 | template=model_info["template"], 55 | generate_configs=generate_configs, 56 | ) 57 | metric = funccall_evalution.calc() 58 | 59 | # save predict results to local 60 | save_to_jsonl_file(funccall_evalution.predicts, f"{save_path}/{model_info['model_name']}/{datainfo['dataset_name']}/result.jsonl") -------------------------------------------------------------------------------- /src/run_eval.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | 3 | from src.hparams.parser import get_all_args 4 | from src.data.data_load import load_all_dataset 5 | from src.model_and_tokenizer_loader.model_and_tokenizer_loader_family import load_model_and_tokenizer 6 | from src.context_builder.context_builder_family import get_context_builder 7 | from src.evaluate.evaluate import evaluate 8 | from src.metric.metric_score import get_acc_score 9 | from src.utils.set_seed import setup_seed 10 | 11 | 12 | def run_eval(args=None): 13 | # Get eval args 14 | eval_args = get_all_args(args) 15 | logger.info('eval_args={}'.format(eval_args)) 16 | 17 | # Setup seed 18 | setup_seed(eval_args.seed) 19 | 20 | # Get all dataset 21 | eval_datasets = load_all_dataset(eval_args) 22 | logger.info('Load all dataset success, total question number={}'.format(sum(len(v) for v in eval_datasets.values()))) 23 | 24 | # Load model and tokenizer 25 | model, tokenizer = load_model_and_tokenizer(eval_args) 26 | logger.info('Load model and tokenizer success') 27 | logger.info('tokenizer={}'.format(tokenizer)) 28 | 29 | # load context_builder 30 | context_builder = get_context_builder(eval_args) 31 | logger.info('context_builder={}'.format(context_builder)) 32 | 33 | # run model 34 | all_pred = evaluate(model, tokenizer, context_builder, eval_datasets) 35 | 36 | # get metric 37 | score_dict = get_acc_score(all_pred) 38 | logger.info('model_path={} k_shot={} Evaluation result={}'.format(eval_args.model_path, eval_args.k_shot, score_dict)) 39 | 40 | # save metric 41 | 42 | 43 | if __name__ == '__main__': 44 | run_eval() 45 | 46 | -------------------------------------------------------------------------------- /src/utils/json_utils.py: -------------------------------------------------------------------------------- 1 | import json, re, os 2 | 3 | 4 | def flatten_json(nested_json, parent_key='', sep='_'): 5 | """\n Flatten a nested JSON object\n """ 6 | items = [] 7 | for key, value in nested_json.items(): 8 | new_key = f"{parent_key}{sep}{key}" if parent_key else key 9 | if isinstance(value, dict): 10 | items.extend(flatten_json(value, new_key, sep=sep).items()) 11 | elif isinstance(value, list): 12 | value_c = sorted(value) 13 | for i, v in enumerate(value_c): 14 | new_item = flatten_json(v, f"{new_key}{sep}{i}", sep=sep) 15 | items.extend(new_item.items()) 16 | else: 17 | items.append((new_key, value)) 18 | return dict(items) 19 | 20 | 21 | def read_json_file(filename): 22 | with open(filename, "r", encoding="utf-8") as f: 23 | return json.load(f) 24 | 25 | 26 | def save_to_json_file(data, filename, encoding="utf-8"): 27 | dir_name = os.path.dirname(filename) 28 | if not os.path.exists(dir_name): os.makedirs(dir_name) 29 | 30 | with open(filename, "w", encoding=encoding) as f: 31 | json.dump(data, f, indent=2, ensure_ascii=False) -------------------------------------------------------------------------------- /src/utils/jsonl_utils.py: -------------------------------------------------------------------------------- 1 | import re, json, os, copy, traceback 2 | 3 | 4 | def read_jsonl_file(filename): 5 | data = [] 6 | with open(filename, "r", encoding="utf-8") as f: 7 | for line in f: 8 | data.append(json.loads(line)) 9 | return data 10 | 11 | 12 | def save_to_jsonl_file(data, filename): 13 | dir_name = os.path.dirname(filename) 14 | if not os.path.exists(dir_name): os.makedirs(dir_name) 15 | 16 | with open(filename, "w", encoding="utf-8") as f: 17 | for item in data: 18 | f.write(json.dumps(item, ensure_ascii=False) + "\n") 19 | 20 | -------------------------------------------------------------------------------- /src/utils/set_seed.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | import numpy as np 4 | 5 | 6 | def setup_seed(seed): 7 | torch.manual_seed(seed) 8 | torch.cuda.manual_seed_all(seed) 9 | np.random.seed(seed) 10 | random.seed(seed) 11 | torch.backends.cudnn.deterministic = True -------------------------------------------------------------------------------- /tests/context_builder_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('/mnt/llm/DevOpsEval') 3 | 4 | from loguru import logger 5 | 6 | from src.context_builder.context_builder_family import QwenContextBuilder 7 | from src.model_and_tokenizer_loader.model_and_tokenizer_loader_family import QwenModelAndTokenizerLoader 8 | 9 | 10 | if __name__ == '__main__': 11 | query = '你好' 12 | system = '请帮助我' 13 | tokenizer_path = '/mnt/llm/devopspal/model/Qwen-7B' 14 | 15 | model_path = '/mnt/llm/devopspal/model/Qwen-7B' 16 | qwen_model_loader = QwenModelAndTokenizerLoader() 17 | tokenizer = qwen_model_loader.load_tokenizer(model_path) 18 | 19 | qcb = QwenContextBuilder() 20 | a, b = qcb.make_context(tokenizer, query, system) 21 | logger.debug(a) 22 | logger.debug(b) 23 | -------------------------------------------------------------------------------- /tests/data_preprocess_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('/mnt/llm/DevOpsEval') 3 | from src.data.data_preprocess import preprocess_zero_shot 4 | 5 | import pandas as pd 6 | from loguru import logger 7 | 8 | 9 | if __name__ == '__main__': 10 | df = pd.read_csv('/mnt/llm/DevOpsEval/data/devopseval/dev/integration.csv') 11 | d = preprocess_zero_shot(df) 12 | logger.info(d[0]['query']) 13 | --------------------------------------------------------------------------------