├── .gitignore ├── LICENSE.txt ├── README.md ├── README_JA.md ├── README_ZH.md ├── benchmark ├── README.md ├── benchmark_eval.py ├── infer_baichuan.py ├── infer_qwen.py └── requirements.txt ├── blob ├── example-zh.gif ├── example.gif ├── logo.png └── overview.png ├── examples └── custom_tool_example.py ├── kwaiagents ├── agent_start.py ├── agents │ ├── __init__.py │ ├── agent_profile.py │ ├── kagent.py │ └── prompts.py ├── config.py ├── llms │ ├── __init__.py │ └── clients.py ├── tools │ ├── __init__.py │ ├── base.py │ ├── browser.py │ ├── calendars.py │ ├── commons.py │ ├── search.py │ ├── solarterms.py │ ├── timedelta.py │ └── weather.py └── utils │ ├── chain_logger.py │ ├── date_utils.py │ ├── function_utils.py │ ├── html_utils.py │ ├── json_fix_general.py │ ├── nlp_utils.py │ └── selenium_utils.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 105 | __pypackages__/ 106 | 107 | # Celery stuff 108 | celerybeat-schedule 109 | celerybeat.pid 110 | 111 | # SageMath parsed files 112 | *.sage.py 113 | 114 | # Environments 115 | .env 116 | .venv 117 | env/ 118 | venv/ 119 | ENV/ 120 | env.bak/ 121 | venv.bak/ 122 | 123 | # Spyder project settings 124 | .spyderproject 125 | .spyproject 126 | 127 | # Rope project settings 128 | .ropeproject 129 | 130 | # mkdocs documentation 131 | /site 132 | 133 | # mypy 134 | .mypy_cache/ 135 | .dmypy.json 136 | dmypy.json 137 | 138 | # Pyre type checker 139 | .pyre/ 140 | 141 | # pytype static type analyzer 142 | .pytype/ 143 | 144 | # Cython debug symbols 145 | cython_debug/ 146 | 147 | # data 148 | data/ 149 | 150 | # test 151 | test.py 152 | 153 | # querys 154 | querys.txt 155 | 156 | # scripts 157 | scripts/stat.py 158 | scripts/stat1.py 159 | 160 | # PyCharm 161 | # JetBrains specific template is maintainted in a separate JetBrains.gitignore that can 162 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 163 | # and can be added to the global gitignore or merged into this file. For a more nuclear 164 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 165 | #.idea/ -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Attribution-NonCommercial-ShareAlike 4.0 International 2 | 3 | ======================================================================= 4 | 5 | Creative Commons Corporation ("Creative Commons") is not a law firm and 6 | does not provide legal services or legal advice. Distribution of 7 | Creative Commons public licenses does not create a lawyer-client or 8 | other relationship. Creative Commons makes its licenses and related 9 | information available on an "as-is" basis. Creative Commons gives no 10 | warranties regarding its licenses, any material licensed under their 11 | terms and conditions, or any related information. Creative Commons 12 | disclaims all liability for damages resulting from their use to the 13 | fullest extent possible. 14 | 15 | Using Creative Commons Public Licenses 16 | 17 | Creative Commons public licenses provide a standard set of terms and 18 | conditions that creators and other rights holders may use to share 19 | original works of authorship and other material subject to copyright 20 | and certain other rights specified in the public license below. The 21 | following considerations are for informational purposes only, are not 22 | exhaustive, and do not form part of our licenses. 23 | 24 | Considerations for licensors: Our public licenses are 25 | intended for use by those authorized to give the public 26 | permission to use material in ways otherwise restricted by 27 | copyright and certain other rights. Our licenses are 28 | irrevocable. Licensors should read and understand the terms 29 | and conditions of the license they choose before applying it. 30 | Licensors should also secure all rights necessary before 31 | applying our licenses so that the public can reuse the 32 | material as expected. Licensors should clearly mark any 33 | material not subject to the license. This includes other CC- 34 | licensed material, or material used under an exception or 35 | limitation to copyright. More considerations for licensors: 36 | wiki.creativecommons.org/Considerations_for_licensors 37 | 38 | Considerations for the public: By using one of our public 39 | licenses, a licensor grants the public permission to use the 40 | licensed material under specified terms and conditions. If 41 | the licensor's permission is not necessary for any reason--for 42 | example, because of any applicable exception or limitation to 43 | copyright--then that use is not regulated by the license. Our 44 | licenses grant only permissions under copyright and certain 45 | other rights that a licensor has authority to grant. Use of 46 | the licensed material may still be restricted for other 47 | reasons, including because others have copyright or other 48 | rights in the material. A licensor may make special requests, 49 | such as asking that all changes be marked or described. 50 | Although not required by our licenses, you are encouraged to 51 | respect those requests where reasonable. More considerations 52 | for the public: 53 | wiki.creativecommons.org/Considerations_for_licensees 54 | 55 | ======================================================================= 56 | 57 | Creative Commons Attribution-NonCommercial-ShareAlike 4.0 International 58 | Public License 59 | 60 | By exercising the Licensed Rights (defined below), You accept and agree 61 | to be bound by the terms and conditions of this Creative Commons 62 | Attribution-NonCommercial-ShareAlike 4.0 International Public License 63 | ("Public License"). To the extent this Public License may be 64 | interpreted as a contract, You are granted the Licensed Rights in 65 | consideration of Your acceptance of these terms and conditions, and the 66 | Licensor grants You such rights in consideration of benefits the 67 | Licensor receives from making the Licensed Material available under 68 | these terms and conditions. 69 | 70 | 71 | Section 1 -- Definitions. 72 | 73 | a. Adapted Material means material subject to Copyright and Similar 74 | Rights that is derived from or based upon the Licensed Material 75 | and in which the Licensed Material is translated, altered, 76 | arranged, transformed, or otherwise modified in a manner requiring 77 | permission under the Copyright and Similar Rights held by the 78 | Licensor. For purposes of this Public License, where the Licensed 79 | Material is a musical work, performance, or sound recording, 80 | Adapted Material is always produced where the Licensed Material is 81 | synched in timed relation with a moving image. 82 | 83 | b. Adapter's License means the license You apply to Your Copyright 84 | and Similar Rights in Your contributions to Adapted Material in 85 | accordance with the terms and conditions of this Public License. 86 | 87 | c. BY-NC-SA Compatible License means a license listed at 88 | creativecommons.org/compatiblelicenses, approved by Creative 89 | Commons as essentially the equivalent of this Public License. 90 | 91 | d. Copyright and Similar Rights means copyright and/or similar rights 92 | closely related to copyright including, without limitation, 93 | performance, broadcast, sound recording, and Sui Generis Database 94 | Rights, without regard to how the rights are labeled or 95 | categorized. For purposes of this Public License, the rights 96 | specified in Section 2(b)(1)-(2) are not Copyright and Similar 97 | Rights. 98 | 99 | e. Effective Technological Measures means those measures that, in the 100 | absence of proper authority, may not be circumvented under laws 101 | fulfilling obligations under Article 11 of the WIPO Copyright 102 | Treaty adopted on December 20, 1996, and/or similar international 103 | agreements. 104 | 105 | f. Exceptions and Limitations means fair use, fair dealing, and/or 106 | any other exception or limitation to Copyright and Similar Rights 107 | that applies to Your use of the Licensed Material. 108 | 109 | g. License Elements means the license attributes listed in the name 110 | of a Creative Commons Public License. The License Elements of this 111 | Public License are Attribution, NonCommercial, and ShareAlike. 112 | 113 | h. Licensed Material means the artistic or literary work, database, 114 | or other material to which the Licensor applied this Public 115 | License. 116 | 117 | i. Licensed Rights means the rights granted to You subject to the 118 | terms and conditions of this Public License, which are limited to 119 | all Copyright and Similar Rights that apply to Your use of the 120 | Licensed Material and that the Licensor has authority to license. 121 | 122 | j. Licensor means the individual(s) or entity(ies) granting rights 123 | under this Public License. 124 | 125 | k. NonCommercial means not primarily intended for or directed towards 126 | commercial advantage or monetary compensation. For purposes of 127 | this Public License, the exchange of the Licensed Material for 128 | other material subject to Copyright and Similar Rights by digital 129 | file-sharing or similar means is NonCommercial provided there is 130 | no payment of monetary compensation in connection with the 131 | exchange. 132 | 133 | l. Share means to provide material to the public by any means or 134 | process that requires permission under the Licensed Rights, such 135 | as reproduction, public display, public performance, distribution, 136 | dissemination, communication, or importation, and to make material 137 | available to the public including in ways that members of the 138 | public may access the material from a place and at a time 139 | individually chosen by them. 140 | 141 | m. Sui Generis Database Rights means rights other than copyright 142 | resulting from Directive 96/9/EC of the European Parliament and of 143 | the Council of 11 March 1996 on the legal protection of databases, 144 | as amended and/or succeeded, as well as other essentially 145 | equivalent rights anywhere in the world. 146 | 147 | n. You means the individual or entity exercising the Licensed Rights 148 | under this Public License. Your has a corresponding meaning. 149 | 150 | 151 | Section 2 -- Scope. 152 | 153 | a. License grant. 154 | 155 | 1. Subject to the terms and conditions of this Public License, 156 | the Licensor hereby grants You a worldwide, royalty-free, 157 | non-sublicensable, non-exclusive, irrevocable license to 158 | exercise the Licensed Rights in the Licensed Material to: 159 | 160 | a. reproduce and Share the Licensed Material, in whole or 161 | in part, for NonCommercial purposes only; and 162 | 163 | b. produce, reproduce, and Share Adapted Material for 164 | NonCommercial purposes only. 165 | 166 | 2. Exceptions and Limitations. For the avoidance of doubt, where 167 | Exceptions and Limitations apply to Your use, this Public 168 | License does not apply, and You do not need to comply with 169 | its terms and conditions. 170 | 171 | 3. Term. The term of this Public License is specified in Section 172 | 6(a). 173 | 174 | 4. Media and formats; technical modifications allowed. The 175 | Licensor authorizes You to exercise the Licensed Rights in 176 | all media and formats whether now known or hereafter created, 177 | and to make technical modifications necessary to do so. The 178 | Licensor waives and/or agrees not to assert any right or 179 | authority to forbid You from making technical modifications 180 | necessary to exercise the Licensed Rights, including 181 | technical modifications necessary to circumvent Effective 182 | Technological Measures. For purposes of this Public License, 183 | simply making modifications authorized by this Section 2(a) 184 | (4) never produces Adapted Material. 185 | 186 | 5. Downstream recipients. 187 | 188 | a. Offer from the Licensor -- Licensed Material. Every 189 | recipient of the Licensed Material automatically 190 | receives an offer from the Licensor to exercise the 191 | Licensed Rights under the terms and conditions of this 192 | Public License. 193 | 194 | b. Additional offer from the Licensor -- Adapted Material. 195 | Every recipient of Adapted Material from You 196 | automatically receives an offer from the Licensor to 197 | exercise the Licensed Rights in the Adapted Material 198 | under the conditions of the Adapter's License You apply. 199 | 200 | c. No downstream restrictions. You may not offer or impose 201 | any additional or different terms or conditions on, or 202 | apply any Effective Technological Measures to, the 203 | Licensed Material if doing so restricts exercise of the 204 | Licensed Rights by any recipient of the Licensed 205 | Material. 206 | 207 | 6. No endorsement. Nothing in this Public License constitutes or 208 | may be construed as permission to assert or imply that You 209 | are, or that Your use of the Licensed Material is, connected 210 | with, or sponsored, endorsed, or granted official status by, 211 | the Licensor or others designated to receive attribution as 212 | provided in Section 3(a)(1)(A)(i). 213 | 214 | b. Other rights. 215 | 216 | 1. Moral rights, such as the right of integrity, are not 217 | licensed under this Public License, nor are publicity, 218 | privacy, and/or other similar personality rights; however, to 219 | the extent possible, the Licensor waives and/or agrees not to 220 | assert any such rights held by the Licensor to the limited 221 | extent necessary to allow You to exercise the Licensed 222 | Rights, but not otherwise. 223 | 224 | 2. Patent and trademark rights are not licensed under this 225 | Public License. 226 | 227 | 3. To the extent possible, the Licensor waives any right to 228 | collect royalties from You for the exercise of the Licensed 229 | Rights, whether directly or through a collecting society 230 | under any voluntary or waivable statutory or compulsory 231 | licensing scheme. In all other cases the Licensor expressly 232 | reserves any right to collect such royalties, including when 233 | the Licensed Material is used other than for NonCommercial 234 | purposes. 235 | 236 | 237 | Section 3 -- License Conditions. 238 | 239 | Your exercise of the Licensed Rights is expressly made subject to the 240 | following conditions. 241 | 242 | a. Attribution. 243 | 244 | 1. If You Share the Licensed Material (including in modified 245 | form), You must: 246 | 247 | a. retain the following if it is supplied by the Licensor 248 | with the Licensed Material: 249 | 250 | i. identification of the creator(s) of the Licensed 251 | Material and any others designated to receive 252 | attribution, in any reasonable manner requested by 253 | the Licensor (including by pseudonym if 254 | designated); 255 | 256 | ii. a copyright notice; 257 | 258 | iii. a notice that refers to this Public License; 259 | 260 | iv. a notice that refers to the disclaimer of 261 | warranties; 262 | 263 | v. a URI or hyperlink to the Licensed Material to the 264 | extent reasonably practicable; 265 | 266 | b. indicate if You modified the Licensed Material and 267 | retain an indication of any previous modifications; and 268 | 269 | c. indicate the Licensed Material is licensed under this 270 | Public License, and include the text of, or the URI or 271 | hyperlink to, this Public License. 272 | 273 | 2. You may satisfy the conditions in Section 3(a)(1) in any 274 | reasonable manner based on the medium, means, and context in 275 | which You Share the Licensed Material. For example, it may be 276 | reasonable to satisfy the conditions by providing a URI or 277 | hyperlink to a resource that includes the required 278 | information. 279 | 3. If requested by the Licensor, You must remove any of the 280 | information required by Section 3(a)(1)(A) to the extent 281 | reasonably practicable. 282 | 283 | b. ShareAlike. 284 | 285 | In addition to the conditions in Section 3(a), if You Share 286 | Adapted Material You produce, the following conditions also apply. 287 | 288 | 1. The Adapter's License You apply must be a Creative Commons 289 | license with the same License Elements, this version or 290 | later, or a BY-NC-SA Compatible License. 291 | 292 | 2. You must include the text of, or the URI or hyperlink to, the 293 | Adapter's License You apply. You may satisfy this condition 294 | in any reasonable manner based on the medium, means, and 295 | context in which You Share Adapted Material. 296 | 297 | 3. You may not offer or impose any additional or different terms 298 | or conditions on, or apply any Effective Technological 299 | Measures to, Adapted Material that restrict exercise of the 300 | rights granted under the Adapter's License You apply. 301 | 302 | 303 | Section 4 -- Sui Generis Database Rights. 304 | 305 | Where the Licensed Rights include Sui Generis Database Rights that 306 | apply to Your use of the Licensed Material: 307 | 308 | a. for the avoidance of doubt, Section 2(a)(1) grants You the right 309 | to extract, reuse, reproduce, and Share all or a substantial 310 | portion of the contents of the database for NonCommercial purposes 311 | only; 312 | 313 | b. if You include all or a substantial portion of the database 314 | contents in a database in which You have Sui Generis Database 315 | Rights, then the database in which You have Sui Generis Database 316 | Rights (but not its individual contents) is Adapted Material, 317 | including for purposes of Section 3(b); and 318 | 319 | c. You must comply with the conditions in Section 3(a) if You Share 320 | all or a substantial portion of the contents of the database. 321 | 322 | For the avoidance of doubt, this Section 4 supplements and does not 323 | replace Your obligations under this Public License where the Licensed 324 | Rights include other Copyright and Similar Rights. 325 | 326 | 327 | Section 5 -- Disclaimer of Warranties and Limitation of Liability. 328 | 329 | a. UNLESS OTHERWISE SEPARATELY UNDERTAKEN BY THE LICENSOR, TO THE 330 | EXTENT POSSIBLE, THE LICENSOR OFFERS THE LICENSED MATERIAL AS-IS 331 | AND AS-AVAILABLE, AND MAKES NO REPRESENTATIONS OR WARRANTIES OF 332 | ANY KIND CONCERNING THE LICENSED MATERIAL, WHETHER EXPRESS, 333 | IMPLIED, STATUTORY, OR OTHER. THIS INCLUDES, WITHOUT LIMITATION, 334 | WARRANTIES OF TITLE, MERCHANTABILITY, FITNESS FOR A PARTICULAR 335 | PURPOSE, NON-INFRINGEMENT, ABSENCE OF LATENT OR OTHER DEFECTS, 336 | ACCURACY, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 337 | KNOWN OR DISCOVERABLE. WHERE DISCLAIMERS OF WARRANTIES ARE NOT 338 | ALLOWED IN FULL OR IN PART, THIS DISCLAIMER MAY NOT APPLY TO YOU. 339 | 340 | b. TO THE EXTENT POSSIBLE, IN NO EVENT WILL THE LICENSOR BE LIABLE 341 | TO YOU ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, 342 | NEGLIGENCE) OR OTHERWISE FOR ANY DIRECT, SPECIAL, INDIRECT, 343 | INCIDENTAL, CONSEQUENTIAL, PUNITIVE, EXEMPLARY, OR OTHER LOSSES, 344 | COSTS, EXPENSES, OR DAMAGES ARISING OUT OF THIS PUBLIC LICENSE OR 345 | USE OF THE LICENSED MATERIAL, EVEN IF THE LICENSOR HAS BEEN 346 | ADVISED OF THE POSSIBILITY OF SUCH LOSSES, COSTS, EXPENSES, OR 347 | DAMAGES. WHERE A LIMITATION OF LIABILITY IS NOT ALLOWED IN FULL OR 348 | IN PART, THIS LIMITATION MAY NOT APPLY TO YOU. 349 | 350 | c. The disclaimer of warranties and limitation of liability provided 351 | above shall be interpreted in a manner that, to the extent 352 | possible, most closely approximates an absolute disclaimer and 353 | waiver of all liability. 354 | 355 | 356 | Section 6 -- Term and Termination. 357 | 358 | a. This Public License applies for the term of the Copyright and 359 | Similar Rights licensed here. However, if You fail to comply with 360 | this Public License, then Your rights under this Public License 361 | terminate automatically. 362 | 363 | b. Where Your right to use the Licensed Material has terminated under 364 | Section 6(a), it reinstates: 365 | 366 | 1. automatically as of the date the violation is cured, provided 367 | it is cured within 30 days of Your discovery of the 368 | violation; or 369 | 370 | 2. upon express reinstatement by the Licensor. 371 | 372 | For the avoidance of doubt, this Section 6(b) does not affect any 373 | right the Licensor may have to seek remedies for Your violations 374 | of this Public License. 375 | 376 | c. For the avoidance of doubt, the Licensor may also offer the 377 | Licensed Material under separate terms or conditions or stop 378 | distributing the Licensed Material at any time; however, doing so 379 | will not terminate this Public License. 380 | 381 | d. Sections 1, 5, 6, 7, and 8 survive termination of this Public 382 | License. 383 | 384 | 385 | Section 7 -- Other Terms and Conditions. 386 | 387 | a. The Licensor shall not be bound by any additional or different 388 | terms or conditions communicated by You unless expressly agreed. 389 | 390 | b. Any arrangements, understandings, or agreements regarding the 391 | Licensed Material not stated herein are separate from and 392 | independent of the terms and conditions of this Public License. 393 | 394 | 395 | Section 8 -- Interpretation. 396 | 397 | a. For the avoidance of doubt, this Public License does not, and 398 | shall not be interpreted to, reduce, limit, restrict, or impose 399 | conditions on any use of the Licensed Material that could lawfully 400 | be made without permission under this Public License. 401 | 402 | b. To the extent possible, if any provision of this Public License is 403 | deemed unenforceable, it shall be automatically reformed to the 404 | minimum extent necessary to make it enforceable. If the provision 405 | cannot be reformed, it shall be severed from this Public License 406 | without affecting the enforceability of the remaining terms and 407 | conditions. 408 | 409 | c. No term or condition of this Public License will be waived and no 410 | failure to comply consented to unless expressly agreed to by the 411 | Licensor. 412 | 413 | d. Nothing in this Public License constitutes or may be interpreted 414 | as a limitation upon, or waiver of, any privileges and immunities 415 | that apply to the Licensor or You, including from the legal 416 | processes of any jurisdiction or authority. 417 | 418 | ======================================================================= 419 | 420 | Creative Commons is not a party to its public 421 | licenses. Notwithstanding, Creative Commons may elect to apply one of 422 | its public licenses to material it publishes and in those instances 423 | will be considered the “Licensor.” The text of the Creative Commons 424 | public licenses is dedicated to the public domain under the CC0 Public 425 | Domain Dedication. Except for the limited purpose of indicating that 426 | material is shared under a Creative Commons public license or as 427 | otherwise permitted by the Creative Commons policies published at 428 | creativecommons.org/policies, Creative Commons does not authorize the 429 | use of the trademark "Creative Commons" or any other trademark or logo 430 | of Creative Commons without its prior written consent including, 431 | without limitation, in connection with any unauthorized modifications 432 | to any of its public licenses or any other arrangements, 433 | understandings, or agreements concerning use of licensed material. For 434 | the avoidance of doubt, this paragraph does not form part of the 435 | public licenses. 436 | 437 | Creative Commons may be contacted at creativecommons.org. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
4 |
7 |
8 |
9 |
10 |
11 |
12 | 📚 Dataset | 📚 Benchmark | 🤗 Models | 📑 Paper
13 |
14 |
15 | KwaiAgents is a series of Agent-related works open-sourced by the [KwaiKEG](https://github.com/KwaiKEG) from [Kuaishou Technology](https://www.kuaishou.com/en). The open-sourced content includes:
16 |
17 | 1. **KAgentSys-Lite**: a lite version of the KAgentSys in the paper. While retaining some of the original system's functionality, KAgentSys-Lite has certain differences and limitations when compared to its full-featured counterpart, such as: (1) a more limited set of tools; (2) a lack of memory mechanisms; (3) slightly reduced performance capabilities; and (4) a different codebase, as it evolves from open-source projects like BabyAGI and Auto-GPT. Despite these modifications, KAgentSys-Lite still delivers comparable performance among numerous open-source Agent systems available.
18 | 2. **KAgentLMs**: a series of large language models with agent capabilities such as planning, reflection, and tool-use, acquired through the Meta-agent tuning proposed in the paper.
19 | 3. **KAgentInstruct**: over 200k Agent-related instructions finetuning data (partially human-edited) proposed in the paper.
20 | 4. **KAgentBench**: over 3,000 human-edited, automated evaluation data for testing Agent capabilities, with evaluation dimensions including planning, tool-use, reflection, concluding, and profiling.
21 |
22 |
23 |
Type | 26 |Models | 27 |Training Data | 28 |Benchmark Data | 29 |
Qwen | 32 |
33 | Qwen-7B-MAT 34 | Qwen-14B-MAT 35 | Qwen-7B-MAT-cpp 36 | Qwen1.5-14B-MAT 37 | |
38 | KAgentInstruct | 39 |KAgentBench | 40 |
Baichuan | 43 |Baichuan2-13B-MAT | 44 |
52 |
53 |
54 |
55 |
56 |
57 |
58 |
59 |
60 | 61 | ## News 62 | * 2024.4.19 - Qwen1.5-14B-MAT model [[link]](https://huggingface.co/kwaikeg/kagentlms_qwen1.5_14b_mat) released. 63 | * 2024.4.9 - Benchmark results have been refreshed. 64 | * 2024.1.29 - Qwen-14B-MAT model [[link]](https://huggingface.co/kwaikeg/kagentlms_qwen_14b_mat) released. 65 | * 2023.1.5 - Training data [[link]](https://huggingface.co/datasets/kwaikeg/KAgentInstruct) released. 66 | * 2023.12.27 - 🔥🔥🔥 KwaiAgents have been reported on many sites. [[机器之心]](https://mp.weixin.qq.com/s/QhZIFL1GHH90z98gnk194g) [[Medium]](https://medium.com/@myscarletpan/can-7b-models-now-master-ai-agents-a-look-at-kwais-recent-llm-open-source-release-8b9e84647412) [[InfoQ]](https://www.infoq.cn/article/xHGJwG3b8hXSdaP4m6r0), etc. 67 | * 2023.12.13 - The benchmark and evaluation code [[link]](https://huggingface.co/datasets/kwaikeg/KAgentBench) released. 68 | * 2023.12.08 - Technical report [[link]](https://arxiv.org/abs/2312.04889) release. 69 | * 2023.11.17 - Initial release. 70 | 71 | ## Evaluation 72 | 1. Benchmark Results 73 | 74 | | | Scale | Planning | Tool-use | Reflection | Concluding | Profile | Overall Score | 75 | |----------------|-------|----------|----------|------------|------------|---------|---------------| 76 | | GPT-3.5-turbo | - | 18.55 | 26.26 | 8.06 | 37.26 | 35.42 | 25.63 | 77 | | Llama2 | 13B | 0.15 | 0.44 | 0.14 | 16.60 | 17.73 | 5.30 | 78 | | ChatGLM3 | 6B | 7.87 | 11.84 | 7.52 | 30.01 | 30.14 | 15.88 | 79 | | Qwen | 7B | 13.34 | 18.00 | 7.91 | 36.24 | 34.99 | 21.17 | 80 | | Baichuan2 | 13B | 6.70 | 16.10 | 6.76 | 24.97 | 19.08 | 14.89 | 81 | | ToolLlama | 7B | 0.20 | 4.83 | 1.06 | 15.62 | 10.66 | 6.04 | 82 | | AgentLM | 13B | 0.17 | 0.15 | 0.05 | 16.30 | 15.22 | 4.88 | 83 | | Qwen-MAT | 7B | 31.64 | 43.30 | 33.34 | 44.85 | 44.78 | 39.85 | 84 | | Baichuan2-MAT | 13B | 37.27 | 52.97 | 37.00 | 48.01 | 41.83 | 45.34 | 85 | | Qwen-MAT | 14B | 43.17 | 63.78 | 32.14 | 45.47 | 45.22 | 49.94 | 86 | | Qwen1.5-MAT | 14B | 42.42 | 64.62 | 30.58 | 46.51 | 45.95 | 50.18 | 87 | 88 | 89 | 2. Human evaluation. Each result cell shows the pass rate (\%) and the average score (in parentheses) 90 | 91 | | | Scale | NoAgent | ReACT | Auto-GPT | KAgentSys | 92 | |-----------------|---------|-----------------|----------------|-----------------|-----------------| 93 | | GPT-4 | - | 57.21% (3.42) | 68.66% (3.88) | 79.60% (4.27) | 83.58% (4.47) | 94 | | GPT-3.5-turbo | - | 47.26% (3.08) | 54.23% (3.33) | 61.74% (3.53) | 64.18% (3.69) | 95 | | Qwen | 7B | 52.74% (3.23) | 51.74% (3.20) | 50.25% (3.11) | 54.23% (3.27) | 96 | | Baichuan2 | 13B | 54.23% (3.31) | 55.72% (3.36) | 57.21% (3.37) | 58.71% (3.54) | 97 | | Qwen-MAT | 7B | - | 58.71% (3.53) | 65.67% (3.77) | 67.66% (3.87) | 98 | | Baichuan2-MAT | 13B | - | 61.19% (3.60) | 66.67% (3.86) | 74.13% (4.11) | 99 | 100 | 101 | ## User Guide 102 | 103 | ### Prebuild environment 104 | Install miniconda for build environment first. Then create build env first: 105 | ```bash 106 | conda create -n kagent python=3.10 107 | conda activate kagent 108 | pip install -r requirements.txt 109 | ``` 110 | 111 | ### Using AgentLMs 112 | #### Serving by [vLLM](https://github.com/vllm-project/vllm) (GPU) 113 | We recommend using [vLLM](https://github.com/vllm-project/vllm) and [FastChat](https://github.com/lm-sys/FastChat) to deploy the model inference service. First, you need to install the corresponding packages (for detailed usage, please refer to the documentation of the two projects): 114 | 1. For Qwen-7B-MAT, install the corresponding packages with the following commands 115 | ```bash 116 | pip install vllm 117 | pip install "fschat[model_worker,webui]" 118 | ``` 119 | 2. For Baichuan-13B-MAT, install the corresponding packages with the following commands 120 | ```bash 121 | pip install "fschat[model_worker,webui]" 122 | pip install vllm==0.2.0 123 | pip install transformers==4.33.2 124 | ``` 125 | 126 | To deploy KAgentLMs, you first need to start the controller in one terminal. 127 | ```bash 128 | python -m fastchat.serve.controller 129 | ``` 130 | Secondly, you should use the following command in another terminal for single-gpu inference service deployment: 131 | ```bash 132 | python -m fastchat.serve.vllm_worker --model-path $model_path --trust-remote-code 133 | ``` 134 | Where `$model_path` is the local path of the model downloaded. If the GPU does not support Bfloat16, you can add `--dtype half` to the command line. 135 | 136 | Thirdly, start the REST API server in the third terminal. 137 | ```bash 138 | python -m fastchat.serve.openai_api_server --host localhost --port 8888 139 | ``` 140 | 141 | Finally, you can use the curl command to invoke the model same as the OpenAI calling format. Here's an example: 142 | ```bash 143 | curl http://localhost:8888/v1/chat/completions \ 144 | -H "Content-Type: application/json" \ 145 | -d '{"model": "kagentlms_qwen_7b_mat", "messages": [{"role": "user", "content": "Who is Andy Lau"}]}' 146 | ``` 147 | Here, change `kagentlms_qwen_7b_mat` to the model you deployed. 148 | 149 | #### Serving by [Lamma.cpp](https://github.com/ggerganov/llama.cpp) (CPU) 150 | llama-cpp-python offers a web server which aims to act as a drop-in replacement for the OpenAI API. This allows you to use llama.cpp compatible models with any OpenAI compatible client (language libraries, services, etc). The converted model can be found in [kwaikeg/kagentlms_qwen_7b_mat_gguf](https://huggingface.co/kwaikeg/kagentlms_qwen_7b_mat_gguf). 151 | 152 | To install the server package and get started: 153 | ```bash 154 | pip install "llama-cpp-python[server]" 155 | python3 -m llama_cpp.server --model kagentlms_qwen_7b_mat_gguf/ggml-model-q4_0.gguf --chat_format chatml --port 8888 156 | ``` 157 | 158 | Finally, you can use the curl command to invoke the model same as the OpenAI calling format. Here's an example: 159 | ```bash 160 | curl http://localhost:8888/v1/chat/completions \ 161 | -H "Content-Type: application/json" \ 162 | -d '{"messages": [{"role": "user", "content": "Who is Andy Lau"}]}' 163 | ``` 164 | 165 | ### Using KAgentSys-Lite 166 | Download and install the KwaiAgents, recommended Python>=3.10 167 | ```bash 168 | git clone git@github.com:KwaiKEG/KwaiAgents.git 169 | cd KwaiAgents 170 | python setup.py develop 171 | ``` 172 | 173 | 1. **ChatGPT usage** 174 | Declare some environment variables 175 | ``` 176 | export OPENAI_API_KEY=sk-xxxxx 177 | export WEATHER_API_KEY=xxxxxx 178 | ``` 179 | 180 | The WEATHER_API_KEY is not mandatory, but you need to configure it when asking weather-related questions. You can obtain the API key from [this website](https://www.weatherapi.com/) (Same for local model usage). 181 | 182 | ```bash 183 | kagentsys --query="Who is Andy Lau's wife?" --llm_name="gpt-3.5-turbo" --lang="en" 184 | ``` 185 | 186 | 2. **Local model usage** 187 | > To use a local model, you need to deploy the corresponding model service as described in the previous chapter 188 | ```bash 189 | kagentsys --query="Who is Andy Lau's wife?" --llm_name="kagentlms_qwen_7b_mat" \ 190 | --use_local_llm --local_llm_host="localhost" --local_llm_port=8888 --lang="en" 191 | ``` 192 | 193 | 194 | Full command arguments: 195 | 196 | ``` 197 | options: 198 | -h, --help show this help message and exit 199 | --id ID ID of this conversation 200 | --query QUERY User query 201 | --history HISTORY History of conversation 202 | --llm_name LLM_NAME the name of llm 203 | --use_local_llm Whether to use local llm 204 | --local_llm_host LOCAL_LLM_HOST 205 | The host of local llm service 206 | --local_llm_port LOCAL_LLM_PORT 207 | The port of local llm service 208 | --tool_names TOOL_NAMES 209 | the name of llm 210 | --max_iter_num MAX_ITER_NUM 211 | the number of iteration of agents 212 | --agent_name AGENT_NAME 213 | The agent name 214 | --agent_bio AGENT_BIO 215 | The agent bio, a short description 216 | --agent_instructions AGENT_INSTRUCTIONS 217 | The instructions of how agent thinking, acting, or talking 218 | --external_knowledge EXTERNAL_KNOWLEDGE 219 | The link of external knowledge 220 | --lang {en,zh} The language of the overall system 221 | --max_tokens_num Maximum length of model input 222 | ``` 223 | 224 | **Note**: 225 | 1. If you need to use the `browse_website` tool, you need to configure the [chromedriver](https://chromedriver.chromium.org/getting-started) on your server. 226 | 2. If the search fails multiple times, it may be because the network cannot access duckduckgo_search. You can solve this by setting the `http_proxy`. 227 | 228 | #### Using Custom tools 229 | Custom tools usage can be found in examples/custom_tool_example.py 230 | 231 | ### Using KAgentBench Evaluation 232 | We only need two lines to evaluate the agent capabilities like: 233 | ```bash 234 | cd benchmark 235 | python infer_qwen.py qwen_benchmark_res.jsonl 236 | python benchmark_eval.py ./benchmark_eval.jsonl ./qwen_benchmark_res.jsonl 237 | ``` 238 | The above command will give the results like 239 | ``` 240 | plan : 31.64, tooluse : 43.30, reflextion : 33.34, conclusion : 44.85, profile : 44.78, overall : 39.85 241 | ``` 242 | 243 | Please refer to benchmark/ for more details. 244 | 245 | ## Citation 246 | ``` 247 | @article{pan2023kwaiagents, 248 | author = {Haojie Pan and 249 | Zepeng Zhai and 250 | Hao Yuan and 251 | Yaojia Lv and 252 | Ruiji Fu and 253 | Ming Liu and 254 | Zhongyuan Wang and 255 | Bing Qin 256 | }, 257 | title = {KwaiAgents: Generalized Information-seeking Agent System with Large Language Models}, 258 | journal = {CoRR}, 259 | volume = {abs/2312.04889}, 260 | year = {2023} 261 | } 262 | ``` 263 | -------------------------------------------------------------------------------- /README_JA.md: -------------------------------------------------------------------------------- 1 |
4 |
7 |
8 |
9 |
10 |
11 |
12 | 📚 データセット | 📚 ベンチマーク | 🤗 モデル | 📑 論文
13 |
14 |
15 | KwaiAgents は、[Kuaishou Technology](https://www.kuaishou.com/en) の [KwaiKEG](https://github.com/KwaiKEG) によってオープンソース化されたエージェント関連の作品シリーズです。オープンソース化されたコンテンツには以下が含まれます:
16 |
17 | 1. **KAgentSys-Lite**: KAgentSys-Lite は、KAgentSys のライトバージョンである。KAgentSys-Lite は、元のシステムの機能の一部を保持していますが、フル機能の対応物と比較すると、以下のような特定の相違点と制限があります: (1)より限定されたツールセット、(2)メモリメカニズムの欠如、(3)若干低下したパフォーマンス能力、(4)BabyAGI や Auto-GPT のようなオープンソースプロジェクトから進化した異なるコードベース。これらの変更にもかかわらず、KAgentSys-Lite は、利用可能な数多くのオープンソースエージェントシステムの中で同等のパフォーマンスを提供します。
18 | 2. **KAgentLMs**: この論文で提案されているメタエージェント・チューニングによって獲得された、プランニング、リフレクション、ツール使用などのエージェント能力を持つ一連の大規模言語モデル。
19 | 3. **KAgentInstruct**: 論文で提案された200k以上のエージェント関連命令の微調整データ(部分的に人間が編集したもの)。
20 | 4. **KAgentBench**: Agent の能力をテストするための、3,000を超える人間による自動化された評価データ。評価項目には、計画、ツールの使用、考察、結論、プロファイリングが含まれる。
21 |
22 |
タイプ | 25 |モデル | 26 |訓練データ | 27 |ベンチマークデータ | 28 |
Qwen | 31 |
32 | Qwen-7B-MAT 33 | Qwen-14B-MAT 34 | Qwen-7B-MAT-cpp 35 | Qwen1.5-14B-MAT 36 | |
37 | KAgentInstruct | 38 |KAgentBench | 39 |
Baichuan | 42 |Baichuan2-13B-MAT | 43 |
50 |
51 |
52 |
53 |
54 |
55 |
56 |
57 |
58 | 59 | ## ニュース 60 | * 2024年4月19日 - Qwen1.5-14B-MATモデル [[リンク]](https://huggingface.co/kwaikeg/kagentlms_qwen1.5_14b_mat) がリリースされました。 61 | * 2024年4月9日 - ベンチマーク結果が更新されました。 62 | * 2024.1.29 - Qwen-14B-MAT [[リンク]](https://huggingface.co/kwaikeg/kagentlms_qwen_14b_mat) をリリース. 63 | * 2023.1.5 - トレーニングデータ[[リンク]](https://huggingface.co/datasets/kwaikeg/KAgentInstruct)をリリース。 64 | * 2023.12.27 - 🔥🔥🔥 KwaiAgents は多くのサイトで報告されている。[[机器之心]](https://mp.weixin.qq.com/s/QhZIFL1GHH90z98gnk194g) [[Medium]](https://medium.com/@myscarletpan/can-7b-models-now-master-ai-agents-a-look-at-kwais-recent-llm-open-source-release-8b9e84647412) [[InfoQ]](https://www.infoq.cn/article/xHGJwG3b8hXSdaP4m6r0) など。 65 | * 2023.12.13 - 公開されたベンチマークと評価コード[[リンク]](https://huggingface.co/datasets/kwaikeg/KAgentBench)をリリース 66 | * 2023.12.08 - テクニカルレポート[[リンク]](https://arxiv.org/abs/2312.04889)をリリース 67 | * 2023.11.17 - 初回リリース 68 | 69 | ## 評価 70 | 1. ベンチマーク結果 71 | 72 | | | Scale | Planning | Tool-use | Reflection | Concluding | Profile | Overall Score | 73 | |----------------|-------|----------|----------|------------|------------|---------|---------------| 74 | | GPT-3.5-turbo | - | 18.55 | 26.26 | 8.06 | 37.26 | 35.42 | 25.63 | 75 | | Llama2 | 13B | 0.15 | 0.44 | 0.14 | 16.60 | 17.73 | 5.30 | 76 | | ChatGLM3 | 6B | 7.87 | 11.84 | 7.52 | 30.01 | 30.14 | 15.88 | 77 | | Qwen | 7B | 13.34 | 18.00 | 7.91 | 36.24 | 34.99 | 21.17 | 78 | | Baichuan2 | 13B | 6.70 | 16.10 | 6.76 | 24.97 | 19.08 | 14.89 | 79 | | ToolLlama | 7B | 0.20 | 4.83 | 1.06 | 15.62 | 10.66 | 6.04 | 80 | | AgentLM | 13B | 0.17 | 0.15 | 0.05 | 16.30 | 15.22 | 4.88 | 81 | | Qwen-MAT | 7B | 31.64 | 43.30 | 33.34 | 44.85 | 44.78 | 39.85 | 82 | | Baichuan2-MAT | 13B | 37.27 | 52.97 | 37.00 | 48.01 | 41.83 | 45.34 | 83 | | Qwen-MAT | 14B | 43.17 | 63.78 | 32.14 | 45.47 | 45.22 | 49.94 | 84 | | Qwen1.5-MAT | 14B | 42.42 | 64.62 | 30.58 | 46.51 | 45.95 | 50.18 | 85 | 86 | 87 | 2. 人間による評価。各結果セルには、合格率(%)と平均点(括弧内)を示す。 88 | 89 | | | Scale | NoAgent | ReACT | Auto-GPT | KAgentSys | 90 | |-----------------|---------|-----------------|----------------|-----------------|-----------------| 91 | | GPT-4 | - | 57.21% (3.42) | 68.66% (3.88) | 79.60% (4.27) | 83.58% (4.47) | 92 | | GPT-3.5-turbo | - | 47.26% (3.08) | 54.23% (3.33) | 61.74% (3.53) | 64.18% (3.69) | 93 | | Qwen | 7B | 52.74% (3.23) | 51.74% (3.20) | 50.25% (3.11) | 54.23% (3.27) | 94 | | Baichuan2 | 13B | 54.23% (3.31) | 55.72% (3.36) | 57.21% (3.37) | 58.71% (3.54) | 95 | | Qwen-MAT | 7B | - | 58.71% (3.53) | 65.67% (3.77) | 67.66% (3.87) | 96 | | Baichuan2-MAT | 13B | - | 61.19% (3.60) | 66.67% (3.86) | 74.13% (4.11) | 97 | 98 | 99 | ## ユーザーガイド 100 | 101 | ### プリビルド環境 102 | まずビルド環境として miniconda をインストールします。次にビルド環境を作成します: 103 | ```bash 104 | conda create -n kagent python=3.10 105 | conda activate kagent 106 | pip install -r requirements.txt 107 | ``` 108 | 109 | ### AgentLMs の使用 110 | #### [vLLM](https://github.com/vllm-project/vllm) によるサービング(GPU) 111 | モデル推論サービスの導入には [vLLM](https://github.com/vllm-project/vllm) と [FastChat](https://github.com/lm-sys/FastChat) の利用を推奨します。まず、対応するパッケージをインストールする必要があります(詳細な使用方法については、2つのプロジェクトのドキュメントを参照してください): 112 | 1. Qwen-7B-MAT の場合は、以下のコマンドで対応するパッケージをインストールしてください 113 | ```bash 114 | pip install vllm 115 | pip install "fschat[model_worker,webui]" 116 | ``` 117 | 2. Baichuan-13B-MAT については、以下のコマンドで対応するパッケージをインストールしてください 118 | ```bash 119 | pip install "fschat[model_worker,webui]" 120 | pip install vllm==0.2.0 121 | pip install transformers==4.33.2 122 | ``` 123 | 124 | KAgentLM をデプロイするには、まず1つのターミナルでコントローラを起動する必要があります。 125 | ```bash 126 | python -m fastchat.serve.controller 127 | ``` 128 | 次に、シングル GPU 推論サービスを展開するには、別の端末で次のコマンドを使用します: 129 | ```bash 130 | python -m fastchat.serve.vllm_worker --model-path $model_path --trust-remote-code 131 | ``` 132 | ここで `$model_path` はダウンロードしたモデルのローカルパスである。GPU が Bfloat16 をサポートしていない場合は、コマンドラインに`--dtype half` を追加することができます。 133 | 134 | 第3に、3番目の端末で REST API サーバーを起動する。 135 | ```bash 136 | python -m fastchat.serve.openai_api_server --host localhost --port 8888 137 | ``` 138 | 139 | 最後に、OpenAI の呼び出し形式と同じように、curl コマンドを使ってモデルを呼び出すことができます。以下に例を示します: 140 | ```bash 141 | curl http://localhost:8888/v1/chat/completions \ 142 | -H "Content-Type: application/json" \ 143 | -d '{"model": "kagentlms_qwen_7b_mat", "messages": [{"role": "user", "content": "Who is Andy Lau"}]}' 144 | ``` 145 | ここで、`kagentlms_qwen_7b_mat` をデプロイしたモデルに変更する。 146 | 147 | #### [Lamma.cpp](https://github.com/ggerganov/llama.cpp)によるサービング (CPU) 148 | llama-cpp-python は、OpenAI API のドロップイン置き換えとして機能することを目的としたウェブサーバーを提供します。これにより、llama.cpp 互換のモデルを OpenAI 互換のクライアント(言語ライブラリやサービスなど)で使うことができます。変換されたモデルは [kwaikeg/kagentlms_qwen_7b_mat_gguf](https://huggingface.co/kwaikeg/kagentlms_qwen_7b_mat_gguf) にあります。 149 | 150 | サーバーパッケージをインストールして開始するには: 151 | ```bash 152 | pip install "llama-cpp-python[server]" 153 | python3 -m llama_cpp.server --model kagentlms_qwen_7b_mat_gguf/ggml-model-q4_0.gguf --chat_format chatml --port 8888 154 | ``` 155 | 156 | 最後に、OpenAI の呼び出し形式と同じように、curl コマンドを使ってモデルを呼び出すことができます。以下に例を示します: 157 | ```bash 158 | curl http://localhost:8888/v1/chat/completions \ 159 | -H "Content-Type: application/json" \ 160 | -d '{"messages": [{"role": "user", "content": "Who is Andy Lau"}]}' 161 | ``` 162 | 163 | ### KAgentSys-Lite の使用 164 | KwaiAgents をダウンロードしてインストールします。Python 3.10 以上を推奨します。 165 | ```bash 166 | git clone git@github.com:KwaiKEG/KwaiAgents.git 167 | cd KwaiAgents 168 | python setup.py develop 169 | ``` 170 | 171 | 1. **ChatGPT の使用** 172 | 環境変数を宣言する 173 | ``` 174 | export OPENAI_API_KEY=sk-xxxxx 175 | export WEATHER_API_KEY=xxxxxx 176 | ``` 177 | 178 | WEATHER_API_KEY は必須ではないが、気象関連の質問をする際には設定する必要がある。API キーは[このサイト](https://www.weatherapi.com/)から取得できます(ローカルモデル利用時も同じ)。 179 | 180 | ```bash 181 | kagentsys --query="Who is Andy Lau's wife?" --llm_name="gpt-3.5-turbo" --lang="en" 182 | ``` 183 | 184 | 2. **ローカルモデルの使用** 185 | > ローカルモデルを使うためには、前の章で説明したように、対応するモデルサービスをデプロイする必要があります 186 | ```bash 187 | kagentsys --query="Who is Andy Lau's wife?" --llm_name="kagentlms_qwen_7b_mat" \ 188 | --use_local_llm --local_llm_host="localhost" --local_llm_port=8888 --lang="en" 189 | ``` 190 | 191 | 192 | 全コマンド引数: 193 | 194 | ``` 195 | options: 196 | -h, --help このヘルプメッセージを表示して終了する 197 | --id ID この会話の ID 198 | --query QUERY ユーザーのクエリ 199 | --history HISTORY 会話の履歴 200 | --llm_name LLM_NAME llm の名前 201 | --use_local_llm ローカル llm を使うかどうか 202 | --local_llm_host LOCAL_LLM_HOST 203 | ローカル llm サービスのホスト 204 | --local_llm_port LOCAL_LLM_PORT 205 | ローカル llm サービスのポート 206 | --tool_names TOOL_NAMES 207 | llm の名前 208 | --max_iter_num MAX_ITER_NUM 209 | エージェントの繰り返し数 210 | --agent_name AGENT_NAME 211 | エージェント名 212 | --agent_bio AGENT_BIO 213 | エージェントの経歴、簡単な説明 214 | --agent_instructions AGENT_INSTRUCTIONS 215 | エージェントの思考、行動、会話方法の指示 216 | --external_knowledge EXTERNAL_KNOWLEDGE 217 | 外部のナレッジのリンク 218 | --lang {en,zh} システム全体の言語 219 | --max_tokens_num モデル入力の最大長 220 | ``` 221 | 222 | **注**: 223 | 1. `browse_website` ツールを使用する必要がある場合は、サーバーで [chromedriver](https://chromedriver.chromium.org/getting-started) を設定する必要があります。 224 | 2. 検索に何度も失敗する場合は、ネットワークが duckduckgo_search にアクセスできないためかもしれません。`http_proxy` を設定することで解決できます。 225 | 226 | #### カスタムツールの使用 227 | カスタムツールの使用方法はexamples/custom_tool_example.pyで見つけることができます" 228 | 229 | ### KAgentBench 評価の使用 230 | エージェントの能力を評価するために必要なのは、以下の2行だけです: 231 | ```bash 232 | cd benchmark 233 | python infer_qwen.py qwen_benchmark_res.jsonl 234 | python benchmark_eval.py ./benchmark_eval.jsonl ./qwen_benchmark_res.jsonl 235 | ``` 236 | 上記のコマンドを実行すると、次のような結果が得られます 237 | ``` 238 | plan : 31.64, tooluse : 43.30, reflextion : 33.34, conclusion : 44.85, profile : 44.78, overall : 39.85 239 | ``` 240 | 241 | 詳しくはベンチマークをご覧ください。 242 | 243 | ## 引用 244 | ``` 245 | @article{pan2023kwaiagents, 246 | author = {Haojie Pan and 247 | Zepeng Zhai and 248 | Hao Yuan and 249 | Yaojia Lv and 250 | Ruiji Fu and 251 | Ming Liu and 252 | Zhongyuan Wang and 253 | Bing Qin 254 | }, 255 | title = {KwaiAgents: Generalized Information-seeking Agent System with Large Language Models}, 256 | journal = {CoRR}, 257 | volume = {abs/2312.04889}, 258 | year = {2023} 259 | } 260 | ``` 261 | -------------------------------------------------------------------------------- /README_ZH.md: -------------------------------------------------------------------------------- 1 |
4 |
7 |
8 |
9 |
10 |
11 |
12 | 📚 Dataset | 📚 Benchmark | 🤗 Models | 📑 Paper
13 |
14 |
15 |
16 | KwaiAgents 是[快手快知团队](https://github.com/KwaiKEG)开源的一整套Agent系列工作。开源的内容包括:
17 | 1. **KAgentSys-Lite**:论文中KAgentSys的轻量版系统,其保留了部分原系统的功能。与功能齐全的系统相比,KAgentSys-Lite(1)缺少部分工具;(2)缺乏记忆机制;(3)性能稍有降低;(4)不同的代码库,Lite版本基于开源项目如BabyAGI和Auto-GPT。尽管有这些变更,KAgentSys-Lite在众多开源Agent系统中仍具有较好的性能。
18 | 2. **KAgentLMs**:经过论文中提出的Meta-agent tuning过后,具有Agents的规划、反思、工具使用等能力的系列大模型。
19 | 3. **KAgentInstruct**:超过20w(部分人工编辑)的Agent相关的指令微调数据。
20 | 4. **KAgentBench**:超过3k条经人工编辑的自动化评测Agent能力数据,能力评测维度包含规划、工具使用、反思、总结、人设指令等。
21 |
22 |
23 |
类别 | 26 |模型 | 27 |训练数据 | 28 |评测数据 | 29 |
Qwen | 32 |
33 | Qwen-7B-MAT 34 | Qwen-14B-MAT 35 | Qwen-7B-MAT-cpp 36 | Qwen1.5-14B-MAT 37 | |
38 | KAgentInstruct | 39 |KAgentBench | 40 |
Baichuan | 43 |Baichuan2-13B-MAT | 44 |
51 |
52 |
57 |
58 |
59 |
60 | ## 动态
61 | * 2024.4.19 - Qwen1.5-14B-MAT模型[[link]](https://huggingface.co/kwaikeg/kagentlms_qwen1.5_14b_mat) 公开.
62 | * 2024.4.9 - Benchmark结果更新.
63 | * 2024.1.29 - Qwen-14B-MAT模型 [[link]](https://huggingface.co/kwaikeg/kagentlms_qwen_14b_mat) 公开.
64 | * 2023.1.5 - 训练数据 [[link]](https://huggingface.co/datasets/kwaikeg/KAgentInstruct) 公开.
65 | * 2023.12.27 - 🔥🔥🔥 KwaiAgents 被国内外多个媒体报道[[机器之心]](https://mp.weixin.qq.com/s/QhZIFL1GHH90z98gnk194g) [[Medium]](https://medium.com/@myscarletpan/can-7b-models-now-master-ai-agents-a-look-at-kwais-recent-llm-open-source-release-8b9e84647412) [[InfoQ]](https://www.infoq.cn/article/xHGJwG3b8hXSdaP4m6r0)等.
66 | * 2023.12.13 - Benchmark和评测脚本 [[link]](https://huggingface.co/datasets/kwaikeg/KAgentBench) 公开.
67 | * 2023.12.08 - 技术报告 [[link]](https://arxiv.org/abs/2312.04889) 公开.
68 | * 2023.11.17 - 项目公开.
69 |
70 | ## 评测表现
71 | 1. KAgentLMs在Benchmark上的表现
72 |
73 | | | Scale | Planning | Tool-use | Reflection | Concluding | Profile | Overall Score |
74 | |----------------|-------|----------|----------|------------|------------|---------|---------------|
75 | | GPT-3.5-turbo | - | 18.55 | 26.26 | 8.06 | 37.26 | 35.42 | 25.63 |
76 | | Llama2 | 13B | 0.15 | 0.44 | 0.14 | 16.60 | 17.73 | 5.30 |
77 | | ChatGLM3 | 6B | 7.87 | 11.84 | 7.52 | 30.01 | 30.14 | 15.88 |
78 | | Qwen | 7B | 13.34 | 18.00 | 7.91 | 36.24 | 34.99 | 21.17 |
79 | | Baichuan2 | 13B | 6.70 | 16.10 | 6.76 | 24.97 | 19.08 | 14.89 |
80 | | ToolLlama | 7B | 0.20 | 4.83 | 1.06 | 15.62 | 10.66 | 6.04 |
81 | | AgentLM | 13B | 0.17 | 0.15 | 0.05 | 16.30 | 15.22 | 4.88 |
82 | | Qwen-MAT | 7B | 31.64 | 43.30 | 33.34 | 44.85 | 44.78 | 39.85 |
83 | | Baichuan2-MAT | 13B | 37.27 | 52.97 | 37.00 | 48.01 | 41.83 | 45.34 |
84 | | Qwen-MAT | 14B | 43.17 | 63.78 | 32.14 | 45.47 | 45.22 | 49.94 |
85 | | Qwen1.5-MAT | 14B | 42.42 | 64.62 | 30.58 | 46.51 | 45.95 | 50.18 |
86 |
87 |
88 | 2. KAgentSys在人工评测上的表现 (百分号前的表示通过率,括号后表示平均分)
89 |
90 | | | Scale | NoAgent | ReACT | Auto-GPT | KAgentSys |
91 | |-----------------|---------|-----------------|----------------|-----------------|-----------------|
92 | | GPT-4 | - | 57.21% (3.42) | 68.66% (3.88) | 79.60% (4.27) | 83.58% (4.47) |
93 | | GPT-3.5-turbo | - | 47.26% (3.08) | 54.23% (3.33) | 61.74% (3.53) | 64.18% (3.69) |
94 | | Qwen | 7B | 52.74% (3.23) | 51.74% (3.20) | 50.25% (3.11) | 54.23% (3.27) |
95 | | Baichuan2 | 13B | 54.23% (3.31) | 55.72% (3.36) | 57.21% (3.37) | 58.71% (3.54) |
96 | | Qwen-MAT | 7B | - | 58.71% (3.53) | 65.67% (3.77) | 67.66% (3.87) |
97 | | Baichuan2-MAT | 13B | - | 61.19% (3.60) | 66.67% (3.86) | 74.13% (4.11) |
98 |
99 | ## 使用指南
100 |
101 | ### AgentLMs 系列模型使用
102 | #### 在GPU上用vLLM部署
103 | 我们建议用[vLLM](https://github.com/vllm-project/vllm)和[FastChat](https://github.com/lm-sys/FastChat)来部署模型推理服务,首先需要安装对应的包(详细使用请参考两个项目对应文档):
104 | 1. 对于 Qwen-7B-MAT,按如下方法安装
105 | ```bash
106 | pip install vllm
107 | pip install "fschat[model_worker,webui]"
108 | ```
109 | 1. 对于 Baichuan-13B-MAT,按如下方法安装
110 | ```bash
111 | pip install "fschat[model_worker,webui]"
112 | pip install vllm==0.2.0
113 | pip install transformers==4.33.2
114 | ```
115 |
116 | 为了能够部署KAgentLMs系列模型,首先需要在一个终端开启controler
117 | ```bash
118 | python -m fastchat.serve.controller
119 | ```
120 | 然后,再在另一个终端开启单卡模型推理服务部署
121 | ```bash
122 | python -m fastchat.serve.vllm_worker --model-path $model_path --trust-remote-code
123 | ```
124 | 其中`$model_path`为从huggingface中下载的模型本地路径,如果显示GPU不支持Bfloat16,可以再命令行后再加个`--dtype half`。
125 | 然后,在第三个终端上开启REST API服务器
126 | ```bash
127 | python -m fastchat.serve.openai_api_server --host localhost --port 8888
128 | ```
129 |
130 | 最后你就可以用curl命令对应OpenAI调用格式进行模型调用啦,参考示例:
131 | ```bash
132 | curl http://localhost:8888/v1/chat/completions \
133 | -H "Content-Type: application/json" \
134 | -d '{"model": "kagentlms_qwen_7b_mat", "messages": [{"role": "user", "content": "刘德华是谁"}]}'
135 | ```
136 | 这里 `kagentlms_qwen_7b_mat` 要改成你部署的模型。
137 |
138 | #### 在CPU上用[Lamma.cpp](https://github.com/ggerganov/llama.cpp)部署
139 | llama-cpp-python 提供了类似OpenAI的API Web接口,我们可以按如下方法安装和部署。转换后的模型可以在[kwaikeg/kagentlms_qwen_7b_mat_gguf](https://huggingface.co/kwaikeg/kagentlms_qwen_7b_mat_gguf)上找到。
140 | ```bash
141 | pip install "llama-cpp-python[server]"
142 | python3 -m llama_cpp.server --model kagentlms_qwen_7b_mat_gguf/ggml-model-q4_0.gguf --chat_format chatml --port 8888
143 | ```
144 |
145 | 最后你就可以用curl命令对应OpenAI调用格式进行模型调用啦,参考示例:
146 | ```bash
147 | curl http://localhost:8888/v1/chat/completions \
148 | -H "Content-Type: application/json" \
149 | -d '{"messages": [{"role": "user", "content": "刘德华是谁"}]}'
150 | ```
151 |
152 | ### KAgentSys-Lite 快速使用
153 | 下载并安装环境包,建议Python>=3.10
154 | ```bash
155 | git clone git@github.com:KwaiKEG/KwaiAgents.git
156 | cd KwaiAgents
157 | python setup.py develop
158 | ```
159 |
160 | 1. **ChatGPT调用**
161 | 声明一些环境变量
162 | ```
163 | export OPENAI_API_KEY=sk-xxxxx
164 | export WEATHER_API_KEY=xxxxxx
165 | ```
166 |
167 | 其中WEATHER_API_KEY不是必须,但问到天气相关的问题时需要进行配置,APIKEY可以从[这个网站](https://www.weatherapi.com/)中获取(本地模型调用同)。
168 |
169 | ```bash
170 | kagentsys --query="刘德华老婆是谁?" --llm_name="gpt-3.5-turbo" --lang="zh"
171 | ```
172 |
173 | 1. **本地模型调用**
174 | > 调用本地模型需要参考上一章部署对应模型服务
175 | ```bash
176 | kagentsys --query="刘德华老婆是谁?" --llm_name="kagentlms_qwen_7b_mat" \
177 | --use_local_llm --local_llm_host="localhost" --local_llm_port=8888 --lang="zh"
178 | ```
179 |
180 | 下面是完整的命令行参数
181 |
182 | | 参数名 | 类型 | 默认值 | 描述 |
183 | | ----- | ---- | ------ | --- |
184 | | --id | str | test | 对话的ID |
185 | | --query | str | | 用户查询问题 |
186 | | --history | str | [] | 对话历史 |
187 | | --llm_name | str | gpt-3.5-turbo | LLM的名称 |
188 | | --use_local_llm | str | False | 是否使用本地LLM |
189 | | --local_llm_host | str | localhost | 本地LLM部署的IP |
190 | | --local_llm_port | int | 8888 | 本地LLM部署的端口 |
191 | | --tool_names | str | ["auto"] | 使用工具的列表,可选有 web_search,browse_website,get_weather_info,get_calendar_info,time_delta,get_solar_terms_info |
192 | | --max_iter_num | int | 1 | agent迭代次数 |
193 | | --agent_name | str | | agent名称 |
194 | | --agent_bio | str | | agent简介,简短的描述 |
195 | | --agent_instructions | str | | agent的指导原则,描述agent如何思考、行动、或交流 |
196 | | --external_knowledge | str | | 外部知识链接 |
197 | | --lang | str | en | 系统的语言,可选(英语/中文) |
198 | | --max_tokens_num | int | 4096 | Prompt截断最大长度 |
199 |
200 | **提示**:
201 | 1. 如果需要用到 browse_website 工具,需要在服务器上配置[chromedriver](https://chromedriver.chromium.org/getting-started)
202 | 2. 如果多次显示搜索失败,可能是网络无法访问duckduckgo_search,可以通过设置`http_proxy`解决
203 |
204 | #### 使用自定义工具
205 | 自定义工具使用可参考这个例子 examples/custom_tool_example.py
206 |
207 | ### KAgentBench效果评估
208 | 仅需两行代码就可以:
209 | ```bash
210 | cd benchmark
211 | python infer_qwen.py qwen_benchmark_res.jsonl
212 | python benchmark_eval.py ./benchmark_eval.jsonl ./qwen_benchmark_res.jsonl
213 | ```
214 | 上面的命令会输出以下结果
215 | ```
216 | plan : 31.64, tooluse : 43.30, reflextion : 33.34, conclusion : 44.85, profile : 44.78, overall : 39.85
217 | ```
218 | 更多细节请参考 benchmark/
219 |
220 |
221 | ## Citation
222 | ```
223 | @article{pan2023kwaiagents,
224 | author = {Haojie Pan and
225 | Zepeng Zhai and
226 | Hao Yuan and
227 | Yaojia Lv and
228 | Ruiji Fu and
229 | Ming Liu and
230 | Zhongyuan Wang and
231 | Bing Qin
232 | },
233 | title = {KwaiAgents: Generalized Information-seeking Agent System with Large Language Models},
234 | journal = {CoRR},
235 | volume = {abs/2312.04889},
236 | year = {2023}
237 | }
238 | ```
239 |
--------------------------------------------------------------------------------
/benchmark/README.md:
--------------------------------------------------------------------------------
1 |
2 | KAgentBench is the benchmark proposed in KwaiAgents ([Github](https://github.com/KwaiKEG/KwaiAgents)), which is a series of Agent-related works open-sourced by the [KwaiKEG](https://github.com/KwaiKEG) from [Kuaishou Technology](https://www.kuaishou.com/en). It contains over 3,000 human-edited, automated evaluation data for testing Agent capabilities, with evaluation dimensions including planning, tool-use, reflection, concluding, and profiling.
3 |
4 |
5 | ## Overall statistics of KAgentBench
6 | ---
7 |
8 | | type| #Queries | #Inst | Avg. #Ground | Avg. #Tools | Avg. #Turns | Avg. #Tasks | Avg. Len-Know | Metric |
9 | | :-------: | :-------:| :-------: | :-------: | :-------: | :-------: | :-------: | :-------: | :-------: |
10 | | Planning & Tool-use | 320 | 1,317 | 4.12 | 8.68 | 1.51 | 2.21 | 245.31 | ROUGE-L, EM |
11 | | Reflection | 68 | 272 | 4 | 12 | 1 | 3.97 | 1369.04 | ROUGE-L, EM |
12 | | Concluding | 245 | 1,225 | 5 | - | 1.52 | 2.14 | 923.96 | ROUGE-L |
13 | | Profile | 433 | 433 | 5 | - | 1.99 | - | - | ROUGE-L |
14 |
15 |
16 |
17 |
18 | ## Experimental results of different LLMs on KAgentBench
19 | ---
20 | The specific performance of different models on benchmarks can be seen in more detail in our [paper](https://arxiv.org/abs/2312.04889).
21 |
22 | | | Scale | Planning | Tool-use | Reflection | Concluding | Profile | Overall Score |
23 | |----------------|-------|----------|----------|------------|------------|---------|---------------|
24 | | GPT-3.5-turbo | - | 18.55 | 26.26 | 8.06 | 37.26 | 35.42 | 25.63 |
25 | | Llama2 | 13B | 0.15 | 0.44 | 0.14 | 16.60 | 17.73 | 5.30 |
26 | | ChatGLM3 | 6B | 7.87 | 11.84 | 7.52 | 30.01 | 30.14 | 15.88 |
27 | | Qwen | 7B | 13.34 | 18.00 | 7.91 | 36.24 | 34.99 | 21.17 |
28 | | Baichuan2 | 13B | 6.70 | 16.10 | 6.76 | 24.97 | 19.08 | 14.89 |
29 | | ToolLlama | 7B | 0.20 | 4.83 | 1.06 | 15.62 | 10.66 | 6.04 |
30 | | AgentLM | 13B | 0.17 | 0.15 | 0.05 | 16.30 | 15.22 | 4.88 |
31 | | Qwen-MAT | 7B | 31.64 | 43.30 | 33.34 | 44.85 | 44.78 | 39.85 |
32 | | Baichuan2-MAT | 13B | 37.27 | 52.97 | 37.00 | 48.01 | 41.83 | 45.34 |
33 |
34 |
35 |
36 | ## JSON Format
37 | ---
38 |
39 | Each data point is
40 | a dict with the following keys:
41 | - `id`: a unique id for this data point. This is useful for evaluation.
42 | - `query`: a string.
43 | - `type`: a string, the type of this data(plantooluse,reflextion,conclusion,profile).
44 | - `golden_result_list`: a list. The reference response.
45 | - `funcs`: a list of functions that may be used in the current query
46 | - `prompt_input`: a dict,input composed of different prompt templates
47 | - `memory`: a string
48 | - `memory_type`: a string,types of memory: task, knowledge, conversation
49 | - `memory_last_task`: a list, in the case where memory is task, the last task information in the previous round
50 |
51 | The overall data format is as follows
52 | ```json
53 | {
54 | "id": "",
55 | "query": "",
56 | "type": "",
57 | "golden_result_list": [],
58 | "funcs": [],
59 | "prompt_input": {},
60 | "memory": "",
61 | "memory_type": "",
62 | "memory_last_task": {}
63 | }
64 | ```
65 |
66 | ## How to download benchmark
67 | ---
68 |
69 | You can download the benchmark evaluation set through [kwaikeg/KAgentBench](https://huggingface.co/datasets/kwaikeg/KAgentBench/tree/main), or you can also download the benchmark evaluation set on [KwaiAgents](https://github.com/KwaiKEG/KwaiAgents).
70 | The filename of the evaluation set is 'benchmark_eval.jsonl'. Download the file to your local system.
71 |
72 | ## Environment Setup
73 | ---
74 |
75 | Please make sure you have setup the environment and installed the required packages. Make sure you meet the above requirements, and then install the dependent libraries.
76 | ```bash
77 | pip install -r requirements.txt
78 | ```
79 |
80 | ## Benchmark Inference
81 | To run benchmark evaluations using different models, it is necessary to appropriately load and predict according to the model in the inference script. Different models may have variations in their initialization and loading methods. We have provided inference scripts for both the Qianwen and Baichuan models. Inference for other models can be adapted based on the scripts we have provided.
82 | Taking the provided open-source model kagentlms_qwen_7b_mat as an example.(The kagentlms_qwen_7b_mat model has been uploaded to [huggingface](https://huggingface.co/kwaikeg/kagentlms_qwen_7b_mat))
83 | Run the script to perform benchmark predictions. The parameter is the prediction results file.
84 | ```bash
85 | python infer_qwen.py qwen_benchmark_res.jsonl
86 | ```
87 | Similarly, to perform evaluations using the 'kagentlms_baichuan2_13b_mat' model, execute the following command.
88 | ```bash
89 | python infer_baichuan.py baichuan_benchmark_res.jsonl
90 | ```
91 | The prediction results will be stored in the 'model_predict' field, which is a dictionary (dict) structure. The key is the type of prompt, and the corresponding value is the output of the model. In the 'profile', as it contains only one prompt, 'model_predict' is a string, and the corresponding value is the model output.
92 |
93 | Below is an example display of a prediction result for 'conclusion' data.
94 | ```json
95 | {
96 | "query": "在过去的一年中,哪个月份的全球气温最高?",
97 | "id": "0ae75cf4-8897-5998-b74c-c7ae7202621c",
98 | "type": "conclusion",
99 | "memory_type": "task",
100 | "memory": "...",
101 | "funcs": [],
102 | "memory_last_task": {},
103 | "golden_result_list": [
104 | "2022年中,6月是全球最高气温。",
105 | "2022年中,全球最高气温是6月。",
106 | "2022年中,全球最高气温是6月。",
107 | "2022年中,6月是全球最高气温。",
108 | "2022年中,全球最高气温是6月。"
109 | ],
110 | "prompt_input": {...},
111 | "model_predict": {
112 | "ReACT": "2022年7月的全球气温最高,达到了17.23℃。",
113 | "AutoGPT": "在过去的一年中,全球气温最高的月份是2023年4月,全球表面平均气温达到了17.23°C,创下了历史新高。这一数据来自于美国国家海洋和大气管理局与美国缅因大学汇编的初步数据。",
114 | "ModelScope": "在过去一年中,全球气温最高的月份是2023年7月。根据美国国家航空航天局(NASA)和美国国家海洋和大气管理局(NOAA)的最新研究报告,2022年是史上排名第6的最热年份,尽管拉尼娜现象改变了天气模式,抑制了全球气温。然而,2023年7月的全球平均气温达到了17.23℃,超过了7月4日和5日创下的17.18℃的联合纪录,而后者之前曾打破了7月3日创下的17.01℃的纪录。",
115 | "ToolLlama": "Thought: I have found the information about the warmest month in 2022 in the memory. I will use this information to answer the question and call the Finish function to provide the final answer.\nAction: Finish\nAction Input: {\"return_type\": \"give_answer\", \"final_answer\": \"在2022年,全球最暖的月份是5月,比20世纪平均值高出1.39°F(0.77°C)。\"}",
116 | "Kuaiagent": "在过去的一年中,全球气温最高的月份是2023年7月。根据美国国家航空航天局(NASA)和美国国家海洋和大气管理局(NOAA)的最新研究报告,2023年7月的全球平均气温达到了17.23℃,超过了19世纪50年代有仪器记录以来的任何一个月。"
117 | }
118 | }
119 | ```
120 | '...' indicates that the content is too extensive. For the sake of convenient display format, the content has been abbreviated.
121 |
122 |
123 |
124 |
125 |
126 |
127 | ## Evaluation
128 | Taking the inference results of the 'kagentlms_qwen_7b_mat' model as an example, calculate the metrics based on the benchmark inference results obtained from the 'infer'. Execute the following command to get the overall score.
129 |
130 | The first parameter is the benchmark evaluation set, which contains reference responses manually annotated, and the second parameter is the prediction results of the model being evaluated.
131 | ```bash
132 | python benchmark_eval.py ./benchmark_eval.jsonl ./qwen_benchmark_res.jsonl
133 | ```
134 | The final model score is as follows:
135 | ```bash
136 | plan : 31.64, tooluse : 43.30, reflextion : 33.34, conclusion : 44.85, profile : 44.78, overall : 39.85
137 | ```
138 |
139 | ## Citation
140 | ```
141 | @article{pan2023kwaiagents,
142 | author = {Haojie Pan and
143 | Zepeng Zhai and
144 | Hao Yuan and
145 | Yaojia Lv and
146 | Ruiji Fu and
147 | Ming Liu and
148 | Zhongyuan Wang and
149 | Bing Qin
150 | },
151 | title = {KwaiAgents: Generalized Information-seeking Agent System with Large Language Models},
152 | journal = {CoRR},
153 | volume = {abs/2312.04889},
154 | year = {2023}
155 | }
156 | ```
--------------------------------------------------------------------------------
/benchmark/benchmark_eval.py:
--------------------------------------------------------------------------------
1 | """
2 | 进行benchmark的评估,包括:plan、tooluse、reflextion、conclusion、profile和最终score
3 | """
4 | import json, re, math, sys, logging, nltk, os, unicodedata, pandas as pd, time
5 | import contextlib
6 | import jsonlines
7 | from tqdm import tqdm
8 | from typing import Optional
9 | from rouge import Rouge
10 | from rouge_chinese import Rouge as RougeCh
11 | from nltk.translate.bleu_score import sentence_bleu
12 | from collections import defaultdict
13 | print(sys.getrecursionlimit())
14 |
15 | # Rouge的LCS使用,增大递归次数
16 | sys.setrecursionlimit(4096 * 4096)
17 |
18 | nltk.download('punkt')
19 |
20 | def mixed_tokenize(sentence):
21 | tokens = nltk.word_tokenize(sentence)
22 | result = []
23 | for token in tokens:
24 | if any('Lo' == unicodedata.category(ch) for ch in token):
25 | # 'Lo' is the unicode category of all non-punctuation/symbol CJK characters # result.extend(jieba.cut(token, cut_all=False))
26 | result.extend(list(token))
27 | else:
28 | result.append(token)
29 | return result
30 |
31 |
32 | def rouge_score(label,predict):
33 | """
34 | 计算rouge-L
35 | """
36 | label,predict = str(label), str(predict)
37 | if label == '' or predict == '':
38 | return 0
39 | rouge = RougeCh()
40 | predict = " ".join(mixed_tokenize(predict))
41 | label = " ".join(mixed_tokenize(label))
42 | scores = rouge.get_scores(predict, label)
43 | return scores[0]["rouge-l"]["f"]
44 |
45 |
46 | def autogpt_response_process(gpt_out):
47 | if "web_search(" in gpt_out:
48 | gpt_out = ""
49 | if "response=\"" in gpt_out:
50 | gpt_out = gpt_out.split("response=\"")[1].replace("\")","")
51 | return gpt_out
52 |
53 | def toolllama_response_process(gpt_out):
54 | if """\"final_answer\": \"""" in gpt_out:
55 | gpt_out = gpt_out.split("""\"final_answer\": \"""")[1].replace("\"","").replace("}","")
56 | if gpt_out == "\n":
57 | gpt_out = ""
58 | return gpt_out
59 |
60 | def find_json_dict(input_str, cnt=0):
61 | if input_str.count("{") > input_str.count("}"):
62 | return find_json_dict(input_str.rstrip("\n") + "\n}", cnt + 1)
63 | if cnt >= 5:
64 | return input_str
65 | try:
66 | st = input_str.index("{")
67 | end_str = '}\n}'
68 | end = input_str.rindex(end_str)
69 | return input_str[st:end + len(end_str)].strip()
70 | except json.decoder.JSONDecodeError:
71 | return find_json_dict(input_str.rstrip("\n") + "\n}", cnt + 1)
72 | except:
73 | return input_str
74 |
75 | def add_quotes_to_property_names(json_string: str) -> str:
76 | """
77 | Add quotes to property names in a JSON string.
78 |
79 | Args:
80 | json_string (str): The JSON string.
81 |
82 | Returns:
83 | str: The JSON string with quotes added to property names.
84 | """
85 |
86 | def replace_func(match: re.Match) -> str:
87 | return f'"{match[1]}":'
88 |
89 | property_name_pattern = re.compile(r"(\w+):")
90 | corrected_json_string = property_name_pattern.sub(replace_func, json_string)
91 |
92 | try:
93 | json.loads(corrected_json_string)
94 | return corrected_json_string
95 | except json.JSONDecodeError as e:
96 | raise e
97 |
98 |
99 | def balance_braces(json_string: str) -> Optional[str]:
100 | """
101 | Balance the braces in a JSON string.
102 |
103 | Args:
104 | json_string (str): The JSON string.
105 |
106 | Returns:
107 | str: The JSON string with braces balanced.
108 | """
109 |
110 | open_braces_count = json_string.count("{")
111 | close_braces_count = json_string.count("}")
112 |
113 | while open_braces_count > close_braces_count:
114 | json_string += "}"
115 | close_braces_count += 1
116 |
117 | while close_braces_count > open_braces_count:
118 | json_string = json_string.rstrip("}")
119 | close_braces_count -= 1
120 |
121 | with contextlib.suppress(json.JSONDecodeError):
122 | json.loads(json_string)
123 | return json_string
124 |
125 |
126 | def correct_json(json_to_load: str) -> str:
127 | """
128 | Correct common JSON errors.
129 | Args:
130 | json_to_load (str): The JSON string.
131 | """
132 |
133 | try:
134 | json.loads(json_to_load)
135 | return json_to_load
136 | except json.JSONDecodeError as e:
137 | error_message = str(e)
138 | if error_message.startswith("Invalid \\escape"):
139 | json_to_load = fix_invalid_escape(json_to_load, error_message)
140 | if error_message.startswith(
141 | "Expecting property name enclosed in double quotes"
142 | ):
143 | json_to_load = add_quotes_to_property_names(json_to_load)
144 | try:
145 | json.loads(json_to_load)
146 | return json_to_load
147 | except json.JSONDecodeError as e:
148 | error_message = str(e)
149 | balanced_str = balance_braces(json_to_load)
150 | if balanced_str:
151 | return balanced_str
152 | return json_to_load
153 |
154 | def fix_invalid_escape(json_to_load: str, error_message: str) -> str:
155 | """Fix invalid escape sequences in JSON strings.
156 |
157 | Args:
158 | json_to_load (str): The JSON string.
159 | error_message (str): The error message from the JSONDecodeError
160 | exception.
161 |
162 | Returns:
163 | str: The JSON string with invalid escape sequences fixed.
164 | """
165 | while error_message.startswith("Invalid \\escape"):
166 | bad_escape_location = extract_char_position(error_message)
167 | json_to_load = (
168 | json_to_load[:bad_escape_location] + json_to_load[bad_escape_location + 1 :]
169 | )
170 | try:
171 | json.loads(json_to_load)
172 | return json_to_load
173 | except json.JSONDecodeError as e:
174 | # print("json loads error - fix invalid escape", e)
175 | error_message = str(e)
176 | return json_to_load
177 |
178 | def extract_char_position(error_message: str) -> int:
179 | """Extract the character position from the JSONDecodeError message.
180 |
181 | Args:
182 | error_message (str): The error message from the JSONDecodeError
183 | exception.
184 |
185 | Returns:
186 | int: The character position.
187 | """
188 |
189 | char_pattern = re.compile(r"\(char (\d+)\)")
190 | match = char_pattern.search(error_message)
191 | if match:
192 | return int(match[1])
193 | else:
194 | raise ValueError("Character position not found in the error message.")
195 |
196 |
197 | def get_ReACT_plan_and_tool(response, funcs):
198 | thought, tool_name, tool_args_kv = 'None','None',{}
199 |
200 | thought = re.findall(r"(.+?)(?=(Final Answer|\Z|Action))", response, re.DOTALL)[0][0].strip()
201 |
202 | def get_react_func_key(func_name, funcs):
203 | key = 'None'
204 | for func in funcs:
205 | if func['name'] == func_name:
206 | try:
207 | key = list(func['parameters']['properties'].keys())[0]
208 | except:
209 | key = 'None'
210 | return key
211 |
212 | tool_name_re = re.findall(r"Action:(.+?)Action Input:", response, re.DOTALL)
213 | if len(tool_name_re) > 0:
214 | tool_name = tool_name_re[0].strip()
215 | key = get_react_func_key(tool_name, funcs)
216 | if key != 'None':
217 | value = re.findall(r"Action Input:(.+?)(?=(Observation|\Z))", response, re.DOTALL)
218 | if len(value) > 0:
219 | tool_args_kv = {
220 | key: value[0][0].strip()
221 | }
222 |
223 | # 没有keys,统一为 None
224 | if thought == '':
225 | thought == 'None'
226 | if tool_name == '':
227 | tool_name = 'None'
228 | if tool_args_kv == '':
229 | tool_args_kv = {}
230 |
231 | return thought, tool_name, tool_args_kv
232 |
233 |
234 | def get_AutoGPT_plan_and_tool(response):
235 | thought, tool_name, tool_args_kv = 'None','None',{}
236 | try:
237 | response = correct_json(find_json_dict(response))
238 | res_json = json.loads(response)
239 | assert isinstance(res_json,dict)
240 | except:
241 | return thought, tool_name, tool_args_kv
242 |
243 | if 'thoughts' in res_json:
244 | if res_json['thoughts'] and 'text' in res_json['thoughts']:
245 | thought = res_json['thoughts']['text']
246 |
247 | if 'command' in res_json:
248 | if res_json['command'] and 'name' in res_json['command']:
249 | tool_name = res_json['command']['name']
250 | if res_json['command'] and 'args' in res_json['command']:
251 | try:
252 | assert isinstance(res_json['command']['args'],dict)
253 | tool_args_kv = res_json['command']['args']
254 | except:
255 | pass
256 |
257 | if thought == '':
258 | thought == 'None'
259 | if tool_name == '':
260 | tool_name = 'None'
261 | if tool_args_kv == '':
262 | tool_args_kv = {}
263 |
264 | return thought, tool_name, tool_args_kv
265 |
266 |
267 | def get_ToolLlama_plan_and_tool(response):
268 | thought,tool_name,tool_args_kv = 'None','None',{}
269 |
270 | try:
271 | thought = re.findall(r"Thought:(.+?)(?=(\Z|Action))", response, re.DOTALL)
272 | if len(thought) > 0:
273 | thought = thought[0][0].strip()
274 | tool_name_re = re.findall(r"Action:(.+?)(?=(Action Input:|\Z))", response, re.DOTALL)
275 | if len(tool_name_re) > 0:
276 | tool_name = tool_name_re[0][0].strip()
277 | tool = re.findall(r"Action Input:(.+?)(?=(Thought|\Z))", response, re.DOTALL)
278 | if len(tool) > 0:
279 | tool = tool[0][0].strip()
280 | try:
281 | tool = correct_json(find_json_dict(tool))
282 | tool_json = json.loads(tool)
283 | assert isinstance(tool_json,dict)
284 | tool_args_kv = tool_json
285 | except:
286 | # print('tool is not a dict')
287 | pass
288 | except:
289 | pass
290 |
291 | if thought == '':
292 | thought == 'None'
293 | if tool_name == '':
294 | tool_name = 'None'
295 | if tool_args_kv == '':
296 | tool_args_kv = {}
297 |
298 | return thought, tool_name, tool_args_kv
299 |
300 | def get_KuaiAgent_plan_and_tool(response):
301 | thought,tool_name,tool_args_kv = 'None','None',{}
302 |
303 | try:
304 | response = correct_json(find_json_dict(response))
305 | res_json = json.loads(response)
306 | assert isinstance(res_json,dict)
307 | except:
308 | # print('KuaiAgent JSON 格式错误')
309 | return thought,tool_name,tool_args_kv
310 |
311 | if 'task_name' in res_json:
312 | thought = res_json['task_name']
313 |
314 | if res_json and 'command' in res_json:
315 | if 'name' in res_json['command']:
316 | tool_name = res_json['command']['name']
317 | if 'args' in res_json['command']:
318 | try:
319 | assert isinstance(res_json['command']['args'],dict)
320 | tool_args_kv = res_json['command']['args']
321 | except:
322 | # print('arg is not a dict')
323 | pass
324 |
325 | if thought == '':
326 | thought == 'None'
327 | if tool_name == '':
328 | tool_name = 'None'
329 | if tool_args_kv == '':
330 | tool_args_kv = {}
331 |
332 | return thought, tool_name, tool_args_kv
333 |
334 | def get_ModelScope_plan_and_tool(response):
335 | thought,tool_name,tool_args_kv = 'None','None',{}
336 |
337 | task = re.findall(r"\<\|startofthink\|\>(.+?)\<\|endofthink\|\>", response, re.DOTALL)
338 | if len(task) > 0:
339 | task = task[0].strip()
340 | try:
341 | task = correct_json(find_json_dict(task))
342 | task = json.loads(task)
343 | assert isinstance(task,dict)
344 | except:
345 | # print('KuaiAgent JSON 格式错误')
346 | return thought,tool_name,tool_args_kv
347 |
348 | if task and 'api_name' in task:
349 | tool_name = task['api_name']
350 | if task and 'parameters' in task:
351 | try:
352 | assert isinstance(task['parameters'],dict)
353 | tool_args_kv = task['parameters']
354 | except:
355 | # print('arg is not a dict')
356 | pass
357 |
358 | if thought == '':
359 | thought == 'None'
360 | if tool_name == '':
361 | tool_name = 'None'
362 | if tool_args_kv == '':
363 | tool_args_kv = {}
364 |
365 | return thought, tool_name, tool_args_kv
366 |
367 |
368 |
369 | def get_plan_metric(golden_thoughts, golden_toolnames, thought, tool_name):
370 | plan_metrics = []
371 | for golden_thought, golden_toolname in zip(golden_thoughts,golden_toolnames):
372 | if golden_thought == 'None' or golden_toolname == 'None':
373 | continue
374 | thought_rouge = rouge_score(golden_thought, thought)
375 | tool_em = 1 if tool_name == golden_toolname else 0
376 | plan_metrics.append(thought_rouge * tool_em)
377 | if len(plan_metrics) == 0:
378 | plan_metrics = [0.]
379 | return max(plan_metrics)
380 |
381 |
382 |
383 |
384 | def get_tool_metric(golden_toolnames, golden_tool_args, tool_name, tool_args):
385 | tool_metrics = []
386 | for golden_toolname, golden_tool_arg in zip(golden_toolnames, golden_tool_args):
387 | if golden_toolname == 'None':
388 | continue
389 | tool_em = 1 if tool_name == golden_toolname else 0
390 | avg_arg_rouges = []
391 | if golden_tool_arg == {} and tool_args == {}:
392 | avg_arg_rouges = [1.]
393 | elif tool_args != {}:
394 | for k,v in golden_tool_arg.items():
395 | match_k = False
396 | for k1,v1 in tool_args.items():
397 | if k1 == k:
398 | avg_arg_rouges.append(rouge_score(v, v1))
399 | match_k = True
400 | break
401 | if not match_k:
402 | avg_arg_rouges.append(0.)
403 | else:
404 | avg_arg_rouges = [0.]
405 | arg_rouge = sum(avg_arg_rouges) / len(avg_arg_rouges) if len(avg_arg_rouges)>0 else 0
406 | tool_metrics.append(arg_rouge * tool_em)
407 |
408 | if len(tool_metrics) == 0:
409 | tool_metrics = [0.]
410 | return max(tool_metrics)
411 |
412 |
413 | def get_reflextion_metric(golden_thoughts, golden_toolnames, golden_tool_args, last_task_info, thought, tool_name, tool_args):
414 | reflextion_metrics = []
415 | for golden_thought, golden_toolname, golden_tool_arg in zip(golden_thoughts,golden_toolnames, golden_tool_args):
416 | if golden_thought == 'None' or golden_toolname == 'None':
417 | continue
418 | thought_rouge = rouge_score(golden_thought, thought)
419 | tool_em = 1 if tool_name == golden_toolname else 0
420 | avg_arg_rouges = []
421 | if golden_tool_arg == {} and tool_args == {}:
422 | avg_arg_rouges = [1.]
423 | elif tool_args != {}:
424 | for k,v in golden_tool_arg.items():
425 | match_k = False
426 | for k1,v1 in tool_args.items():
427 | if k1 == k:
428 | avg_arg_rouges.append(rouge_score(v, v1))
429 | match_k = True
430 | break
431 | if not match_k:
432 | avg_arg_rouges.append(0.)
433 | else:
434 | avg_arg_rouges = [0.]
435 | arg_rouge = sum(avg_arg_rouges) / len(avg_arg_rouges) if len(avg_arg_rouges)>0 else 0
436 | # 惩罚因子,如果和上一轮相同则penalty_weight为1,进行惩罚
437 | if last_task_info["tool_name"] == golden_toolname and last_task_info["tool_args"]== golden_tool_arg:
438 | penalty_weight = 1
439 | else:
440 | penalty_weight = 0
441 | reflextion_score = (1-penalty_weight) * (0.3 * tool_em * thought_rouge + 0.7 * tool_em * arg_rouge)
442 | reflextion_metrics.append(reflextion_score)
443 |
444 | return max(reflextion_metrics)
445 |
446 | def plan_tooluse_reflextion_predict(model_predict, funcs):
447 |
448 | predict_parsed_list = []
449 | for prompt, predict in model_predict.items():
450 | if prompt == 'ReACT' and predict != "":
451 | thought, tool_name, tool_args_kv = get_ReACT_plan_and_tool(predict, funcs)
452 | elif prompt == 'AutoGPT':
453 | thought, tool_name, tool_args_kv = get_AutoGPT_plan_and_tool(predict)
454 | elif prompt == 'ToolLlama':
455 | thought, tool_name, tool_args_kv = get_ToolLlama_plan_and_tool(predict)
456 | elif prompt == 'ModelScope':
457 | thought, tool_name, tool_args_kv = get_ModelScope_plan_and_tool(predict)
458 | elif prompt == 'KuaiAgent':
459 | thought, tool_name, tool_args_kv = get_KuaiAgent_plan_and_tool(predict)
460 | result = {
461 | 'thought': thought,
462 | 'tool_name': tool_name,
463 | 'tool_args': tool_args_kv,
464 | }
465 | predict_parsed_list.append(result)
466 |
467 | return predict_parsed_list
468 |
469 | def conclusion_metrics(label_dict, predict_dict):
470 | """
471 | 计算conclusion的分数
472 | """
473 | all_rouge = []
474 | for id, obj in tqdm(predict_dict.items()):
475 | label_response_dict_list = label_dict[id]["golden_result_list"]
476 | label_response_list = []
477 | for i in label_response_dict_list:
478 | label_response_list.append(i["golden_result"])
479 | predict_parsed_list = obj["model_predict"]
480 |
481 | rouge_list = []
482 | predict_pre_template_score = []
483 |
484 | for key,predict in predict_parsed_list.items():
485 | # 格式单独处理
486 | if key == "AutoGPT":
487 | predict = autogpt_response_process(predict)
488 | if key == "ToolLlama":
489 | predict = toolllama_response_process(predict)
490 |
491 | predict_pre_label_score = []
492 | if predict == "":
493 | predict_pre_label_score.append(0)
494 | else:
495 | if type(predict) == dict:
496 | predict = json.dumps(predict,ensure_ascii=False)
497 | for label in label_response_list:
498 | rouge_res = rouge_score(label,predict)
499 | predict_pre_label_score.append(rouge_res)
500 | predict_pre_template_score.append(max(predict_pre_label_score))
501 |
502 | all_rouge.append(sum(predict_pre_template_score)/len(predict_pre_template_score))
503 |
504 | conclusion_avg_rouge = sum(all_rouge)/len(all_rouge)
505 |
506 | return conclusion_avg_rouge
507 |
508 | def profile_metrics(label_dict, predict_dict):
509 | """
510 | 计算profile 的平均Rouge
511 | """
512 | all_rouge = []
513 | for id, obj in tqdm(predict_dict.items()):
514 | label_response_dict_list = label_dict[id]["golden_result_list"]
515 | label_response_list = []
516 | for i in label_response_dict_list:
517 | label_response_list.append(i["golden_result"])
518 | predict = obj["model_predict"]
519 |
520 | rouge_list = []
521 | if predict == "":
522 | all_rouge.append(0)
523 | else:
524 | for label in label_response_list:
525 | rouge_res = rouge_score(label,predict)
526 | rouge_list.append(rouge_res)
527 | all_rouge.append(max(rouge_list))
528 | profile_avg_rouge = sum(all_rouge)/len(all_rouge)
529 | return profile_avg_rouge
530 |
531 |
532 |
533 | def plantooluse_metrics(label_dict, predict_dict):
534 | all_plan_rouge = []
535 | all_tooluse_rouge = []
536 | for id, obj in tqdm(predict_dict.items()):
537 | label_response_list = [i for i in label_dict[id]["golden_result_list"]]
538 | funcs = label_dict[id]["funcs"]
539 | predict_parsed_list = plan_tooluse_reflextion_predict(obj["model_predict"], funcs)
540 | plan_rouge_list = []
541 | tooluse_rouge_list = []
542 | label_thoughts = []
543 | label_tool_names = []
544 | label_tool_args = []
545 | query = obj["query"]
546 | for label in label_response_list:
547 | label_thoughts.append(label["thought"])
548 | label_tool_names.append(label["tool_name"])
549 | label_tool_args.append(label['tool_args'])
550 | for predict in predict_parsed_list:
551 | plan_metric = get_plan_metric(label_thoughts, label_tool_names, predict['thought'], predict['tool_name'])
552 | tool_metric = get_tool_metric(label_tool_names, label_tool_args, predict['tool_name'], predict['tool_args'])
553 | plan_rouge_list.append(plan_metric)
554 | tooluse_rouge_list.append(tool_metric)
555 |
556 | # plan_metric内部做过max,外部求mean
557 | all_plan_rouge.append(sum(plan_rouge_list)/len(plan_rouge_list))
558 | all_tooluse_rouge.append(sum(tooluse_rouge_list)/len(tooluse_rouge_list))
559 |
560 | plan_avg_score = sum(all_plan_rouge) / len(all_plan_rouge)
561 | tooluse_avg_score = sum(all_tooluse_rouge) / len(all_tooluse_rouge)
562 | return plan_avg_score, tooluse_avg_score
563 |
564 |
565 |
566 |
567 | def reflextion_metrics(label_dict, predict_dict):
568 | all_reflextion_score = []
569 | query_score = {}
570 | for id, obj in predict_dict.items():
571 | label_response_list = [i for i in label_dict[id]["golden_result_list"]]
572 | predict_parsed_list = []
573 | query = obj["query"]
574 | funcs = label_dict[id]["funcs"]
575 | predict_parsed_list = plan_tooluse_reflextion_predict(obj["model_predict"], funcs)
576 | last_task_info = label_dict[id]["memory_last_task"]
577 | reflextion_score_list = []
578 | label_thoughts = []
579 | label_tool_names = []
580 | label_tool_args = []
581 | for label in label_response_list:
582 | label_thoughts.append(label["thought"])
583 | label_tool_names.append(label["tool_name"])
584 | label_tool_args.append(label['tool_args'])
585 |
586 | for predict in predict_parsed_list:
587 | reflextion_metric = get_reflextion_metric(label_thoughts, label_tool_names, label_tool_args, last_task_info, predict['thought'], predict['tool_name'], predict['tool_args'])
588 | reflextion_score_list.append(reflextion_metric)
589 | all_reflextion_score.append(sum(reflextion_score_list)/len(reflextion_score_list))
590 |
591 |
592 | reflextion_avg_score = sum(all_reflextion_score)/len(all_reflextion_score)
593 | return reflextion_avg_score
594 |
595 |
596 |
597 |
598 | def eval(eval_file, predict_file):
599 | """
600 | 进行整体评估
601 | """
602 | print(f"load eval file from {eval_file}")
603 | print(f"load predict file from {predict_file}")
604 | plan_tooluser_label = {}
605 | reflextion_label = {}
606 | conclusion_label = {}
607 | profile_label = {}
608 |
609 | with jsonlines.open(eval_file,"r") as f:
610 | for line in f:
611 | type = line["type"]
612 | id = line["id"]
613 | if type == "plantooluse":
614 |
615 | plan_tooluser_label[id] = line
616 | if type == "reflextion":
617 | reflextion_label[id] = line
618 | if type == "conclusion":
619 | conclusion_label[id] = line
620 | if type == "profile":
621 | profile_label[id] = line
622 |
623 |
624 | plan_tooluser_predict = {}
625 | reflextion_predict = {}
626 | conclusion_predict = {}
627 | profile_predict = {}
628 |
629 | with jsonlines.open(predict_file,"r") as f:
630 | for line in f:
631 | type = line["type"]
632 | id = line["id"]
633 | if type == "plantooluse":
634 | plan_tooluser_predict[id] = line
635 | if type == "reflextion":
636 | reflextion_predict[id] = line
637 | if type == "conclusion":
638 | conclusion_predict[id] = line
639 | if type == "profile":
640 |
641 | profile_predict[id] = line
642 | assert len(plan_tooluser_label) == len(plan_tooluser_predict)
643 | assert len(reflextion_label) == len(reflextion_predict)
644 | assert len(conclusion_label) == len(conclusion_predict)
645 | assert len(profile_label) == len(profile_predict)
646 |
647 | plan_score, tooluse_score = plantooluse_metrics(plan_tooluser_label, plan_tooluser_predict)
648 | reflextion_score = reflextion_metrics(reflextion_label, reflextion_predict)
649 | conclusion_score = conclusion_metrics(conclusion_label, conclusion_predict)
650 | profile_score = profile_metrics(profile_label, profile_predict)
651 | overall_score = (
652 | 0.25 * plan_score + # Weight for 'plantooluse' score
653 | 0.35 * tooluse_score + # Weight for 'tooluse' score
654 | 0.1 * reflextion_score + # Weight for 'reflection' score
655 | 0.2 * conclusion_score + # Weight for 'conclusion' score
656 | 0.1 * profile_score # Weight for 'profile' score
657 | )
658 | print(f"plan : {plan_score*100:.2f}, tooluse : {tooluse_score*100:.2f}, reflextion : {reflextion_score*100:.2f}, conclusion : {conclusion_score*100:.2f}, profile : {profile_score*100:.2f}, overall : {overall_score*100:.2f}")
659 |
660 |
661 |
662 |
663 |
664 |
665 | if __name__ == "__main__":
666 | eval(sys.argv[1], sys.argv[2])
667 |
668 |
669 |
670 |
671 |
672 |
673 |
--------------------------------------------------------------------------------
/benchmark/infer_baichuan.py:
--------------------------------------------------------------------------------
1 | """
2 | benchmark形式评估集推理
3 | """
4 | import os
5 | os.environ["CUDA_VISIBLE_DEVICES"] = "7"
6 | import sys
7 | import time
8 | import copy
9 | import jsonlines
10 | from tqdm import tqdm
11 | from vllm import LLM, SamplingParams
12 | from transformers import AutoModelForCausalLM, AutoTokenizer
13 | import transformers, datetime, json
14 |
15 | class ChatBaichuan:
16 | def __init__(self,
17 | model_name_or_path: str = "kwaikeg/kagentlms_baichuan2_13b_mat",
18 | template: str = 'baichuan2',
19 | input_max_length = 4096,
20 | ) -> None:
21 |
22 | assert template in ['baichuan', 'baichuan2']
23 | self.template = template
24 |
25 | print('loading tokenizer')
26 | self.tokenizer = AutoTokenizer.from_pretrained(
27 | model_name_or_path,
28 | use_fast=False,
29 | padding_side='right',
30 | trust_remote_code=True
31 | )
32 |
33 | print('loading model')
34 | self.model = LLM(
35 | model = model_name_or_path,
36 | trust_remote_code=True,
37 | max_num_batched_tokens=input_max_length
38 | )
39 | print('loaded')
40 |
41 |
42 |
43 |
44 | def encode(self, tokenizer, query, history, system=''):
45 | prompt_ids = []
46 | history = history + [(query, None)]
47 | kwargs = dict(add_special_tokens=False)
48 | for turn_idx, (q, r) in enumerate(history):
49 | prefix_ids = tokenizer.encode(system, **kwargs) if turn_idx == 0 else []
50 | if self.template == 'baichuan':
51 | prompt = [' {data["description"].strip()} {data['fullname']} ({data["stars"]} stars, {data["forks"]} forks)
31 | "):
13 | header = [th.text.strip() for th in table.find_all('th')]
14 |
15 | # Extract rows
16 | rows = []
17 | for row in table.find_all('tr'):
18 | cells = row.find_all(['td', 'th'])
19 | rows.append([cell.text.strip() for cell in cells])
20 |
21 | if header:
22 | content_rows = rows
23 | else:
24 | header = rows[0]
25 | content_rows = rows[1:]
26 | return [header] + content_rows
27 |
28 |
29 | def convert_bs_html_table_to_markdown(table):
30 | items = convert_bs_html_table_to_list(table)
31 | header = items[0]
32 | content_rows = items[1:]
33 | # Convert to markdown
34 | markdown = '| ' + ' | '.join(header) + ' |\n'
35 | markdown += '| ' + ' | '.join(['---'] * len(header)) + ' |\n'
36 | for row in content_rows:
37 | markdown += '| ' + ' | '.join(row) + ' |\n'
38 |
39 | return markdown
40 |
41 |
42 | def convert_html_table_to_markdown(table_html):
43 | soup = BeautifulSoup(table_html, 'html.parser')
44 | table = soup.find('table')
45 |
46 | return convert_bs_html_table_to_markdown(table)
47 |
48 |
49 |
50 |
51 | def extract_hyperlinks(soup: BeautifulSoup, base_url: str) -> list[tuple[str, str]]:
52 | """Extract hyperlinks from a BeautifulSoup object
53 |
54 | Args:
55 | soup (BeautifulSoup): The BeautifulSoup object
56 | base_url (str): The base URL
57 |
58 | Returns:
59 | List[Tuple[str, str]]: The extracted hyperlinks
60 | """
61 | return [
62 | (link.text, urljoin(base_url, link["href"]))
63 | for link in soup.find_all("a", href=True)
64 | ]
65 |
66 |
67 | def format_hyperlinks(hyperlinks: list[tuple[str, str]]) -> list[str]:
68 | """Format hyperlinks to be displayed to the user
69 |
70 | Args:
71 | hyperlinks (List[Tuple[str, str]]): The hyperlinks to format
72 |
73 | Returns:
74 | List[str]: The formatted hyperlinks
75 | """
76 | return [f"{link_text} ({link_url})" for link_text, link_url in hyperlinks]
77 |
--------------------------------------------------------------------------------
/kwaiagents/utils/json_fix_general.py:
--------------------------------------------------------------------------------
1 | """This module contains functions to fix JSON strings using general programmatic approaches, suitable for addressing
2 | common JSON formatting issues."""
3 | from __future__ import annotations
4 |
5 | import contextlib
6 | import json
7 | import re
8 | from typing import Optional
9 |
10 |
11 | def find_json_list(input_str):
12 | try:
13 | st = input_str.index("[")
14 | end = input_str.rindex("]")
15 | return input_str[st:end + 1]
16 | except:
17 | return input_str
18 |
19 |
20 | def find_json_dict(input_str, cnt=0):
21 | if input_str.count("{") > input_str.count("}"):
22 | return find_json_dict(input_str.rstrip("\n") + "\n}", cnt + 1)
23 | if cnt >= 5:
24 | return input_str
25 | try:
26 | st = input_str.index("{")
27 | end_str = '}\n}'
28 | end = input_str.rindex(end_str)
29 | return input_str[st:end + len(end_str)].strip()
30 | except json.decoder.JSONDecodeError:
31 | return find_json_dict(input_str.rstrip("\n") + "\n}", cnt + 1)
32 | except:
33 | return input_str
34 |
35 |
36 | def extract_char_position(error_message: str) -> int:
37 | """Extract the character position from the JSONDecodeError message.
38 |
39 | Args:
40 | error_message (str): The error message from the JSONDecodeError
41 | exception.
42 |
43 | Returns:
44 | int: The character position.
45 | """
46 |
47 | char_pattern = re.compile(r"\(char (\d+)\)")
48 | if match := char_pattern.search(error_message):
49 | return int(match[1])
50 | else:
51 | raise ValueError("Character position not found in the error message.")
52 |
53 |
54 | def fix_invalid_escape(json_to_load: str, error_message: str) -> str:
55 | """Fix invalid escape sequences in JSON strings.
56 |
57 | Args:
58 | json_to_load (str): The JSON string.
59 | error_message (str): The error message from the JSONDecodeError
60 | exception.
61 |
62 | Returns:
63 | str: The JSON string with invalid escape sequences fixed.
64 | """
65 | while error_message.startswith("Invalid \\escape"):
66 | bad_escape_location = extract_char_position(error_message)
67 | json_to_load = (
68 | json_to_load[:bad_escape_location] + json_to_load[bad_escape_location + 1 :]
69 | )
70 | try:
71 | json.loads(json_to_load)
72 | return json_to_load
73 | except json.JSONDecodeError as e:
74 | print("json loads error - fix invalid escape", e)
75 | error_message = str(e)
76 | return json_to_load
77 |
78 |
79 | def balance_braces(json_string: str) -> Optional[str]:
80 | """
81 | Balance the braces in a JSON string.
82 |
83 | Args:
84 | json_string (str): The JSON string.
85 |
86 | Returns:
87 | str: The JSON string with braces balanced.
88 | """
89 |
90 | open_braces_count = json_string.count("{")
91 | close_braces_count = json_string.count("}")
92 |
93 | while open_braces_count > close_braces_count:
94 | json_string += "}"
95 | close_braces_count += 1
96 |
97 | while close_braces_count > open_braces_count:
98 | json_string = json_string.rstrip("}")
99 | close_braces_count -= 1
100 |
101 | with contextlib.suppress(json.JSONDecodeError):
102 | json.loads(json_string)
103 | return json_string
104 |
105 |
106 | def add_quotes_to_property_names(json_string: str) -> str:
107 | """
108 | Add quotes to property names in a JSON string.
109 |
110 | Args:
111 | json_string (str): The JSON string.
112 |
113 | Returns:
114 | str: The JSON string with quotes added to property names.
115 | """
116 |
117 | def replace_func(match: re.Match) -> str:
118 | return f'"{match[1]}":'
119 |
120 | property_name_pattern = re.compile(r"(\w+):")
121 | corrected_json_string = property_name_pattern.sub(replace_func, json_string)
122 |
123 | try:
124 | json.loads(corrected_json_string)
125 | return corrected_json_string
126 | except json.JSONDecodeError as e:
127 | raise e
128 |
129 |
130 | def correct_json(json_to_load: str) -> str:
131 | """
132 | Correct common JSON errors.
133 | Args:
134 | json_to_load (str): The JSON string.
135 | """
136 |
137 | try:
138 | json.loads(json_to_load)
139 | return json_to_load
140 | except json.JSONDecodeError as e:
141 | print("json loads error", e)
142 | error_message = str(e)
143 | if error_message.startswith("Invalid \\escape"):
144 | json_to_load = fix_invalid_escape(json_to_load, error_message)
145 | if error_message.startswith(
146 | "Expecting property name enclosed in double quotes"
147 | ):
148 | json_to_load = add_quotes_to_property_names(json_to_load)
149 | try:
150 | json.loads(json_to_load)
151 | return json_to_load
152 | except json.JSONDecodeError as e:
153 | print("json loads error - add quotes", e)
154 | error_message = str(e)
155 | if balanced_str := balance_braces(json_to_load):
156 | return balanced_str
157 | return json_to_load
--------------------------------------------------------------------------------
/kwaiagents/utils/nlp_utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | import re
3 | """Text processing functions"""
4 | from typing import Generator, Optional, Dict
5 | from selenium.webdriver.remote.webdriver import WebDriver
6 | from kwaiagents.config import Config
7 | from kwaiagents.llms import create_chat_completion
8 |
9 |
10 | def split_sentences(text, lang='en'):
11 | if not text:
12 | return []
13 | if lang == 'en':
14 | # Split English sentences using regular expression
15 | sentences = re.split(r'(? Generator[str, None, None]:
29 | """Split text into chunks of a maximum length
30 |
31 | Args:
32 | text (str): The text to split
33 | max_length (int, optional): The maximum length of each chunk. Defaults to 8192.
34 |
35 | Yields:
36 | str: The next chunk of text
37 |
38 | Raises:
39 | ValueError: If the text is longer than the maximum length
40 | """
41 | paragraphs = text.split("\n")
42 | current_length = 0
43 | current_chunk = []
44 |
45 | for paragraph in paragraphs:
46 | if current_length + len(paragraph) + 1 <= max_length:
47 | current_chunk.append(paragraph)
48 | current_length += len(paragraph) + 1
49 | else:
50 | yield "\n".join(current_chunk)
51 | current_chunk = [paragraph]
52 | current_length = len(paragraph) + 1
53 |
54 | if current_chunk:
55 | yield "\n".join(current_chunk)
56 |
57 |
58 | def summarize_text(
59 | url: str, text: str, question: str, driver: Optional[WebDriver] = None, cfg: Config = None
60 | ) -> str:
61 | """Summarize text using the OpenAI API
62 |
63 | Args:
64 | url (str): The url of the text
65 | text (str): The text to summarize
66 | question (str): The question to ask the model
67 | driver (WebDriver): The webdriver to use to scroll the page
68 | cfg (Config): The config of the global agent
69 |
70 | Returns:
71 | str: The summary of the text
72 | """
73 | if not text:
74 | return "Error: No text to summarize", []
75 |
76 | text_length = len(text)
77 | cfg.chain_logger.put("reading", f"共 {text_length} 字需要阅读")
78 |
79 | summaries = []
80 | chunks = list(split_text(text, cfg.browse_chunk_max_length))
81 | scroll_ratio = 1 / len(chunks)
82 |
83 | prompt_responses = list()
84 |
85 | if cfg.fast_llm_model in ["llama"] and len(chunks) > 1:
86 | batch_size = 3
87 | cnt = 0
88 | for i in range(len(chunks) // batch_size + 1):
89 | if driver:
90 | scroll_to_percentage(driver, scroll_ratio * i)
91 | batch_chunk = chunks[i * batch_size: (i + 1) * batch_size]
92 | if not batch_chunk:
93 | break
94 | batch = [
95 | create_message(chunk, question)
96 | for chunk in batch_chunk
97 | ]
98 | batch_summaries, _ = create_chat_completion(
99 | query=batch,
100 | llm_model_name=cfg.fast_llm_model,
101 | max_tokens=cfg.browse_summary_max_token,
102 | )
103 | if isinstance(batch_summaries, str):
104 | batch_summaries = json.loads(batch_summaries)
105 | summaries.extend(batch_summaries)
106 |
107 | cnt += len(batch)
108 | cfg.chain_logger.put("reading", f"{cnt} / {len(chunks)} 个段落")
109 | else:
110 | for i, chunk in enumerate(chunks):
111 | if driver:
112 | scroll_to_percentage(driver, scroll_ratio * i)
113 | # print(f"Adding chunk {i + 1} / {len(chunks)} to memory")
114 |
115 | # memory_to_add = f"Source: {url}\n" f"Raw content part#{i + 1}: {chunk}"
116 |
117 | # MEMORY.add(memory_to_add)
118 |
119 |
120 | cfg.chain_logger.put("reading", f"第 {i + 1} / {len(chunks)} 个段落")
121 | message = create_message(chunk, question)
122 |
123 | try:
124 | summary, _ = create_chat_completion(
125 | query=message,
126 | llm_model_name=cfg.fast_llm_model,
127 | max_tokens=cfg.browse_summary_max_token,
128 | )
129 | except:
130 | summary = ""
131 | summaries.append(summary)
132 |
133 | prompt_responses.append((message, summary))
134 | # print(f"Added chunk {i + 1} summary to memory")
135 |
136 | # memory_to_add = f"Source: {url}\n" f"Content summary part#{i + 1}: {summary}"
137 |
138 | # MEMORY.add(memory_to_add)
139 | print(len(summaries))
140 | if len(summaries) == 1:
141 | return summary, prompt_responses
142 | if len(summaries) == 0:
143 | return "", prompt_responses
144 | cfg.chain_logger.put("reading", f"总结这 {len(chunks)} 个段落")
145 | print(f"Summarized {len(chunks)} chunks.")
146 |
147 | combined_summary = "\n".join(summaries)
148 | message = create_message(combined_summary, question)
149 |
150 | summary, _ = create_chat_completion(
151 | query=message,
152 | llm_model_name=cfg.fast_llm_model,
153 | max_tokens=cfg.browse_summary_max_token,
154 | )
155 | prompt_responses.append((message, summary))
156 |
157 | return summary, prompt_responses
158 |
159 |
160 | def scroll_to_percentage(driver: WebDriver, ratio: float) -> None:
161 | """Scroll to a percentage of the page
162 |
163 | Args:
164 | driver (WebDriver): The webdriver to use
165 | ratio (float): The percentage to scroll to
166 |
167 | Raises:
168 | ValueError: If the ratio is not between 0 and 1
169 | """
170 | if ratio < 0 or ratio > 1:
171 | raise ValueError("Percentage should be between 0 and 1")
172 | driver.execute_script(f"window.scrollTo(0, document.body.scrollHeight * {ratio});")
173 |
174 |
175 | def create_message(chunk: str, question: str) -> Dict[str, str]:
176 | """Create a message for the chat completion
177 |
178 | Args:
179 | chunk (str): The chunk of text to summarize
180 | question (str): The question to answer
181 |
182 | Returns:
183 | Dict[str, str]: The message to send to the chat completion
184 | """
185 | return f'"""{chunk}""" 基于上述文本回答下面的问题 ' +\
186 | f'问题: "{question}" -- 假如无法回答这个问题,则总结上述文本:'
187 |
188 |
189 | if __name__ == "__main__":
190 | # Example usage
191 | text_en = "This is an example sentence. Here's another one! And a third?"
192 | text_zh = "这是一个示例句子。这是另一个!还有第三个?"
193 |
194 | sentences_en = split_sentences(text_en, lang='en')
195 | sentences_zh = split_sentences(text_zh, lang='zh')
196 |
197 | print(sentences_en)
198 | print(sentences_zh)
--------------------------------------------------------------------------------
/kwaiagents/utils/selenium_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from selenium import webdriver
4 | from selenium.webdriver.remote.webdriver import WebDriver
5 | from selenium.webdriver.common.by import By
6 | from selenium.webdriver.support.wait import WebDriverWait
7 | from selenium.webdriver.support import expected_conditions as EC
8 | from selenium.webdriver.chrome.options import Options as ChromeOptions
9 | from selenium.webdriver.firefox.options import Options as FirefoxOptions
10 | from selenium.webdriver.safari.options import Options as SafariOptions
11 | from webdriver_manager.chrome import ChromeDriverManager
12 | from webdriver_manager.firefox import GeckoDriverManager
13 | import logging
14 |
15 | import time
16 |
17 |
18 | def get_web_driver(selenium_web_browser):
19 | options_available = {
20 | "chrome": ChromeOptions,
21 | "safari": SafariOptions,
22 | "firefox": FirefoxOptions,
23 | }
24 |
25 | options = options_available[selenium_web_browser]()
26 | options.add_argument(
27 | "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.5615.49 Safari/537.36"
28 | )
29 |
30 | if selenium_web_browser == "firefox":
31 | current_driver = webdriver.Firefox(
32 | executable_path=GeckoDriverManager().install(), options=options
33 | )
34 | elif selenium_web_browser == "safari":
35 | # Requires a bit more setup on the users end
36 | # See https://developer.apple.com/documentation/webkit/testing_with_webdriver_in_safari
37 | current_driver = webdriver.Safari(options=options)
38 | else:
39 | options.add_argument('--no-sandbox')
40 | options.add_argument('--disable-dev-shm-usage')
41 | options.add_argument('--headless')
42 | proxy = os.getenv("http_proxy")
43 | if proxy:
44 | options.add_argument(f'--proxy-server={proxy}')
45 | current_driver = webdriver.Chrome(options=options)
46 | return current_driver
47 |
48 |
49 | def get_pagesource_with_selenium(url: str, selenium_web_browser:str, driver: WebDriver = None) -> str:
50 | logging.getLogger("selenium").setLevel(logging.CRITICAL)
51 | driver = get_web_driver(selenium_web_browser)
52 | if driver is None:
53 | driver = get_web_driver(selenium_web_browser)
54 |
55 | driver.get(url)
56 |
57 | WebDriverWait(driver, 10).until(
58 | EC.presence_of_element_located((By.TAG_NAME, "body"))
59 | )
60 |
61 | # Get the HTML content directly from the browser's DOM
62 | page_source = driver.execute_script("return document.body.outerHTML;")
63 | return driver, page_source
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | bs4
2 | pandas
3 | docstring_parser
4 | lunar_python==1.3.2
5 | duckduckgo-search>=4.2
6 | selenium==4.1.4
7 | webdriver-manager==3.8.6
8 | openai==0.27.8
9 | translate==3.6.1
10 | ephem==4.1.4
11 | transformers>=4.33.2
12 | tiktoken
13 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import io
3 | from setuptools import setup, find_packages
4 |
5 |
6 | with open('README.md') as f:
7 | readme = f.read()
8 |
9 | with open('requirements.txt') as f:
10 | requirements = f.read()
11 |
12 | setup(
13 | # Metadata
14 | name='kwaiagents',
15 | version='0.0.1',
16 | python_requires='>=2.7,>=3.6',
17 | author='Haojie Pan',
18 | author_email='panhaojie@kuaishou.com',
19 | description='Kwaiagents',
20 | long_description=readme,
21 | long_description_content_type='text/markdown',
22 | entry_points = {
23 | 'console_scripts': [
24 | 'kagentsys=kwaiagents.agent_start:main']
25 | },
26 | packages=find_packages(),
27 | license='Attribution-NonCommercial-ShareAlike 4.0',
28 |
29 | # Package info
30 | install_requires=requirements
31 | )
32 |
--------------------------------------------------------------------------------