├── LICENSE ├── README.md ├── benchmark_gaia.ipynb ├── benchmark_transformers_agents.ipynb ├── data └── gaia │ ├── .DS_Store │ ├── test │ ├── .DS_Store │ ├── 021a5339-744f-42b7-bd9b-9368b3efda7a.pdf │ ├── 021a5339-744f-42b7-bd9b-9368b3efda7a.png │ ├── 03c577c9-4227-48a9-9b75-f8f598de14c1.mp3 │ ├── 063800f6-8832-4856-972b-17b877612533.png │ ├── 07c3029f-7095-455d-a9e9-cd5e34001b38.json │ ├── 0c393561-dd13-4b7c-ac49-20ac469aa276.MOV │ ├── 171dd6d2-d1d4-439b-8d4e-7507018a816b.png │ ├── 198ffd8f-6041-458d-bacc-fe49872cfa43.txt │ ├── 23bcfab0-f47b-4dcb-8599-459c329ac153.mp3 │ ├── 2bb16c35-403a-4d4c-859e-a88ccd55f876.xml │ ├── 32f386b9-73ee-4455-b412-ddad508aa979.pdf │ ├── 32f386b9-73ee-4455-b412-ddad508aa979.png │ ├── 355b827f-fff0-4e0c-9ff0-65dea0609838.png │ ├── 355b827f-fff0-4e0c-9ff0-65dea0609838.xlsx │ ├── 3cc53dbf-1ab9-4d21-a56a-fc0151c10f89.xlsx │ ├── 4033181f-1988-476b-bc33-6da0f96d7bd0.png │ ├── 4033181f-1988-476b-bc33-6da0f96d7bd0.xlsx │ ├── 4044eab7-1282-42bd-a559-3bf3a4d5858e.pdf │ ├── 4044eab7-1282-42bd-a559-3bf3a4d5858e.png │ ├── 4cf4a5c1-7c9c-4cce-94cb-57b8be196244.png │ ├── 52e8ce1c-09bd-4537-8e2d-67d1648779b9.csv │ ├── 52e8ce1c-09bd-4537-8e2d-67d1648779b9.png │ ├── 56376d48-f456-4c24-a917-834be04c7608.png │ ├── 56376d48-f456-4c24-a917-834be04c7608.xlsx │ ├── 59b3cfb0-a06a-4ac5-b54e-81c9db8b0957.png │ ├── 59b3cfb0-a06a-4ac5-b54e-81c9db8b0957.xlsx │ ├── 5b89b147-cdab-40e1-be5b-819bc076c270.mp3 │ ├── 5bbf523f-b902-4d7d-8e8d-212d00018733.mp3 │ ├── 5f2b2e54-5047-4394-81be-198230c3b508.png │ ├── 5f2b2e54-5047-4394-81be-198230c3b508.xlsx │ ├── 634fca59-03b2-4cdf-9ce4-0205df22f256.pdf │ ├── 680d7d77-c0c7-49c8-88fd-f8ec623645e9.pdf │ ├── 680d7d77-c0c7-49c8-88fd-f8ec623645e9.png │ ├── 68ccf11a-bcd3-41e5-a5ee-3e29253449e9.docx │ ├── 70510d87-5004-4e4a-b078-21abf699dc12.txt │ ├── 7245af7c-404e-4d60-9ef4-94ed301e5315.jpg │ ├── 7674ee67-d671-462f-9e51-129944749a0a.png │ ├── 7707f3dd-1aa6-42f5-847a-b66f3eaf2ee4.png │ ├── 7805912b-c8da-4134-9b54-b590f884352a.csv │ ├── 7805912b-c8da-4134-9b54-b590f884352a.png │ ├── 7a770333-8c1b-4008-b630-9d3cb4f0c171.txt │ ├── 7c215d46-91c7-424e-9f22-37d43ab73ea6.pdf │ ├── 86ca62df-b518-48e7-9115-1b0b800e5453.csv │ ├── 86ca62df-b518-48e7-9115-1b0b800e5453.png │ ├── 8b553092-3d44-4ab3-8d1e-932aabc1e143.png │ ├── 8f697523-6988-4c4f-8d72-760a45681f68.pdf │ ├── 91f2bf12-5280-4efc-b9a7-26e67ca850b4.jpg │ ├── 943255a6-8c56-4cf8-9faf-c74743960097.csv │ ├── 943255a6-8c56-4cf8-9faf-c74743960097.png │ ├── 985ec22e-546b-49fc-ab3c-af490fbefdf3.txt │ ├── 98efafc6-c376-4b53-be91-a130e1d90e02.jpg │ ├── 99b5ea36-0310-49c4-85d8-9ae83a96029a.m4a │ ├── 9ca9bc20-1a0b-4076-9937-7724e3491cf8.png │ ├── Capture d’écran 2024-04-29 à 15.19.50.zip │ ├── a0dcc222-691e-4b03-ac75-c4493991ab80.txt │ ├── aac4df0d-407a-45f2-add5-d9b31ebe1ddc.png │ ├── aac4df0d-407a-45f2-add5-d9b31ebe1ddc.xlsx │ ├── aea1ea38-dfd0-41ab-ad79-badc3c69c784.txt │ ├── afd1efe6-03dd-478c-9eb1-e562355ee94e.txt │ ├── b3654e47-4307-442c-a09c-945b33b913c6.pdf │ ├── b3654e47-4307-442c-a09c-945b33b913c6.png │ ├── b74b4ce7-4f03-42b5-b60e-62da7ffa282e.png │ ├── b74b4ce7-4f03-42b5-b60e-62da7ffa282e.xlsx │ ├── b7767ed5-20c7-4243-86b1-e8bd9a3d2a64.png │ ├── be353748-74eb-4904-8f17-f180ce087f1a.pdf │ ├── c4456885-2f03-436f-8fe9-0b4ca6822cdb.pdf │ ├── c4456885-2f03-436f-8fe9-0b4ca6822cdb.png │ ├── c68c0db6-1929-4194-8602-56dce5ddbd29.xml │ ├── c7003252-fc58-44bf-92f5-ec3991a49d00.png │ ├── c7003252-fc58-44bf-92f5-ec3991a49d00.xlsx │ ├── ca0a4c14-4b97-43e7-8923-539d61050ae3.pdf │ ├── ca0a4c14-4b97-43e7-8923-539d61050ae3.png │ ├── cbdb17dc-62d9-4463-b648-2eaacfeba4e5.png │ ├── cbdb17dc-62d9-4463-b648-2eaacfeba4e5.xlsx │ ├── cd886ddd-2d12-4347-9c7a-64774f66a3d3.txt │ ├── cfd773c8-371f-425c-b081-f254f96c0530.png │ ├── cfd773c8-371f-425c-b081-f254f96c0530.xlsx │ ├── d366cc70-86f1-4dca-bf12-440479c825fe.pptx │ ├── d50b8ecb-a8aa-4696-ad84-403ef15e2c8b.pdf │ ├── d50b8ecb-a8aa-4696-ad84-403ef15e2c8b.png │ ├── d6059b3e-e1da-43b4-ac26-ecad2984909b.csv │ ├── d62cbee6-47c7-4918-825d-3b73b1af7e85.png │ ├── d8434132-f196-4048-82c3-c06facff53c0.png │ ├── d8434132-f196-4048-82c3-c06facff53c0.xlsx │ ├── d89733a3-7d86-4ed8-b5a3-bf4831b06e3c.jpg │ ├── dbb02ff7-f947-491b-9ce2-41e3df16dbb8.txt │ ├── dd024dd9-8da6-4d4e-aee1-ed0d999035a9.txt │ ├── dfa03d6c-402b-43fc-9222-5738f8bdfd0c.txt │ ├── e14448e9-5243-4b07-86e1-22e657f96bcf.jpg │ ├── e51753c7-3ef3-4404-a352-11a18e5760c9.png │ ├── f1ba834a-3bcb-4e55-836c-06cc1e2ccb9f.txt │ ├── f5d0b1c6-5e15-4c55-b60c-9fc855dda5cf.png │ ├── f5d0b1c6-5e15-4c55-b60c-9fc855dda5cf.xlsx │ ├── f6d29ef1-0e4d-41cb-ac25-e60023b3bd96.png │ ├── f6d29ef1-0e4d-41cb-ac25-e60023b3bd96.xlsx │ ├── fcd80879-4f1d-49d8-b6d6-2993607432c2.png │ ├── fcd80879-4f1d-49d8-b6d6-2993607432c2.xlsx │ ├── fe8f4748-5d00-4a27-9070-090a0cfdeac4.png │ ├── fe8f4748-5d00-4a27-9070-090a0cfdeac4.xlsx │ └── metadata.jsonl │ └── validation │ ├── .DS_Store │ ├── 076c8171-9b3b-49b9-a477-244d2a532826.png │ ├── 076c8171-9b3b-49b9-a477-244d2a532826.xlsx │ ├── 1f975693-876d-457b-a649-393859e79bf3.mp3 │ ├── 2b3ef98c-cc05-450b-a719-711aee40ac65.mp3 │ ├── 32102e3e-d12a-4209-9163-7b3a104efe5d.png │ ├── 32102e3e-d12a-4209-9163-7b3a104efe5d.xlsx │ ├── 366e2f2b-8632-4ef2-81eb-bc3877489217.pdf │ ├── 366e2f2b-8632-4ef2-81eb-bc3877489217.png │ ├── 389793a7-ca17-4e82-81cb-2b3a2391b4b9.png │ ├── 389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt │ ├── 3da89939-209c-4086-8520-7eb734e6b4ef.png │ ├── 3da89939-209c-4086-8520-7eb734e6b4ef.xlsx │ ├── 4d0aa727-86b1-406b-9b33-f870dd14a4a5.png │ ├── 4d0aa727-86b1-406b-9b33-f870dd14a4a5.xlsx │ ├── 4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2.png │ ├── 4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2.xlsx │ ├── 54612da3-fd56-4941-80f4-5eb82330de25.png │ ├── 54612da3-fd56-4941-80f4-5eb82330de25.xlsx │ ├── 5b2a14e8-6e59-479c-80e3-4696e8980152.jpg │ ├── 5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.png │ ├── 5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.xlsx │ ├── 6359a0b1-8f7b-499b-9336-840f9ab90688.png │ ├── 65afbc8a-89ca-4ad5-8d62-355bb401f61d.png │ ├── 65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx │ ├── 67e8878b-5cef-4375-804e-e6291fdbe78a.pdf │ ├── 67e8878b-5cef-4375-804e-e6291fdbe78a.png │ ├── 7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx │ ├── 7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f.png │ ├── 7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f.xlsx │ ├── 7dd30055-0198-452e-8c25-f73dbe27dcb8.pdb │ ├── 8d46b8d6-b38a-47ff-ac74-cda14cf2d19b.csv │ ├── 8f80e01c-1296-4371-9486-bb3d68651a60.png │ ├── 9318445f-fe6a-4e1b-acbf-c68228c9906a.png │ ├── 99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3 │ ├── 9b54f9d9-35ee-4a14-b62f-d130ea00317f.zip │ ├── 9b54f9d9-35ee-4a14-b62f-d130ea00317f │ ├── CATEGORIES.xml │ └── food_duplicates.xls │ ├── a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx │ ├── b2c257e0-3ad7-4f05-b8e3-d9da973be36e.jpg │ ├── b7f857e4-d8aa-4387-af2a-0e844df5b9d8.png │ ├── bec74516-02fc-48dc-b202-55e78d0e17cf.jsonld │ ├── bfcd99e1-0690-4b53-a85c-0174a8629083.zip │ ├── bfcd99e1-0690-4b53-a85c-0174a8629083 │ ├── Applicants.xlsx │ └── Job Listing.pdf │ ├── c526d8d6-5987-4da9-b24c-83466fa172f3.png │ ├── c526d8d6-5987-4da9-b24c-83466fa172f3.xlsx │ ├── cca530fc-4052-43b2-b130-b30968d8aa44.png │ ├── cca70ce6-1952-45d2-acd4-80c903b0bc49.png │ ├── cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb.docx │ ├── d8152ad6-e4d5-4c12-8bb7-8d57dc10c6de.png │ ├── da52d699-e8d2-4dc5-9191-a2199e0b6a9b.png │ ├── da52d699-e8d2-4dc5-9191-a2199e0b6a9b.xlsx │ ├── df6561b2-7ee5-4540-baab-5095f742716a.png │ ├── e9a2c537-8232-4c3f-85b0-b52de6bcba99.pdf │ ├── e9a2c537-8232-4c3f-85b0-b52de6bcba99.png │ ├── edd4d4f2-1a58-45c4-b038-67337af4e029.png │ ├── edd4d4f2-1a58-45c4-b038-67337af4e029.xlsx │ ├── f918266a-b3e0-4914-865d-4faa564f1aef.py │ └── metadata.jsonl ├── figures ├── .DS_Store ├── aggregate_errors.png ├── aggregate_score.png ├── aggregate_score_GPT-4-Turbo.png ├── aggregate_score_Llama3-70B-Instruct.png └── aggregate_score_vs_langchain.png ├── gaia.py └── scripts ├── .DS_Store ├── __pycache__ ├── agents.cpython-310.pyc ├── agents.cpython-311.pyc ├── browser.cpython-310.pyc ├── create_agents.cpython-310.pyc ├── evaluation.cpython-310.pyc ├── evaluation.cpython-311.pyc ├── gaia_scorer.cpython-310.pyc ├── gaia_scorer.cpython-311.pyc ├── gaia_scorer.cpython-39.pyc ├── mdconvert.cpython-310.pyc ├── modified_calculator.cpython-311.pyc ├── new_browser.cpython-310.pyc ├── optimize_prompt.cpython-311.pyc ├── prompts.cpython-310.pyc ├── prompts.cpython-311.pyc ├── python_evaluator.cpython-311.pyc ├── reformulator.cpython-310.pyc ├── run_agents.cpython-310.pyc ├── run_agents.cpython-311.pyc ├── run_agents.cpython-39.pyc ├── serpapi_browser.cpython-310.pyc ├── visual_qa.cpython-310.pyc ├── visual_qa.cpython-311.pyc ├── web_surfer.cpython-310.pyc ├── web_surfer.cpython-311.pyc └── web_surfer.cpython-39.pyc ├── create_agents.py ├── evaluation ├── __pycache__ │ ├── gaia_scorer.cpython-310.pyc │ └── unsolved_questions.cpython-310.pyc ├── evaluation.py ├── gaia_scorer.py ├── optimize_prompt.py └── unsolved_questions.py ├── experiments ├── calculator_tool.py ├── retriever_tool.py └── scene.py ├── reformulator.py ├── run_agents.py └── tools ├── __pycache__ ├── browser.cpython-310.pyc ├── cookies.cpython-310.pyc ├── mdconvert.cpython-310.pyc ├── serpapi_browser.cpython-310.pyc ├── visual_qa.cpython-310.pyc └── web_surfer.cpython-310.pyc ├── browser.py ├── cookies.py ├── mdconvert.py ├── visual_qa.py └── web_surfer.py /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Benchmark agent workflows: try the models of your choice on the framework that you want 2 | 3 | This repo is the engine for the evaluations displayed in our [Agents v2.0](https://huggingface.co/blog/agents) announcement post. 4 | 5 | You can use it to test agents on different frameworks: 6 | - [LangChain](https://github.com/langchain-ai/langchain) 7 | - [Transformers agents](https://huggingface.co/docs/transformers/en/transformers_agents) 8 | 9 | 10 | On different benchmarks: 11 | - [GAIA](https://huggingface.co/papers/2311.12983) 12 | - our [custom agent reasoning benchmark](https://huggingface.co/datasets/m-ric/agents_small_benchmark) that includes tasks from GSM8K, HotpotQA and GAIA 13 | 14 | And with different models (cf benchmark below). 15 | 16 | We also implement LLM-judge evaluation, with parallel processing for faster results. 17 | 18 | ![benchmark](figures/aggregate_score_vs_langchain.png) 19 | 20 | -------------------------------------------------------------------------------- /data/gaia/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/.DS_Store -------------------------------------------------------------------------------- /data/gaia/test/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/.DS_Store -------------------------------------------------------------------------------- /data/gaia/test/021a5339-744f-42b7-bd9b-9368b3efda7a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/021a5339-744f-42b7-bd9b-9368b3efda7a.pdf -------------------------------------------------------------------------------- /data/gaia/test/021a5339-744f-42b7-bd9b-9368b3efda7a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/021a5339-744f-42b7-bd9b-9368b3efda7a.png -------------------------------------------------------------------------------- /data/gaia/test/03c577c9-4227-48a9-9b75-f8f598de14c1.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/03c577c9-4227-48a9-9b75-f8f598de14c1.mp3 -------------------------------------------------------------------------------- /data/gaia/test/063800f6-8832-4856-972b-17b877612533.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/063800f6-8832-4856-972b-17b877612533.png -------------------------------------------------------------------------------- /data/gaia/test/0c393561-dd13-4b7c-ac49-20ac469aa276.MOV: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/0c393561-dd13-4b7c-ac49-20ac469aa276.MOV -------------------------------------------------------------------------------- /data/gaia/test/171dd6d2-d1d4-439b-8d4e-7507018a816b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/171dd6d2-d1d4-439b-8d4e-7507018a816b.png -------------------------------------------------------------------------------- /data/gaia/test/23bcfab0-f47b-4dcb-8599-459c329ac153.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/23bcfab0-f47b-4dcb-8599-459c329ac153.mp3 -------------------------------------------------------------------------------- /data/gaia/test/32f386b9-73ee-4455-b412-ddad508aa979.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/32f386b9-73ee-4455-b412-ddad508aa979.pdf -------------------------------------------------------------------------------- /data/gaia/test/32f386b9-73ee-4455-b412-ddad508aa979.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/32f386b9-73ee-4455-b412-ddad508aa979.png -------------------------------------------------------------------------------- /data/gaia/test/355b827f-fff0-4e0c-9ff0-65dea0609838.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/355b827f-fff0-4e0c-9ff0-65dea0609838.png -------------------------------------------------------------------------------- /data/gaia/test/355b827f-fff0-4e0c-9ff0-65dea0609838.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/355b827f-fff0-4e0c-9ff0-65dea0609838.xlsx -------------------------------------------------------------------------------- /data/gaia/test/3cc53dbf-1ab9-4d21-a56a-fc0151c10f89.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/3cc53dbf-1ab9-4d21-a56a-fc0151c10f89.xlsx -------------------------------------------------------------------------------- /data/gaia/test/4033181f-1988-476b-bc33-6da0f96d7bd0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/4033181f-1988-476b-bc33-6da0f96d7bd0.png -------------------------------------------------------------------------------- /data/gaia/test/4033181f-1988-476b-bc33-6da0f96d7bd0.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/4033181f-1988-476b-bc33-6da0f96d7bd0.xlsx -------------------------------------------------------------------------------- /data/gaia/test/4044eab7-1282-42bd-a559-3bf3a4d5858e.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/4044eab7-1282-42bd-a559-3bf3a4d5858e.pdf -------------------------------------------------------------------------------- /data/gaia/test/4044eab7-1282-42bd-a559-3bf3a4d5858e.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/4044eab7-1282-42bd-a559-3bf3a4d5858e.png -------------------------------------------------------------------------------- /data/gaia/test/4cf4a5c1-7c9c-4cce-94cb-57b8be196244.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/4cf4a5c1-7c9c-4cce-94cb-57b8be196244.png -------------------------------------------------------------------------------- /data/gaia/test/52e8ce1c-09bd-4537-8e2d-67d1648779b9.csv: -------------------------------------------------------------------------------- 1 | ,Brooklyn,Bronx,Queens,Staten Island,Manhattan 2 | January,66,22,31,23,10 3 | February,12,30,12,45,10 4 | March,7,24,34,54,1 5 | April,50,41,65,35,43 6 | May,19,55,76,96,85 7 | June,4,42,32,12,22 8 | July,42,40,11,4,20 9 | August,8,12,41,12,45 10 | September,12,8,34,54,12 11 | October,50,1,22,13,12 12 | November,41,34,64,7,51 13 | December,51,23,31,24,61 -------------------------------------------------------------------------------- /data/gaia/test/52e8ce1c-09bd-4537-8e2d-67d1648779b9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/52e8ce1c-09bd-4537-8e2d-67d1648779b9.png -------------------------------------------------------------------------------- /data/gaia/test/56376d48-f456-4c24-a917-834be04c7608.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/56376d48-f456-4c24-a917-834be04c7608.png -------------------------------------------------------------------------------- /data/gaia/test/56376d48-f456-4c24-a917-834be04c7608.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/56376d48-f456-4c24-a917-834be04c7608.xlsx -------------------------------------------------------------------------------- /data/gaia/test/59b3cfb0-a06a-4ac5-b54e-81c9db8b0957.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/59b3cfb0-a06a-4ac5-b54e-81c9db8b0957.png -------------------------------------------------------------------------------- /data/gaia/test/59b3cfb0-a06a-4ac5-b54e-81c9db8b0957.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/59b3cfb0-a06a-4ac5-b54e-81c9db8b0957.xlsx -------------------------------------------------------------------------------- /data/gaia/test/5b89b147-cdab-40e1-be5b-819bc076c270.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/5b89b147-cdab-40e1-be5b-819bc076c270.mp3 -------------------------------------------------------------------------------- /data/gaia/test/5bbf523f-b902-4d7d-8e8d-212d00018733.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/5bbf523f-b902-4d7d-8e8d-212d00018733.mp3 -------------------------------------------------------------------------------- /data/gaia/test/5f2b2e54-5047-4394-81be-198230c3b508.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/5f2b2e54-5047-4394-81be-198230c3b508.png -------------------------------------------------------------------------------- /data/gaia/test/5f2b2e54-5047-4394-81be-198230c3b508.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/5f2b2e54-5047-4394-81be-198230c3b508.xlsx -------------------------------------------------------------------------------- /data/gaia/test/634fca59-03b2-4cdf-9ce4-0205df22f256.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/634fca59-03b2-4cdf-9ce4-0205df22f256.pdf -------------------------------------------------------------------------------- /data/gaia/test/680d7d77-c0c7-49c8-88fd-f8ec623645e9.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/680d7d77-c0c7-49c8-88fd-f8ec623645e9.pdf -------------------------------------------------------------------------------- /data/gaia/test/680d7d77-c0c7-49c8-88fd-f8ec623645e9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/680d7d77-c0c7-49c8-88fd-f8ec623645e9.png -------------------------------------------------------------------------------- /data/gaia/test/68ccf11a-bcd3-41e5-a5ee-3e29253449e9.docx: -------------------------------------------------------------------------------- 1 | 1. Place the first letter of the third word on a line to begin a new word. 2 | 2. Add the first and last letters of the last word before and after the new word, respectively. 3 | 3. Place the last letter of the fourth word at the beginning of the new word. 4 | 4. Add the new word’s reverse to its end to make a palindrome. 5 | 5. Remove letters 3, 4, 7, and 8 from the new word. 6 | 6. Add the fourth letter from the end of the line, with the letter turned upside-down, to the beginning of the new word. 7 | 7. Reverse the new word. 8 | 8. Add the last two letters of the third word after the comma to the new word’s end. 9 | 9. Remove the repeated consonant that’s not on the end of the new word. 10 | 10. Put the new word in upper case. -------------------------------------------------------------------------------- /data/gaia/test/7245af7c-404e-4d60-9ef4-94ed301e5315.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/7245af7c-404e-4d60-9ef4-94ed301e5315.jpg -------------------------------------------------------------------------------- /data/gaia/test/7674ee67-d671-462f-9e51-129944749a0a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/7674ee67-d671-462f-9e51-129944749a0a.png -------------------------------------------------------------------------------- /data/gaia/test/7707f3dd-1aa6-42f5-847a-b66f3eaf2ee4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/7707f3dd-1aa6-42f5-847a-b66f3eaf2ee4.png -------------------------------------------------------------------------------- /data/gaia/test/7805912b-c8da-4134-9b54-b590f884352a.csv: -------------------------------------------------------------------------------- 1 | ,Brooklyn,Bronx,Queens,Staten Island,Manhattan 2 | January,66,22,31,23,10 3 | February,12,30,12,45,10 4 | March,7,24,34,54,1 5 | April,50,41,65,35,43 6 | May,19,55,76,96,85 7 | June,4,42,32,12,22 8 | July,42,40,11,4,20 9 | August,8,12,41,12,45 10 | September,12,8,34,54,12 11 | October,50,1,22,13,12 12 | November,41,34,64,7,51 13 | December,51,23,31,24,61 -------------------------------------------------------------------------------- /data/gaia/test/7805912b-c8da-4134-9b54-b590f884352a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/7805912b-c8da-4134-9b54-b590f884352a.png -------------------------------------------------------------------------------- /data/gaia/test/7c215d46-91c7-424e-9f22-37d43ab73ea6.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/7c215d46-91c7-424e-9f22-37d43ab73ea6.pdf -------------------------------------------------------------------------------- /data/gaia/test/86ca62df-b518-48e7-9115-1b0b800e5453.csv: -------------------------------------------------------------------------------- 1 | ,Brooklyn,Bronx,Queens,Staten Island,Manhattan 2 | January,66,22,31,23,10 3 | February,12,30,12,45,10 4 | March,7,24,34,54,1 5 | April,50,41,65,35,43 6 | May,19,55,76,96,85 7 | June,4,42,32,12,22 8 | July,42,40,11,4,20 9 | August,8,12,41,12,45 10 | September,12,8,34,54,12 11 | October,50,1,22,13,12 12 | November,41,34,64,7,51 13 | December,51,23,31,24,61 -------------------------------------------------------------------------------- /data/gaia/test/86ca62df-b518-48e7-9115-1b0b800e5453.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/86ca62df-b518-48e7-9115-1b0b800e5453.png -------------------------------------------------------------------------------- /data/gaia/test/8b553092-3d44-4ab3-8d1e-932aabc1e143.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/8b553092-3d44-4ab3-8d1e-932aabc1e143.png -------------------------------------------------------------------------------- /data/gaia/test/8f697523-6988-4c4f-8d72-760a45681f68.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/8f697523-6988-4c4f-8d72-760a45681f68.pdf -------------------------------------------------------------------------------- /data/gaia/test/91f2bf12-5280-4efc-b9a7-26e67ca850b4.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/91f2bf12-5280-4efc-b9a7-26e67ca850b4.jpg -------------------------------------------------------------------------------- /data/gaia/test/943255a6-8c56-4cf8-9faf-c74743960097.csv: -------------------------------------------------------------------------------- 1 | AtomicNumber,Element,Symbol,AtomicMass,NumberofNeutrons,NumberofProtons,NumberofElectrons,Period,Group,Phase,Radioactive,Natural,Nonmetal,Metalloid,Type,AtomicRadius,Electronegativity,FirstIonization,Density,MeltingPoint,BoilingPoint,NumberOfIsotopes,Discoverer,Year,SpecificHeat,NumberofShells,NumberofValence, 2 | 40,Zirconium,Zr,91.224,51,40,40,5,4,solid,,yes,yes,,,Transition Metal,2.2,1.33,6.6339,6.51E+00,2125.15,4682,20,Klaproth,1789,0.278,5, 3 | 30,Zinc,Zn,65.38,35,30,30,4,12,solid,,yes,yes,,,Transition Metal,1.5,1.65,9.3942,7.13E+00,692.88,1180,15,Prehistoric,,0.388,4, 4 | 39,Yttrium,Y,88.906,50,39,39,5,3,solid,,yes,yes,,,Transition Metal,2.3,1.22,6.2173,4.47E+00,1799.15,3609,21,Gadolin,1794,0.298,5, 5 | 70,Ytterbium,Yb,173.054,103,70,70,6,,solid,,yes,yes,,,Lanthanide,2.4,1.1,6.2542,6.97E+00,1097.15,1469,16,Marignac,1878,0.155,6, 6 | 54,Xenon,Xe,131.293,77,54,54,5,18,gas,,yes,,yes,,Noble Gas,1.2,,12.1298,5.89E-03,161.45,165.03,31,Ramsay and Travers,1898,0.158,5,8 7 | 74,Wolfram,W,183.84,110,74,74,6,6,solid,,yes,yes,,,Transition Metal,2,2.36,7.864,1.93E+01,3680.15,5828,22,J. and F. d'Elhuyar,1783,0.132,6, 8 | 23,Vanadium,V,50.942,28,23,23,4,5,solid,,yes,yes,,,Transition Metal,1.9,1.63,6.7462,6.11E+00,2175.15,3680,9,del Rio,1801,0.489,4, 9 | 92,Uranium,U,238.029,146,92,92,7,,solid,yes,yes,yes,,,Actinide,,1.38,6.1941,1.90E+01,1405.15,4404,15,Peligot,1841,0.116,7, 10 | 22,Titanium,Ti,47.867,26,22,22,4,4,solid,,yes,yes,,,Transition Metal,2,1.54,6.8281,4.54E+00,1933.15,3560,9,Gregor,1791,0.523,4, 11 | 50,Tin,Sn,118.71,69,50,50,5,14,solid,,yes,yes,,,Metal,1.7,1.96,7.3439,7.29E+00,505.21,2875,28,Prehistoric,,0.228,5,4 12 | 69,Thulium,Tm,168.934,100,69,69,6,,solid,,yes,yes,,,Lanthanide,2.4,1.25,6.1843,9.32E+00,1818.15,2223,18,Cleve,1879,0.16,6, 13 | 90,Thorium,Th,232.038,142,90,90,7,,solid,yes,yes,yes,,,Actinide,,1.3,6.3067,1.17E+01,2028.15,5061,12,Berzelius,1828,0.113,7, 14 | 81,Thallium,Tl,204.383,123,81,81,6,13,solid,,yes,yes,,,Metal,2.1,2.04,6.1082,1.19E+01,577.15,1746,28,Crookes,1861,0.129,6,3 15 | 65,Terbium,Tb,158.925,94,65,65,6,,solid,,yes,yes,,,Lanthanide,2.5,1.2,5.8638,8.23E+00,1630.15,3503,24,Mosander,1843,0.182,6, 16 | 117,Tennessine,Ts,295,178,117,117,7,17,artificial,yes,,,yes,,,,,,,,,,,2010,,7,7 17 | 52,Tellurium,Te,127.6,76,52,52,5,16,solid,,yes,,,yes,Metalloid,1.4,2.1,9.0096,6.23E+00,722.8,1261,29,von Reichenstein,1782,0.202,5,6 18 | 43,Technetium,Tc,98,55,43,43,5,7,artificial,yes,,yes,,,Transition Metal,2,1.9,7.28,1.15E+01,2473.15,5150,23,Perrier and Segr�,1937,,5, 19 | 73,Tantalum,Ta,180.948,108,73,73,6,5,solid,,yes,yes,,,Transition Metal,2.1,1.5,7.5496,1.67E+01,3269.15,5731,19,Ekeberg,1801,0.14,6, 20 | 16,Sulfur,S,32.065,16,16,16,3,16,solid,,yes,,yes,,Nonmetal,1.1,2.58,10.36,2.07E+00,388.51,717.8,10,Prehistoric,,0.71,3,6 21 | 38,Strontium,Sr,87.62,50,38,38,5,2,solid,,yes,yes,,,Alkaline Earth Metal,2.5,0.95,5.6949,2.64E+00,1042.15,1655,18,Davy,1808,0.301,5,2 22 | 11,Sodium,Na,22.99,12,11,11,3,1,solid,,yes,yes,,,Alkali Metal,2.2,0.93,5.1391,9.71E-01,371.15,1156,7,Davy,1807,1.228,3,1 23 | 47,Silver,Ag,107.868,61,47,47,5,11,solid,,yes,yes,,,Transition Metal,1.8,1.93,7.5762,1.05E+01,1234.15,2435,27,Prehistoric,,0.235,5, 24 | 14,Silicon,Si,28.086,14,14,14,3,14,solid,,yes,,,yes,Metalloid,1.5,1.9,8.1517,2.33E+00,1683.15,3538,8,Berzelius,1824,0.705,3,4 25 | 34,Selenium,Se,78.96,45,34,34,4,16,solid,,yes,,yes,,Nonmetal,1.2,2.55,9.7524,4.81E+00,494.15,958,20,Berzelius,1817,0.321,4,6 26 | 106,Seaborgium,Sg,266,160,106,106,7,6,artificial,yes,,yes,,,Transactinide,,,,3.50E+01,,,,Ghiorso et al.,1974,,7, 27 | 21,Scandium,Sc,44.956,24,21,21,4,3,solid,,yes,yes,,,Transition Metal,2.1,1.36,6.5615,2.99E+00,1812.15,3109,15,Nilson,1878,0.568,4, 28 | 62,Samarium,Sm,150.36,88,62,62,6,,solid,,yes,yes,,,Lanthanide,2.6,1.17,5.6437,7.52E+00,1345.15,2067,17,Boisbaudran,1879,0.197,6, 29 | 104,Rutherfordium,Rf,261,157,104,104,7,4,artificial,yes,,yes,,,Transactinide,,,,1.81E+01,,,,Ghiorso et al.,1969,,7, 30 | 44,Ruthenium,Ru,101.07,57,44,44,5,8,solid,,yes,yes,,,Transition Metal,1.9,2.2,7.3605,1.24E+01,2523.15,4423,16,Klaus,1844,0.238,5, 31 | 37,Rubidium,Rb,85.468,48,37,37,5,1,solid,,yes,yes,,,Alkali Metal,3,0.82,4.1771,1.53E+00,312.79,961,20,Bunsen and Kirchoff,1861,0.363,5,1 32 | 111,Roentgenium,Rg,272,161,111,111,7,11,artificial,yes,,yes,,,Transactinide,,,,,,,,,1994,,7, 33 | 45,Rhodium,Rh,102.906,58,45,45,5,9,solid,,yes,yes,,,Transition Metal,1.8,2.28,7.4589,1.24E+01,2239.15,3968,20,Wollaston,1803,0.243,5, 34 | 75,Rhenium,Re,186.207,111,75,75,6,7,solid,,yes,yes,,,Transition Metal,2,1.9,7.8335,2.10E+01,3453.15,5869,21,"Noddack, Berg, and Tacke",1925,0.137,6, 35 | 86,Radon,Rn,222,136,86,86,6,18,gas,yes,yes,yes,,,Alkali Metal,1.3,,10.7485,9.73E-03,202.15,211.3,20,Dorn,1900,0.094,6,8 36 | 88,Radium,Ra,226,138,88,88,7,2,solid,yes,yes,yes,,,Actinide,,0.9,5.2784,5.50E+00,973.15,2010,15,Pierre and Marie Curie,1898,,7,2 37 | 91,Protactinium,Pa,231.036,140,91,91,7,,solid,yes,yes,yes,,,Actinide,,1.5,5.89,1.54E+01,1873.15,4300,14,Hahn and Meitner,1917,,7, 38 | 61,Promethium,Pm,145,84,61,61,6,,artificial,yes,,yes,,,Lanthanide,2.6,1.13,5.582,7.26E+00,1204.15,3273,14,Marinsky et al.,1945,,6, 39 | 59,Praseodymium,Pr,140.908,82,59,59,6,,solid,,yes,yes,,,Lanthanide,2.7,1.13,5.473,6.77E+00,1204.15,3793,15,von Welsbach,1885,0.193,6, 40 | 19,Potassium,K,39.098,20,19,19,4,1,solid,,yes,yes,,,Alkali Metal,2.8,0.82,4.3407,8.62E-01,336.5,1032,10,Davy,1807,0.757,4,1 41 | 84,Polonium,Po,210,126,84,84,6,16,solid,yes,yes,,,yes,Metalloid,1.5,2,8.417,9.32E+00,527.15,1235,34,Curie,1898,,6,6 42 | 94,Plutonium,Pu,244,150,94,94,7,,artificial,yes,,yes,,,Actinide,,1.28,6.0262,1.98E+01,913.15,3501,163,Seaborg et al.,1940,,7, 43 | 78,Platinum,Pt,195.084,117,78,78,6,10,solid,,yes,yes,,,Transition Metal,1.8,2.28,8.9587,2.15E+01,2045.15,4098,32,Ulloa/Wood,1735,0.133,6, 44 | 15,Phosphorus,P,30.974,16,15,15,3,15,solid,,yes,,yes,,Nonmetal,1.2,2.19,10.4867,1.82E+00,317.25,553,7,BranBrand,1669,0.769,3,5 45 | 46,Palladium,Pd,106.42,60,46,46,5,10,solid,,yes,yes,,,Transition Metal,1.8,2.2,8.3369,1.20E+01,1825.15,3236,21,Wollaston,1803,0.244,5, 46 | 8,Oxygen,O,15.999,8,8,8,2,16,gas,,yes,,yes,,Nonmetal,0.65,3.44,13.6181,1.43E-03,50.5,90.2,8,Priestley/Scheele,1774,0.918,2,6 47 | 76,Osmium,Os,190.23,114,76,76,6,8,solid,,yes,yes,,,Transition Metal,1.9,2.2,8.4382,2.26E+01,3300.15,5285,19,Tennant,1803,0.13,6, 48 | 118,Oganesson,Og,294,176,118,118,7,18,artificial,yes,,,yes,,Noble Gas,,,,,,,,,2006,,, 49 | 102,Nobelium,No,259,157,102,102,7,,artificial,yes,,yes,,,Actinide,,1.3,6.65,,,,73,Ghiorso et al.,1958,,7, 50 | 7,Nitrogen,N,14.007,7,7,7,2,15,gas,,yes,,yes,,Nonmetal,0.75,3.04,14.5341,1.25E-03,63.29,77.36,8,Rutherford,1772,1.04,2,5 51 | 41,Niobium,Nb,92.906,52,41,41,5,5,solid,,yes,yes,,,Transition Metal,2.1,1.6,6.7589,8.57E+00,2741.15,5017,24,Hatchett,1801,0.265,5, 52 | 113,Nihonium,Nh,284,171,113,113,7,13,artificial,yes,,yes,,,,,,,,,,,,2004,,7,3 53 | 28,Nickel,Ni,58.693,31,28,28,4,10,solid,,yes,yes,,,Transition Metal,1.6,1.91,7.6398,8.91E+00,1726.15,3186,11,Cronstedt,1751,0.444,4, 54 | 93,Neptunium,Np,237,144,93,93,7,,artificial,yes,,yes,,,Actinide,,1.36,6.2657,2.05E+01,913.15,4273,153,McMillan and Abelson,1940,,7, 55 | 10,Neon,Ne,20.18,10,10,10,2,18,gas,,yes,,yes,,Noble Gas,0.51,,21.5645,9.00E-04,24.703,27.07,8,Ramsay and Travers,1898,1.03,2,8 56 | 60,Neodymium,Nd,144.242,84,60,60,6,,solid,,yes,yes,,,Lanthanide,2.6,1.14,5.525,7.01E+00,1289.15,3347,16,von Welsbach,1885,0.19,6, 57 | 115,Moscovium,Mc,288,173,115,115,7,15,artificial,yes,,yes,,,,,,,,,,,,2010,,7,5 58 | 42,Molybdenum,Mo,95.96,54,42,42,5,6,solid,,yes,yes,,,Transition Metal,2,2.16,7.0924,1.02E+01,2890.15,4912,20,Scheele,1778,0.251,5, 59 | 80,Mercury,Hg,200.59,121,80,80,6,12,liq,,yes,yes,,,Transition Metal,1.8,2,10.4375,1.35E+01,234.43,630,26,Prehistoric,,0.14,6, 60 | 101,Mendelevium,Md,258,157,101,101,7,,artificial,yes,,yes,,,Actinide,,1.3,6.58,,,,33,Ghiorso et al.,1955,,7, 61 | 109,Meitnerium,Mt,268,159,109,109,7,9,artificial,yes,,yes,,,Transactinide,,,,3.50E+01,,,,"GSI, Darmstadt, West Germany",1982,,7, 62 | 25,Manganese,Mn,54.938,30,25,25,4,7,solid,,yes,yes,,,Transition Metal,1.8,1.55,7.434,7.44E+00,1519.15,2334,11,"Gahn, Scheele",1774,0.479,4, 63 | 12,Magnesium,Mg,24.305,12,12,12,3,2,solid,,yes,yes,,,Alkaline Earth Metal,1.7,1.31,7.6462,1.74E+00,923.15,1363,8,Black,1755,1.023,3,2 64 | 71,Lutetium,Lu,174.967,104,71,71,6,,solid,,yes,yes,,,Lanthanide,2.3,1.27,5.4259,9.84E+00,1936.15,3675,22,Urbain/ von Welsbach,1907,0.154,6, 65 | 116,Livermorium,Lv,292,176,116,116,7,16,artificial,yes,,yes,,,Transactinide,,,,,,,,,2000,,7,6 66 | 3,Lithium,Li,6.941,4,3,3,2,1,solid,,yes,yes,,,Alkali Metal,2.1,0.98,5.3917,5.34E-01,453.85,1615,5,Arfvedson,1817,3.582,2,1 67 | 82,Lead,Pb,207.2,125,82,82,6,14,solid,,yes,yes,,,Metal,1.8,2.33,7.4167,1.13E+01,600.75,2022,29,Prehistoric,,0.129,6,4 68 | 103,Lawrencium,Lr,262,159,103,103,7,,artificial,yes,,yes,,,Actinide,,,,,,,203,Ghiorso et al.,1961,,7, 69 | 57,Lanthanum,La,138.905,82,57,57,6,3,solid,,yes,yes,,,Lanthanide,2.7,1.1,5.5769,6.15E+00,1193.15,3737,19,Mosander,1839,0.195,6, 70 | 36,Krypton,Kr,83.798,48,36,36,4,18,gas,,yes,,yes,,Noble Gas,1,,13.9996,3.73E-03,115.93,119.93,23,Ramsay and Travers,1898,0.248,4,8 71 | 26,Iron,Fe,55.845,30,26,26,4,8,solid,,yes,yes,,,Transition Metal,1.7,1.83,7.9024,7.87E+00,1808.15,3134,10,Prehistoric,,0.449,4, 72 | 77,Iridium,Ir,192.217,115,77,77,6,9,solid,,yes,yes,,,Transition Metal,1.9,2.2,8.967,2.26E+01,2716.15,4701,25,Tennant,1804,0.131,6, 73 | 53,Iodine,I,126.904,74,53,53,5,17,solid,,yes,,yes,,Halogen,1.3,2.66,10.4513,4.93E+00,386.65,457.4,24,Courtois,1811,0.214,5,7 74 | 49,Indium,In,114.818,66,49,49,5,13,solid,,yes,yes,,,Metal,2,1.78,5.7864,7.31E+00,429.91,2345,34,Reich and Richter,1863,0.233,5,3 75 | 1,Hydrogen,H,1.007,0,1,1,1,1,gas,,yes,,yes,,Nonmetal,0.79,2.2,13.5984,8.99E-05,14.175,20.28,3,Cavendish,1766,14.304,1,1 76 | 67,Holmium,Ho,164.93,98,67,67,6,,solid,,yes,yes,,,Lanthanide,2.5,1.23,6.0215,8.80E+00,1743.15,2993,29,Delafontaine and Soret,1878,0.165,6, 77 | 2,Helium,He,4.002,2,2,2,1,18,gas,,yes,,yes,,Noble Gas,0.49,,24.5874,1.79E-04,,4.22,5,Janssen,1868,5.193,1, 78 | 108,Hassium,Hs,267,159,108,108,7,8,artificial,yes,,yes,,,Transactinide,,,,4.10E+01,,,,Armbruster and M�nzenberg,1983,,7, 79 | 72,Hafnium,Hf,178.49,106,72,72,6,4,solid,,yes,yes,,,Transition Metal,2.2,1.3,6.8251,1.33E+01,2500.15,4876,17,Coster and von Hevesy,1923,0.144,6, 80 | 79,Gold,Au,196.967,118,79,79,6,11,solid,,yes,yes,,,Transition Metal,1.8,2.54,9.2255,1.93E+01,1337.73,3129,21,Prehistoric,,0.129,6, 81 | 32,Germanium,Ge,72.64,41,32,32,4,14,solid,,yes,,,yes,Metalloid,1.5,2.01,7.8994,5.32E+00,1211.45,3106,17,Winkler,1886,0.32,4,4 82 | 31,Gallium,Ga,69.723,39,31,31,4,13,solid,,yes,yes,,,Metal,1.8,1.81,5.9993,5.91E+00,302.91,2477,14,de Boisbaudran,1875,0.371,4,3 83 | 64,Gadolinium,Gd,157.25,93,64,64,6,,solid,,yes,yes,,,Lanthanide,2.5,1.2,6.1501,7.90E+00,1585.15,3546,17,de Marignac,1880,0.236,6, 84 | 87,Francium,Fr,223,136,87,87,7,1,solid,yes,yes,yes,,,Alkaline Earth Metal,,0.7,4.0727,1.87E+00,300.15,950,21,Perey,1939,,7,1 85 | 9,Fluorine,F,18.998,10,9,9,2,17,gas,,yes,,yes,,Halogen,0.57,3.98,17.4228,1.70E-03,53.63,85.03,6,Moissan,1886,0.824,2,7 86 | 114,Flerovium,Fl,289,175,114,114,7,14,artificial,yes,,yes,,,Transactinide,,,,,,,,,1999,,7,4 87 | 100,Fermium,Fm,257,157,100,100,7,,artificial,yes,,yes,,,Actinide,,1.3,6.5,,,,103,Ghiorso et al.,1953,,7, 88 | 63,Europium,Eu,151.964,89,63,63,6,,solid,,yes,yes,,,Lanthanide,2.6,1.2,5.6704,5.24E+00,1095.15,1802,21,Demarcay,1901,0.182,6, 89 | 68,Erbium,Er,167.259,99,68,68,6,,solid,,yes,yes,,,Lanthanide,2.5,1.24,6.1077,9.07E+00,1795.15,3503,16,Mosander,1843,0.168,6, 90 | 99,Einsteinium,Es,252,153,99,99,7,,artificial,yes,,yes,,,Actinide,,1.3,6.42,1.35E+01,1133.15,,123,Ghiorso et al.,1952,,7, 91 | 66,Dysprosium,Dy,162.5,97,66,66,6,,solid,,yes,yes,,,Lanthanide,2.5,1.22,5.9389,8.55E+00,1680.15,2840,21,de Boisbaudran,1886,0.17,6, 92 | 105,Dubnium,Db,262,157,105,105,7,5,artificial,yes,,yes,,,Transactinide,,,,3.90E+01,,,,Ghiorso et al.,1970,,7, 93 | 110,Darmstadtium,Ds,271,161,110,110,7,10,artificial,yes,,yes,,,Transactinide,,,,,,,,,1994,,7, 94 | 96,Curium,Cm,247,151,96,96,7,,artificial,yes,,yes,,,Actinide,,1.3,5.9915,1.35E+01,1340.15,3383,133,Seaborg et al.,1944,,7, 95 | 29,Copper,Cu,63.546,35,29,29,4,11,solid,,yes,yes,,,Transition Metal,1.6,1.9,7.7264,8.96E+00,1357.75,2835,11,Prehistoric,,0.385,4, 96 | 112,Copernicium,Cn,285,173,112,112,7,12,artificial,yes,,yes,,,Transactinide,,,,,,,,,1996,,7, 97 | 27,Cobalt,Co,58.933,32,27,27,4,9,solid,,yes,yes,,,Transition Metal,1.7,1.88,7.881,8.86E+00,1768.15,3200,14,Brandt,1735,0.421,4, 98 | 24,Chromium,Cr,51.996,28,24,24,4,6,solid,,yes,yes,,,Transition Metal,1.9,1.66,6.7665,7.15E+00,2130.15,2944,9,Vauquelin,1797,0.449,4, 99 | 17,Chlorine,Cl,35.453,18,17,17,3,17,gas,,yes,,yes,,Halogen,0.97,3.16,12.9676,3.21E-03,172.31,239.11,11,Scheele,1774,0.479,3,7 100 | 55,Cesium,Cs,132.905,78,55,55,6,1,solid,,yes,yes,,,Alkali Metal,3.3,0.79,3.8939,1.87E+00,301.7,944,22,Bunsen and Kirchoff,1860,0.242,6,1 101 | 58,Cerium,Ce,140.116,82,58,58,6,,solid,,yes,yes,,,Lanthanide,2.7,1.12,5.5387,6.77E+00,1071.15,3716,19,Berzelius,1803,0.192,6, 102 | 6,Carbon,C,12.011,6,6,6,2,14,solid,,yes,,yes,,Nonmetal,0.91,2.55,11.2603,2.27E+00,3948.15,4300,7,Prehistoric,,0.709,2,4 103 | 98,Californium,Cf,251,153,98,98,7,,artificial,yes,,yes,,,Actinide,,1.3,6.2817,1.51E+01,1925.15,1173,123,Seaborg et al.,1950,,7, 104 | 20,Calcium,Ca,40.078,20,20,20,4,2,solid,,yes,yes,,,Alkaline Earth Metal,2.2,1,6.1132,1.54E+00,1112.15,1757,14,Davy,1808,0.647,4,2 105 | 48,Cadmium,Cd,112.411,64,48,48,5,12,solid,,yes,yes,,,Transition Metal,1.7,1.69,8.9938,8.69E+00,594.33,1040,22,Stromeyer,1817,0.232,5, 106 | 35,Bromine,Br,79.904,45,35,35,4,17,liq,,yes,,yes,,Halogen,1.1,2.96,11.8138,3.12E+00,266.05,332,19,Balard,1826,0.474,4,7 107 | 5,Boron,B,10.811,6,5,5,2,13,solid,,yes,,,yes,Metalloid,1.2,2.04,8.298,2.34E+00,2573.15,4200,6,Gay-Lussac,1808,1.026,2,3 108 | 107,Bohrium,Bh,264,157,107,107,7,7,artificial,yes,,yes,,,Transactinide,,,,3.70E+01,,,,Armbruster and M�nzenberg,1981,,7, 109 | 83,Bismuth,Bi,208.98,126,83,83,6,15,solid,,yes,yes,,,Metal,1.6,2.02,7.2856,9.81E+00,544.67,1837,19,Geoffroy the Younger,1753,0.122,6,5 110 | 4,Beryllium,Be,9.012,5,4,4,2,2,solid,,yes,yes,,,Alkaline Earth Metal,1.4,1.57,9.3227,1.85E+00,1560.15,2742,6,Vaulquelin,1798,1.825,2,2 111 | 97,Berkelium,Bk,247,150,97,97,7,,artificial,yes,,yes,,,Actinide,,1.3,6.1979,1.48E+01,1259.15,983,83,Seaborg et al.,1949,,7, 112 | 56,Barium,Ba,137.327,81,56,56,6,2,solid,,yes,yes,,,Alkaline Earth Metal,2.8,0.89,5.2117,3.59E+00,1002.15,2170,25,Davy,1808,0.204,6,2 113 | 85,Astatine,At,210,125,85,85,6,17,solid,yes,yes,,yes,,Noble Gas,1.4,2.2,9.3,7.00E+00,575.15,610,21,Corson et al.,1940,,6,7 114 | 33,Arsenic,As,74.922,42,33,33,4,15,solid,,yes,,,yes,Metalloid,1.3,2.18,9.7886,5.78E+00,1090.15,887,14,Albertus Magnus,1250,0.329,4,5 115 | 18,Argon,Ar,39.948,22,18,18,3,18,gas,,yes,,yes,,Noble Gas,0.88,,15.7596,1.78E-03,83.96,87.3,8,Rayleigh and Ramsay,1894,0.52,3,8 116 | 51,Antimony,Sb,121.76,71,51,51,5,15,solid,,yes,,,yes,Metalloid,1.5,2.05,8.6084,6.69E+00,904.05,1860,29,Early historic times,,0.207,5,5 117 | 95,Americium,Am,243,148,95,95,7,,artificial,yes,,yes,,,Actinide,,1.3,5.9738,1.37E+01,1267.15,2880,133,Seaborg et al.,1944,,7, 118 | 13,Aluminum,Al,26.982,14,13,13,3,13,solid,,yes,yes,,,Metal,1.8,1.61,5.9858,2.70E+00,933.4,2792,8,Wshler,1827,0.897,3,3 119 | 89,Actinium,Ac,227,138,89,89,7,3,solid,yes,yes,yes,,,Actinide,,1.1,5.17,1.01E+01,1323.15,3471,11,Debierne/Giesel,1899,0.12,7, 120 | ,,,,,,,,,,,,,,,,,,,,,,,,,,, 121 | -------------------------------------------------------------------------------- /data/gaia/test/943255a6-8c56-4cf8-9faf-c74743960097.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/943255a6-8c56-4cf8-9faf-c74743960097.png -------------------------------------------------------------------------------- /data/gaia/test/98efafc6-c376-4b53-be91-a130e1d90e02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/98efafc6-c376-4b53-be91-a130e1d90e02.jpg -------------------------------------------------------------------------------- /data/gaia/test/99b5ea36-0310-49c4-85d8-9ae83a96029a.m4a: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/99b5ea36-0310-49c4-85d8-9ae83a96029a.m4a -------------------------------------------------------------------------------- /data/gaia/test/9ca9bc20-1a0b-4076-9937-7724e3491cf8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/9ca9bc20-1a0b-4076-9937-7724e3491cf8.png -------------------------------------------------------------------------------- /data/gaia/test/Capture d’écran 2024-04-29 à 15.19.50.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/Capture d’écran 2024-04-29 à 15.19.50.zip -------------------------------------------------------------------------------- /data/gaia/test/a0dcc222-691e-4b03-ac75-c4493991ab80.txt: -------------------------------------------------------------------------------- 1 | Magmortar @ Choice Specs 2 | Ability: Flame Body 3 | EVs: 252 SpA / 4 SpD / 252 Spe 4 | Timid Nature 5 | IVs: 0 Atk 6 | - Fire Blast 7 | - Focus Blast 8 | - Psychic 9 | - Taunt 10 | 11 | Torkoal @ Focus Sash 12 | Ability: White Smoke 13 | EVs: 248 HP / 252 Def / 8 SpA 14 | Bold Nature 15 | IVs: 0 Atk 16 | - Lava Plume 17 | - Earth Power 18 | - Curse 19 | - Protect 20 | 21 | Rapidash @ Lum Berry 22 | Ability: Flash Fire 23 | EVs: 252 Atk / 4 SpD / 252 Spe 24 | Jolly Nature 25 | - Flare Blitz 26 | - Megahorn 27 | - Poison Jab 28 | - Quick Attack 29 | 30 | Flareon @ Salac Berry 31 | Ability: Flash Fire 32 | EVs: 252 Atk / 4 SpD / 252 Spe 33 | Jolly Nature 34 | - Fire Fang 35 | - Superpower 36 | - Last Resort 37 | - Wish 38 | 39 | Infernape @ Sitrus Berry 40 | Ability: Blaze 41 | EVs: 252 Atk / 4 SpD / 252 Spe 42 | Jolly Nature 43 | - Blaze Kick 44 | - Brick Break 45 | - Earthquake 46 | - Gunk Shot 47 | 48 | Houndoom @ Apicot Berry 49 | Ability: Early Bird 50 | EVs: 252 SpA / 4 SpD / 252 Spe 51 | Timid Nature 52 | IVs: 0 Atk 53 | - Dark Pulse 54 | - Flamethrower 55 | - Shadow Ball 56 | - Sludge Bomb -------------------------------------------------------------------------------- /data/gaia/test/aac4df0d-407a-45f2-add5-d9b31ebe1ddc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/aac4df0d-407a-45f2-add5-d9b31ebe1ddc.png -------------------------------------------------------------------------------- /data/gaia/test/aac4df0d-407a-45f2-add5-d9b31ebe1ddc.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/aac4df0d-407a-45f2-add5-d9b31ebe1ddc.xlsx -------------------------------------------------------------------------------- /data/gaia/test/aea1ea38-dfd0-41ab-ad79-badc3c69c784.txt: -------------------------------------------------------------------------------- 1 | (=<`#9]~6ZY32Vx/4Rs+0No-&Jk)"Fh} -------------------------------------------------------------------------------- /data/gaia/test/b3654e47-4307-442c-a09c-945b33b913c6.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/b3654e47-4307-442c-a09c-945b33b913c6.pdf -------------------------------------------------------------------------------- /data/gaia/test/b3654e47-4307-442c-a09c-945b33b913c6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/b3654e47-4307-442c-a09c-945b33b913c6.png -------------------------------------------------------------------------------- /data/gaia/test/b74b4ce7-4f03-42b5-b60e-62da7ffa282e.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/b74b4ce7-4f03-42b5-b60e-62da7ffa282e.png -------------------------------------------------------------------------------- /data/gaia/test/b74b4ce7-4f03-42b5-b60e-62da7ffa282e.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/b74b4ce7-4f03-42b5-b60e-62da7ffa282e.xlsx -------------------------------------------------------------------------------- /data/gaia/test/b7767ed5-20c7-4243-86b1-e8bd9a3d2a64.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/b7767ed5-20c7-4243-86b1-e8bd9a3d2a64.png -------------------------------------------------------------------------------- /data/gaia/test/be353748-74eb-4904-8f17-f180ce087f1a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/be353748-74eb-4904-8f17-f180ce087f1a.pdf -------------------------------------------------------------------------------- /data/gaia/test/c4456885-2f03-436f-8fe9-0b4ca6822cdb.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/c4456885-2f03-436f-8fe9-0b4ca6822cdb.pdf -------------------------------------------------------------------------------- /data/gaia/test/c4456885-2f03-436f-8fe9-0b4ca6822cdb.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/c4456885-2f03-436f-8fe9-0b4ca6822cdb.png -------------------------------------------------------------------------------- /data/gaia/test/c7003252-fc58-44bf-92f5-ec3991a49d00.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/c7003252-fc58-44bf-92f5-ec3991a49d00.png -------------------------------------------------------------------------------- /data/gaia/test/c7003252-fc58-44bf-92f5-ec3991a49d00.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/c7003252-fc58-44bf-92f5-ec3991a49d00.xlsx -------------------------------------------------------------------------------- /data/gaia/test/ca0a4c14-4b97-43e7-8923-539d61050ae3.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/ca0a4c14-4b97-43e7-8923-539d61050ae3.pdf -------------------------------------------------------------------------------- /data/gaia/test/ca0a4c14-4b97-43e7-8923-539d61050ae3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/ca0a4c14-4b97-43e7-8923-539d61050ae3.png -------------------------------------------------------------------------------- /data/gaia/test/cbdb17dc-62d9-4463-b648-2eaacfeba4e5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/cbdb17dc-62d9-4463-b648-2eaacfeba4e5.png -------------------------------------------------------------------------------- /data/gaia/test/cbdb17dc-62d9-4463-b648-2eaacfeba4e5.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/cbdb17dc-62d9-4463-b648-2eaacfeba4e5.xlsx -------------------------------------------------------------------------------- /data/gaia/test/cd886ddd-2d12-4347-9c7a-64774f66a3d3.txt: -------------------------------------------------------------------------------- 1 | 1 a 2 | 2 b 3 | 3 c 4 | 4 d 5 | 5 e -------------------------------------------------------------------------------- /data/gaia/test/cfd773c8-371f-425c-b081-f254f96c0530.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/cfd773c8-371f-425c-b081-f254f96c0530.png -------------------------------------------------------------------------------- /data/gaia/test/cfd773c8-371f-425c-b081-f254f96c0530.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/cfd773c8-371f-425c-b081-f254f96c0530.xlsx -------------------------------------------------------------------------------- /data/gaia/test/d366cc70-86f1-4dca-bf12-440479c825fe.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/d366cc70-86f1-4dca-bf12-440479c825fe.pptx -------------------------------------------------------------------------------- /data/gaia/test/d50b8ecb-a8aa-4696-ad84-403ef15e2c8b.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/d50b8ecb-a8aa-4696-ad84-403ef15e2c8b.pdf -------------------------------------------------------------------------------- /data/gaia/test/d50b8ecb-a8aa-4696-ad84-403ef15e2c8b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/d50b8ecb-a8aa-4696-ad84-403ef15e2c8b.png -------------------------------------------------------------------------------- /data/gaia/test/d62cbee6-47c7-4918-825d-3b73b1af7e85.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/d62cbee6-47c7-4918-825d-3b73b1af7e85.png -------------------------------------------------------------------------------- /data/gaia/test/d8434132-f196-4048-82c3-c06facff53c0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/d8434132-f196-4048-82c3-c06facff53c0.png -------------------------------------------------------------------------------- /data/gaia/test/d8434132-f196-4048-82c3-c06facff53c0.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/d8434132-f196-4048-82c3-c06facff53c0.xlsx -------------------------------------------------------------------------------- /data/gaia/test/d89733a3-7d86-4ed8-b5a3-bf4831b06e3c.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/d89733a3-7d86-4ed8-b5a3-bf4831b06e3c.jpg -------------------------------------------------------------------------------- /data/gaia/test/dbb02ff7-f947-491b-9ce2-41e3df16dbb8.txt: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | arr = np.array([15, 68, 44, 34]) 3 | arr1 = np.std(arr) 4 | print(arr1) -------------------------------------------------------------------------------- /data/gaia/test/dd024dd9-8da6-4d4e-aee1-ed0d999035a9.txt: -------------------------------------------------------------------------------- 1 | Worker A: 6:45 pm - 8 pm 2 | Worker B: 10 am - 11:45 am 3 | Worker C: 12:30 pm - 5 pm 4 | Worker D: 8 am - 4:30 pm 5 | Worker E: 5:30 pm - 8 pm 6 | Worker F: 1 pm - 3:15 pm -------------------------------------------------------------------------------- /data/gaia/test/dfa03d6c-402b-43fc-9222-5738f8bdfd0c.txt: -------------------------------------------------------------------------------- 1 | Creamy Garlic Chicken 2 | Garlic Butter Shrimp & Asparagus 3 | ‎Shrimp Creole 4 | Chocolate Chia Pudding 5 | Rice Cooker Chicken and Rice with Feta and Mint 6 | Casserole with Broccoli 7 | Paneer Curry Recipe (Dhaba Style) 8 | Cajun Shrimp Pasta 9 | Stuffing Meatloaf 10 | Vegan Brownies 11 | One-Pan Lemon Chicken & Orzo 12 | Vegan Black Bean and Sweet Potato Soup 13 | Savory Dutch Baby 14 | Crunchwrap Supreme 15 | Instant Pot Oatmeal 16 | Al Pastor 17 | Garlicky Greek Chicken 18 | Beef and Cabbage Skillet Supper 19 | Garlic Lemon Shrimp 20 | Baked Lemon Herb Salmon 21 | Dal Fry Recipe (Restaurant Style Dal) 22 | Chicken Lasagna 23 | Fusilli with Roasted Eggplant and Goat Cheese 24 | Grilled BBQ Tempeh Steaks 25 | Ricotta Meatballs 26 | Smoky Black Bean and Quinoa Soup 27 | Tofu Scramble 28 | Braised pig's trotters 29 | Quick Hot Dog Chili 30 | Cowboy Steak 31 | Korean-style curry rice 32 | Ultimate Veggie Burgers 33 | Instant Pot Jambalaya 34 | Copycat Crunchwrap Supreme 35 | Skillet Shrimp Destin with Orzo 36 | Roasted Carrots and Red 37 | Slow Cooker Pork Chops 38 | Bombay Potatoes Recipe (Bombay Aloo) 39 | Garlic Shrimp Bacon Alfredo 40 | BBQ Pulled Jackfruit Sandwiches 41 | Shrimp Tempura Roll 42 | Party-Ready Pork Roast 43 | Vegan Mac 'n' Cheese 44 | Vegetable Beef Pot Pie 45 | Chili Mac 46 | One-Pan Shrimp Fajitas 47 | Snap Pea and Chicken Salad 48 | Instant Pot Shrimp Biryani 49 | Al Pastor 50 | Bacon-Cheddar Skillet Cornbread 51 | Vegetable Biryani 52 | Grilled Shrimp Panzanella 53 | Korean-style mapo tofu 54 | Falafel 55 | Taco Pizza 56 | Fajita Burgers 57 | Easy Stuffed Peppers 58 | Best Chicken Quesadillas 59 | Chicken Primavera 60 | Air Fryer Chicken Wings 61 | Easy baked oatmeal muffins 62 | Roasted Portobello Tacos 63 | Tacos Al Pastor 64 | Classic Tofu Scramble 65 | Mustard Glazed Pork Chops 66 | Taco Soup 67 | Bacon-Shrimp Skewers 68 | Sheet-Pan Ranch Pork and Veggies 69 | Ultimate Vegetarian Italian Sub 70 | The Best Gazpacho 71 | Spicy pork stew 72 | Kadhi Recipe (Kadhi Pakora) 73 | Shrimp Scampi 74 | Instant Pot Hummus 75 | Slow-Cooker Vegan Pasta 76 | Vegetable Sandwich with Dill Sauce 77 | Salisbury Steak 78 | One-Pan Honey Garlic Chicken Lettuce Wraps 79 | Pork Banh Mi Lettuce Wraps 80 | Pork Belly Asado 81 | Coconut Shrimp 82 | ‎Shrimp Scampi 83 | Homemade Pasta Sauce 84 | Creamy Rosé Pasta with Roasted Tomatoes 85 | Air Fryer Fried Chicken 86 | Chipotle Chili and Cornbread Dumplings 87 | Chili Mac and Cheese 88 | Slow Cooker Pulled Pork 89 | Hasselback Chicken 90 | Best-Ever Chorizo 91 | Sloppy Joe Sliders 92 | Eggplant Parmesan 93 | Garlic Butter Shrimp 94 | Ceviche Style Shrimp 95 | Classic Chicken Pot Pie 96 | Sheet Pan Ratatouille 97 | Pork Chops With Bloody Mary Sauce 98 | Baked Pork Chops 99 | Mediterranean Chicken 100 | Ravioli Nudi in Tomato Sauce 101 | Ramen Noodle Salad 102 | Instant Pot Carnitas with Cornbread 103 | Vegan Mac & Cheese 104 | Cheeseburger Soup 105 | Vegetarian Enchiladas 106 | Stuffed Cabbage Rolls 107 | Slow Cooker Pulled Pork 108 | Perfect Chicken Alfredo 109 | Chicken Satay with Spiralized-Carrot Salad 110 | Stir-fried noodles and vegetables 111 | Eggplant Parmesan 112 | Casserole with Stuffing 113 | Creamy Tomato Vegan Pasta 114 | Creamy Tuscan Chicken 115 | Hamburger Stroganoff 116 | Creamy Chipotle Shrimp Pasta 117 | Vegetable Rainbow Salad 118 | Kimchi pork buns 119 | Crispy, Creamy Chicken Cordon Bleu 120 | Almond Cherry Pepita Bars 121 | Stir-fried chives, meat, and vegetables 122 | Taco Stuffed Shells 123 | Beef Wellington 124 | One-Pot Taco Spaghetti 125 | One-Pot Cheeseburger Pasta 126 | No-Bake Vegetarian Enchiladas 127 | Grilled Pork Chops 128 | Melt-In-Your-Mouth Chicken 129 | Vegan Cream of Broccoli Soup 130 | Bang Bang Shrimp Sandwich With Slaw 131 | Ground Beef Stroganoff 132 | Cheeseburger Pie 133 | Beef Stroganoff 134 | Cheesy Chicken Alfredo Pasta Bake 135 | Lemon Garlic Shrimp Salad 136 | Chili Cheese Sweet Potato Casserole 137 | Game Day Chili 138 | Blueberry Baked Oatmeal 139 | Skillet Lemony Chicken and Artichokes 140 | Crustless Caprese Quiche 141 | Shrimp Rangoon 142 | Noodles & black bean sauce platter 143 | Cheesy Cabbage & Beef Skillet 144 | Chicken Saltimbocca 145 | Easiest Instant Pot Steel Cut Oats 146 | Easy Sheet Pan Vegan Burritos 147 | Pork and Veggie Stir Fry 148 | Lemon Pepper Chicken 149 | Sheet Pan BBQ Mini Meatloaves 150 | Creamy Chicken and Gnocchi 151 | Ground Beef Bulgogi 152 | Caprese Chicken with Zucchini 153 | Green Tahini Dip 154 | Cauliflower Manchurian 155 | Slutty Vegan's One Night Stand Burger 156 | Spicy Southwestern Beef 157 | Homemade Bolognese Sauce (Pappardelle) 158 | Easy Shepherd's Pie Recipe 159 | Chicken Taquitos 160 | Garlic Shrimp 161 | Sinarsahang Pork Ribs 162 | Chipotle-Inspired Vegetarian Burrito Bowl 163 | Broccoli Beef Braids 164 | Green Soup with Crispy Tofu 165 | Instant Pot Collard Greens 166 | Bang Bang Shrimp 167 | Cheesy Shrimp and Grits 168 | Spaghetti and Meatballs 169 | Double-Pork Carnitas 170 | Taco Lasagna 171 | Vegetable Curry Recipe 172 | Easy Shepherd's Pie 173 | Easy Swedish Meatballs 174 | One Pot Lemon Orzo Shrimp 175 | Uncle Pooh's Shrimp, Sausage, and Grits 176 | One-Pot Beef Goulash 177 | Easy Tomato Soup 178 | Quick Pork Ragu With Ravioli 179 | Slow Cooker Pork Chops 180 | Dad's Stuffed Bell Peppers 181 | Hoppin' John 182 | Easy Greek Salad 183 | Cashew Chicken 184 | Spicy Ground Beef 185 | Chickpeas and Kale in Spicy Pomodoro Sauce 186 | Homemade Chinese-Style Orange Chicken 187 | Grilled Pork Tenderloin with Broccolini 188 | One-Pot Swedish Meatball Pasta 189 | Thanksgiving Leftovers Turkey Club 190 | Asparagus Frittata 191 | Beef Stew 192 | Paneer Jalfrezi Recipe 193 | Coconut Shrimp 194 | Pork Banh Mi Lettuce Wraps 195 | Vegetable Pakora Recipe 196 | All-American Sloppy Joes 197 | Easy Thai Shrimp Soup 198 | Pork Milanese Sandwich With Tomato Salad 199 | Tandoori-Spiced Cauliflower Chicken Flatbreads 200 | Brie Spaghetti 201 | Grilled Bacon BLTs 202 | Classic Lasagna 203 | Grilled Chicken Fajitas 204 | Easiest-Ever Chicken Alfredo 205 | Grilled Pork Chops With Soy Glaze 206 | Overnight Steel Cut Oats 207 | Spicy Pork 208 | Supreme Pizza Casserole 209 | Boeuf en Croûte 210 | Creamy Pumpkin Vodka Sauce & Penne 211 | Quinoa Breakfast Meal Prep 212 | Charred Lemon Shrimp Skewers 213 | Chicken-Bacon Smash Burgers 214 | Low-Carb Burger 215 | Shrimp Tacos with Mango Salsa 216 | Vietnamese Braised Pork Belly 217 | Pan-Roasted Brined Pork Chop 218 | Grilled Pork Tenderloin with Broccolini 219 | Grilled Spatchcock Chicken 220 | Anti-inflammatory Turmeric Smoothie with Pineapple 221 | Gobi Manchurian Recipe 222 | Creole Seafood Jambalaya 223 | Spicy Parmesan Shrimp Pasta 224 | Roasted Cauliflower Enchiladas 225 | Creamy Orzo with Mushrooms 226 | Chicken and Black Bean 227 | Veggie Tacos 228 | Cheesy Fiesta Beef Casserole 229 | Vegan Pizza 230 | Paprika Chicken & Rice Bake 231 | Butternut Squash Curry 232 | Sausage Rolls 233 | Buckwheat Noodle Salad 234 | Chili Cheese Burger Bowls 235 | Vegan Meatballs 236 | Air Fryer Pork Chops 237 | Vegan Charred Lemon-Asparagus Risotto 238 | Mung bean pancakes 239 | Casserole with Rice 240 | Meatballs 241 | Lemony Lentil Soup 242 | Kimchi pork ribs 243 | Southwest Chicken Soup 244 | Slow-Roasted Pork And Peppers 245 | Instant Pot Chicken Thighs 246 | Frito Taco Pie 247 | Focaccia Bread 248 | Spaghetti and Meatballs 249 | Sausage Rolls 250 | Cold Sesame Peanut Noodles 251 | Grilled Lemon-Lime Corn Salad 252 | Thai Shrimp Soup 253 | Mediterranean Orzo with Shrimp 254 | Shrimp Mango Salad 255 | Lemony Chicken & Potatoes With Feta 256 | Italian-Marinated Chicken 257 | Sheet-Pan Ranch Pork and Veggies 258 | Walking Tacos 259 | Garlic Parmesan Roasted Shrimp 260 | Pork Binagoongan sa Gata 261 | Smoky Cedar-Planked Burgers 262 | Lettuce Wraps 263 | Bacon Brunch Ring 264 | Spice-Rubbed Shrimp Tacos 265 | Cauliflower Parmesan 266 | Shaking Tofu 267 | Braised Chipotle Sweet Potatoes 268 | Corned Beef 269 | Lemony Risotto With Asparagus and Shrimp 270 | Easy Taco Salad 271 | The BEST Black Bean Chili 272 | Vegan Lentil Burgers 273 | Dumplings 274 | Honey-Garlic Shrimp Chow Mein 275 | Cauliflower Gratin with Manchego and Almond Sauce 276 | One Skillet Ground Beef Stroganoff 277 | Creamy Tortellini Minestrone 278 | Healthy Steel Cut Oats 279 | Seafood Boil With Shrimp, Corn, and Sausage 280 | Easy Enchiladas 281 | Easy Tabbouleh 282 | Chicken Taquitos 283 | Strawberry-Cabernet Barbecue Sauce 284 | Vegan Pho 285 | Shrimp Dumplings 286 | Shrimp Lo Mein 287 | Sheet-Pan Pork Chops With Apples 288 | Cheeseburger Casserole 289 | How to Brine Shrimp 290 | Beet Green and Walnut Pesto 291 | Beef Bourguignon 292 | Classic Pasta Primavera 293 | Easy Hamburger Soup 294 | Spicy Cashew Shrimp 295 | Pot-au-Feu 296 | Szechuan Shrimp 297 | Pesto Shrimp 298 | Corn Chowder With Shrimp 299 | Cincinnati Chili 300 | Grilled Pork Tenderloin 301 | Peanut Sauce Soba with Crispy Tofu 302 | Penne Pasta With Meat Sauce 303 | Tomato and Charred Pepper Farro Salad 304 | Stovetop Taco Soup 305 | Creamy Kale & Gnocchi Bake 306 | Shrimp Katsu Burgers 307 | Best Classic White Chicken Chili 308 | Beef Enchilada Casserole 309 | Classic Banh Mi 310 | Bacon Cheeseburger Pasta 311 | Baked Falafel with Cucumbers and Tahini 312 | Homemade Hamburger Helper 313 | Honey Garlic Shrimp Stir-Fry 314 | Vegan Chickpea Crab Cakes 315 | Cheap & Easy Chicken 316 | Creamy One-Pot Spinach Shrimp Pasta 317 | Garlic Chicken 318 | Creamy Balsamic Chicken 319 | Special Lunch Menu 320 | Creamy Lemon Chicken 321 | Beef Tenderloin Steak au Poivre 322 | Instant Pot Shrimp Fried Rice 323 | Chilled Fennel-Grapefruit Velouté with Lemon Olive Oil 324 | Stuffed Cabbage Casserole 325 | Mushroom Bolognese 326 | Crab Cakes 327 | Butter Chicken 328 | ‎Tuscan Butter Shrimp 329 | ‎Shrimp Salad 330 | Instant Pot Carnitas with Cornbread 331 | Chili Beans With Rice 332 | Cheesy Beef and Sweet Potato Taco Casserole 333 | Bulgogi-Inspired Beef Tacos 334 | Shrimp Cheung Fun (Rice Rolls) 335 | One-Pot Dinner 336 | Philly Cheesesteak Sloppy Joes 337 | One-Pan Garlic-Butter Shrimp with Orzo 338 | Mum's Everyday Red Lentils 339 | Ground Beef Enchiladas 340 | Indian Chickpea Curry 341 | Buffalo Shrimp Lettuce Wraps 342 | Honey Walnut Shrimp 343 | Shrimp Étouffée 344 | Pork Milanese 345 | Meat Loaf 346 | Cilantro Chutney (Coriander Chutney 347 | Vegetarian Eggplant Lasagna 348 | Cheesy Taco Pasta 349 | Tandoori Chicken 350 | Garlic Broccoli Shrimp Stir Fry 351 | Sweet & Sour Chicken 352 | Aloo Gobi Recipe 353 | Bacon-Shrimp Skewers 354 | Weeknight Tex-Mex Chicken Enchiladas 355 | Steak and Bacon Salad with Chipotle 356 | Vegetarian Biscuits and Gravy 357 | Shrimp Parmesan 358 | Ranch Chicken Drumsticks 359 | Taco Ring 360 | Hearty Beef Stew 361 | Shrimp Salad 362 | Easy Weeknight Chicken and Dumplings 363 | Pork Chops With Bok Choy and Coconut Rice 364 | Best Baked Beans Ever 365 | Mushroom Piccata Pasta 366 | Baked Shrimp Scampi Linguine Pasta 367 | Air Fryer Longganisa Frittata Recipe 368 | Grilled Shrimp Foil Packets 369 | Roasted Chicken and Tomatoes 370 | Lemon Chicken-Orzo Soup 371 | Shrimp Fried Rice 372 | Pepper Pork Belly 373 | Roast Chicken with Tarragon-Braised Fennel 374 | Italian Chicken Sheet 375 | Ribs On The Grill 376 | Hamburger Casserole 377 | Sausage Stuffed Peppers 378 | Silky Pork and Cumin Stew 379 | Slow-Cooker Black Bean Beef Stew 380 | Greek Baked Shrimp with Feta 381 | Crispy Chicken Thighs with Garlic and Rosemary 382 | Cilantro Lime Chicken 383 | Strawberry Jalapeño Chicken 384 | Habanero BBQ Shrimp 385 | Casserole with Bacon Ranch 386 | Air Fryer Brussels Sprouts 387 | Vegan Meatloaf 388 | Chicken & Veggie Stir-Fry 389 | Masala Kheema 390 | Dirty Shrimp in Butter-Beer Sauce 391 | Sheet Pan Spanakopita Quesadillas 392 | Classic Bacon Cheeseburger 393 | Chili con Tofu 394 | Instant Pot Chicken Breast 395 | Air Fryer Tostones with Cilantro Dip 396 | 3-Ingredient Teriyaki Chicken 397 | Classic Buffalo Chicken Dip 398 | Shepherd's Pie 399 | Tofu Katsu Curry 400 | Crispy Pork Cutlets with Fennel Salad 401 | Southwestern Tofu Scramble & Breakfast Sweet Potatoes 402 | Stuffed Bell Peppers 403 | Mushroom Stroganoff 404 | Honey Walnut Shrimp 405 | Slow Cooker Chili 406 | Roasted Cauliflower 407 | Buffalo Chicken with Blue Cheese Salad 408 | Green Chili Pork Stew 409 | Creamy Lemon Butter Chicken 410 | Pork Fried Rice 411 | Taco Casserole 412 | Shrimp Cobb Salad 413 | Taco Soup 414 | Taco Stuffed Zucchini Boats 415 | Zucchini "Linguini" with Roasted Shrimp 416 | Cheesy Ham Steak 417 | Chicken Spaghetti 418 | Vegan Wild-Rice-Stuffed Butternut Squash 419 | Beef 'n' Biscuit Bake 420 | Spicy Shrimp Stir-Fry with Zucchini Noodles 421 | Tuscan Butter Shrimp 422 | Spicy pork and vegetables over rice 423 | One-Pan Coconut-Lime Chicken 424 | BBQ Spaghetti Squash Sliders 425 | Bacon-Wrapped Hot Dogs 426 | Air Fryer Pork Chops 427 | Whole Cauliflower Wellington 428 | Curry with Yoghurt 429 | Braised Beef Short Ribs 430 | Baked Spaghetti and Meatballs 431 | Vegetarian Chili 432 | Butter Bean Salad with Herby Vinaigrette 433 | Pork Loin Roast 434 | Buffalo Chicken Meatballs 435 | Butter Chicken 436 | Grilled Pork Chops 437 | Black Eyed Peas Curry Recipe 438 | Ricotta Meatballs 439 | Pork Milanese 440 | Copycat Cheesecake Factory Shrimp Scampi 441 | Creamy Chicken Penne Pasta 442 | Vegetarian Chili 443 | Balsamic Chicken with Veggies 444 | Roasted Corn, Tomato, and Zucchini Tart 445 | Ginger Mushroom 446 | Greek Stuffed Peppers 447 | Shrimp Fried Rice 448 | Lemon Garlic Shrimp Kabobs 449 | Spring Greens With Hot Bacon Vinaigrette 450 | Sheet Pan Shrimp Boil 451 | Shortcut Crispy Chicken Sandwich -------------------------------------------------------------------------------- /data/gaia/test/e14448e9-5243-4b07-86e1-22e657f96bcf.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/e14448e9-5243-4b07-86e1-22e657f96bcf.jpg -------------------------------------------------------------------------------- /data/gaia/test/e51753c7-3ef3-4404-a352-11a18e5760c9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/e51753c7-3ef3-4404-a352-11a18e5760c9.png -------------------------------------------------------------------------------- /data/gaia/test/f5d0b1c6-5e15-4c55-b60c-9fc855dda5cf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/f5d0b1c6-5e15-4c55-b60c-9fc855dda5cf.png -------------------------------------------------------------------------------- /data/gaia/test/f5d0b1c6-5e15-4c55-b60c-9fc855dda5cf.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/f5d0b1c6-5e15-4c55-b60c-9fc855dda5cf.xlsx -------------------------------------------------------------------------------- /data/gaia/test/f6d29ef1-0e4d-41cb-ac25-e60023b3bd96.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/f6d29ef1-0e4d-41cb-ac25-e60023b3bd96.png -------------------------------------------------------------------------------- /data/gaia/test/f6d29ef1-0e4d-41cb-ac25-e60023b3bd96.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/f6d29ef1-0e4d-41cb-ac25-e60023b3bd96.xlsx -------------------------------------------------------------------------------- /data/gaia/test/fcd80879-4f1d-49d8-b6d6-2993607432c2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/fcd80879-4f1d-49d8-b6d6-2993607432c2.png -------------------------------------------------------------------------------- /data/gaia/test/fcd80879-4f1d-49d8-b6d6-2993607432c2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/fcd80879-4f1d-49d8-b6d6-2993607432c2.xlsx -------------------------------------------------------------------------------- /data/gaia/test/fe8f4748-5d00-4a27-9070-090a0cfdeac4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/fe8f4748-5d00-4a27-9070-090a0cfdeac4.png -------------------------------------------------------------------------------- /data/gaia/test/fe8f4748-5d00-4a27-9070-090a0cfdeac4.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/test/fe8f4748-5d00-4a27-9070-090a0cfdeac4.xlsx -------------------------------------------------------------------------------- /data/gaia/validation/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/.DS_Store -------------------------------------------------------------------------------- /data/gaia/validation/076c8171-9b3b-49b9-a477-244d2a532826.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/076c8171-9b3b-49b9-a477-244d2a532826.png -------------------------------------------------------------------------------- /data/gaia/validation/076c8171-9b3b-49b9-a477-244d2a532826.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/076c8171-9b3b-49b9-a477-244d2a532826.xlsx -------------------------------------------------------------------------------- /data/gaia/validation/1f975693-876d-457b-a649-393859e79bf3.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/1f975693-876d-457b-a649-393859e79bf3.mp3 -------------------------------------------------------------------------------- /data/gaia/validation/2b3ef98c-cc05-450b-a719-711aee40ac65.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/2b3ef98c-cc05-450b-a719-711aee40ac65.mp3 -------------------------------------------------------------------------------- /data/gaia/validation/32102e3e-d12a-4209-9163-7b3a104efe5d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/32102e3e-d12a-4209-9163-7b3a104efe5d.png -------------------------------------------------------------------------------- /data/gaia/validation/32102e3e-d12a-4209-9163-7b3a104efe5d.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/32102e3e-d12a-4209-9163-7b3a104efe5d.xlsx -------------------------------------------------------------------------------- /data/gaia/validation/366e2f2b-8632-4ef2-81eb-bc3877489217.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/366e2f2b-8632-4ef2-81eb-bc3877489217.pdf -------------------------------------------------------------------------------- /data/gaia/validation/366e2f2b-8632-4ef2-81eb-bc3877489217.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/366e2f2b-8632-4ef2-81eb-bc3877489217.png -------------------------------------------------------------------------------- /data/gaia/validation/389793a7-ca17-4e82-81cb-2b3a2391b4b9.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/389793a7-ca17-4e82-81cb-2b3a2391b4b9.png -------------------------------------------------------------------------------- /data/gaia/validation/389793a7-ca17-4e82-81cb-2b3a2391b4b9.txt: -------------------------------------------------------------------------------- 1 | H H H 2 | -------------------------------- 3 | H H H H -------------------------------------------------------------------------------- /data/gaia/validation/3da89939-209c-4086-8520-7eb734e6b4ef.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/3da89939-209c-4086-8520-7eb734e6b4ef.png -------------------------------------------------------------------------------- /data/gaia/validation/3da89939-209c-4086-8520-7eb734e6b4ef.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/3da89939-209c-4086-8520-7eb734e6b4ef.xlsx -------------------------------------------------------------------------------- /data/gaia/validation/4d0aa727-86b1-406b-9b33-f870dd14a4a5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/4d0aa727-86b1-406b-9b33-f870dd14a4a5.png -------------------------------------------------------------------------------- /data/gaia/validation/4d0aa727-86b1-406b-9b33-f870dd14a4a5.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/4d0aa727-86b1-406b-9b33-f870dd14a4a5.xlsx -------------------------------------------------------------------------------- /data/gaia/validation/4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2.png -------------------------------------------------------------------------------- /data/gaia/validation/4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/4d51c4bf-4b0e-4f3d-897b-3f6687a7d9f2.xlsx -------------------------------------------------------------------------------- /data/gaia/validation/54612da3-fd56-4941-80f4-5eb82330de25.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/54612da3-fd56-4941-80f4-5eb82330de25.png -------------------------------------------------------------------------------- /data/gaia/validation/54612da3-fd56-4941-80f4-5eb82330de25.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/54612da3-fd56-4941-80f4-5eb82330de25.xlsx -------------------------------------------------------------------------------- /data/gaia/validation/5b2a14e8-6e59-479c-80e3-4696e8980152.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/5b2a14e8-6e59-479c-80e3-4696e8980152.jpg -------------------------------------------------------------------------------- /data/gaia/validation/5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.png -------------------------------------------------------------------------------- /data/gaia/validation/5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/5cfb274c-0207-4aa7-9575-6ac0bd95d9b2.xlsx -------------------------------------------------------------------------------- /data/gaia/validation/6359a0b1-8f7b-499b-9336-840f9ab90688.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/6359a0b1-8f7b-499b-9336-840f9ab90688.png -------------------------------------------------------------------------------- /data/gaia/validation/65afbc8a-89ca-4ad5-8d62-355bb401f61d.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/65afbc8a-89ca-4ad5-8d62-355bb401f61d.png -------------------------------------------------------------------------------- /data/gaia/validation/65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/65afbc8a-89ca-4ad5-8d62-355bb401f61d.xlsx -------------------------------------------------------------------------------- /data/gaia/validation/67e8878b-5cef-4375-804e-e6291fdbe78a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/67e8878b-5cef-4375-804e-e6291fdbe78a.pdf -------------------------------------------------------------------------------- /data/gaia/validation/67e8878b-5cef-4375-804e-e6291fdbe78a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/67e8878b-5cef-4375-804e-e6291fdbe78a.png -------------------------------------------------------------------------------- /data/gaia/validation/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/7bd855d8-463d-4ed5-93ca-5fe35145f733.xlsx -------------------------------------------------------------------------------- /data/gaia/validation/7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f.png -------------------------------------------------------------------------------- /data/gaia/validation/7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/7cc4acfa-63fd-4acc-a1a1-e8e529e0a97f.xlsx -------------------------------------------------------------------------------- /data/gaia/validation/8d46b8d6-b38a-47ff-ac74-cda14cf2d19b.csv: -------------------------------------------------------------------------------- 1 | species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex 2 | Adelie,Torgersen,39.1,18.7,181,3750,MALE 3 | Adelie,Torgersen,39.5,17.4,186,3800,FEMALE 4 | Adelie,Torgersen,40.3,18,195,3250,FEMALE 5 | Adelie,Torgersen,,,,, 6 | Adelie,Torgersen,36.7,19.3,193,3450,FEMALE 7 | Adelie,Torgersen,39.3,20.6,190,3650,MALE 8 | Adelie,Torgersen,38.9,17.8,181,3625,FEMALE 9 | Adelie,Torgersen,39.2,19.6,195,4675,MALE 10 | Adelie,Torgersen,34.1,18.1,193,3475, 11 | Adelie,Torgersen,42,20.2,190,4250, 12 | Adelie,Torgersen,37.8,17.1,186,3300, 13 | Adelie,Torgersen,37.8,17.3,180,3700, 14 | Adelie,Torgersen,41.1,17.6,182,3200,FEMALE 15 | Adelie,Torgersen,38.6,21.2,191,3800,MALE 16 | Adelie,Torgersen,34.6,21.1,198,4400,MALE 17 | Adelie,Torgersen,36.6,17.8,185,3700,FEMALE 18 | Adelie,Torgersen,38.7,19,195,3450,FEMALE 19 | Adelie,Torgersen,42.5,20.7,197,4500,MALE 20 | Adelie,Torgersen,34.4,18.4,184,3325,FEMALE 21 | Adelie,Torgersen,46,21.5,194,4200,MALE 22 | Adelie,Biscoe,37.8,18.3,174,3400,FEMALE 23 | Adelie,Biscoe,37.7,18.7,180,3600,MALE 24 | Adelie,Biscoe,35.9,19.2,189,3800,FEMALE 25 | Adelie,Biscoe,38.2,18.1,185,3950,MALE 26 | Adelie,Biscoe,38.8,17.2,180,3800,MALE 27 | Adelie,Biscoe,35.3,18.9,187,3800,FEMALE 28 | Adelie,Biscoe,40.6,18.6,183,3550,MALE 29 | Adelie,Biscoe,40.5,17.9,187,3200,FEMALE 30 | Adelie,Biscoe,37.9,18.6,172,3150,FEMALE 31 | Adelie,Biscoe,40.5,18.9,180,3950,MALE 32 | Adelie,Dream,39.5,16.7,178,3250,FEMALE 33 | Adelie,Dream,37.2,18.1,178,3900,MALE 34 | Adelie,Dream,39.5,17.8,188,3300,FEMALE 35 | Adelie,Dream,40.9,18.9,184,3900,MALE 36 | Adelie,Dream,36.4,17,195,3325,FEMALE 37 | Adelie,Dream,39.2,21.1,196,4150,MALE 38 | Adelie,Dream,38.8,20,190,3950,MALE 39 | Adelie,Dream,42.2,18.5,180,3550,FEMALE 40 | Adelie,Dream,37.6,19.3,181,3300,FEMALE 41 | Adelie,Dream,39.8,19.1,184,4650,MALE 42 | Adelie,Dream,36.5,18,182,3150,FEMALE 43 | Adelie,Dream,40.8,18.4,195,3900,MALE 44 | Adelie,Dream,36,18.5,186,3100,FEMALE 45 | Adelie,Dream,44.1,19.7,196,4400,MALE 46 | Adelie,Dream,37,16.9,185,3000,FEMALE 47 | Adelie,Dream,39.6,18.8,190,4600,MALE 48 | Adelie,Dream,41.1,19,182,3425,MALE 49 | Adelie,Dream,37.5,18.9,179,2975, 50 | Adelie,Dream,36,17.9,190,3450,FEMALE 51 | Adelie,Dream,42.3,21.2,191,4150,MALE 52 | Adelie,Biscoe,39.6,17.7,186,3500,FEMALE 53 | Adelie,Biscoe,40.1,18.9,188,4300,MALE 54 | Adelie,Biscoe,35,17.9,190,3450,FEMALE 55 | Adelie,Biscoe,42,19.5,200,4050,MALE 56 | Adelie,Biscoe,34.5,18.1,187,2900,FEMALE 57 | Adelie,Biscoe,41.4,18.6,191,3700,MALE 58 | Adelie,Biscoe,39,17.5,186,3550,FEMALE 59 | Adelie,Biscoe,40.6,18.8,193,3800,MALE 60 | Adelie,Biscoe,36.5,16.6,181,2850,FEMALE 61 | Adelie,Biscoe,37.6,19.1,194,3750,MALE 62 | Adelie,Biscoe,35.7,16.9,185,3150,FEMALE 63 | Adelie,Biscoe,41.3,21.1,195,4400,MALE 64 | Adelie,Biscoe,37.6,17,185,3600,FEMALE 65 | Adelie,Biscoe,41.1,18.2,192,4050,MALE 66 | Adelie,Biscoe,36.4,17.1,184,2850,FEMALE 67 | Adelie,Biscoe,41.6,18,192,3950,MALE 68 | Adelie,Biscoe,35.5,16.2,195,3350,FEMALE 69 | Adelie,Biscoe,41.1,19.1,188,4100,MALE 70 | Adelie,Torgersen,35.9,16.6,190,3050,FEMALE 71 | Adelie,Torgersen,41.8,19.4,198,4450,MALE 72 | Adelie,Torgersen,33.5,19,190,3600,FEMALE 73 | Adelie,Torgersen,39.7,18.4,190,3900,MALE 74 | Adelie,Torgersen,39.6,17.2,196,3550,FEMALE 75 | Adelie,Torgersen,45.8,18.9,197,4150,MALE 76 | Adelie,Torgersen,35.5,17.5,190,3700,FEMALE 77 | Adelie,Torgersen,42.8,18.5,195,4250,MALE 78 | Adelie,Torgersen,40.9,16.8,191,3700,FEMALE 79 | Adelie,Torgersen,37.2,19.4,184,3900,MALE 80 | Adelie,Torgersen,36.2,16.1,187,3550,FEMALE 81 | Adelie,Torgersen,42.1,19.1,195,4000,MALE 82 | Adelie,Torgersen,34.6,17.2,189,3200,FEMALE 83 | Adelie,Torgersen,42.9,17.6,196,4700,MALE 84 | Adelie,Torgersen,36.7,18.8,187,3800,FEMALE 85 | Adelie,Torgersen,35.1,19.4,193,4200,MALE 86 | Adelie,Dream,37.3,17.8,191,3350,FEMALE 87 | Adelie,Dream,41.3,20.3,194,3550,MALE 88 | Adelie,Dream,36.3,19.5,190,3800,MALE 89 | Adelie,Dream,36.9,18.6,189,3500,FEMALE 90 | Adelie,Dream,38.3,19.2,189,3950,MALE 91 | Adelie,Dream,38.9,18.8,190,3600,FEMALE 92 | Adelie,Dream,35.7,18,202,3550,FEMALE 93 | Adelie,Dream,41.1,18.1,205,4300,MALE 94 | Adelie,Dream,34,17.1,185,3400,FEMALE 95 | Adelie,Dream,39.6,18.1,186,4450,MALE 96 | Adelie,Dream,36.2,17.3,187,3300,FEMALE 97 | Adelie,Dream,40.8,18.9,208,4300,MALE 98 | Adelie,Dream,38.1,18.6,190,3700,FEMALE 99 | Adelie,Dream,40.3,18.5,196,4350,MALE 100 | Adelie,Dream,33.1,16.1,178,2900,FEMALE 101 | Adelie,Dream,43.2,18.5,192,4100,MALE 102 | Adelie,Biscoe,35,17.9,192,3725,FEMALE 103 | Adelie,Biscoe,41,20,203,4725,MALE 104 | Adelie,Biscoe,37.7,16,183,3075,FEMALE 105 | Adelie,Biscoe,37.8,20,190,4250,MALE 106 | Adelie,Biscoe,37.9,18.6,193,2925,FEMALE 107 | Adelie,Biscoe,39.7,18.9,184,3550,MALE 108 | Adelie,Biscoe,38.6,17.2,199,3750,FEMALE 109 | Adelie,Biscoe,38.2,20,190,3900,MALE 110 | Adelie,Biscoe,38.1,17,181,3175,FEMALE 111 | Adelie,Biscoe,43.2,19,197,4775,MALE 112 | Adelie,Biscoe,38.1,16.5,198,3825,FEMALE 113 | Adelie,Biscoe,45.6,20.3,191,4600,MALE 114 | Adelie,Biscoe,39.7,17.7,193,3200,FEMALE 115 | Adelie,Biscoe,42.2,19.5,197,4275,MALE 116 | Adelie,Biscoe,39.6,20.7,191,3900,FEMALE 117 | Adelie,Biscoe,42.7,18.3,196,4075,MALE 118 | Adelie,Torgersen,38.6,17,188,2900,FEMALE 119 | Adelie,Torgersen,37.3,20.5,199,3775,MALE 120 | Adelie,Torgersen,35.7,17,189,3350,FEMALE 121 | Adelie,Torgersen,41.1,18.6,189,3325,MALE 122 | Adelie,Torgersen,36.2,17.2,187,3150,FEMALE 123 | Adelie,Torgersen,37.7,19.8,198,3500,MALE 124 | Adelie,Torgersen,40.2,17,176,3450,FEMALE 125 | Adelie,Torgersen,41.4,18.5,202,3875,MALE 126 | Adelie,Torgersen,35.2,15.9,186,3050,FEMALE 127 | Adelie,Torgersen,40.6,19,199,4000,MALE 128 | Adelie,Torgersen,38.8,17.6,191,3275,FEMALE 129 | Adelie,Torgersen,41.5,18.3,195,4300,MALE 130 | Adelie,Torgersen,39,17.1,191,3050,FEMALE 131 | Adelie,Torgersen,44.1,18,210,4000,MALE 132 | Adelie,Torgersen,38.5,17.9,190,3325,FEMALE 133 | Adelie,Torgersen,43.1,19.2,197,3500,MALE 134 | Adelie,Dream,36.8,18.5,193,3500,FEMALE 135 | Adelie,Dream,37.5,18.5,199,4475,MALE 136 | Adelie,Dream,38.1,17.6,187,3425,FEMALE 137 | Adelie,Dream,41.1,17.5,190,3900,MALE 138 | Adelie,Dream,35.6,17.5,191,3175,FEMALE 139 | Adelie,Dream,40.2,20.1,200,3975,MALE 140 | Adelie,Dream,37,16.5,185,3400,FEMALE 141 | Adelie,Dream,39.7,17.9,193,4250,MALE 142 | Adelie,Dream,40.2,17.1,193,3400,FEMALE 143 | Adelie,Dream,40.6,17.2,187,3475,MALE 144 | Adelie,Dream,32.1,15.5,188,3050,FEMALE 145 | Adelie,Dream,40.7,17,190,3725,MALE 146 | Adelie,Dream,37.3,16.8,192,3000,FEMALE 147 | Adelie,Dream,39,18.7,185,3650,MALE 148 | Adelie,Dream,39.2,18.6,190,4250,MALE 149 | Adelie,Dream,36.6,18.4,184,3475,FEMALE 150 | Adelie,Dream,36,17.8,195,3450,FEMALE 151 | Adelie,Dream,37.8,18.1,193,3750,MALE 152 | Adelie,Dream,36,17.1,187,3700,FEMALE 153 | Adelie,Dream,41.5,18.5,201,4000,MALE 154 | Chinstrap,Dream,46.5,17.9,192,3500,FEMALE 155 | Chinstrap,Dream,50,19.5,196,3900,MALE 156 | Chinstrap,Dream,51.3,19.2,193,3650,MALE 157 | Chinstrap,Dream,45.4,18.7,188,3525,FEMALE 158 | Chinstrap,Dream,52.7,19.8,197,3725,MALE 159 | Chinstrap,Dream,45.2,17.8,198,3950,FEMALE 160 | Chinstrap,Dream,46.1,18.2,178,3250,FEMALE 161 | Chinstrap,Dream,51.3,18.2,197,3750,MALE 162 | Chinstrap,Dream,46,18.9,195,4150,FEMALE 163 | Chinstrap,Dream,51.3,19.9,198,3700,MALE 164 | Chinstrap,Dream,46.6,17.8,193,3800,FEMALE 165 | Chinstrap,Dream,51.7,20.3,194,3775,MALE 166 | Chinstrap,Dream,47,17.3,185,3700,FEMALE 167 | Chinstrap,Dream,52,18.1,201,4050,MALE 168 | Chinstrap,Dream,45.9,17.1,190,3575,FEMALE 169 | Chinstrap,Dream,50.5,19.6,201,4050,MALE 170 | Chinstrap,Dream,50.3,20,197,3300,MALE 171 | Chinstrap,Dream,58,17.8,181,3700,FEMALE 172 | Chinstrap,Dream,46.4,18.6,190,3450,FEMALE 173 | Chinstrap,Dream,49.2,18.2,195,4400,MALE 174 | Chinstrap,Dream,42.4,17.3,181,3600,FEMALE 175 | Chinstrap,Dream,48.5,17.5,191,3400,MALE 176 | Chinstrap,Dream,43.2,16.6,187,2900,FEMALE 177 | Chinstrap,Dream,50.6,19.4,193,3800,MALE 178 | Chinstrap,Dream,46.7,17.9,195,3300,FEMALE 179 | Chinstrap,Dream,52,19,197,4150,MALE 180 | Chinstrap,Dream,50.5,18.4,200,3400,FEMALE 181 | Chinstrap,Dream,49.5,19,200,3800,MALE 182 | Chinstrap,Dream,46.4,17.8,191,3700,FEMALE 183 | Chinstrap,Dream,52.8,20,205,4550,MALE 184 | Chinstrap,Dream,40.9,16.6,187,3200,FEMALE 185 | Chinstrap,Dream,54.2,20.8,201,4300,MALE 186 | Chinstrap,Dream,42.5,16.7,187,3350,FEMALE 187 | Chinstrap,Dream,51,18.8,203,4100,MALE 188 | Chinstrap,Dream,49.7,18.6,195,3600,MALE 189 | Chinstrap,Dream,47.5,16.8,199,3900,FEMALE 190 | Chinstrap,Dream,47.6,18.3,195,3850,FEMALE 191 | Chinstrap,Dream,52,20.7,210,4800,MALE 192 | Chinstrap,Dream,46.9,16.6,192,2700,FEMALE 193 | Chinstrap,Dream,53.5,19.9,205,4500,MALE 194 | Chinstrap,Dream,49,19.5,210,3950,MALE 195 | Chinstrap,Dream,46.2,17.5,187,3650,FEMALE 196 | Chinstrap,Dream,50.9,19.1,196,3550,MALE 197 | Chinstrap,Dream,45.5,17,196,3500,FEMALE 198 | Chinstrap,Dream,50.9,17.9,196,3675,FEMALE 199 | Chinstrap,Dream,50.8,18.5,201,4450,MALE 200 | Chinstrap,Dream,50.1,17.9,190,3400,FEMALE 201 | Chinstrap,Dream,49,19.6,212,4300,MALE 202 | Chinstrap,Dream,51.5,18.7,187,3250,MALE 203 | Chinstrap,Dream,49.8,17.3,198,3675,FEMALE 204 | Chinstrap,Dream,48.1,16.4,199,3325,FEMALE 205 | Chinstrap,Dream,51.4,19,201,3950,MALE 206 | Chinstrap,Dream,45.7,17.3,193,3600,FEMALE 207 | Chinstrap,Dream,50.7,19.7,203,4050,MALE 208 | Chinstrap,Dream,42.5,17.3,187,3350,FEMALE 209 | Chinstrap,Dream,52.2,18.8,197,3450,MALE 210 | Chinstrap,Dream,45.2,16.6,191,3250,FEMALE 211 | Chinstrap,Dream,49.3,19.9,203,4050,MALE 212 | Chinstrap,Dream,50.2,18.8,202,3800,MALE 213 | Chinstrap,Dream,45.6,19.4,194,3525,FEMALE 214 | Chinstrap,Dream,51.9,19.5,206,3950,MALE 215 | Chinstrap,Dream,46.8,16.5,189,3650,FEMALE 216 | Chinstrap,Dream,45.7,17,195,3650,FEMALE 217 | Chinstrap,Dream,55.8,19.8,207,4000,MALE 218 | Chinstrap,Dream,43.5,18.1,202,3400,FEMALE 219 | Chinstrap,Dream,49.6,18.2,193,3775,MALE 220 | Chinstrap,Dream,50.8,19,210,4100,MALE 221 | Chinstrap,Dream,50.2,18.7,198,3775,FEMALE 222 | Gentoo,Biscoe,46.1,13.2,211,4500,FEMALE 223 | Gentoo,Biscoe,50,16.3,230,5700,MALE 224 | Gentoo,Biscoe,48.7,14.1,210,4450,FEMALE 225 | Gentoo,Biscoe,50,15.2,218,5700,MALE 226 | Gentoo,Biscoe,47.6,14.5,215,5400,MALE 227 | Gentoo,Biscoe,46.5,13.5,210,4550,FEMALE 228 | Gentoo,Biscoe,45.4,14.6,211,4800,FEMALE 229 | Gentoo,Biscoe,46.7,15.3,219,5200,MALE 230 | Gentoo,Biscoe,43.3,13.4,209,4400,FEMALE 231 | Gentoo,Biscoe,46.8,15.4,215,5150,MALE 232 | Gentoo,Biscoe,40.9,13.7,214,4650,FEMALE 233 | Gentoo,Biscoe,49,16.1,216,5550,MALE 234 | Gentoo,Biscoe,45.5,13.7,214,4650,FEMALE 235 | Gentoo,Biscoe,48.4,14.6,213,5850,MALE 236 | Gentoo,Biscoe,45.8,14.6,210,4200,FEMALE 237 | Gentoo,Biscoe,49.3,15.7,217,5850,MALE 238 | Gentoo,Biscoe,42,13.5,210,4150,FEMALE 239 | Gentoo,Biscoe,49.2,15.2,221,6300,MALE 240 | Gentoo,Biscoe,46.2,14.5,209,4800,FEMALE 241 | Gentoo,Biscoe,48.7,15.1,222,5350,MALE 242 | Gentoo,Biscoe,50.2,14.3,218,5700,MALE 243 | Gentoo,Biscoe,45.1,14.5,215,5000,FEMALE 244 | Gentoo,Biscoe,46.5,14.5,213,4400,FEMALE 245 | Gentoo,Biscoe,46.3,15.8,215,5050,MALE 246 | Gentoo,Biscoe,42.9,13.1,215,5000,FEMALE 247 | Gentoo,Biscoe,46.1,15.1,215,5100,MALE 248 | Gentoo,Biscoe,44.5,14.3,216,4100, 249 | Gentoo,Biscoe,47.8,15,215,5650,MALE 250 | Gentoo,Biscoe,48.2,14.3,210,4600,FEMALE 251 | Gentoo,Biscoe,50,15.3,220,5550,MALE 252 | Gentoo,Biscoe,47.3,15.3,222,5250,MALE 253 | Gentoo,Biscoe,42.8,14.2,209,4700,FEMALE 254 | Gentoo,Biscoe,45.1,14.5,207,5050,FEMALE 255 | Gentoo,Biscoe,59.6,17,230,6050,MALE 256 | Gentoo,Biscoe,49.1,14.8,220,5150,FEMALE 257 | Gentoo,Biscoe,48.4,16.3,220,5400,MALE 258 | Gentoo,Biscoe,42.6,13.7,213,4950,FEMALE 259 | Gentoo,Biscoe,44.4,17.3,219,5250,MALE 260 | Gentoo,Biscoe,44,13.6,208,4350,FEMALE 261 | Gentoo,Biscoe,48.7,15.7,208,5350,MALE 262 | Gentoo,Biscoe,42.7,13.7,208,3950,FEMALE 263 | Gentoo,Biscoe,49.6,16,225,5700,MALE 264 | Gentoo,Biscoe,45.3,13.7,210,4300,FEMALE 265 | Gentoo,Biscoe,49.6,15,216,4750,MALE 266 | Gentoo,Biscoe,50.5,15.9,222,5550,MALE 267 | Gentoo,Biscoe,43.6,13.9,217,4900,FEMALE 268 | Gentoo,Biscoe,45.5,13.9,210,4200,FEMALE 269 | Gentoo,Biscoe,50.5,15.9,225,5400,MALE 270 | Gentoo,Biscoe,44.9,13.3,213,5100,FEMALE 271 | Gentoo,Biscoe,45.2,15.8,215,5300,MALE 272 | Gentoo,Biscoe,46.6,14.2,210,4850,FEMALE 273 | Gentoo,Biscoe,48.5,14.1,220,5300,MALE 274 | Gentoo,Biscoe,45.1,14.4,210,4400,FEMALE 275 | Gentoo,Biscoe,50.1,15,225,5000,MALE 276 | Gentoo,Biscoe,46.5,14.4,217,4900,FEMALE 277 | Gentoo,Biscoe,45,15.4,220,5050,MALE 278 | Gentoo,Biscoe,43.8,13.9,208,4300,FEMALE 279 | Gentoo,Biscoe,45.5,15,220,5000,MALE 280 | Gentoo,Biscoe,43.2,14.5,208,4450,FEMALE 281 | Gentoo,Biscoe,50.4,15.3,224,5550,MALE 282 | Gentoo,Biscoe,45.3,13.8,208,4200,FEMALE 283 | Gentoo,Biscoe,46.2,14.9,221,5300,MALE 284 | Gentoo,Biscoe,45.7,13.9,214,4400,FEMALE 285 | Gentoo,Biscoe,54.3,15.7,231,5650,MALE 286 | Gentoo,Biscoe,45.8,14.2,219,4700,FEMALE 287 | Gentoo,Biscoe,49.8,16.8,230,5700,MALE 288 | Gentoo,Biscoe,46.2,14.4,214,4650, 289 | Gentoo,Biscoe,49.5,16.2,229,5800,MALE 290 | Gentoo,Biscoe,43.5,14.2,220,4700,FEMALE 291 | Gentoo,Biscoe,50.7,15,223,5550,MALE 292 | Gentoo,Biscoe,47.7,15,216,4750,FEMALE 293 | Gentoo,Biscoe,46.4,15.6,221,5000,MALE 294 | Gentoo,Biscoe,48.2,15.6,221,5100,MALE 295 | Gentoo,Biscoe,46.5,14.8,217,5200,FEMALE 296 | Gentoo,Biscoe,46.4,15,216,4700,FEMALE 297 | Gentoo,Biscoe,48.6,16,230,5800,MALE 298 | Gentoo,Biscoe,47.5,14.2,209,4600,FEMALE 299 | Gentoo,Biscoe,51.1,16.3,220,6000,MALE 300 | Gentoo,Biscoe,45.2,13.8,215,4750,FEMALE 301 | Gentoo,Biscoe,45.2,16.4,223,5950,MALE 302 | Gentoo,Biscoe,49.1,14.5,212,4625,FEMALE 303 | Gentoo,Biscoe,52.5,15.6,221,5450,MALE 304 | Gentoo,Biscoe,47.4,14.6,212,4725,FEMALE 305 | Gentoo,Biscoe,50,15.9,224,5350,MALE 306 | Gentoo,Biscoe,44.9,13.8,212,4750,FEMALE 307 | Gentoo,Biscoe,50.8,17.3,228,5600,MALE 308 | Gentoo,Biscoe,43.4,14.4,218,4600,FEMALE 309 | Gentoo,Biscoe,51.3,14.2,218,5300,MALE 310 | Gentoo,Biscoe,47.5,14,212,4875,FEMALE 311 | Gentoo,Biscoe,52.1,17,230,5550,MALE 312 | Gentoo,Biscoe,47.5,15,218,4950,FEMALE 313 | Gentoo,Biscoe,52.2,17.1,228,5400,MALE 314 | Gentoo,Biscoe,45.5,14.5,212,4750,FEMALE 315 | Gentoo,Biscoe,49.5,16.1,224,5650,MALE 316 | Gentoo,Biscoe,44.5,14.7,214,4850,FEMALE 317 | Gentoo,Biscoe,50.8,15.7,226,5200,MALE 318 | Gentoo,Biscoe,49.4,15.8,216,4925,MALE 319 | Gentoo,Biscoe,46.9,14.6,222,4875,FEMALE 320 | Gentoo,Biscoe,48.4,14.4,203,4625,FEMALE 321 | Gentoo,Biscoe,51.1,16.5,225,5250,MALE 322 | Gentoo,Biscoe,48.5,15,219,4850,FEMALE 323 | Gentoo,Biscoe,55.9,17,228,5600,MALE 324 | Gentoo,Biscoe,47.2,15.5,215,4975,FEMALE 325 | Gentoo,Biscoe,49.1,15,228,5500,MALE 326 | Gentoo,Biscoe,47.3,13.8,216,4725, 327 | Gentoo,Biscoe,46.8,16.1,215,5500,MALE 328 | Gentoo,Biscoe,41.7,14.7,210,4700,FEMALE 329 | Gentoo,Biscoe,53.4,15.8,219,5500,MALE 330 | Gentoo,Biscoe,43.3,14,208,4575,FEMALE 331 | Gentoo,Biscoe,48.1,15.1,209,5500,MALE 332 | Gentoo,Biscoe,50.5,15.2,216,5000,FEMALE 333 | Gentoo,Biscoe,49.8,15.9,229,5950,MALE 334 | Gentoo,Biscoe,43.5,15.2,213,4650,FEMALE 335 | Gentoo,Biscoe,51.5,16.3,230,5500,MALE 336 | Gentoo,Biscoe,46.2,14.1,217,4375,FEMALE 337 | Gentoo,Biscoe,55.1,16,230,5850,MALE 338 | Gentoo,Biscoe,44.5,15.7,217,4875, 339 | Gentoo,Biscoe,48.8,16.2,222,6000,MALE 340 | Gentoo,Biscoe,47.2,13.7,214,4925,FEMALE 341 | Gentoo,Biscoe,,,,, 342 | Gentoo,Biscoe,46.8,14.3,215,4850,FEMALE 343 | Gentoo,Biscoe,50.4,15.7,222,5750,MALE 344 | Gentoo,Biscoe,45.2,14.8,212,5200,FEMALE 345 | Gentoo,Biscoe,49.9,16.1,213,5400,MALE 346 | -------------------------------------------------------------------------------- /data/gaia/validation/8f80e01c-1296-4371-9486-bb3d68651a60.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/8f80e01c-1296-4371-9486-bb3d68651a60.png -------------------------------------------------------------------------------- /data/gaia/validation/9318445f-fe6a-4e1b-acbf-c68228c9906a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/9318445f-fe6a-4e1b-acbf-c68228c9906a.png -------------------------------------------------------------------------------- /data/gaia/validation/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/99c9cc74-fdc8-46c6-8f8d-3ce2d3bfeea3.mp3 -------------------------------------------------------------------------------- /data/gaia/validation/9b54f9d9-35ee-4a14-b62f-d130ea00317f.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/9b54f9d9-35ee-4a14-b62f-d130ea00317f.zip -------------------------------------------------------------------------------- /data/gaia/validation/9b54f9d9-35ee-4a14-b62f-d130ea00317f/food_duplicates.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/9b54f9d9-35ee-4a14-b62f-d130ea00317f/food_duplicates.xls -------------------------------------------------------------------------------- /data/gaia/validation/a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/a3fbeb63-0e8c-4a11-bff6-0e3b484c3e9c.pptx -------------------------------------------------------------------------------- /data/gaia/validation/b2c257e0-3ad7-4f05-b8e3-d9da973be36e.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/b2c257e0-3ad7-4f05-b8e3-d9da973be36e.jpg -------------------------------------------------------------------------------- /data/gaia/validation/b7f857e4-d8aa-4387-af2a-0e844df5b9d8.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/b7f857e4-d8aa-4387-af2a-0e844df5b9d8.png -------------------------------------------------------------------------------- /data/gaia/validation/bec74516-02fc-48dc-b202-55e78d0e17cf.jsonld: -------------------------------------------------------------------------------- 1 | { 2 | "@context": "http://schema.org", 3 | "@type": "Collection", 4 | "@id": "https://doi.org/10.5447/ipk/2022/29", 5 | "url": "https://doi.ipk-gatersleben.de:443/DOI/64fb788c-7495-4800-8568-fd562b07017e/fbda7260-8307-485e-a9b7-d84292e3eb04/2", 6 | "additionalType": "directory", 7 | "name": "GLOBAL STRATEGY FOR THE CONSERVATION OF POTATO", 8 | "author": { 9 | "name": "Manuela Nagel", 10 | "givenName": "Manuela", 11 | "familyName": "Nagel", 12 | "affiliation": { 13 | "@type": "Organization", 14 | "name": "Leibniz Institute of Plant Genetics and Crop Plant Research (IPK), Seeland OT Gatersleben, Corrensstraße 3, 06466, Germany" 15 | }, 16 | "@id": "https://orcid.org/0000-0003-0396-0333" 17 | }, 18 | "editor": [ 19 | { 20 | "name": "Ehsan Dulloo", 21 | "givenName": "Ehsan", 22 | "familyName": "Dulloo", 23 | "affiliation": { 24 | "@type": "Organization", 25 | "name": "International Consultant, ," 26 | }, 27 | "contributorType": "Researcher" 28 | }, 29 | { 30 | "name": "Prishnee Bissessur", 31 | "givenName": "Prishnee", 32 | "familyName": "Bissessur", 33 | "affiliation": { 34 | "@type": "Organization", 35 | "name": "International Consultant, ," 36 | }, 37 | "contributorType": "Researcher" 38 | }, 39 | { 40 | "name": "Tatjana Gavrilenko", 41 | "givenName": "Tatjana", 42 | "familyName": "Gavrilenko", 43 | "affiliation": { 44 | "@type": "Organization", 45 | "name": "N.I. Vavilov All-Russian Institute of Plant Genetic Resources, , Russia" 46 | }, 47 | "contributorType": "Researcher", 48 | "@id": "https://orcid.org/0000-0002-2605-6569" 49 | }, 50 | { 51 | "name": "John Bamberg", 52 | "givenName": "John", 53 | "familyName": "Bamberg", 54 | "affiliation": { 55 | "@type": "Organization", 56 | "name": "U. S. Potato Genebank, , USA" 57 | }, 58 | "contributorType": "Researcher", 59 | "@id": "https://orcid.org/0000-0001-6102-7846" 60 | }, 61 | { 62 | "name": "David Ellis", 63 | "givenName": "David", 64 | "familyName": "Ellis", 65 | "affiliation": { 66 | "@type": "Organization", 67 | "name": "International Potato Center (CIP), , Peru" 68 | }, 69 | "contributorType": "Researcher", 70 | "@id": "https://orcid.org/0000-0002-0209-2784" 71 | }, 72 | { 73 | "name": "Peter Giovannini", 74 | "givenName": "Peter", 75 | "familyName": "Giovannini", 76 | "affiliation": { 77 | "@type": "Organization", 78 | "name": "Global Crop Diversity Trust, ," 79 | }, 80 | "contributorType": "Researcher", 81 | "@id": "https://orcid.org/0000-0002-1053-2030" 82 | } 83 | ], 84 | "description": "Cultivated potato, Solanum tuberosum ssp. tuberosum, is the third most consumed crop globally and important not only for food but also for for the animal feed, pharmaceutical, textile and paper industries. To gain an overview on the current state of the conservation and use of potato genetic resources, the Global Crop Diversity Trust (Crop Trust), commissioned an update of the ‘Global conservation strategy for potato genetic resources’. This updated strategy aims to support the efficiency and effectiveness of potato diversity conservation at national, regional and international levels, and to identify priorities for strengthening the conservation and use of potato genetic resources.", 85 | "keywords": "ex situ conservation, plant genetic resources, potato, Solanum tuberosum, global strategy, conservation strategy, wild potato, Andigenum group, Chilotanum group, native potato variety, genebank, accession, true potato seed, potato tuber, late blight", 86 | "inLanguage": "en", 87 | "contentSize": "0 B", 88 | "datePublished": "2022", 89 | "schemaVersion": "http://datacite.org/schema/kernel-4", 90 | "publisher": { 91 | "@type": "Organization", 92 | "name": "e!DAL - Plant Genomics and Phenomics Research Data Repository (PGP), IPK Gatersleben, Seeland OT Gatersleben, Corrensstraße 3, 06466, Germany" 93 | }, 94 | "provider": { 95 | "@type": "Organization", 96 | "name": "datacite" 97 | } 98 | } -------------------------------------------------------------------------------- /data/gaia/validation/bfcd99e1-0690-4b53-a85c-0174a8629083.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/bfcd99e1-0690-4b53-a85c-0174a8629083.zip -------------------------------------------------------------------------------- /data/gaia/validation/bfcd99e1-0690-4b53-a85c-0174a8629083/Applicants.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/bfcd99e1-0690-4b53-a85c-0174a8629083/Applicants.xlsx -------------------------------------------------------------------------------- /data/gaia/validation/bfcd99e1-0690-4b53-a85c-0174a8629083/Job Listing.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/bfcd99e1-0690-4b53-a85c-0174a8629083/Job Listing.pdf -------------------------------------------------------------------------------- /data/gaia/validation/c526d8d6-5987-4da9-b24c-83466fa172f3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/c526d8d6-5987-4da9-b24c-83466fa172f3.png -------------------------------------------------------------------------------- /data/gaia/validation/c526d8d6-5987-4da9-b24c-83466fa172f3.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/c526d8d6-5987-4da9-b24c-83466fa172f3.xlsx -------------------------------------------------------------------------------- /data/gaia/validation/cca530fc-4052-43b2-b130-b30968d8aa44.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/cca530fc-4052-43b2-b130-b30968d8aa44.png -------------------------------------------------------------------------------- /data/gaia/validation/cca70ce6-1952-45d2-acd4-80c903b0bc49.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/cca70ce6-1952-45d2-acd4-80c903b0bc49.png -------------------------------------------------------------------------------- /data/gaia/validation/cffe0e32-c9a6-4c52-9877-78ceb4aaa9fb.docx: -------------------------------------------------------------------------------- 1 | Employees 2 | 3 | Harry 4 | Rebecca 5 | Georgette 6 | Micah 7 | Perry 8 | Tyson 9 | Lucy 10 | Jun 11 | Sara 12 | Miguel 13 | Fred 14 | Alex -------------------------------------------------------------------------------- /data/gaia/validation/d8152ad6-e4d5-4c12-8bb7-8d57dc10c6de.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/d8152ad6-e4d5-4c12-8bb7-8d57dc10c6de.png -------------------------------------------------------------------------------- /data/gaia/validation/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.png -------------------------------------------------------------------------------- /data/gaia/validation/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.xlsx -------------------------------------------------------------------------------- /data/gaia/validation/df6561b2-7ee5-4540-baab-5095f742716a.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/df6561b2-7ee5-4540-baab-5095f742716a.png -------------------------------------------------------------------------------- /data/gaia/validation/e9a2c537-8232-4c3f-85b0-b52de6bcba99.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/e9a2c537-8232-4c3f-85b0-b52de6bcba99.pdf -------------------------------------------------------------------------------- /data/gaia/validation/e9a2c537-8232-4c3f-85b0-b52de6bcba99.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/e9a2c537-8232-4c3f-85b0-b52de6bcba99.png -------------------------------------------------------------------------------- /data/gaia/validation/edd4d4f2-1a58-45c4-b038-67337af4e029.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/edd4d4f2-1a58-45c4-b038-67337af4e029.png -------------------------------------------------------------------------------- /data/gaia/validation/edd4d4f2-1a58-45c4-b038-67337af4e029.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/data/gaia/validation/edd4d4f2-1a58-45c4-b038-67337af4e029.xlsx -------------------------------------------------------------------------------- /data/gaia/validation/f918266a-b3e0-4914-865d-4faa564f1aef.py: -------------------------------------------------------------------------------- 1 | from random import randint 2 | import time 3 | 4 | class UhOh(Exception): 5 | pass 6 | 7 | class Hmm: 8 | def __init__(self): 9 | self.value = randint(-100, 100) 10 | 11 | def Yeah(self): 12 | if self.value == 0: 13 | return True 14 | else: 15 | raise UhOh() 16 | 17 | def Okay(): 18 | while True: 19 | yield Hmm() 20 | 21 | def keep_trying(go, first_try=True): 22 | maybe = next(go) 23 | try: 24 | if maybe.Yeah(): 25 | return maybe.value 26 | except UhOh: 27 | if first_try: 28 | print("Working...") 29 | print("Please wait patiently...") 30 | time.sleep(0.1) 31 | return keep_trying(go, first_try=False) 32 | 33 | if __name__ == "__main__": 34 | go = Okay() 35 | print(f"{keep_trying(go)}") 36 | -------------------------------------------------------------------------------- /figures/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/figures/.DS_Store -------------------------------------------------------------------------------- /figures/aggregate_errors.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/figures/aggregate_errors.png -------------------------------------------------------------------------------- /figures/aggregate_score.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/figures/aggregate_score.png -------------------------------------------------------------------------------- /figures/aggregate_score_GPT-4-Turbo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/figures/aggregate_score_GPT-4-Turbo.png -------------------------------------------------------------------------------- /figures/aggregate_score_Llama3-70B-Instruct.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/figures/aggregate_score_Llama3-70B-Instruct.png -------------------------------------------------------------------------------- /figures/aggregate_score_vs_langchain.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/figures/aggregate_score_vs_langchain.png -------------------------------------------------------------------------------- /gaia.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import os 3 | from openai import OpenAI 4 | from typing import Optional 5 | import json 6 | import pandas as pd 7 | from dotenv import load_dotenv 8 | import datasets 9 | from huggingface_hub import login 10 | from transformers.agents.llm_engine import MessageRole, get_clean_message_list 11 | from transformers.agents import ReactCodeAgent, ReactJsonAgent, HfEngine 12 | from transformers.agents.prompts import DEFAULT_REACT_CODE_SYSTEM_PROMPT, DEFAULT_REACT_JSON_SYSTEM_PROMPT 13 | from transformers.agents.default_tools import Tool, PythonInterpreterTool 14 | from scripts.tools.web_surfer import ( 15 | SearchInformationTool, 16 | NavigationalSearchTool, 17 | VisitTool, 18 | PageUpTool, 19 | PageDownTool, 20 | FinderTool, 21 | FindNextTool, 22 | ArchiveSearchTool, 23 | ) 24 | from scripts.tools.mdconvert import MarkdownConverter 25 | from scripts.reformulator import prepare_response 26 | from scripts.run_agents import answer_questions 27 | from scripts.tools.visual_qa import VisualQATool, VisualQAGPT4Tool 28 | 29 | load_dotenv(override=True) 30 | login(os.getenv("HUGGINGFACEHUB_API_TOKEN")) 31 | 32 | ### IMPORTANT: EVALUATION SWITCHES 33 | 34 | print("Make sure you deactivated Tailsacale VPN, else some URLs will be blocked!") 35 | 36 | OUTPUT_DIR = "output_gaia" 37 | USE_OS_MODELS = False 38 | USE_JSON = False 39 | 40 | SET = "validation" 41 | 42 | ### BUILD LLM ENGINES 43 | 44 | openai_role_conversions = { 45 | MessageRole.TOOL_RESPONSE: MessageRole.USER, 46 | } 47 | 48 | 49 | class OpenAIModel: 50 | def __init__(self, model_name="gpt-4o"): 51 | self.model_name = model_name 52 | self.client = OpenAI( 53 | api_key=os.getenv("OPENAI_API_KEY"), 54 | ) 55 | 56 | def __call__(self, messages, stop_sequences=[]): 57 | messages = get_clean_message_list(messages, role_conversions=openai_role_conversions) 58 | 59 | response = self.client.chat.completions.create( 60 | model=self.model_name, 61 | messages=messages, 62 | stop=stop_sequences, 63 | temperature=0.5 64 | ) 65 | return response.choices[0].message.content 66 | 67 | 68 | oai_llm_engine = OpenAIModel() 69 | 70 | 71 | url_llama3 = "meta-llama/Meta-Llama-3-70B-Instruct" 72 | url_qwen2 = "https://azbwihkodyacoe54.us-east-1.aws.endpoints.huggingface.cloud" 73 | url_command_r = "CohereForAI/c4ai-command-r-plus" 74 | 75 | ### LOAD EVALUATION DATASET 76 | 77 | eval_ds = datasets.load_dataset("gaia-benchmark/GAIA", "2023_all")[SET] 78 | eval_ds = eval_ds.rename_columns( 79 | {"Question": "question", "Final answer": "true_answer", "Level": "task"} 80 | ) 81 | 82 | 83 | def preprocess_file_paths(row): 84 | if len(row["file_name"]) > 0: 85 | row["file_name"] = f"data/gaia/{SET}/" + row["file_name"] 86 | return row 87 | 88 | 89 | eval_ds = eval_ds.map(preprocess_file_paths) 90 | 91 | eval_df = pd.DataFrame(eval_ds) 92 | print("Loaded evaluation dataset:") 93 | print(pd.Series(eval_ds["task"]).value_counts()) 94 | 95 | 96 | websurfer_llm_engine = HfEngine( 97 | model=url_qwen2, 98 | ) # chosen for its high context length 99 | 100 | # Replace with OAI if needed 101 | if not USE_OS_MODELS: 102 | websurfer_llm_engine = oai_llm_engine 103 | 104 | ### BUILD AGENTS & TOOLS 105 | 106 | WEB_TOOLS = [ 107 | SearchInformationTool(), 108 | NavigationalSearchTool(), 109 | VisitTool(), 110 | PageUpTool(), 111 | PageDownTool(), 112 | FinderTool(), 113 | FindNextTool(), 114 | ArchiveSearchTool(), 115 | ] 116 | 117 | class TextInspectorTool(Tool): 118 | name = "inspect_file_as_text" 119 | description = """ 120 | You cannot load files yourself: instead call this tool to read a file as markdown text and ask questions about it. 121 | This tool handles the following file extensions: [".html", ".htm", ".xlsx", ".pptx", ".wav", ".mp3", ".flac", ".pdf", ".docx"], and all other types of text files. IT DOES NOT HANDLE IMAGES.""" 122 | 123 | inputs = { 124 | "question": { 125 | "description": "[Optional]: Your question, as a natural language sentence. Provide as much context as possible. Do not pass this parameter if you just want to directly return the content of the file.", 126 | "type": "text", 127 | }, 128 | "file_path": { 129 | "description": "The path to the file you want to read as text. Must be a '.something' file, like '.pdf'. If it is an image, use the visualizer tool instead! DO NOT USE THIS TOOL FOR A WEBPAGE: use the search tool instead!", 130 | "type": "text", 131 | }, 132 | } 133 | output_type = "text" 134 | md_converter = MarkdownConverter() 135 | 136 | def forward(self, file_path, question: Optional[str] = None, initial_exam_mode: Optional[bool] = False) -> str: 137 | 138 | result = self.md_converter.convert(file_path) 139 | 140 | if file_path[-4:] in ['.png', '.jpg']: 141 | raise Exception("Cannot use inspect_file_as_text tool with images: use visualizer instead!") 142 | 143 | if ".zip" in file_path: 144 | return result.text_content 145 | 146 | if not question: 147 | return result.text_content 148 | 149 | if initial_exam_mode: 150 | messages = [ 151 | { 152 | "role": "user", 153 | "content": "Here is a file:\n### " 154 | + str(result.title) 155 | + "\n\n" 156 | + result.text_content[:70000], 157 | }, 158 | { 159 | "role": "user", 160 | "content": question, 161 | }, 162 | ] 163 | return websurfer_llm_engine(messages) 164 | else: 165 | messages = [ 166 | { 167 | "role": "user", 168 | "content": "You will have to write a short caption for this file, then answer this question:" 169 | + question, 170 | }, 171 | { 172 | "role": "user", 173 | "content": "Here is the complete file:\n### " 174 | + str(result.title) 175 | + "\n\n" 176 | + result.text_content[:70000], 177 | }, 178 | { 179 | "role": "user", 180 | "content": "Now answer the question below. Use these three headings: '1. Short answer', '2. Extremely detailed answer', '3. Additional Context on the document and question asked'." 181 | + question, 182 | }, 183 | ] 184 | return websurfer_llm_engine(messages) 185 | 186 | 187 | surfer_agent = ReactJsonAgent( 188 | llm_engine=websurfer_llm_engine, 189 | tools=WEB_TOOLS, 190 | max_iterations=12, 191 | verbose=2, 192 | system_prompt=DEFAULT_REACT_JSON_SYSTEM_PROMPT + "\nAdditionally, if after some searching you find out that you need more information to answer the question, you can use `final_answer` with your request for clarification as argument to request for more information.", 193 | planning_interval=4, 194 | ) 195 | 196 | class SearchTool(Tool): 197 | name = "ask_search_agent" 198 | description = """ 199 | This will send a message to a team member that will browse the internet to answer your question. 200 | Ask him for all your web-search related questions, but he's unable to do problem-solving. 201 | Provide him as much context as possible, in particular if you need to search on a specific timeframe! 202 | And don't hesitate to provide them with a complex search task, like finding a difference between two webpages. 203 | """ 204 | 205 | inputs = { 206 | "query": { 207 | "description": "Your question, as a natural language sentence with a verb! You are talking to an human, so provide them with as much context as possible! DO NOT ASK a google-like query like 'paper about fish species 2011': instead ask a real sentence like: 'What appears on the last figure of a paper about fish species published in 2011?'", 208 | "type": "text", 209 | } 210 | } 211 | output_type = "text" 212 | 213 | def forward(self, query: str) -> str: 214 | final_answer = surfer_agent.run(f""" 215 | You've been submitted this request by your manager: '{query}' 216 | 217 | You're helping your manager solve a wider task: so make sure to not provide a one-line answer, but give as much information as possible so that they have a clear understanding of the answer. 218 | 219 | Your final_answer WILL HAVE to contain these parts: 220 | ### 1. Search outcome (short version): 221 | ### 2. Search outcome (extremely detailed version): 222 | ### 3. Additional context: 223 | 224 | Put all these in your final_answer, everything that you do not pass as an argument to final_answer will be lost. 225 | 226 | You can navigate to .txt or .pdf online files using your 'visit_page' tool. 227 | If it's another format, you can return the url of the file, and your manager will handle the download and inspection from there. 228 | 229 | And even if your search is unsuccessful, please return as much context as possible, so they can act upon this feedback. 230 | """) 231 | answer = "Here is the report from your team member's search:\n" 232 | for message in surfer_agent.write_inner_memory_from_logs(): 233 | content = message['content'] 234 | if 'tool_arguments' in str(content): 235 | if len(str(content)) < 1000 or "[FACTS]" in str(content): 236 | answer += "" + str(content) + "\n" 237 | else: 238 | try: 239 | answer += f"{json.loads(content)['tool_name']}\n" 240 | except: 241 | answer += f"{content[:1000]}(...)\n" 242 | else: 243 | if len(str(content)) > 2000: 244 | answer += ">>> Tool output too long to show, showing only the beginning:\n" + str(content)[:500] + '\n(...)\n\n' 245 | else: 246 | answer += ">>> "+ str(content) + "\n\n" 247 | answer += "\nNow here is the team member's final answer deducted from the above:\n" 248 | answer += str(final_answer) 249 | return answer 250 | 251 | 252 | ti_tool = TextInspectorTool() 253 | 254 | TASK_SOLVING_TOOLBOX = [ 255 | SearchTool(), 256 | VisualQAGPT4Tool(), # VisualQATool(), 257 | ti_tool, 258 | ] 259 | 260 | if USE_JSON: 261 | TASK_SOLVING_TOOLBOX.append(PythonInterpreterTool()) 262 | 263 | hf_llm_engine = HfEngine(model=url_qwen2) 264 | 265 | llm_engine = hf_llm_engine if USE_OS_MODELS else oai_llm_engine 266 | 267 | react_agent = ReactCodeAgent( 268 | llm_engine=llm_engine, 269 | tools=TASK_SOLVING_TOOLBOX, 270 | max_iterations=15, 271 | verbose=0, 272 | memory_verbose=True, 273 | system_prompt=DEFAULT_REACT_CODE_SYSTEM_PROMPT, 274 | additional_authorized_imports=[ 275 | "requests", 276 | "zipfile", 277 | "os", 278 | "pandas", 279 | "numpy", 280 | "sympy", 281 | "json", 282 | "bs4", 283 | "pubchempy", 284 | "xml", 285 | "yahoo_finance", 286 | "Bio", 287 | "sklearn", 288 | "scipy", 289 | "pydub", 290 | "io", 291 | "PIL", 292 | "chess", 293 | "PyPDF2", 294 | "pptx", 295 | "torch", 296 | "datetime", 297 | "csv", 298 | "fractions", 299 | ], 300 | planning_interval=2 301 | ) 302 | 303 | if USE_JSON: 304 | react_agent = ReactJsonAgent( 305 | llm_engine=llm_engine, 306 | tools=TASK_SOLVING_TOOLBOX, 307 | max_iterations=15, 308 | verbose=0, 309 | memory_verbose=True, 310 | system_prompt=DEFAULT_REACT_JSON_SYSTEM_PROMPT, 311 | planning_interval=2 312 | ) 313 | 314 | ### EVALUATE 315 | 316 | async def call_transformers(agent, question: str, **kwargs) -> str: 317 | result = agent.run(question, **kwargs) 318 | agent_memory = agent.write_inner_memory_from_logs(summary_mode=True) 319 | try: 320 | final_result = prepare_response(question, agent_memory, llm_engine) 321 | except Exception as e: 322 | print(e) 323 | final_result = result 324 | return { 325 | "output": str(final_result), 326 | "intermediate_steps": [ 327 | {key: value for key, value in log.items() if key != "agent_memory"} 328 | for log in agent.logs 329 | ], 330 | } 331 | 332 | results = asyncio.run(answer_questions( 333 | eval_ds, 334 | react_agent, 335 | "react_code_claude_28-june_planning2_newprompt5", 336 | output_folder=f"{OUTPUT_DIR}/{SET}", 337 | agent_call_function=call_transformers, 338 | visual_inspection_tool = VisualQAGPT4Tool(), 339 | text_inspector_tool = ti_tool, 340 | )) -------------------------------------------------------------------------------- /scripts/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/.DS_Store -------------------------------------------------------------------------------- /scripts/__pycache__/agents.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/agents.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/agents.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/agents.cpython-311.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/browser.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/browser.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/create_agents.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/create_agents.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/evaluation.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/evaluation.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/evaluation.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/evaluation.cpython-311.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/gaia_scorer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/gaia_scorer.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/gaia_scorer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/gaia_scorer.cpython-311.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/gaia_scorer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/gaia_scorer.cpython-39.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/mdconvert.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/mdconvert.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/modified_calculator.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/modified_calculator.cpython-311.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/new_browser.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/new_browser.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/optimize_prompt.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/optimize_prompt.cpython-311.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/prompts.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/prompts.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/prompts.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/prompts.cpython-311.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/python_evaluator.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/python_evaluator.cpython-311.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/reformulator.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/reformulator.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/run_agents.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/run_agents.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/run_agents.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/run_agents.cpython-311.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/run_agents.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/run_agents.cpython-39.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/serpapi_browser.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/serpapi_browser.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/visual_qa.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/visual_qa.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/visual_qa.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/visual_qa.cpython-311.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/web_surfer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/web_surfer.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/web_surfer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/web_surfer.cpython-311.pyc -------------------------------------------------------------------------------- /scripts/__pycache__/web_surfer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/__pycache__/web_surfer.cpython-39.pyc -------------------------------------------------------------------------------- /scripts/create_agents.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Dict 2 | import numexpr 3 | import math 4 | 5 | from langchain.agents.output_parsers import ( 6 | ReActJsonSingleInputOutputParser, 7 | OpenAIFunctionsAgentOutputParser, 8 | ) 9 | from langchain.llms import HuggingFaceEndpoint 10 | from langchain.chat_models import ChatOpenAI 11 | from langchain.tools.render import ( 12 | render_text_description_and_args, 13 | format_tool_to_openai_function, 14 | ) 15 | from langchain.agents.format_scratchpad import ( 16 | format_to_openai_function_messages, 17 | format_log_to_str, 18 | ) 19 | from langchain.prompts import ( 20 | ChatPromptTemplate, 21 | HumanMessagePromptTemplate, 22 | SystemMessagePromptTemplate, 23 | AIMessagePromptTemplate, 24 | MessagesPlaceholder, 25 | ) 26 | from langchain.agents import AgentExecutor, load_tools 27 | from langchain.schema import HumanMessage 28 | from langchain.chat_models.base import BaseChatModel 29 | from langchain_community.chat_models.huggingface import ChatHuggingFace 30 | from transformers.agents import Tool 31 | 32 | from scripts.prompts import HUMAN_PROMPT, SYSTEM_PROMPT, SCRATCHPAD_PROMPT 33 | 34 | class CalculatorTool(Tool): 35 | name = "calculator" 36 | description = "This is a tool that calculates. It can be used to perform simple arithmetic operations." 37 | 38 | inputs = { 39 | "expression": { 40 | "type": "text", 41 | "description": "The expression to be evaluated.The variables used CANNOT be placeholders like 'x' or 'mike's age', they must be numbers", 42 | } 43 | } 44 | output_type = "text" 45 | 46 | def forward(self, expression): 47 | if isinstance(expression, Dict): 48 | expression = expression["expression"] 49 | local_dict = {"pi": math.pi, "e": math.e} 50 | output = str( 51 | numexpr.evaluate( 52 | expression.strip().replace("^", "**"), 53 | global_dict={}, # restrict access to globals 54 | local_dict=local_dict, # add common mathematical functions 55 | ) 56 | ) 57 | return output 58 | 59 | def init_tools_with_llm(llm: BaseChatModel): 60 | tools = load_tools(["serpapi", "llm-math"], llm=llm) 61 | # Rename tools in the same format used by other tools 62 | tools[0].name = "search" 63 | # llm_math_tool = Tool( 64 | # name="Calculator", 65 | # description="Useful for when you need to answer questions about math.", 66 | # func=LLMMathChain.from_llm(llm=llm).run, 67 | # coroutine=LLMMathChain.from_llm(llm=llm).arun, 68 | # ) 69 | # tools.append(llm_math_tool) 70 | tools[1].name = "calculator" 71 | return tools 72 | 73 | 74 | def build_openai_agent_with_tools(model_id: Optional[str] = "gpt-4-1106-preview") -> AgentExecutor: 75 | llm = ChatOpenAI(model=model_id, temperature=0.1) 76 | tools = init_tools_with_llm(llm) 77 | 78 | 79 | llm_with_tools = llm.bind( 80 | functions=[format_tool_to_openai_function(t) for t in tools], 81 | stop=["Observation:", "<|eot_id|>"] 82 | ) 83 | prompt = ChatPromptTemplate.from_messages( 84 | [ 85 | ("system", "You are a helpful assistant. Answer the following question:"), 86 | ("user", "{input}"), 87 | MessagesPlaceholder(variable_name="agent_scratchpad"), 88 | ] 89 | ) 90 | agent = ( 91 | { 92 | "input": lambda x: x["input"], 93 | "agent_scratchpad": lambda x: format_to_openai_function_messages( 94 | x["intermediate_steps"] 95 | ), 96 | } 97 | | prompt 98 | | llm_with_tools 99 | | OpenAIFunctionsAgentOutputParser() 100 | ) 101 | return AgentExecutor( 102 | agent=agent, 103 | tools=tools, 104 | verbose=True, 105 | return_intermediate_steps=True, 106 | handle_parsing_errors=True, 107 | max_iterations=7, 108 | ) 109 | 110 | 111 | def build_hf_agent_with_tools(hf_endpoint_url: Optional[str] = None, repo_id: Optional[str] = None) -> AgentExecutor: 112 | """ 113 | Build a zero-shot ReAct chat agent from HF endpoint. 114 | 115 | Args: 116 | hf_endpoint_url (str): The endpoint URL for the Hugging Face model. 117 | 118 | Returns: 119 | AgentExecutor: An agent executor object that can be used to run the agent. 120 | 121 | """ 122 | assert hf_endpoint_url or repo_id, "hf_endpoint_url or repo_id must be provided." 123 | assert not (hf_endpoint_url and repo_id), "Only one of hf_endpoint_url or repo_id can be provided." 124 | 125 | # instantiate LLM and chat model 126 | if hf_endpoint_url: 127 | llm = HuggingFaceEndpoint( 128 | endpoint_url=hf_endpoint_url, 129 | task="text-generation", 130 | max_new_tokens= 512, 131 | do_sample= False, 132 | repetition_penalty= 1.03, 133 | ) 134 | else: 135 | llm = HuggingFaceEndpoint( 136 | repo_id=repo_id, 137 | task="text-generation", 138 | max_new_tokens= 512, 139 | do_sample= False, 140 | repetition_penalty= 1.03, 141 | ) 142 | 143 | chat_model = ChatHuggingFace(llm=llm) 144 | tools = init_tools_with_llm(llm) 145 | 146 | # # TODO: remove 147 | # tools = [tools[1]] # only use calculator for now 148 | 149 | 150 | # define the prompt depending on whether the chat model supports system prompts 151 | system_prompt_supported = check_supports_system_prompt(chat_model) 152 | 153 | if system_prompt_supported: 154 | prompt = ChatPromptTemplate.from_messages( 155 | [ 156 | SystemMessagePromptTemplate.from_template(SYSTEM_PROMPT), 157 | HumanMessagePromptTemplate.from_template(HUMAN_PROMPT), 158 | SystemMessagePromptTemplate.from_template(SCRATCHPAD_PROMPT), 159 | ] 160 | ) 161 | else: 162 | prompt = ChatPromptTemplate.from_messages( 163 | [ 164 | HumanMessagePromptTemplate.from_template( 165 | SYSTEM_PROMPT + "\nSo, here is my question:" + HUMAN_PROMPT 166 | ), 167 | AIMessagePromptTemplate.from_template(SCRATCHPAD_PROMPT), 168 | HumanMessage(content="Now give your next thoughts: "), 169 | ] 170 | ) 171 | 172 | prompt = prompt.partial( 173 | tool_description_with_args=render_text_description_and_args(tools), 174 | tool_names=", ".join([t.name for t in tools]), 175 | ) 176 | 177 | # define the agent 178 | chat_model_with_stop = chat_model.bind(stop=["Observation:", "<|eot_id|>"]) 179 | agent = ( 180 | { 181 | "input": lambda x: x["input"], 182 | "agent_scratchpad": lambda x: format_log_to_str(x["intermediate_steps"]), 183 | } 184 | | prompt 185 | | chat_model_with_stop 186 | | ReActJsonSingleInputOutputParser() 187 | ) 188 | 189 | return AgentExecutor( 190 | agent=agent, 191 | tools=tools, 192 | verbose=True, 193 | return_intermediate_steps=True, 194 | handle_parsing_errors=True, 195 | max_iterations=7, 196 | ) 197 | 198 | 199 | def check_supports_system_prompt(chat_model): 200 | """ 201 | Checks if the given chat model supports system prompts. 202 | 203 | Args: 204 | chat_model: The chat model to be checked. 205 | 206 | Returns: 207 | True if the chat model supports system prompts, False otherwise. 208 | """ 209 | messages = ChatPromptTemplate.from_messages( 210 | [ 211 | SystemMessagePromptTemplate.from_template(SYSTEM_PROMPT), 212 | HumanMessagePromptTemplate.from_template(HUMAN_PROMPT), 213 | SystemMessagePromptTemplate.from_template(SCRATCHPAD_PROMPT), 214 | ] 215 | ) 216 | try: 217 | chat_model._to_chat_prompt(messages) 218 | print("System prompt supported") 219 | return True 220 | except Exception as e: 221 | print(e) 222 | print("System prompt not supported") 223 | return False 224 | -------------------------------------------------------------------------------- /scripts/evaluation/__pycache__/gaia_scorer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/evaluation/__pycache__/gaia_scorer.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/evaluation/__pycache__/unsolved_questions.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/evaluation/__pycache__/unsolved_questions.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/evaluation/evaluation.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import json 4 | from langchain.llms import HuggingFaceEndpoint 5 | from langchain.prompts.chat import ChatPromptTemplate 6 | import pandas as pd 7 | import asyncio 8 | from typing import Optional, List, Union 9 | import tqdm.asyncio 10 | import numpy as np 11 | from threading import Thread 12 | from queue import Queue 13 | import datasets 14 | 15 | import numpy as np 16 | import re 17 | 18 | 19 | _SENTINEL_KILL_CONSUMERS = object() 20 | 21 | 22 | def build_evaluator(hf_endpoint_url: str) -> tuple: 23 | """ 24 | Build an evaluator language model using the given Hugging Face endpoint URL. 25 | 26 | Args: 27 | hf_endpoint_url (str): The URL of the Hugging Face endpoint. 28 | 29 | Returns: 30 | Tuple: A tuple containing the evaluator chat model and the correctness prompt template. 31 | """ 32 | eval_chat_model = HuggingFaceEndpoint( 33 | endpoint_url=hf_endpoint_url, 34 | task="text-generation", 35 | model_kwargs={ 36 | "max_new_tokens": 488, 37 | "do_sample": False, 38 | "repetition_penalty": 1.03, 39 | }, 40 | ) 41 | return eval_chat_model 42 | 43 | 44 | async def evaluate_single_example( 45 | example: dict, evaluator, eval_prompt_template, evaluator_name, eval_split_string="[RESULT]", writer_queue: Optional[Queue] = None 46 | ): 47 | if f"eval_score_{evaluator_name}" in example: 48 | try: 49 | el = float(example[f"eval_score_{evaluator_name}"]) 50 | assert not np.isnan(el) 51 | return example 52 | except: 53 | pass 54 | eval_prompt = eval_prompt_template.format_messages( 55 | instruction=example["question"], 56 | response=example["prediction"], 57 | reference_answer=example["true_answer"], 58 | ) 59 | print("Evaluating example") 60 | eval_result = await evaluator.ainvoke(eval_prompt) 61 | eval_result = eval_result.content 62 | try: 63 | feedback, score = [item.strip() for item in eval_result.split(eval_split_string)] 64 | except: 65 | print(eval_result) 66 | segments = [ 67 | segment.strip() for segment in eval_result.split(eval_split_string) if segment.strip() 68 | ] 69 | # Search for a segment that contains a numerical score 70 | for segment in segments: 71 | if segment.isdigit(): 72 | feedback = "" 73 | score = int(segment) 74 | example[f"eval_score_{evaluator_name}"] = score 75 | example[f"eval_feedback_{evaluator_name}"] = feedback 76 | if writer_queue: 77 | writer_queue.put(example) 78 | return example 79 | 80 | 81 | async def evaluate_answers( 82 | examples, 83 | evaluator, 84 | evaluator_name: str, 85 | eval_prompt_template: ChatPromptTemplate, 86 | eval_split_string: str = "[RESULT]", 87 | output_file_path: Optional[str] = None, 88 | ) -> pd.DataFrame: 89 | """ 90 | Run a full evaluation on the given dataset using multiple agent models. 91 | Uses safe writing in multithreading, from options suggested here: 92 | https://stackoverflow.com/questions/33107019/multiple-threads-writing-to-the-same-csv-in-python 93 | 94 | Args: 95 | dataset (Dataset): The dataset to test on. 96 | agents (Dict[str, AgentExecutor]): A dictionary of agent executors to test on the dataset 97 | 98 | Returns: 99 | pd.DataFrame: The evaluation results as a pandas DataFrame. 100 | """ 101 | examples_to_do = examples 102 | previous_evaluations = pd.DataFrame() 103 | 104 | if output_file_path and os.path.isfile(output_file_path): 105 | previous_evaluations = pd.read_json(output_file_path, lines=True) 106 | print(f'Found {len(previous_evaluations)} previous evaluations!') 107 | if f"eval_score_{evaluator_name}" in previous_evaluations.columns: 108 | previous_evaluations = previous_evaluations.loc[previous_evaluations[f"eval_score_{evaluator_name}"].notna()] 109 | 110 | examples_to_do = [example for example in examples if not len(previous_evaluations.loc[ 111 | (previous_evaluations["question"] == example["question"]) & (previous_evaluations["agent_name"] == example["agent_name"]) 112 | ]) > 0] 113 | 114 | print(f"Launching evaluation for {len(examples_to_do)} examples...") 115 | writer_queue = Queue() 116 | 117 | with open(output_file_path, "a") as output_file: 118 | def write_line(): 119 | while True: 120 | if not writer_queue.empty(): 121 | annotated_example = writer_queue.get() 122 | 123 | if annotated_example is _SENTINEL_KILL_CONSUMERS: 124 | writer_queue.put(_SENTINEL_KILL_CONSUMERS) # put it back so that other consumers see it 125 | return 126 | 127 | annotated_example = {k: str(v) for k, v in annotated_example.items()} 128 | 129 | # Row comes out of writer_queue; JSON writing goes here 130 | json.dump(annotated_example, output_file) 131 | output_file.write('\n') 132 | 133 | consumer = Thread(target=write_line) 134 | consumer.setDaemon(True) 135 | consumer.start() 136 | 137 | tasks = [ 138 | evaluate_single_example( 139 | example, 140 | evaluator, 141 | eval_prompt_template, 142 | evaluator_name, 143 | eval_split_string, 144 | writer_queue, 145 | ) 146 | for example in examples_to_do 147 | ] 148 | 149 | evaluation_results = [await f for f in tqdm.tqdm(asyncio.as_completed(tasks), total=len(tasks))] 150 | writer_queue.put(_SENTINEL_KILL_CONSUMERS) 151 | 152 | return evaluation_results + previous_evaluations.to_dict(orient="records") 153 | 154 | 155 | def extract_numbers(string): 156 | try: 157 | found_strings = [el.strip() for el in re.findall(r"(?:[,\d]+.?\d*)", string)] 158 | 159 | found_strings = [ 160 | "".join(ch for ch in el if (ch.isalnum() or ch == ".")) 161 | for el in found_strings 162 | if el[0].isdigit() or el[0] == "." 163 | ] 164 | found_strings = [float(el) for el in found_strings if len(el) > 0] 165 | 166 | return found_strings 167 | except Exception as e: 168 | print("Error when extracting string:", e) 169 | return 0 170 | 171 | 172 | def split_answer(row): 173 | if row['task'] == 'GSM8K': 174 | splitted = row["true_answer"].split("####") 175 | row["true_reasoning"] = splitted[0] 176 | str_answer = splitted[1].strip().replace(",", "") # remove thousand separators from GSM8K 177 | row["true_answer"] = float(str_answer) 178 | return row 179 | 180 | 181 | def load_math_datasets(n_eval_samples = 30): 182 | math_dataset = ( 183 | datasets.load_dataset("GSM8K", "main")["train"].shuffle(seed=496).select(range(100)) 184 | ) 185 | math_dataset = pd.DataFrame(math_dataset) 186 | 187 | math_dataset = math_dataset.apply(split_answer, axis=1) 188 | math_dataset = math_dataset.drop(columns=["answer"]).iloc[:100] 189 | math_dataset = datasets.Dataset.from_pandas(math_dataset) 190 | 191 | return math_dataset 192 | 193 | 194 | def load_benchmark(): 195 | dataset = datasets.load_dataset("m-ric/agents_medium_benchmark")['train'] 196 | dataset = dataset.rename_column("answer", "true_answer") 197 | df = pd.DataFrame(dataset) 198 | return df.apply(split_answer, axis=1) 199 | 200 | 201 | def extract_numbers(output): 202 | if isinstance(output, float) or isinstance(output, int): 203 | return [output] 204 | try: 205 | found_strings = [el.strip() for el in re.findall(r"(?:[,\d]+.?\d*)", output)] 206 | 207 | found_strings = [ 208 | "".join(ch for ch in el if (ch.isalnum() or ch == ".")) 209 | for el in found_strings 210 | if el[0].isdigit() or el[0] == "." 211 | ] 212 | found_strings = [float(el) for el in found_strings if len(el) > 0] 213 | 214 | return found_strings 215 | 216 | except Exception as e: 217 | print("Error when extracting string:", e) 218 | return [] 219 | 220 | 221 | def score_any_match(prediction: str, true_answer: Union[str, int, float]) -> bool: 222 | """Scores if any number extracted from the prediction matches the true answer""" 223 | extracted_numbers = extract_numbers(prediction) 224 | found_match = any( 225 | [ 226 | np.isclose(extracted_number, float(true_answer), atol=0.1, rtol=0.05) 227 | for extracted_number in extracted_numbers 228 | ] 229 | ) 230 | return found_match 231 | 232 | def score_last_match(prediction: str, true_answer: Union[str, int, float]) -> bool: 233 | """Scores if any number extracted from the prediction matches the true answer""" 234 | extracted_numbers = extract_numbers(prediction) 235 | if len(extracted_numbers) == 0: 236 | return False 237 | return np.isclose(extracted_numbers[-1], float(true_answer), atol=0.1, rtol=0.05) 238 | 239 | 240 | def score_any_match_series(predictions: pd.Series, true_answers: pd.Series) -> List: 241 | return [score_any_match(predictions.values[i], true_answers.values[i]) for i in range(len(predictions.values))] 242 | 243 | def score_last_match_series(predictions: pd.Series, true_answers: pd.Series) -> List: 244 | return [score_last_match(predictions.values[i], true_answers.values[i]) for i in range(len(predictions.values))] 245 | 246 | 247 | # def score_levenshtein(prediction: str, true_answer: str): 248 | # if len(prediction) <= len(true_answer): 249 | # return 1 - ( 250 | # levenshtein_distance(prediction.lower(), true_answer.lower()) 251 | # / len(true_answer) 252 | # ) 253 | # else: # find substring with highest score 254 | # base_score = max( 255 | # [ 256 | # 1 257 | # - ( 258 | # levenshtein_distance( 259 | # prediction[offset : offset + len(true_answer)].lower(), 260 | # true_answer.lower(), 261 | # ) 262 | # / len(true_answer) 263 | # ) 264 | # for offset in range(len(prediction) - len(true_answer)) 265 | # ] 266 | # ) 267 | # # downgrade score if length is too long 268 | # return base_score 269 | 270 | 271 | def score_naive_match(prediction: str, true_answer: str): 272 | if len(prediction) <= len(true_answer): 273 | return prediction.lower() == true_answer.lower() 274 | else: # find substring with highest score 275 | return any( 276 | [ 277 | prediction[offset : offset + len(true_answer)].lower() 278 | == true_answer.lower() 279 | for offset in range(len(prediction) - len(true_answer)) 280 | ] 281 | ) 282 | 283 | def is_number(am_i_a_number): 284 | return am_i_a_number.strip().lstrip('-').replace('.', '', 1).replace(',', '').isdigit() 285 | 286 | 287 | # def score_outputs(prediction: str, true_answer: str): 288 | # if is_number(true_answer): 289 | # return score_any_match(prediction, true_answer) 290 | # else: 291 | # return score_levenshtein(prediction, true_answer) -------------------------------------------------------------------------------- /scripts/evaluation/gaia_scorer.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | import string 4 | import warnings 5 | 6 | import numpy as np 7 | 8 | 9 | def normalize_number_str(number_str: str) -> float: 10 | # we replace these common units and commas to allow 11 | # conversion to float 12 | for char in ["$", "%", ","]: 13 | number_str = number_str.replace(char, "") 14 | try: 15 | return float(number_str) 16 | except ValueError: 17 | print(f"String {number_str} cannot be normalized to number str.") 18 | return float("inf") 19 | 20 | 21 | def split_string( 22 | s: str, 23 | char_list: list[str] = [",", ";"], 24 | ) -> list[str]: 25 | pattern = f"[{''.join(char_list)}]" 26 | return re.split(pattern, s) 27 | 28 | 29 | def is_float(element: any) -> bool: 30 | try: 31 | float(element) 32 | return True 33 | except ValueError: 34 | return False 35 | 36 | def question_scorer( 37 | model_answer: str, 38 | ground_truth: str, 39 | ) -> bool: 40 | # if gt is a number 41 | if is_float(ground_truth): 42 | normalized_answer = normalize_number_str(str(model_answer)) 43 | return normalized_answer == float(ground_truth) 44 | 45 | # if gt is a list 46 | elif any(char in ground_truth for char in [",", ";"]): 47 | # question with the fish: normalization removes punct 48 | 49 | gt_elems = split_string(ground_truth) 50 | ma_elems = split_string(model_answer) 51 | 52 | # check length is the same 53 | if len(gt_elems) != len(ma_elems): 54 | warnings.warn( 55 | "Answer lists have different lengths, returning False.", UserWarning 56 | ) 57 | return False 58 | 59 | # compare each element as float or str 60 | comparisons = [] 61 | for ma_elem, gt_elem in zip(ma_elems, gt_elems): 62 | if is_float(gt_elem): 63 | normalized_ma_elem = normalize_number_str(ma_elem) 64 | comparisons.append(normalized_ma_elem == float(gt_elem)) 65 | else: 66 | # we do not remove punct since comparisons can include punct 67 | comparisons.append( 68 | normalize_str(ma_elem, remove_punct=False) 69 | == normalize_str(gt_elem, remove_punct=False) 70 | ) 71 | return all(comparisons) 72 | 73 | # if gt is a str 74 | else: 75 | return normalize_str(model_answer) == normalize_str(ground_truth) 76 | 77 | def check_prediction_contains_answer_letters_in_order(prediction, true_answer): 78 | prediction = prediction.lower() 79 | true_answer = true_answer.lower() 80 | if len(prediction) > len(true_answer) * 3: 81 | return False 82 | i = 0 83 | for letter in true_answer: 84 | if letter in prediction[i:]: 85 | i += prediction[i:].index(letter) 86 | else: 87 | return False 88 | return True 89 | 90 | 91 | def check_close_call(prediction, true_answer, is_correct): 92 | if is_correct: 93 | return True 94 | else: 95 | if is_float(true_answer): 96 | return is_correct 97 | else: 98 | if check_prediction_contains_answer_letters_in_order(str(prediction), str(true_answer)) and len(str(true_answer)) * 0.5 <= len(str(prediction)) <= len(str(true_answer))*2: 99 | print(f"Close call: {prediction} vs {true_answer}") 100 | return True 101 | else: 102 | return False 103 | 104 | 105 | def normalize_str(input_str, remove_punct=True) -> str: 106 | """ 107 | Normalize a string by: 108 | - Removing all white spaces 109 | - Optionally removing punctuation (if remove_punct is True) 110 | - Converting to lowercase 111 | Parameters: 112 | - input_str: str, the string to normalize 113 | - remove_punct: bool, whether to remove punctuation (default: True) 114 | Returns: 115 | - str, the normalized string 116 | """ 117 | # Remove all white spaces. Required e.g for seagull vs. sea gull 118 | no_spaces = re.sub(r"\s", "", input_str) 119 | 120 | # Remove punctuation, if specified. 121 | if remove_punct: 122 | translator = str.maketrans("", "", string.punctuation) 123 | return no_spaces.lower().translate(translator) 124 | else: 125 | return no_spaces.lower() 126 | -------------------------------------------------------------------------------- /scripts/evaluation/optimize_prompt.py: -------------------------------------------------------------------------------- 1 | from typing import Callable, Dict 2 | import pandas as pd 3 | from tqdm.auto import tqdm 4 | 5 | 6 | def get_answers(prompt: str, llm_client, questions: pd.Series) -> pd.Series: 7 | try: 8 | prompts = questions.apply(lambda x: prompt.replace('{question}', x)) 9 | except Exception as e: 10 | print(e) 11 | return e 12 | return prompts.apply(lambda x: llm_client.text_generation(prompt=x, max_new_tokens=1000)) 13 | 14 | 15 | def get_better_prompt(best_example: Dict, validation_set_with_answers: pd.DataFrame, teacher_agent, examples_other_less_good_prompts = None) -> str: 16 | prompt = f""" 17 | You are trying to optimize the prompt of a LLM to maximize its score on a task. 18 | 19 | You have already tried a few prompts, with these results: 20 | """ 21 | for example in examples_other_less_good_prompts: 22 | prompt += f"Prompt: '{example['prompt']}':\nAverage score: {example['score']}\n\n---\n" 23 | 24 | prompt += f""" 25 | The best prompt for now is: {best_example['prompt']}. It achieves score {best_example['score']}. Could you improve this prompt? 26 | 27 | Here are the examples of the validation set with the answers for this best prompt, to help you come up with an even better prompt: 28 | """ 29 | for i, example in validation_set_with_answers.iloc[:7].iterrows(): 30 | prompt += f'--- Example {i}:\n' 31 | for feature, value in example.to_dict().items(): 32 | prompt += f"Feature: {feature.capitalize()}: has value '{value}'.\n" 33 | 34 | prompt += """ 35 | --- 36 | Please provide an analysis of the error cases, and suggest a possible cause. 37 | 38 | Then only at the end, based on the causes for error, come up with an improved prompt. Your improved prompt should contain the placeholder '{question}' to indicate where the question should be inserted. 39 | Preface your suggestion of this improved prompt with '\Improved prompt:\n', and add at the end: '\nEnd of improved prompt'. 40 | 41 | Now begin! 42 | """ 43 | print('='*10+ 'Here is the new full prompt'+'='*10) 44 | print(prompt) 45 | print('='*10 + 'End new full prompt' + '='*10) 46 | return teacher_agent.invoke(prompt).content 47 | 48 | 49 | def optimize_prompt(logs, prompt, validation_set, llm_client, scoring_function: Callable, teacher_agent = None, n_iter = 6): 50 | for _ in tqdm(range(n_iter)): 51 | # Score current prompt 52 | validation_set_with_answers = validation_set.copy() 53 | 54 | validation_set_with_answers['prediction'] = get_answers(prompt, llm_client, validation_set['question']) 55 | validation_set_with_answers['prediction_is_correct'] = scoring_function(validation_set_with_answers['prediction'], validation_set_with_answers['true_answer']) 56 | print("Current prompt:", prompt) 57 | print('Score:', validation_set_with_answers['prediction_is_correct'].mean()) 58 | logs.append({'prompt': prompt, 'score': validation_set_with_answers['prediction_is_correct'].mean(), 'answers': validation_set_with_answers}) 59 | 60 | index_best_example = max(enumerate(logs), key=(lambda x: x[1]['score']))[0] 61 | best_example = logs[index_best_example] 62 | 63 | 64 | # Get a better prompt! 65 | feedback = get_better_prompt(best_example, validation_set_with_answers, teacher_agent, examples_other_less_good_prompts=[logs[i] for i in range(len(logs)) if i != index_best_example]) 66 | print('===========================') 67 | print("MODEL FEEDBACK:") 68 | print(feedback) 69 | print('END OF MODEL FEEDBACK') 70 | print('===========================') 71 | 72 | if 'Improved prompt:' in feedback: 73 | new_prompt = feedback.split('Improved prompt:')[-1] 74 | elif 'improved prompt' in feedback: 75 | new_prompt = feedback.split('improved prompt')[-1] 76 | else: 77 | new_prompt = best_example['prompt'] 78 | if 'End of improved prompt' in feedback: 79 | new_prompt = new_prompt.split('End of improved prompt')[0] 80 | elif 'end of improved prompt' in feedback: 81 | new_prompt = new_prompt.split('end of improved prompt')[0] 82 | prompt = new_prompt 83 | return logs 84 | 85 | 86 | -------------------------------------------------------------------------------- /scripts/experiments/calculator_tool.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numexpr 3 | from typing import Dict 4 | from transformers.agents import Tool 5 | 6 | class CalculatorTool(Tool): 7 | name = "calculator" 8 | description = "This is a tool that performs simple arithmetic operations." 9 | 10 | inputs = { 11 | "expression": { 12 | "type": "text", 13 | "description": "The expression to be evaluated.The variables used CANNOT be placeholders like 'x' or 'mike's age', they must be numbers", 14 | }, 15 | "useless_expression": { 16 | "type": "text", 17 | "description": "The expression to not be evaluated.The variables used CANNOT be placeholders like 'x' or 'mike's age', they must be numbers", 18 | } 19 | } 20 | output_type = "text" 21 | 22 | def __init__(self, *args, **kwargs): 23 | super().__init__(*args, **kwargs) 24 | 25 | def __call__(self, expression, useless_expression): 26 | if isinstance(expression, Dict): 27 | expression = expression["expression"] 28 | local_dict = {"pi": math.pi, "e": math.e} 29 | output = str( 30 | numexpr.evaluate( 31 | expression.strip().replace("^", "**"), 32 | global_dict={}, # restrict access to globals 33 | local_dict=local_dict, # add common mathematical functions 34 | ) 35 | ) 36 | return output -------------------------------------------------------------------------------- /scripts/experiments/retriever_tool.py: -------------------------------------------------------------------------------- 1 | from transformers.agents import Tool 2 | from langchain_core.vectorstores import VectorStore 3 | import json 4 | 5 | 6 | 7 | class RetrieverTool(Tool): 8 | name = "retriever" 9 | description = "Retrieves some documents from the knowledge base that have the closest embeddings to the input query." 10 | inputs = {} 11 | output_type = "text" 12 | 13 | def __init__(self, vectordb: VectorStore, all_sources: str, **kwargs): 14 | super().__init__(**kwargs) 15 | self.vectordb = vectordb 16 | self.inputs = { 17 | "query": { 18 | "type": "text", 19 | "description": "The query to perform. This should be semantically close to your target documents. Use the affirmative form rather than a question.", 20 | }, 21 | "source": { 22 | "type": "text", 23 | "description": f"The source of the documents to search, as a str representation of a list. Possible values in the list are: {all_sources}. If this argument is not provided, all sources will be searched.", 24 | }, 25 | } 26 | 27 | def forward(self, query: str, source: str = None) -> str: 28 | assert isinstance(query, str), "Your search query must be a string" 29 | 30 | if source: 31 | if isinstance(source, str) and "[" not in str(source): # if the source is not representing a list 32 | source = [source] 33 | source = json.loads(str(source).replace("'", '"')) 34 | 35 | docs = self.vectordb.similarity_search(query, filter=({"source": source} if source else None), k=3) 36 | 37 | if len(docs) == 0: 38 | return "No documents found with this filtering. Try removing the source filter." 39 | return "Retrieved documents:\n\n" + "\n===Document===\n".join( 40 | [doc.page_content for doc in docs] 41 | ) -------------------------------------------------------------------------------- /scripts/experiments/scene.py: -------------------------------------------------------------------------------- 1 | from manim import * 2 | 3 | class TextExample(Scene): 4 | def construct(self): 5 | text = Text("Here is a text", font="Consolas", font_size=90) 6 | self.play(Write(text)) 7 | self.wait(3) 8 | 9 | SCALE = 0.4 10 | 11 | rescaling_factor = 2 12 | 13 | 14 | class Agent(Scene): 15 | def animate_pulsing_effect(self, arrow): 16 | self.play(arrow.animate.scale(rescaling_factor), run_time=0.3) 17 | self.play(arrow.animate.scale(1 / rescaling_factor), run_time=0.3) 18 | self.wait(0.5) 19 | 20 | def construct(self): 21 | # Title 22 | # title = Text("Agent - Multi step/ ReAct", font_size=32, color=RED).to_edge(UP) 23 | # self.play(Write(title)) 24 | Text.set_default(font="Consolas", font_size=30) 25 | 26 | 27 | # Step 1: System prompt template 28 | system_prompt_template_box = Rectangle(width=20, height=2.5, color=WHITE).scale(SCALE) 29 | system_prompt_template_text = Text( 30 | 'System prompt:\n"Solve this task in an iterative way with a Thought/Action/Observation loop.\nYou can use these tools: ["calculator", "web_search"]\nTask: how much is 2^0.27?"', 31 | ).scale(SCALE).move_to(system_prompt_template_box.get_center()) 32 | system_prompt_template_group = VGroup(system_prompt_template_box, system_prompt_template_text).to_edge(UP) 33 | self.play(FadeIn(system_prompt_template_group)) 34 | self.wait(0.3) 35 | 36 | # Step 2: Initialize memory and prompt 37 | memory_box = Rectangle(width=11, height=3.5, color=WHITE).scale(SCALE) 38 | memory_text = Text("Memory: []").scale(SCALE).move_to(memory_box.get_center()) 39 | memory_group = VGroup(memory_box, memory_text).next_to(system_prompt_template_group, DOWN, buff=0) 40 | 41 | prompt_box = Rectangle(width=11, height=1, color=WHITE).scale(SCALE) 42 | prompt_text = Text("Prompt = System prompt + Memory (empty)", t2c={'Prompt = System prompt':YELLOW}, t2w={'Prompt = System prompt':BOLD}).scale(SCALE).move_to(prompt_box.get_center()) 43 | prompt_group = VGroup(prompt_box, prompt_text).next_to(memory_group, DOWN, buff=0) 44 | 45 | self.play(FadeIn(memory_box), Write(memory_text), FadeIn(prompt_box), Write(prompt_text)) 46 | 47 | # Step 3: Call LLM 48 | call_llm_box = Rectangle(width=3, height=1, color=ORANGE).scale(SCALE) 49 | call_llm_text = Text("Run LLM").scale(SCALE).move_to(call_llm_box.get_center()) 50 | call_llm_group = VGroup(call_llm_box, call_llm_text).next_to(prompt_group, DOWN*SCALE, buff=1) 51 | 52 | 53 | arrow1 = Arrow(start=prompt_group.get_bottom(), end=call_llm_group.get_top(), buff=0.1) 54 | self.play(FadeIn(arrow1), FadeIn(call_llm_box), Write(call_llm_text)) 55 | 56 | # Step 3.5: LLM Output 57 | llm_output_box = Rectangle(width=10, height=2, color=WHITE).scale(SCALE) 58 | llm_output_text = Text("LLM output:\nThought: I should use the calculator.\nAction: calculator(2^0.27)").scale(SCALE).move_to(llm_output_box.get_center()) 59 | llm_output_group = VGroup(llm_output_box, llm_output_text).next_to(call_llm_group, DOWN*SCALE, buff=1) 60 | 61 | arrow1_5 = Arrow(start=call_llm_group.get_bottom(), end=llm_output_group.get_top(), buff=0.1) 62 | self.play(FadeIn(arrow1_5), FadeIn(llm_output_box), Write(llm_output_text)) 63 | 64 | # Step 4: Parse tool call(s) from output 65 | parse_box = Rectangle(width=9, height=1, color=BLUE).scale(SCALE) 66 | parse_text = Text("Parse tool call(s) from output").scale(SCALE).move_to(parse_box.get_center()) 67 | parse_group = VGroup(parse_box, parse_text).next_to(llm_output_group, DOWN*SCALE, buff=1) 68 | 69 | arrow2 = Arrow(start=llm_output_group.get_bottom(), end=parse_group.get_top(), buff=0.1) 70 | self.play(FadeIn(arrow2), FadeIn(parse_box), Write(parse_text)) 71 | 72 | # Step 5: Resulting tool call(s) 73 | parsed_tool_call_box = Rectangle(width=6, height=1.5, color=WHITE).scale(SCALE) 74 | parsed_tool_call_text = Text("Tool calls:\ncalculator(2^0.27)").scale(SCALE).move_to(parsed_tool_call_box.get_center()) 75 | parsed_tool_call_group = VGroup(parsed_tool_call_box, parsed_tool_call_text).next_to(parse_group, DOWN*SCALE, buff=1) 76 | 77 | arrow3 = Arrow(start=parse_group.get_bottom(), end=parsed_tool_call_group.get_top(), buff=0.1) 78 | self.play(FadeIn(arrow3), FadeIn(parsed_tool_call_box), Write(parsed_tool_call_text)) 79 | 80 | 81 | # Step 6: Decision 82 | arrow_decision_no = Arrow(start=parsed_tool_call_group.get_right(), end=parsed_tool_call_group.get_right() + RIGHT*2, buff=0.1, tip_length=0.15) 83 | no_text = Text("Normal tool call", color=BLUE).scale(SCALE).next_to(arrow_decision_no, UP*SCALE*0.5) 84 | self.play(FadeIn(arrow_decision_no), Write(no_text)) 85 | 86 | # Execute call 87 | tool_call_box = Rectangle(width=5, height=1, color=BLUE).scale(SCALE) 88 | tool_call_text = Text("Execute call").scale(SCALE).move_to(tool_call_box.get_center()) 89 | tool_call_group = VGroup(tool_call_box, tool_call_text).next_to(arrow_decision_no.get_end(), RIGHT) 90 | 91 | self.play(FadeIn(tool_call_box), Write(tool_call_text)) 92 | 93 | 94 | # Adding "Observation" text 95 | observation_text = Text("Observation: 1.2058", color=YELLOW).scale(SCALE) 96 | observation_text.move_to(tool_call_group.get_top() + UP * 0.5) 97 | 98 | # Moving "Observation" text along the CurvedArrow 99 | loop_arrow = CurvedArrow(start_point=tool_call_group.get_top(), end_point=memory_group.get_right() + RIGHT*SCALE, angle=TAU/4, tip_length=0.15) 100 | self.play(FadeIn(observation_text)) 101 | self.play(FadeIn(loop_arrow), MoveAlongPath(observation_text, loop_arrow), run_time=2) 102 | 103 | # Adding the "Observation" text to the memory 104 | updated_memory_text = Text('Memory: [\nStep 1:\n(Step 1 LLM output + Step 1 Observation)\n]').scale(SCALE).move_to(memory_box.get_center()) 105 | self.play(Transform(memory_text, updated_memory_text), FadeOut(observation_text)) 106 | self.wait(0.5) 107 | 108 | 109 | updated_memory_text = Text('Memory: [\n"LLM output:\nThought: I should use the calculator.\nAction: calculator(2+2)\nObservation: 1.2058"\n]').scale(SCALE).move_to(memory_box.get_center()) 110 | self.play(Transform(memory_text, updated_memory_text)) 111 | self.wait(0.3) 112 | 113 | updated_prompt_text = Text("Prompt = System prompt + Memory (1 step)", color=YELLOW, weight=BOLD).scale(SCALE).move_to(prompt_box.get_center()) 114 | self.play(Transform(prompt_text, updated_prompt_text)) 115 | self.wait(0.5) 116 | 117 | # Going again through the loop 118 | # Create a pulsing effect 119 | self.animate_pulsing_effect(arrow1) 120 | self.animate_pulsing_effect(arrow1_5) 121 | 122 | # Update LLM output: 123 | updated_llm_output_text = Text("LLM output:\nThought: I should return the result.\nAction: final_answer(1.2058)").scale(SCALE).move_to(llm_output_box.get_center()) 124 | self.play(Transform(llm_output_text, updated_llm_output_text)) 125 | 126 | self.animate_pulsing_effect(arrow2) 127 | 128 | updated_parsed_tool_call_text = Text("Tool calls:\nfinal_answer(1.2058)").scale(SCALE).move_to(parsed_tool_call_box.get_center()) 129 | self.play(Transform(parsed_tool_call_text, updated_parsed_tool_call_text)) 130 | 131 | # Return result 132 | arrow_decision_yes = Arrow(start=parsed_tool_call_group.get_bottom(), end=parsed_tool_call_group.get_bottom() + DOWN*SCALE, buff=0.1) 133 | 134 | final_text = Text("Final answer", color=GREEN).scale(SCALE).next_to(arrow_decision_yes, RIGHT) 135 | self.play(FadeIn(arrow_decision_yes), Write(final_text)) 136 | 137 | # Return result 138 | result_text = Text("Return result: 1.2058").scale(SCALE).next_to(arrow_decision_yes.get_end(), DOWN*SCALE) 139 | self.play(Write(result_text)) 140 | self.wait(2) 141 | -------------------------------------------------------------------------------- /scripts/reformulator.py: -------------------------------------------------------------------------------- 1 | import copy 2 | 3 | def prepare_response(original_task, inner_messages, llm_engine): 4 | 5 | messages = [ 6 | { 7 | "role": "user", 8 | "content": f"""Earlier you were asked the following: 9 | 10 | {original_task} 11 | 12 | Your team then worked diligently to address that request. Here is a transcript of that conversation:""", 13 | } 14 | ] 15 | 16 | # The first message just repeats the question, so remove it 17 | #if len(inner_messages) > 1: 18 | # del inner_messages[0] 19 | 20 | # copy them to this context 21 | for message in inner_messages: 22 | if not message.get("content"): 23 | continue 24 | message = copy.deepcopy(message) 25 | message["role"] = "user" 26 | messages.append(message) 27 | 28 | # ask for the final answer 29 | messages.append( 30 | { 31 | "role": "user", 32 | "content": f""" 33 | Read the above conversation and output a FINAL ANSWER to the question. The question is repeated here for convenience: 34 | 35 | {original_task} 36 | 37 | To output the final answer, use the following template: FINAL ANSWER: [YOUR FINAL ANSWER] 38 | Your FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. 39 | ADDITIONALLY, your FINAL ANSWER MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.) 40 | If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and DO NOT INCLUDE UNITS such as $ or USD or percent signs unless specified otherwise. 41 | If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'. 42 | If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings. 43 | If you are unable to determine the final answer, output 'FINAL ANSWER: Unable to determine' 44 | """, 45 | } 46 | ) 47 | 48 | response = llm_engine(messages) 49 | 50 | final_answer = response.split("FINAL ANSWER: ")[-1].strip() 51 | print("Reformulated answer is: ", final_answer) 52 | 53 | if "unable to determine" in final_answer.lower(): 54 | messages.append({"role": "assistant", "content": response }) 55 | messages.append({"role": "user", "content": """ 56 | I understand that a definitive answer could not be determined. Please make a well-informed EDUCATED GUESS based on the conversation. 57 | 58 | To output the educated guess, use the following template: EDUCATED GUESS: [YOUR EDUCATED GUESS] 59 | Your EDUCATED GUESS should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. DO NOT OUTPUT 'I don't know', 'Unable to determine', etc. 60 | ADDITIONALLY, your EDUCATED GUESS MUST adhere to any formatting instructions specified in the original question (e.g., alphabetization, sequencing, units, rounding, decimal places, etc.) 61 | If you are asked for a number, express it numerically (i.e., with digits rather than words), don't use commas, and don't include units such as $ or percent signs unless specified otherwise. 62 | If you are asked for a string, don't use articles or abbreviations (e.g. for cities), unless specified otherwise. Don't output any final sentence punctuation such as '.', '!', or '?'. 63 | If you are asked for a comma separated list, apply the above rules depending on whether the elements are numbers or strings. 64 | """.strip()}) 65 | 66 | response = llm_engine(messages) 67 | print("\n>>>Making an educated guess.\n", response) 68 | final_answer = response.split("EDUCATED GUESS: ")[-1].strip() 69 | return final_answer 70 | 71 | -------------------------------------------------------------------------------- /scripts/run_agents.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from datetime import datetime 3 | from typing import Any, Dict, List, Callable 4 | import json 5 | import pandas as pd 6 | from tqdm import tqdm 7 | from datasets import Dataset 8 | import os 9 | # import tqdm.asyncio 10 | from queue import Queue 11 | 12 | from langchain.agents import AgentExecutor 13 | from langchain.tools.base import ToolException 14 | from transformers.agents.default_tools import Tool 15 | from transformers.agents.agents import AgentError 16 | from .evaluation.unsolved_questions import UNSOLVED_QUESTIONS 17 | 18 | def acall_langchain_agent(agent: AgentExecutor, question: str) -> str: 19 | return agent.ainvoke({"input": question}) 20 | 21 | def call_langchain_agent(agent: AgentExecutor, question: str) -> str: 22 | return agent.invoke({"input": question}) 23 | 24 | async def arun_agent( 25 | example: Dict, 26 | agent_executor: AgentExecutor, 27 | agent_name: str, 28 | agent_call_function: Callable, 29 | writer_queue: Queue = None, 30 | **kwargs 31 | ) -> dict: 32 | start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 33 | augmented_question = example["augmented_question"] 34 | try: 35 | # run executor agent 36 | response = await agent_call_function(agent_executor, augmented_question, **kwargs) 37 | 38 | # check for parsing errors which indicate the LLM failed to follow the ReACT format 39 | # this could be due to an issue with the tool calling format or ReACT formatting (i.e. Thought, Action, Observation, etc.) 40 | parsing_error = ( 41 | True 42 | if any( 43 | [ 44 | "Could not parse LLM output" in step 45 | for step in response["intermediate_steps"] 46 | ] 47 | ) 48 | else False 49 | ) 50 | 51 | # check if iteration limit exceeded 52 | iteration_limit_exceeded = ( 53 | True 54 | if "Agent stopped due to iteration limit or time limit." in response["output"] 55 | else False 56 | ) 57 | raised_exception = False 58 | 59 | except (ValueError, ToolException) as e: 60 | print("Error on ", augmented_question, e) 61 | response = {"output": None, "intermediate_steps": None} 62 | parsing_error = False 63 | iteration_limit_exceeded = False 64 | exception = e 65 | raised_exception = True 66 | end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 67 | intermediate_steps = response["intermediate_steps"] 68 | annotated_example = { 69 | "agent_name": agent_name, 70 | "question": example['question'], 71 | "augmented_question": augmented_question, 72 | "prediction": response["output"], 73 | "intermediate_steps": intermediate_steps, 74 | "parsing_error": parsing_error, 75 | "iteration_limit_exceeded": iteration_limit_exceeded, 76 | "agent_error": str(exception) if raised_exception else None, 77 | "start_time": start_time, 78 | "end_time": end_time, 79 | "task": example["task"], 80 | "true_answer": example["true_answer"], 81 | } 82 | if writer_queue: 83 | writer_queue.put(annotated_example) 84 | return annotated_example 85 | 86 | 87 | def run_agent( 88 | question: str, 89 | agent_executor: AgentExecutor, 90 | agent_name: str, 91 | agent_call_function: Callable, 92 | ) -> dict: 93 | """ 94 | Runs the execution process for a given question and ground truth answer. 95 | 96 | Args: 97 | question (str): The input question to be evaluated. 98 | agent_executor (AgentExecutor): The agent executor object used to run the agent. 99 | agent_name (str): The name of the agent model. 100 | 101 | Returns: 102 | dict: A dictionary containing the evaluation results, including the agent model ID, evaluator model ID, 103 | question, ground truth answer, prediction, intermediate steps, evaluation score, evaluation feedback, 104 | tool call parsing error flag, iteration limit exceeded flag, and agent error (if any). 105 | """ 106 | start_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 107 | try: 108 | # run executor agent 109 | response = agent_call_function(agent_executor, question) 110 | 111 | # check for parsing errors which indicate the LLM failed to follow the ReACT format 112 | # this could be due to an issue with the tool calling format or ReACT formatting (i.e. Thought, Action, Observation, etc.) 113 | parsing_error = ( 114 | True 115 | if any( 116 | [ 117 | "Could not parse LLM output" in step[0].log 118 | for step in response["intermediate_steps"] 119 | ] 120 | ) 121 | else False 122 | ) 123 | 124 | # check if iteration limit exceeded 125 | iteration_limit_exceeded = ( 126 | True 127 | if "Agent stopped due to iteration limit or time limit." in response["output"] 128 | else False 129 | ) 130 | raised_exception = False 131 | 132 | except Exception as e: 133 | response = {"output": None, "intermediate_steps": None} 134 | parsing_error = False 135 | iteration_limit_exceeded = False 136 | exception = e 137 | raised_exception = True 138 | 139 | end_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 140 | # collect results 141 | if response["intermediate_steps"] is not None: 142 | intermediate_steps = [ 143 | { 144 | "tool": response[0].tool, 145 | "tool_input": response[0].tool_input, 146 | "tool_output": response[1], 147 | } 148 | for response in response["intermediate_steps"] 149 | ] 150 | else: 151 | intermediate_steps = None 152 | return { 153 | "agent_name": agent_name, 154 | "question": question, 155 | "prediction": response["output"], 156 | "intermediate_steps": intermediate_steps, 157 | "parsing_error": parsing_error, 158 | "iteration_limit_exceeded": iteration_limit_exceeded, 159 | "agent_error": repr(exception) if raised_exception else None, 160 | "start_time": start_time, 161 | "end_time": end_time, 162 | } 163 | 164 | 165 | def serialize_agent_error(obj): 166 | if isinstance(obj, AgentError): 167 | return {"error_type": obj.__class__.__name__, "message": obj.message} 168 | else: 169 | return str(obj) 170 | 171 | 172 | async def answer_questions( 173 | dataset: Dataset, 174 | agent: AgentExecutor, 175 | agent_name: str, 176 | output_folder: str = "output", 177 | agent_call_function: Callable = call_langchain_agent, 178 | visual_inspection_tool: Tool = None, 179 | text_inspector_tool: Tool = None, 180 | ) -> List[Dict[str, Any]]: 181 | """ 182 | Evaluates the agent on a given dataset. 183 | 184 | Args: 185 | dataset (Dataset): The dataset to test the agent on. 186 | agent: The agent. 187 | agent_name (str): The name of the agent model. 188 | 189 | Returns: 190 | List[Dict[str, Any]]: A list of dictionaries containing the evaluation results for each example in the dataset. 191 | Each dictionary includes the agent model ID, evaluator model ID, question, ground truth answer, prediction, 192 | intermediate steps, evaluation score, evaluation feedback, tool call parsing error flag, iteration limit 193 | exceeded flag, agent error (if any), and example metadata (task). 194 | """ 195 | output_path = f"{output_folder}/{agent_name}.jsonl" 196 | print(f"Loading answers from {output_path}...") 197 | try: 198 | results = pd.read_json(output_path, lines=True).to_dict(orient="records") 199 | print(f"Found {len(results)} previous results!") 200 | except Exception as e: 201 | print("Error when loading records: ", e) 202 | print("Found no usable records! 🤔 Starting new.") 203 | results = [] 204 | 205 | results_df = pd.DataFrame(results) 206 | 207 | for _, example in tqdm(enumerate(dataset), total=len(dataset)): 208 | if len(results_df) > 0: 209 | if example["question"] in results_df["question"].unique(): 210 | continue 211 | prompt_use_files = "" 212 | if example['file_name']: 213 | if '.MOV' in example['file_name']: 214 | continue 215 | prompt_use_files += f"\n\nTo answer the question above, you will have to use these attached files:" 216 | if example['file_name'].split('.')[-1] in ['pdf', 'xlsx']: 217 | image_path = example['file_name'].split('.')[0] + '.png' 218 | if os.path.exists(image_path): 219 | prompt_use_files += f"\nAttached image: {image_path}" 220 | else: 221 | prompt_use_files += f"\nAttached file: {example['file_name']}" 222 | elif example['file_name'].split('.')[-1] == "zip": 223 | import shutil 224 | 225 | folder_name = example['file_name'].replace(".zip", "") 226 | os.makedirs(folder_name, exist_ok=True) 227 | shutil.unpack_archive(example['file_name'], folder_name) 228 | 229 | # Convert the extracted files 230 | prompt_use_files = "\n\nYou have been given a zip archive of supporting files. We extracted it into a directory: find the extracted files at the following paths:\n" 231 | for root, dirs, files in os.walk(folder_name): 232 | for file in files: 233 | file_path = os.path.join(root, file) 234 | prompt_use_files += f"- {file_path}\n" 235 | if file.split('.')[-1] in ['png', 'jpg', 'jpeg'] and visual_inspection_tool is not None: 236 | prompt = f"""Write a caption of 5 sentences maximum for this image. Pay special attention to any details that might be useful for someone answering the following question: 237 | {example['question']}. But do not try to answer the question directly! 238 | Do not add any information that is not present in the image. 239 | """.strip() 240 | prompt_use_files += "> Description of this image: " + visual_inspection_tool(image_path=file_path, question=prompt) + '\n\n' 241 | else: 242 | prompt = f"""Write a short caption (5 sentences maximum) for this file. Pay special attention to any details that might be useful for someone answering the following question: 243 | {example['question']}. But do not try to answer the question directly! 244 | Do not add any information that is not present in the file. 245 | """.strip() 246 | prompt_use_files += "> Description of this file: " + text_inspector_tool(file_path=file_path, question=prompt, initial_exam_mode=True) + '\n\n' 247 | elif example['file_name'].split('.')[-1] in ['png', 'jpg', 'jpeg']: 248 | prompt_use_files += f"\nAttached image: {example['file_name']}" 249 | elif example['file_name'].split('.')[-1] in ['mp3', 'm4a', 'wav']: 250 | prompt_use_files += f"\nAttached audio: {example['file_name']}" 251 | else: 252 | prompt_use_files += f"\nAttached file: {example['file_name']}" 253 | 254 | if example['file_name'].split('.')[-1] in ['png', 'jpg', 'jpeg'] and visual_inspection_tool is not None: 255 | prompt = f"""Write a caption of 5 sentences maximum for this image. Pay special attention to any details that might be useful for someone answering the following question: 256 | {example['question']}. But do not try to answer the question directly! 257 | Do not add any information that is not present in the image. 258 | """.strip() 259 | prompt_use_files += "\n> Description of this image: " + visual_inspection_tool(image_path=example['file_name'], question=prompt) 260 | elif '.zip' not in example['file_name'] and text_inspector_tool is not None: 261 | prompt = f"""Write a short caption (5 sentences maximum) for this file. Pay special attention to any details that might be useful for someone answering the following question: 262 | {example['question']}. But do not try to answer the question directly! 263 | Do not add any information that is not present in the file. 264 | """.strip() 265 | prompt_use_files += "\n> Description of this file: " + text_inspector_tool(file_path=example['file_name'], question=prompt, initial_exam_mode=True) 266 | else: 267 | prompt_use_files += "\n\nYou have been given no local files to access." 268 | example['augmented_question'] = f"""It is paramount that you complete this task and provide a correct answer. 269 | Give it all you can: I know for a fact that you have access to all the relevant tools to solve it. Failure or 'I cannot answer' will not be tolerated, success will be rewarded. 270 | Here is the task: 271 | """ + example['question'] + prompt_use_files 272 | 273 | # run agent 274 | result = await arun_agent( 275 | example=example, 276 | agent_executor=agent, 277 | agent_name=agent_name, 278 | agent_call_function=agent_call_function, 279 | ) 280 | 281 | # add in example metadata 282 | result.update( 283 | { 284 | "true_answer": example["true_answer"], 285 | "task": example["task"], 286 | } 287 | ) 288 | results.append(result) 289 | 290 | with open(output_path, 'w') as f: 291 | for d in results: 292 | json.dump(d, f, default=serialize_agent_error) 293 | f.write('\n') # add a newline for JSONL format 294 | # except Exception as e: 295 | # print("EXCEPTION!!!!=================\nFIND THE EXCEPTION LOG BELOW:\n", e) 296 | return results 297 | 298 | 299 | async def run_full_tests( 300 | dataset: Dataset, 301 | agents: Dict[str, AgentExecutor], 302 | agent_call_function: Callable = acall_langchain_agent, 303 | output_folder: str = "output", 304 | ) -> pd.DataFrame: 305 | """ 306 | Run a full evaluation on the given dataset using multiple agent models. 307 | 308 | Args: 309 | dataset (Dataset): The dataset to test on. 310 | agents (Dict[str, AgentExecutor]): A dictionary of agent executors to test on the dataset 311 | 312 | Returns: 313 | pd.DataFrame: The evaluation results as a pandas DataFrame. 314 | """ 315 | results = [] 316 | 317 | tasks = [ 318 | answer_questions( 319 | dataset=dataset, 320 | agent=agent_executor, 321 | agent_name=agent_name, 322 | agent_call_function=agent_call_function, 323 | output_folder=output_folder, 324 | ) 325 | for agent_name, agent_executor in agents.items() 326 | ] 327 | 328 | results = await asyncio.gather(*tasks) 329 | 330 | return pd.DataFrame([element for sublist in results for element in sublist]) 331 | -------------------------------------------------------------------------------- /scripts/tools/__pycache__/browser.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/tools/__pycache__/browser.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/tools/__pycache__/cookies.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/tools/__pycache__/cookies.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/tools/__pycache__/mdconvert.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/tools/__pycache__/mdconvert.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/tools/__pycache__/serpapi_browser.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/tools/__pycache__/serpapi_browser.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/tools/__pycache__/visual_qa.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/tools/__pycache__/visual_qa.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/tools/__pycache__/web_surfer.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aymeric-roucher/agent_reasoning_benchmark/814af85d4e796e74b14a83192554d73a0aa34a78/scripts/tools/__pycache__/web_surfer.cpython-310.pyc -------------------------------------------------------------------------------- /scripts/tools/visual_qa.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import base64 3 | from io import BytesIO 4 | import json 5 | import os 6 | import requests 7 | from typing import Optional 8 | from huggingface_hub import InferenceClient 9 | from transformers import AutoProcessor, Tool 10 | import uuid 11 | import mimetypes 12 | from dotenv import load_dotenv 13 | 14 | load_dotenv(override=True) 15 | 16 | idefics_processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b") 17 | 18 | def process_images_and_text(image_path, query, client): 19 | messages = [ 20 | { 21 | "role": "user", "content": [ 22 | {"type": "image"}, 23 | {"type": "text", "text": query}, 24 | ] 25 | }, 26 | ] 27 | 28 | prompt_with_template = idefics_processor.apply_chat_template(messages, add_generation_prompt=True) 29 | 30 | # load images from local directory 31 | 32 | # encode images to strings which can be sent to the endpoint 33 | def encode_local_image(image_path): 34 | # load image 35 | image = Image.open(image_path).convert('RGB') 36 | 37 | # Convert the image to a base64 string 38 | buffer = BytesIO() 39 | image.save(buffer, format="JPEG") # Use the appropriate format (e.g., JPEG, PNG) 40 | base64_image = base64.b64encode(buffer.getvalue()).decode('utf-8') 41 | 42 | # add string formatting required by the endpoint 43 | image_string = f"data:image/jpeg;base64,{base64_image}" 44 | 45 | return image_string 46 | 47 | 48 | image_string = encode_local_image(image_path) 49 | prompt_with_images = prompt_with_template.replace("", "![]({}) ").format(image_string) 50 | 51 | 52 | payload = { 53 | "inputs": prompt_with_images, 54 | "parameters": { 55 | "return_full_text": False, 56 | "max_new_tokens": 200, 57 | } 58 | } 59 | 60 | return json.loads(client.post(json=payload).decode())[0] 61 | 62 | # Function to encode the image 63 | def encode_image(image_path): 64 | if image_path.startswith("http"): 65 | user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0" 66 | request_kwargs = { 67 | "headers": {"User-Agent": user_agent}, 68 | "stream": True, 69 | } 70 | 71 | # Send a HTTP request to the URL 72 | response = requests.get(image_path, **request_kwargs) 73 | response.raise_for_status() 74 | content_type = response.headers.get("content-type", "") 75 | 76 | extension = mimetypes.guess_extension(content_type) 77 | if extension is None: 78 | extension = ".download" 79 | 80 | fname = str(uuid.uuid4()) + extension 81 | download_path = os.path.abspath(os.path.join("downloads", fname)) 82 | 83 | with open(download_path, "wb") as fh: 84 | for chunk in response.iter_content(chunk_size=512): 85 | fh.write(chunk) 86 | 87 | image_path = download_path 88 | 89 | with open(image_path, "rb") as image_file: 90 | return base64.b64encode(image_file.read()).decode('utf-8') 91 | 92 | headers = { 93 | "Content-Type": "application/json", 94 | "Authorization": f"Bearer {os.getenv('OPENAI_API_KEY')}" 95 | } 96 | 97 | 98 | def resize_image(image_path): 99 | img = Image.open(image_path) 100 | width, height = img.size 101 | img = img.resize((int(width / 2), int(height / 2))) 102 | new_image_path = f"resized_{image_path}" 103 | img.save(new_image_path) 104 | return new_image_path 105 | 106 | 107 | class VisualQATool(Tool): 108 | name = "visualizer" 109 | description = "A tool that can answer questions about attached images." 110 | inputs = { 111 | "question": {"description": "the question to answer", "type": "text"}, 112 | "image_path": { 113 | "description": "The path to the image on which to answer the question", 114 | "type": "text", 115 | }, 116 | } 117 | output_type = "text" 118 | 119 | client = InferenceClient("HuggingFaceM4/idefics2-8b-chatty") 120 | 121 | def forward(self, image_path: str, question: Optional[str] = None) -> str: 122 | add_note = False 123 | if not question: 124 | add_note = True 125 | question = "Please write a detailed caption for this image." 126 | try: 127 | output = process_images_and_text(image_path, question, self.client) 128 | except Exception as e: 129 | print(e) 130 | if "Payload Too Large" in str(e): 131 | new_image_path = resize_image(image_path) 132 | output = process_images_and_text(new_image_path, question, self.client) 133 | 134 | if add_note: 135 | output = f"You did not provide a particular question, so here is a detailed caption for the image: {output}" 136 | 137 | return output 138 | 139 | class VisualQAGPT4Tool(Tool): 140 | name = "visualizer" 141 | description = "A tool that can answer questions about attached images." 142 | inputs = { 143 | "question": {"description": "the question to answer", "type": "text"}, 144 | "image_path": { 145 | "description": "The path to the image on which to answer the question. This should be a local path to downloaded image.", 146 | "type": "text", 147 | }, 148 | } 149 | output_type = "text" 150 | 151 | def forward(self, image_path: str, question: Optional[str] = None) -> str: 152 | add_note = False 153 | if not question: 154 | add_note = True 155 | question = "Please write a detailed caption for this image." 156 | if not isinstance(image_path, str): 157 | raise Exception("You should provide only one string as argument to this tool!") 158 | 159 | base64_image = encode_image(image_path) 160 | 161 | payload = { 162 | "model": "gpt-4o", 163 | "messages": [ 164 | { 165 | "role": "user", 166 | "content": [ 167 | { 168 | "type": "text", 169 | "text": question 170 | }, 171 | { 172 | "type": "image_url", 173 | "image_url": { 174 | "url": f"data:image/jpeg;base64,{base64_image}" 175 | } 176 | } 177 | ] 178 | } 179 | ], 180 | "max_tokens": 500 181 | } 182 | response = requests.post("https://api.openai.com/v1/chat/completions", headers=headers, json=payload) 183 | try: 184 | output = response.json()['choices'][0]['message']['content'] 185 | except Exception: 186 | raise Exception(f"Response format unexpected: {response.json()}") 187 | 188 | if add_note: 189 | output = f"You did not provide a particular question, so here is a detailed caption for the image: {output}" 190 | 191 | return output 192 | 193 | -------------------------------------------------------------------------------- /scripts/tools/web_surfer.py: -------------------------------------------------------------------------------- 1 | # Shamelessly stolen from Microsoft Autogen team: thanks to them for this great resource! 2 | # https://github.com/microsoft/autogen/blob/gaia_multiagent_v01_march_1st/autogen/browser_utils.py 3 | import os 4 | import re 5 | from typing import Tuple, Optional 6 | from transformers.agents.agents import Tool 7 | import time 8 | from dotenv import load_dotenv 9 | import requests 10 | from pypdf import PdfReader 11 | from markdownify import markdownify as md 12 | import mimetypes 13 | from .browser import SimpleTextBrowser 14 | 15 | load_dotenv(override=True) 16 | 17 | user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0" 18 | 19 | browser_config = { 20 | "viewport_size": 1024 * 5, 21 | "downloads_folder": "coding", 22 | "request_kwargs": { 23 | "headers": {"User-Agent": user_agent}, 24 | "timeout": 300, 25 | }, 26 | } 27 | 28 | browser_config["serpapi_key"] = os.environ["SERPAPI_API_KEY"] 29 | 30 | browser = SimpleTextBrowser(**browser_config) 31 | 32 | 33 | # Helper functions 34 | def _browser_state() -> Tuple[str, str]: 35 | header = f"Address: {browser.address}\n" 36 | if browser.page_title is not None: 37 | header += f"Title: {browser.page_title}\n" 38 | 39 | current_page = browser.viewport_current_page 40 | total_pages = len(browser.viewport_pages) 41 | 42 | address = browser.address 43 | for i in range(len(browser.history)-2,-1,-1): # Start from the second last 44 | if browser.history[i][0] == address: 45 | header += f"You previously visited this page {round(time.time() - browser.history[i][1])} seconds ago.\n" 46 | break 47 | 48 | header += f"Viewport position: Showing page {current_page+1} of {total_pages}.\n" 49 | return (header, browser.viewport) 50 | 51 | 52 | class SearchInformationTool(Tool): 53 | name="informational_web_search" 54 | description="Perform an INFORMATIONAL web search query then return the search results." 55 | inputs = { 56 | "query": { 57 | "type": "text", 58 | "description": "The informational web search query to perform." 59 | } 60 | } 61 | inputs["filter_year"]= { 62 | "type": "text", 63 | "description": "[Optional parameter]: filter the search results to only include pages from a specific year. For example, '2020' will only include pages from 2020. Make sure to use this parameter if you're trying to search for articles from a specific date!" 64 | } 65 | output_type = "text" 66 | 67 | def forward(self, query: str, filter_year: Optional[int] = None) -> str: 68 | browser.visit_page(f"google: {query}", filter_year=filter_year) 69 | header, content = _browser_state() 70 | return header.strip() + "\n=======================\n" + content 71 | 72 | 73 | class NavigationalSearchTool(Tool): 74 | name="navigational_web_search" 75 | description="Perform a NAVIGATIONAL web search query then immediately navigate to the top result. Useful, for example, to navigate to a particular Wikipedia article or other known destination. Equivalent to Google's \"I'm Feeling Lucky\" button." 76 | inputs = {"query": {"type": "text", "description": "The navigational web search query to perform."}} 77 | output_type = "text" 78 | 79 | def forward(self, query: str) -> str: 80 | browser.visit_page(f"google: {query}") 81 | 82 | # Extract the first line 83 | m = re.search(r"\[.*?\]\((http.*?)\)", browser.page_content) 84 | if m: 85 | browser.visit_page(m.group(1)) 86 | 87 | # Return where we ended up 88 | header, content = _browser_state() 89 | return header.strip() + "\n=======================\n" + content 90 | 91 | 92 | class VisitTool(Tool): 93 | name="visit_page" 94 | description="Visit a webpage at a given URL and return its text." 95 | inputs = {"url": {"type": "text", "description": "The relative or absolute url of the webapge to visit."}} 96 | output_type = "text" 97 | 98 | def forward(self, url: str) -> str: 99 | browser.visit_page(url) 100 | header, content = _browser_state() 101 | return header.strip() + "\n=======================\n" + content 102 | 103 | 104 | class DownloadTool(Tool): 105 | name="download_file" 106 | description=""" 107 | Download a file at a given URL. The file should be of this format: [".xlsx", ".pptx", ".wav", ".mp3", ".png", ".docx"] 108 | After using this tool, for further inspection of this page you should return the download path to your manager via final_answer, and they will be able to inspect it. 109 | DO NOT use this tool for .pdf or .txt or .htm files: for these types of files use visit_page with the file url instead.""" 110 | inputs = {"url": {"type": "text", "description": "The relative or absolute url of the file to be downloaded."}} 111 | output_type = "text" 112 | 113 | def forward(self, url: str) -> str: 114 | if "arxiv" in url: 115 | url = url.replace("abs", "pdf") 116 | response = requests.get(url) 117 | content_type = response.headers.get("content-type", "") 118 | extension = mimetypes.guess_extension(content_type) 119 | if extension and isinstance(extension, str): 120 | new_path = f"./downloads/file{extension}" 121 | else: 122 | new_path = "./downloads/file.object" 123 | 124 | with open(new_path, "wb") as f: 125 | f.write(response.content) 126 | 127 | if "pdf" in extension or "txt" in extension or "htm" in extension: 128 | raise Exception("Do not use this tool for pdf or txt or html files: use visit_page instead.") 129 | 130 | return f"File was downloaded and saved under path {new_path}." 131 | 132 | 133 | class PageUpTool(Tool): 134 | name="page_up" 135 | description="Scroll the viewport UP one page-length in the current webpage and return the new viewport content." 136 | output_type = "text" 137 | 138 | def forward(self) -> str: 139 | browser.page_up() 140 | header, content = _browser_state() 141 | return header.strip() + "\n=======================\n" + content 142 | 143 | class ArchiveSearchTool(Tool): 144 | name="find_archived_url" 145 | description="Given a url, searches the Wayback Machine and returns the archived version of the url that's closest in time to the desired date." 146 | inputs={ 147 | "url": {"type": "text", "description": "The url you need the archive for."}, 148 | "date": {"type": "text", "description": "The date that you want to find the archive for. Give this date in the format 'YYYYMMDD', for instance '27 June 2008' is written as '20080627'."} 149 | } 150 | output_type = "text" 151 | 152 | def forward(self, url, date) -> str: 153 | archive_url = f"https://archive.org/wayback/available?url={url}×tamp={date}" 154 | response = requests.get(archive_url).json() 155 | try: 156 | closest = response["archived_snapshots"]["closest"] 157 | except: 158 | raise Exception(f"Your url was not archived on Wayback Machine, try a different url.") 159 | target_url = closest["url"] 160 | browser.visit_page(target_url) 161 | header, content = _browser_state() 162 | return f"Web archive for url {url}, snapshot taken at date {closest['timestamp'][:8]}:\n" + header.strip() + "\n=======================\n" + content 163 | 164 | 165 | class PageDownTool(Tool): 166 | name="page_down" 167 | description="Scroll the viewport DOWN one page-length in the current webpage and return the new viewport content." 168 | output_type = "text" 169 | 170 | def forward(self, ) -> str: 171 | browser.page_down() 172 | header, content = _browser_state() 173 | return header.strip() + "\n=======================\n" + content 174 | 175 | 176 | class FinderTool(Tool): 177 | name="find_on_page_ctrl_f" 178 | description="Scroll the viewport to the first occurrence of the search string. This is equivalent to Ctrl+F." 179 | inputs = {"search_string": {"type": "text", "description": "The string to search for on the page. This search string supports wildcards like '*'" }} 180 | output_type = "text" 181 | 182 | def forward(self, search_string: str) -> str: 183 | find_result = browser.find_on_page(search_string) 184 | header, content = _browser_state() 185 | 186 | if find_result is None: 187 | return header.strip() + f"\n=======================\nThe search string '{search_string}' was not found on this page." 188 | else: 189 | return header.strip() + "\n=======================\n" + content 190 | 191 | 192 | class FindNextTool(Tool): 193 | name="find_next" 194 | description="Scroll the viewport to next occurrence of the search string. This is equivalent to finding the next match in a Ctrl+F search." 195 | inputs = {} 196 | output_type = "text" 197 | 198 | def forward(self, ) -> str: 199 | find_result = browser.find_next() 200 | header, content = _browser_state() 201 | 202 | if find_result is None: 203 | return header.strip() + "\n=======================\nThe search string was not found on this page." 204 | else: 205 | return header.strip() + "\n=======================\n" + content 206 | --------------------------------------------------------------------------------