├── LICENSE
├── MODEL_LICENSE
├── README.md
├── conda-env.yml
├── data
    ├── gpt-3.5-turbo-testset-v1.json
    ├── pandalm-7b-testset-v1.json
    ├── pipeline-sanity-check.json
    ├── testset-inference-v1.json
    └── testset-v1.json
├── figures
    ├── inst-tune-pipeline.png
    ├── logo.png
    ├── main-figure.png
    └── pandalm-webui.png
├── pandalm
    ├── __init__.py
    ├── assets
    │   ├── __init__.py
    │   ├── ds_config_zero2.json
    │   └── ds_config_zero2_linear_lr.json
    ├── core
    │   ├── __init__.py
    │   ├── customtrainer.py
    │   ├── datasets.py
    │   ├── global_var.py
    │   └── nets.py
    ├── inst-tune.py
    ├── run-gradio.py
    ├── scripts
    │   └── inst-tune.sh
    └── utils
    │   ├── __init__.py
    │   ├── candidate_model_inference.py
    │   ├── evaluation_pipeline.py
    │   └── pandalm_inference.py
├── requirements.txt
└── test.py


/LICENSE:
--------------------------------------------------------------------------------
  1 |                                 Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2023 Yidong Wang, Zhuohao Yu
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MODEL_LICENSE:
--------------------------------------------------------------------------------
 1 | LLaMA LICENSE AGREEMENT
 2 | This License Agreement (as may be amended in accordance with this License Agreement, “License”), between you, or your employer or other entity (if you are entering into this agreement on behalf of your employer or other entity) (“Licensee” or “you”) and Meta Platforms, Inc. (“Meta” or “we”) applies to your use of any computer program, algorithm, source code, object code, or software that is made available by Meta under this License (“Software”) and any specifications, manuals, documentation, and other written information provided by Meta related to the Software (“Documentation”).
 3 | 
 4 | By clicking “I Accept” below or by using the Software, you agree to the terms of this License. If you do not agree to this License, then you do not have any rights to use the Software or Documentation (collectively, the “Software Products”), and you must immediately cease using the Software Products. If you are agreeing to be bound by the terms of this License on behalf of your employer or other entity, you represent and warrant to Meta that you have full legal authority to bind your employer or such entity to this License. If you do not have the requisite authority, you may not accept the License or access the Software Products on behalf of your employer or other entity.
 5 | 
 6 | 
 7 | 
 8 | LICENSE GRANT
 9 | 
10 | a. Subject to your compliance with the Documentation and Sections 2, 3, and 5, Meta grants you a non-exclusive, worldwide, non-transferable, non-sublicensable, revocable, royalty free and limited license under Meta’s copyright interests to reproduce, distribute, and create derivative works of the Software solely for your non-commercial research purposes. The foregoing license is personal to you, and you may not assign or sublicense this License or any other rights or obligations under this License without Meta’s prior written consent; any such assignment or sublicense will be void and will automatically and immediately terminate this License.
11 | 
12 | b. You may make a reasonable number of copies of the Documentation solely for use in connection with the license to the Software granted above.
13 | 
14 | c. The grant of rights expressly set forth in this Section 1 (License Grant) are the complete grant of rights to you in the Software Products, and no other licenses are granted, whether by waiver, estoppel, implication, equity or otherwise. Meta and its licensors reserve all rights not expressly granted by this License.
15 | 
16 | 
17 | RESTRICTIONS
18 | 
19 | You will not, and will not permit, assist or cause any third party to:
20 | 
21 | a. use, modify, copy, reproduce, create derivative works of, or distribute the Software Products (or any derivative works thereof, works incorporating the Software Products, or any data produced by the Software), in whole or in part, for (i) any commercial or production purposes, (ii) military purposes or in the service of nuclear technology, (iii) purposes of surveillance, including any research or development relating to surveillance, (iv) biometric processing, (v) in any manner that infringes, misappropriates, or otherwise violates any third-party rights, or (vi) in any manner that violates any applicable law, including accessing the Software Products from an embargoed country as prohibited by the U.S. government, and violating any privacy or security laws, rules, regulations, directives, or governmental requirements (including the General Data Privacy Regulation (Regulation (EU) 2016/679), the California Consumer Privacy Act, and any and all laws governing the processing of biometric information), as well as all amendments and successor laws to any of the foregoing;
22 | 
23 | b. alter or remove copyright and other proprietary notices which appear on or in the Software Products;
24 | 
25 | c. utilize any equipment, device, software, or other means to circumvent or remove any security or protection used by Meta in connection with the Software, or to circumvent or remove any usage restrictions, or to enable functionality disabled by Meta; or
26 | 
27 | d. offer or impose any terms on the Software Products that alter, restrict, or are inconsistent with the terms of this License.
28 | 
29 | 
30 | ATTRIBUTION
31 | 
32 | Together with any copies of the Software Products (as well as derivative works thereof or works incorporating the Software Products) that you distribute, you must provide (i) a copy of this License, and (ii) the following attribution notice: “LLaMA is licensed under the LLaMA license, Copyright (c) Meta Platforms, Inc. All Rights Reserved.”
33 | 
34 | 
35 | DISCLAIMERS
36 | 
37 | THE SOFTWARE PRODUCTS ARE PROVIDED “AS IS” and “WITH ALL FAULTS” WITH NO WARRANTY OF ANY KIND, EXPRESS OR IMPLIED. META EXPRESSLY DISCLAIMS ALL REPRESENTATIONS AND WARRANTIES, EXPRESS OR IMPLIED, WHETHER BY STATUTE, CUSTOM, USAGE OR OTHERWISE AS TO ANY MATTERS RELATED TO THE SOFTWARE PRODUCTS, INCLUDING BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE, SATISFACTORY QUALITY, OR NON-INFRINGEMENT. META MAKES NO WARRANTIES OR REPRESENTATIONS THAT THE SOFTWARE PRODUCTS WILL BE ERROR FREE OR FREE OF VIRUSES OR OTHER HARMFUL COMPONENTS, OR PRODUCE ANY PARTICULAR RESULTS.
38 | 
39 | 
40 | LIMITATION OF LIABILITY
41 | 
42 | TO THE FULLEST EXTENT PERMITTED BY LAW, IN NO EVENT WILL META BE LIABLE TO YOU (A) UNDER ANY THEORY OF LIABILITY, WHETHER BASED IN CONTRACT, TORT, NEGLIGENCE, STRICT LIABILITY, WARRANTY, OR OTHERWISE UNDER THIS LICENSE, OR (B) FOR ANY INDIRECT, CONSEQUENTIAL, EXEMPLARY, INCIDENTAL, PUNITIVE OR SPECIAL DAMAGES OR LOST PROFITS, EVEN IF META HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. THE SOFTWARE PRODUCTS, THEIR CONSTITUENT COMPONENTS, AND ANY OUTPUT (COLLECTIVELY, “SOFTWARE MATERIALS”) ARE NOT DESIGNED OR INTENDED FOR USE IN ANY APPLICATION OR SITUATION WHERE FAILURE OR FAULT OF THE SOFTWARE MATERIALS COULD REASONABLY BE ANTICIPATED TO LEAD TO SERIOUS INJURY OF ANY PERSON, INCLUDING POTENTIAL DISCRIMINATION OR VIOLATION OF AN INDIVIDUAL’S PRIVACY RIGHTS, OR TO SEVERE PHYSICAL, PROPERTY, OR ENVIRONMENTAL DAMAGE (EACH, A “HIGH-RISK USE”). IF YOU ELECT TO USE ANY OF THE SOFTWARE MATERIALS FOR A HIGH-RISK USE, YOU DO SO AT YOUR OWN RISK. YOU AGREE TO DESIGN AND IMPLEMENT APPROPRIATE DECISION-MAKING AND RISK-MITIGATION PROCEDURES AND POLICIES IN CONNECTION WITH A HIGH-RISK USE SUCH THAT EVEN IF THERE IS A FAILURE OR FAULT IN ANY OF THE SOFTWARE MATERIALS, THE SAFETY OF PERSONS OR PROPERTY AFFECTED BY THE ACTIVITY STAYS AT A LEVEL THAT IS REASONABLE, APPROPRIATE, AND LAWFUL FOR THE FIELD OF THE HIGH-RISK USE.
43 | 
44 | 
45 | INDEMNIFICATION
46 | 
47 | You will indemnify, defend and hold harmless Meta and our subsidiaries and affiliates, and each of our respective shareholders, directors, officers, employees, agents, successors, and assigns (collectively, the “Meta Parties”) from and against any losses, liabilities, damages, fines, penalties, and expenses (including reasonable attorneys’ fees) incurred by any Meta Party in connection with any claim, demand, allegation, lawsuit, proceeding, or investigation (collectively, “Claims”) arising out of or related to: (a) your access to or use of the Software Products (as well as any results or data generated from such access or use), including any High-Risk Use (defined below); (b) your violation of this License; or (c) your violation, misappropriation or infringement of any rights of another (including intellectual property or other proprietary rights and privacy rights). You will promptly notify the Meta Parties of any such Claims, and cooperate with Meta Parties in defending such Claims. You will also grant the Meta Parties sole control of the defense or settlement, at Meta’s sole option, of any Claims. This indemnity is in addition to, and not in lieu of, any other indemnities or remedies set forth in a written agreement between you and Meta or the other Meta Parties.
48 | 
49 | 
50 | TERMINATION; SURVIVAL
51 | 
52 | a. This License will automatically terminate upon any breach by you of the terms of this License.
53 | 
54 | b. We may terminate this License, in whole or in part, at any time upon notice (including electronic) to you.
55 | 
56 | c. The following sections survive termination of this License: 2 (Restrictions), 3 (Attribution), 4 (Disclaimers), 5 (Limitation on Liability), 6 (Indemnification) 7 (Termination; Survival), 8 (Third Party Materials), 9 (Trademarks), 10 (Applicable Law; Dispute Resolution), and 11 (Miscellaneous).
57 | 
58 | 
59 | THIRD PARTY MATERIALS
60 | 
61 | The Software Products may contain third-party software or other components (including free and open source software) (all of the foregoing, “Third Party Materials”), which are subject to the license terms of the respective third-party licensors. Your dealings or correspondence with third parties and your use of or interaction with any Third Party Materials are solely between you and the third party. Meta does not control or endorse, and makes no representations or warranties regarding, any Third Party Materials, and your access to and use of such Third Party Materials are at your own risk.
62 | 
63 | 
64 | TRADEMARKS
65 | 
66 | Licensee has not been granted any trademark license as part of this License and may not use any name or mark associated with Meta without the prior written permission of Meta, except to the extent necessary to make the reference required by the “ATTRIBUTION” section of this Agreement.
67 | 
68 | 
69 | APPLICABLE LAW; DISPUTE RESOLUTION
70 | 
71 | This License will be governed and construed under the laws of the State of California without regard to conflicts of law provisions. Any suit or proceeding arising out of or relating to this License will be brought in the federal or state courts, as applicable, in San Mateo County, California, and each party irrevocably submits to the jurisdiction and venue of such courts.
72 | 
73 | 
74 | MISCELLANEOUS
75 | 
76 | If any provision or part of a provision of this License is unlawful, void or unenforceable, that provision or part of the provision is deemed severed from this License, and will not affect the validity and enforceability of any remaining provisions. The failure of Meta to exercise or enforce any right or provision of this License will not operate as a waiver of such right or provision. This License does not confer any third-party beneficiary rights upon any other person or entity. This License, together with the Documentation, contains the entire understanding between you and Meta regarding the subject matter of this License, and supersedes all other written or oral agreements and understandings between you and Meta regarding such subject matter. No change or addition to any provision of this License will be binding unless it is in writing and signed by an authorized representative of both you and Meta.
77 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PandaLM: ReProducible and Automated Language Model Assessment
  2 | 
  3 | <div align="center">
  4 |   <a href=" ">
  5 |     <img src="figures/logo.png" alt="Logo" width="400">
  6 |   </a>
  7 |   </br>
  8 |   <a>Yidong Wang<sup>*1,2</sup></a>&emsp;
  9 |   <a>Zhuohao Yu<sup>*1</sup></a>&emsp;
 10 |   </br>
 11 |   <a>Zhengran Zeng<sup>1</sup></a>&emsp;
 12 |   <a>Linyi Yang<sup>2</sup></a>&emsp;
 13 |   <a>Qiang Heng<sup>3</sup></a>&emsp;
 14 |   <a>Cunxiang Wang<sup>2</sup></a>&emsp;
 15 |   <a>Hao Chen<sup>4</sup></a>&emsp;
 16 |   <a>Chaoya Jiang<sup>1</sup></a>&emsp;
 17 |   <a>Rui Xie<sup>1</sup></a>&emsp;
 18 |   </br>
 19 |   <a>Jindong Wang<sup>5</sup></a>&emsp;
 20 |   <a>Xing Xie<sup>5</sup></a>&emsp;
 21 |   <a>Wei Ye<sup>†1</sup></a>&emsp;
 22 |   <a>Shikun Zhang<sup>†1</sup></a>&emsp;
 23 |   <a>Yue Zhang<sup>†2</sup></a>&emsp;
 24 |   <div>
 25 |     </br>
 26 |     *: Co-first Authors, work done during Yidong's internship at Westlake University.   †: Co-corresponding Authors
 27 |   </div>
 28 |   <p> </br> <sup>1</sup> Peking University, <sup>2</sup> Westlake University,</br> <sup>3</sup> North Carolina State University, <sup>4</sup> Carnegie Mellon University, <sup>5</sup> Microsoft Research Asia</p>
 29 |   
 30 | </div>
 31 | 
 32 | 
 33 | ## Overview
 34 | This is the official repository for PandaLM: Re**P**roducible **and** **A**utomated **L**anguage **M**odel Assessment.
 35 | 
 36 | Paper: [PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization](https://arxiv.org/abs/2306.05087)
 37 | 
 38 | PandaLM aims to provide reproducible and automated comparisons between different large language models (LLMs). By giving PandaLM the same context, it can compare the responses of different LLMs and provide a reason for the decision, along with a reference answer. The target audience for PandaLM may be organizations that have confidential data and research labs with limited funds that seek reproducibility. These organizations may not want to disclose their data to third parties or may not be able to afford the high costs of secret data leakage using third-party APIs or hiring human annotators. With PandaLM, they can perform evaluations without compromising data security or incurring high costs, and obtain reproducible results. To demonstrate the reliability and consistency of our tool, we have created a diverse human-annotated test dataset of approximately 1,000 samples, where the contexts and the labels are all created by humans. **Our results indicate that PandaLM-7B achieves 93.75% of GPT-3.5's evaluation ability and 88.28% of GPT-4's in terms of F1-score on our test dataset.**. **More papers and features are coming soon.**
 39 | 
 40 | ![img](./figures/main-figure.png)
 41 | 
 42 | This repository contains:
 43 | 
 44 | - The codes for training PandaLM
 45 | - The human-annotated test dataset with ~1k samples for validating PandaLM's ability to evaluate LLMs
 46 | - The model weights of PandaLM
 47 | - The codes and configs for instruction tuning other foundation models such as Bloom, OPT and LLaMA, etc.
 48 | 
 49 | ## News 
 50 | - [2024/05/21] Fix the training data 404 problem.
 51 | - [2024/01/16] Pandalm is accepted by ICLR 2024!
 52 | - [2023/07/25] We share our tuned Alpaca and PandaLM-Alpaca at [Huggingface](https://huggingface.co/WeOpenML). Besides, we uploaded our tuned Alpaca and PandaLM-Alpaca to [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), the results prove that using PandaLM can boost the instruction tuning performance. 
 53 | - [2023/04/30] We are pleased to announce the release of PandaLM 1.0 as an open-source tool for evaluating LLMs with reliability. To further demonstrate the effectiveness of PandaLM, we are also sharing a human-annotated test dataset.
 54 | - [2023/06/08] We release our paper on arxiv: [PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization](https://arxiv.org/abs/2306.05087)
 55 | 
 56 | ## **Table of Contents**
 57 | 
 58 | - [PandaLM: Panda Language Models](#pandalm-panda-language-models)
 59 |   - [News](#News)
 60 |   - [Table of Contents](#table-of-contents)
 61 |   - [Motivation](#motivation)
 62 |   - [Installation](#installation)
 63 |   - [Usage](#usage)
 64 |   - [Data](#data)
 65 |     - [Train data](#train-data)
 66 |     - [Test data](#test-data)
 67 |   - [Experimental Results](#experimental-results)
 68 |   - [Conrtibuting](#conrtibuting)
 69 |   - [Citation](#citation)
 70 |   - [License](#license) 
 71 | 
 72 | ## Motivation
 73 | 
 74 | ![img](./figures/inst-tune-pipeline.png)
 75 | 
 76 | As shown above, an iteration of the instruction tuning of LLMs includes training and evaluation. Each iteration refers to changes in hyper-paramters or fine tuning algorithms. The instruction tuning of LLMs can be done within a few GPU hours even using a consumer-grade GPU, thanks to parameter-efficient-tuning methods. However, the human-based and API-based evaluations can be more expensive and time consuming. Furthermore, they can be inconsistent and unreproducible due to a lack of transparency regarding LLM changelogs and subjectivity in human annotations. Moreover, the use of API-based evaluations can result in potential high costs of remediation after [secret data leakage](https://mashable.com/article/samsung-chatgpt-leak-details). Formally, the total cost of instruction tuning LLMs is $N \times (C_{train}+C_{cost})$, where $C_{train}$ can be significantly smaller than $C_{cost}$ if human-based or API-based evaluations are adopted. 
 77 | 
 78 | To address these challenges, we propose an evaluation model named PandaLM, which can ensure reproducibility, safety, and efficiency in evaluation. By automating the evaluation process, our model can achieve efficient and consistent evaluations while maintaining high evaluation ability. 
 79 | 
 80 | ## **Installation**
 81 | 
 82 | To install PandaLM, follow these steps:
 83 | 
 84 | 1. Clone the repository: `git clone https://github.com/WeOpenML/PandaLM.git`
 85 | 2. Navigate to the project directory: `cd PandaLM`
 86 | 3. Install the required dependencies: `pip install -r requirements.txt` or use `conda env create -f conda-env.yml` if you prefer conda. Note that it's required to modify `prefix` in `conda-env.yml` to your conda path.
 87 | 
 88 | To instruction a foundation model, follow these steps:
 89 | 1. Install PandaLM.
 90 | 2. Navigate to the project directory: `cd PandaLM/pandalm`
 91 | 3. Run the demo scripts: `bash scripts/inst-tune.sh`
 92 | 
 93 | Due to concerns about copyright issues, we do not provided the instruction tuned model. The instruction tuned model can be easily reproduced in PandaLM/pandalm/scripts/inst-tune.sh.
 94 | 
 95 | ## **Usage**
 96 | 
 97 | We have uploaded PandaLM-7B to HuggingFace, you can simply initialize the model and tokenizer with:
 98 | 
 99 | ```python
100 | from transformers import AutoTokenizer, AutoModelForCausalLM
101 | 
102 | tokenizer = AutoTokenizer.from_pretrained("WeOpenML/PandaLM-7B-v1",use_fast=False)
103 | 
104 | model = AutoModelForCausalLM.from_pretrained("WeOpenML/PandaLM-7B-v1")
105 | ```
106 | 
107 | 
108 | We offer several choices for experiencing our PandaLM. (Preparing codes..Please be patient.)
109 | 1. Try PandaLM on your local machine (with a GPU having at least 24G VRAM) using a Web UI:
110 | ```shell
111 | cd PandaLM/pandalm/ 
112 | CUDA_VISIBLE_DEVICES=0 python3 run-gradio.py --base_model=WeOpenML/PandaLM-7B-v1 --server_port=<your-server-port> --server_name=<your-server-name>
113 | ```
114 | By default the program will listen to port 31228 on all network interfaces, access http://localhost:31228/ if you are running on a local machine or http://\<your-server-name\>:\<your-server-port\>/ if on a remote server.
115 | <div align="center">
116 |   <a href=" ">
117 |     <img src="figures/pandalm-webui.png" alt="webui" width="800">
118 |   </a>
119 | </div>
120 | 
121 | 2. We provide a class called `EvaluationPipeline` that can evaluate multiple candidate models using PandaLM. The constructor of this class takes in a list of candidate model paths or output JSON files if some models are not open sourced. Optionally, It also takes in the path of the PandaLM model, an input data path to load the test data, and an output data path to save the test result. **Note that this demo just shows the evaluation of LLMs before instruction tuning. In practice, _we need to instruction tune them first_, and then pass the tuned models into `EvaluationPipeline`.** For more details, see the codes. You can test the candidate models in just three lines:
122 | 
123 | ```python
124 | from pandalm import EvaluationPipeline
125 | 
126 | pipeline = EvaluationPipeline(candidate_paths=["huggyllama/llama-7b", "bigscience/bloom-7b1", "facebook/opt-6.7b"], input_data_path="data/pipeline-sanity-check.json")
127 | 
128 | print(pipeline.evaluate())
129 | ```
130 | 
131 | 
132 | ## **Data**
133 | 
134 | This section introduces the train and test data for training and evaluating PandaLM. **We will continuously update and open-source the data to improve PandaLM.**
135 | 
136 | ### **Train data**
137 | 
138 | We aim to force our model not only to evaluate different responses for a given context, but also generate a reference response utilizing the given context. Thus, each instance from the training data consists of an input tuple (instruction, input, response1, response2) and an output tuple (evaluation_result, evaluation_reason, reference_response). Specifically, in the input tuple, the instructions and inputs are sampled from [Alpaca 52K data](https://github.com/tatsu-lab/stanford_alpaca#data-release) and the response pairs are provided by LLaMA-7B, Bloom-7B, Cerebras-GPT-6.7B, OPT-7B and Pythia-6.9 tuned by ourselves with the same instruction data and hyper-parameters. We chose these foundation models as they are similar in size and their model weights are publicly available. The corresponding output tuple includes an evaluation result, a brief explanation for the evaluation and a reference response. Note that "1" or "2" in the evaluation result means response 1 or 2 is better and "Tie" means they are similar in quality. Since it is unaffordable to obtain *millions* of output tuples from human annotators and ChatGPT has the ability to evaluate LLMs to a certain extent, we follow [self-instruct](https://github.com/yizhongw/self-instruct/blob/main/human_eval) to get output tuples from ChatGPT(gpt-3.5-turbo) and then adopt heuristic data filtering strategy to filter noisy ones. The filtered train dataset contains 300K samples while the original unfiltered datasets have 1M samples. **The train data can be found at [train-data](https://1drv.ms/u/c/1d37ede6eaa974dd/Ed10qerm7TcggB2rAgAAAAABNam36aQ16ZcMMH21ZUO9fQ?e=gN6nxR).** Here is a demonstration of the train data:
139 | 
140 | ```Plain
141 | {
142 |     "inputs": {
143 |         "instruction": "Find an example of the given kind of data",
144 |         "input": "Qualitative data",
145 |         "response1": "An example of qualitative data is customer feedback.",
146 |         "response2": "An example of qualitative data is a customer review.",
147 |     },
148 |     "outputs": {
149 |         "evaluation_result": "Tie",
150 |         "evaluation_reason": "Both responses are correct and provide similar examples of qualitative data.",
151 |         "reference_response": "An example of qualitative data is an interview transcript."
152 |     }
153 | }
154 | ```
155 | 
156 | ### **Test data**
157 | 
158 | To prove the reliability of PandaLM, we created a **human-labeled test dataset** that is reliable and aligned to human preference of text. Each instance of the test dataset consists of one instruction and input, two responses generated by different instruction-tuned LLMs. The task is to compare the qualities of two responses. Similar to train data, the responses are provided by LLaMA-7B, Bloom-7B, Cerebras-GPT-6.7B, OPT-7B and Pythia-6.9B instruction-tuned by ourselves with the same instruction data and hyper-parameters. After obtaining the human-labeled test dataset, we can then compare ChatGPT and PandaLM in terms of evaluation performance.
159 | 
160 | The test data is generated and sampled from the human evaluation data of [self-instruct](https://github.com/yizhongw/self-instruct/blob/main/human_eval). The inputs and labels of test data are purely generated by humans and contain diverse tasks and contents. The labels are annotated independently by three different human evaluators. The data consists of a series of tasks, where each task includes an instruction, input sentence, two responses, and a label indicating the preferred response. Note that "1" or "2" means response 1 or 2 is better and "0" means they are similar in quality. Note that we exclude examples with big divergences from the origin annotated test data to make sure the IAA(Inter Annotator Agreement) of each annotator on the rest data is close to 0.85 because these filtered samples require additional knowledge or hard-to-obtain information, which makes it difficult for humans to evaluate them. The filtered test dataset contains 1K samples while the original unfiltered datasets have 2.5K samples. Here we give an example with explanations on the test set. *The test data is available in `./data/testset-v1.json`.* We also release the test results of gpt-3.5-turbo and PandaLM-7B in `./data/gpt-3.5-turbo-testset-v1.json` and `./data/pandalm-7b-testset-v1.json`.
161 | 
162 | ```Plain
163 | {
164 |   "index": "749",
165 |   "motivation_app": "CNN News",
166 |   "task_id": "user_oriented_task_165",
167 |   "cmp_key": "opt-7b_pythia-6.9b", ## It means response 1 is from opt-7B and response 2 is from pythia-6.9B
168 |   "instruction": "Give the news title a category. Pick a category from the list of News & Buzz, Travel, Style, Arts & Culture, Politics, Tech, and Science & Health.",
169 |   "input": "The #Banksy Exhibit in Cambridge, MA is absolutely terrific.",
170 |   "reference_response": "Arts & Culture", ## Directly copy from the self instruct repo.
171 |   "response1": "Politics",
172 |   "response2": "Arts & Culture",
173 |   "label_0": "2", # Label from Human annotator No.1
174 |   "label_1": "2", # Label from Human annotator No.2
175 |   "label_2": "2", # Label from Human annotator No.3
176 | }
177 | ```
178 | 
179 | We calculate the IAA of each annotator using Cohen’s kappa. The IAA is shown below:
180 | 
181 | | Cohen’s kappa| Annotator #1 | Annotator #2 | Annotator #3 |
182 | | ------------ | ------------ | ------------ | ------------ |
183 | | Annotator #1 | 1            | 0.85         | 0.88         |
184 | | Annotator #2 | 0.85         | 1            | 0.86         |
185 | | Annotator #3 | 0.88         | 0.86         | 1            |
186 | 
187 | The label distribution of test data is:
188 | |              | 0            |1             |2             |
189 | | ------------ | ------------ | ------------ | ------------ |
190 | | Number       | 105          | 422          | 472          |
191 | 
192 | ## **Experimental Results**
193 | Please refer to Paper: [PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization](https://arxiv.org/abs/2306.05087)
194 | 
195 | ## **Conrtibuting**
196 | 
197 | We welcome contributions to PandaLM! If you'd like to contribute, please follow these steps:
198 | 
199 | 1. Fork the repository.
200 | 2. Create a new branch with your changes.
201 | 3. Submit a pull request with a clear description of your changes.
202 | 
203 | 
204 | ## **Citation**
205 | 
206 | ```Plain
207 | @article{pandalm2024,
208 |       title={PandaLM: An Automatic Evaluation Benchmark for LLM Instruction Tuning Optimization}, 
209 |       author={Wang, Yidong and Yu, Zhuohao and Zeng, Zhengran and Yang, Linyi and Wang, Cunxiang and Chen, Hao and Jiang, Chaoya and Xie, Rui and Wang, Jindong and Xie, Xing and Ye, Wei and Zhang, Shikun and Zhang, Yue},
210 |       booktitle={International Conference on Learning Representations (ICLR)},
211 |       year={2024}
212 | }
213 | 
214 | @misc{PandaLM,
215 |   author = {Wang, Yidong and Yu, Zhuohao and Zeng, Zhengran and Yang, Linyi and Heng, Qiang and Wang, Cunxiang and Chen, Hao and Jiang, Chaoya and Xie, Rui and Wang, Jindong and Xie, Xing and Ye, Wei and Zhang, Shikun and Zhang, Yue},
216 |   title = {PandaLM: Reproducible and Automated Language Model Assessment},
217 |   year = {2023},
218 |   publisher = {GitHub},
219 |   journal = {GitHub repository},
220 |   howpublished = {\url{https://github.com/WeOpenML/PandaLM}},
221 | }
222 | ```
223 | 
224 | ## **License**
225 | 
226 | For model weights of PandaLM, we follow LLaMA license. See MODEL_LICENSE.
227 | 
228 | The train data license will be added when we uploaded train data.
229 | 
230 | The rest of this repo is under Apache License 2.0. See LICENSE.
231 | 


--------------------------------------------------------------------------------
/conda-env.yml:
--------------------------------------------------------------------------------
  1 | name: pandalm
  2 | channels:
  3 |   - pytorch
  4 |   - nvidia
  5 |   - conda-forge
  6 | dependencies:
  7 |   - _libgcc_mutex=0.1=conda_forge
  8 |   - _openmp_mutex=4.5=2_kmp_llvm
  9 |   - blas=2.116=mkl
 10 |   - blas-devel=3.9.0=16_linux64_mkl
 11 |   - brotlipy=0.7.0=py310h5764c6d_1005
 12 |   - bzip2=1.0.8=h7f98852_4
 13 |   - ca-certificates=2022.12.7=ha878542_0
 14 |   - certifi=2022.12.7=pyhd8ed1ab_0
 15 |   - cffi=1.15.1=py310h255011f_3
 16 |   - charset-normalizer=3.1.0=pyhd8ed1ab_0
 17 |   - cryptography=40.0.2=py310h34c0648_0
 18 |   - cuda-cudart=11.8.89=0
 19 |   - cuda-cupti=11.8.87=0
 20 |   - cuda-libraries=11.8.0=0
 21 |   - cuda-nvrtc=11.8.89=0
 22 |   - cuda-nvtx=11.8.86=0
 23 |   - cuda-runtime=11.8.0=0
 24 |   - ffmpeg=4.3=hf484d3e_0
 25 |   - filelock=3.12.0=pyhd8ed1ab_0
 26 |   - freetype=2.12.1=hca18f0e_1
 27 |   - gmp=6.2.1=h58526e2_0
 28 |   - gmpy2=2.1.2=py310h3ec546c_1
 29 |   - gnutls=3.6.13=h85f3911_1
 30 |   - icu=72.1=hcb278e6_0
 31 |   - idna=3.4=pyhd8ed1ab_0
 32 |   - jinja2=3.1.2=pyhd8ed1ab_1
 33 |   - jpeg=9e=h0b41bf4_3
 34 |   - lame=3.100=h166bdaf_1003
 35 |   - lcms2=2.15=hfd0df8a_0
 36 |   - ld_impl_linux-64=2.40=h41732ed_0
 37 |   - lerc=4.0.0=h27087fc_0
 38 |   - libblas=3.9.0=16_linux64_mkl
 39 |   - libcblas=3.9.0=16_linux64_mkl
 40 |   - libcublas=11.11.3.6=0
 41 |   - libcufft=10.9.0.58=0
 42 |   - libcufile=1.6.0.25=0
 43 |   - libcurand=10.3.2.56=0
 44 |   - libcusolver=11.4.1.48=0
 45 |   - libcusparse=11.7.5.86=0
 46 |   - libdeflate=1.17=h0b41bf4_0
 47 |   - libffi=3.4.2=h7f98852_5
 48 |   - libgcc-ng=12.2.0=h65d4601_19
 49 |   - libgfortran-ng=12.2.0=h69a702a_19
 50 |   - libgfortran5=12.2.0=h337968e_19
 51 |   - libgomp=12.2.0=h65d4601_19
 52 |   - libhwloc=2.9.1=hd6dc26d_0
 53 |   - libiconv=1.17=h166bdaf_0
 54 |   - liblapack=3.9.0=16_linux64_mkl
 55 |   - liblapacke=3.9.0=16_linux64_mkl
 56 |   - libnpp=11.8.0.86=0
 57 |   - libnsl=2.0.0=h7f98852_0
 58 |   - libnvjpeg=11.9.0.86=0
 59 |   - libpng=1.6.39=h753d276_0
 60 |   - libsqlite=3.40.0=h753d276_0
 61 |   - libstdcxx-ng=12.2.0=h46fd767_19
 62 |   - libtiff=4.5.0=h6adf6a1_2
 63 |   - libuuid=2.38.1=h0b41bf4_0
 64 |   - libwebp-base=1.3.0=h0b41bf4_0
 65 |   - libxcb=1.13=h7f98852_1004
 66 |   - libxml2=2.10.4=hfdac1af_0
 67 |   - libzlib=1.2.13=h166bdaf_4
 68 |   - llvm-openmp=16.0.1=h417c0b6_0
 69 |   - markupsafe=2.1.2=py310h1fa729e_0
 70 |   - mkl=2022.1.0=h84fe81f_915
 71 |   - mkl-devel=2022.1.0=ha770c72_916
 72 |   - mkl-include=2022.1.0=h84fe81f_915
 73 |   - mpc=1.3.1=hfe3b2da_0
 74 |   - mpfr=4.2.0=hb012696_0
 75 |   - mpmath=1.3.0=pyhd8ed1ab_0
 76 |   - ncurses=6.3=h27087fc_1
 77 |   - nettle=3.6=he412f7d_0
 78 |   - networkx=3.1=pyhd8ed1ab_0
 79 |   - numpy=1.24.2=py310h8deb116_0
 80 |   - openh264=2.1.1=h780b84a_0
 81 |   - openjpeg=2.5.0=hfec8fc6_2
 82 |   - openssl=3.1.0=h0b41bf4_0
 83 |   - pillow=9.4.0=py310h023d228_1
 84 |   - pip=23.1=pyhd8ed1ab_0
 85 |   - pthread-stubs=0.4=h36c2ea0_1001
 86 |   - pycparser=2.21=pyhd8ed1ab_0
 87 |   - pyopenssl=23.1.1=pyhd8ed1ab_0
 88 |   - pysocks=1.7.1=pyha2e5f31_6
 89 |   - python=3.10.10=he550d4f_0_cpython
 90 |   - python_abi=3.10=3_cp310
 91 |   - pytorch=2.0.0=py3.10_cuda11.8_cudnn8.7.0_0
 92 |   - pytorch-cuda=11.8=h7e8668a_3
 93 |   - pytorch-mutex=1.0=cuda
 94 |   - readline=8.2=h8228510_1
 95 |   - requests=2.28.2=pyhd8ed1ab_1
 96 |   - setuptools=67.6.1=pyhd8ed1ab_0
 97 |   - sympy=1.11.1=pypyh9d50eac_103
 98 |   - tbb=2021.9.0=hf52228f_0
 99 |   - tk=8.6.12=h27826a3_0
100 |   - torchaudio=2.0.0=py310_cu118
101 |   - torchtriton=2.0.0=py310
102 |   - torchvision=0.15.0=py310_cu118
103 |   - typing_extensions=4.5.0=pyha770c72_0
104 |   - urllib3=1.26.15=pyhd8ed1ab_0
105 |   - wheel=0.40.0=pyhd8ed1ab_0
106 |   - xorg-libxau=1.0.9=h7f98852_0
107 |   - xorg-libxdmcp=1.1.3=h7f98852_0
108 |   - xz=5.2.6=h166bdaf_0
109 |   - zlib=1.2.13=h166bdaf_4
110 |   - zstd=1.5.2=h3eb15da_6
111 |   - pip:
112 |       - accelerate==0.18.0
113 |       - aiofiles==23.1.0
114 |       - aiohttp==3.8.4
115 |       - aiosignal==1.3.1
116 |       - altair==4.2.2
117 |       - anyio==3.6.2
118 |       - appdirs==1.4.4
119 |       - async-timeout==4.0.2
120 |       - attrs==23.1.0
121 |       - click==8.1.3
122 |       - contourpy==1.0.7
123 |       - cycler==0.11.0
124 |       - deepspeed==0.9.0
125 |       - docker-pycreds==0.4.0
126 |       - entrypoints==0.4
127 |       - fastapi==0.95.1
128 |       - ffmpy==0.3.0
129 |       - fire==0.5.0
130 |       - fonttools==4.39.3
131 |       - frozenlist==1.3.3
132 |       - fsspec==2023.4.0
133 |       - gitdb==4.0.10
134 |       - gitpython==3.1.31
135 |       - gradio==3.28.1
136 |       - gradio-client==0.1.4
137 |       - h11==0.14.0
138 |       - hjson==3.1.0
139 |       - httpcore==0.17.0
140 |       - httpx==0.24.0
141 |       - huggingface-hub==0.13.4
142 |       - jsonschema==4.17.3
143 |       - kiwisolver==1.4.4
144 |       - linkify-it-py==2.0.0
145 |       - markdown-it-py==2.2.0
146 |       - matplotlib==3.7.1
147 |       - mdit-py-plugins==0.3.3
148 |       - mdurl==0.1.2
149 |       - multidict==6.0.4
150 |       - ninja==1.11.1
151 |       - openai==0.27.4
152 |       - orjson==3.8.11
153 |       - packaging==23.1
154 |       - pandas==2.0.1
155 |       - pathtools==0.1.2
156 |       - peft==0.2.0
157 |       - protobuf==4.22.3
158 |       - psutil==5.9.5
159 |       - py-cpuinfo==9.0.0
160 |       - pydantic==1.10.7
161 |       - pydub==0.25.1
162 |       - pyparsing==3.0.9
163 |       - pyrsistent==0.19.3
164 |       - python-dateutil==2.8.2
165 |       - python-multipart==0.0.6
166 |       - pytz==2023.3
167 |       - pyyaml==6.0
168 |       - regex==2023.3.23
169 |       - semantic-version==2.10.0
170 |       - sentencepiece==0.1.98
171 |       - sentry-sdk==1.21.1
172 |       - setproctitle==1.3.2
173 |       - six==1.16.0
174 |       - smmap==5.0.0
175 |       - sniffio==1.3.0
176 |       - starlette==0.26.1
177 |       - termcolor==2.3.0
178 |       - tokenizers==0.13.3
179 |       - toolz==0.12.0
180 |       - tqdm==4.65.0
181 |       - transformers==4.28.1
182 |       - tzdata==2023.3
183 |       - uc-micro-py==1.0.1
184 |       - uvicorn==0.22.0
185 |       - wandb==0.15.0
186 |       - websockets==11.0.2
187 |       - yarl==1.8.2
188 | prefix: /{your-conda-path}/envs/pandalm
189 | 


--------------------------------------------------------------------------------
/data/pipeline-sanity-check.json:
--------------------------------------------------------------------------------
 1 | [
 2 |     {
 3 |         "motivation_app": "Google Meet",
 4 |         "instruction": "Summarize a meeting from the given list of bullet points. Be sure to convert shorthand into a first-hand account.",
 5 |         "input": "Rose: Analyze data and presents findings\nJohn: propose new idea\nJane: appointed to head project\nTom: need more time to fix software bug"
 6 |     },
 7 |     {
 8 |         "motivation_app": "Amazon",
 9 |         "instruction": "Make a list of adjectives that can be used to describe the given brand.",
10 |         "input": "a creative tech startup"
11 |     },
12 |     {
13 |         "motivation_app": "Wolfram alpha",
14 |         "instruction": "Verify the correctness of the given statement.",
15 |         "input": "\"For all integers j and k, if j and k are odd, then jk is odd.\""
16 |     },
17 |     {
18 |         "motivation_app": "Amazon",
19 |         "instruction": "What other Amazon products might interest someone who visited the given product?",
20 |         "input": "Zeroll Zerolon Hardcoat Anodized Commercial Ice Cream Scoop with Unique Liquid Filled Heat Conductive Handle Easy Release Made in USA, 1.5-Ounce, Black"
21 |     },
22 |     {
23 |         "motivation_app": "Coursera",
24 |         "instruction": "Come up with the courses that one is supposed to take in order to be an expert in a given field.",
25 |         "input": "Graphic Design"
26 |     },
27 |     {
28 |         "motivation_app": "Google Search",
29 |         "instruction": "Based on the given query, suggest some related search queries.",
30 |         "input": "learning french"
31 |     },
32 |     {
33 |         "motivation_app": "Jira",
34 |         "instruction": "A user story is an informal, general explanation of a software feature written from the perspective of the end user or customer. Write a user story for a given software.",
35 |         "input": "Gmail"
36 |     },
37 |     {
38 |         "motivation_app": "Tasty",
39 |         "instruction": "Describe how to prepare the given food in your own words. Note down the ingredients you will need and the steps you will take to prepare them.",
40 |         "input": "Chewy Chocolate Chip Cookies"
41 |     },
42 |     {
43 |         "motivation_app": "YouTube",
44 |         "instruction": "You should choose a YouTube video title based on the video's content. A video's title tells viewers what to expect from it. It should be direct, honest, and clear. The title of the video needs to capture the attention of viewers, so do not use an unclear or ambiguous one.",
45 |         "input": "A research study has been conducted to determine if exercise really can \"boost\" your metabolism."
46 |     },
47 |     {
48 |         "motivation_app": "(Wolfram alpha)?",
49 |         "instruction": "Using a given amount, determine an appropriate tip.",
50 |         "input": "14% tip on $47.50"
51 |     }
52 | ]


--------------------------------------------------------------------------------
/data/testset-inference-v1.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "motivation_app": "Google Meet",
  4 |         "instruction": "Summarize a meeting from the given list of bullet points. Be sure to convert shorthand into a first-hand account.",
  5 |         "input": "Rose: Analyze data and presents findings\nJohn: propose new idea\nJane: appointed to head project\nTom: need more time to fix software bug"
  6 |     },
  7 |     {
  8 |         "motivation_app": "Amazon",
  9 |         "instruction": "Make a list of adjectives that can be used to describe the given brand.",
 10 |         "input": "a creative tech startup"
 11 |     },
 12 |     {
 13 |         "motivation_app": "Wolfram alpha",
 14 |         "instruction": "Verify the correctness of the given statement.",
 15 |         "input": "\"For all integers j and k, if j and k are odd, then jk is odd.\""
 16 |     },
 17 |     {
 18 |         "motivation_app": "Amazon",
 19 |         "instruction": "What other Amazon products might interest someone who visited the given product?",
 20 |         "input": "Zeroll Zerolon Hardcoat Anodized Commercial Ice Cream Scoop with Unique Liquid Filled Heat Conductive Handle Easy Release Made in USA, 1.5-Ounce, Black"
 21 |     },
 22 |     {
 23 |         "motivation_app": "Coursera",
 24 |         "instruction": "Come up with the courses that one is supposed to take in order to be an expert in a given field.",
 25 |         "input": "Graphic Design"
 26 |     },
 27 |     {
 28 |         "motivation_app": "Google Search",
 29 |         "instruction": "Based on the given query, suggest some related search queries.",
 30 |         "input": "learning french"
 31 |     },
 32 |     {
 33 |         "motivation_app": "Jira",
 34 |         "instruction": "A user story is an informal, general explanation of a software feature written from the perspective of the end user or customer. Write a user story for a given software.",
 35 |         "input": "Gmail"
 36 |     },
 37 |     {
 38 |         "motivation_app": "Tasty",
 39 |         "instruction": "Describe how to prepare the given food in your own words. Note down the ingredients you will need and the steps you will take to prepare them.",
 40 |         "input": "Chewy Chocolate Chip Cookies"
 41 |     },
 42 |     {
 43 |         "motivation_app": "YouTube",
 44 |         "instruction": "You should choose a YouTube video title based on the video's content. A video's title tells viewers what to expect from it. It should be direct, honest, and clear. The title of the video needs to capture the attention of viewers, so do not use an unclear or ambiguous one.",
 45 |         "input": "A research study has been conducted to determine if exercise really can \"boost\" your metabolism."
 46 |     },
 47 |     {
 48 |         "motivation_app": "(Wolfram alpha)?",
 49 |         "instruction": "Using a given amount, determine an appropriate tip.",
 50 |         "input": "14% tip on $47.50"
 51 |     },
 52 |     {
 53 |         "motivation_app": "Weather",
 54 |         "instruction": "In relation to the given weather scenario, give some tips on how to adjust the travel plans with it.",
 55 |         "input": "a sudden temperature change"
 56 |     },
 57 |     {
 58 |         "motivation_app": "Socratic by Google",
 59 |         "instruction": "Answer the following literature question as accurately as possible based on your knowledge of literature for high school students.",
 60 |         "input": "What does the green light symbolize at the end of the book?"
 61 |     },
 62 |     {
 63 |         "motivation_app": "Netflix",
 64 |         "instruction": "Summarize the movie in a snarky way. Try to explain the movie in just one sentence.",
 65 |         "input": "The Shining"
 66 |     },
 67 |     {
 68 |         "motivation_app": "Google Docs",
 69 |         "instruction": "Include important study notes and key points that someone should know about the given subject.",
 70 |         "input": "history of the USA"
 71 |     },
 72 |     {
 73 |         "motivation_app": "Coursera",
 74 |         "instruction": "List the personality traits that are required to be successful in the given job.",
 75 |         "input": "Social Media Marketer"
 76 |     },
 77 |     {
 78 |         "motivation_app": "Sudoku",
 79 |         "instruction": "Solve the given Sudoku puzzle.",
 80 |         "input": "The Sudoku puzzle is:\n|1| |5| | | | |8| |\n| | | | | |3| |6|4|\n|8|3|4|6|7| |9| | |\n|2|9|1| | |8|3|7|6|\n| | |6| | |9|8|1| |\n| | | | | |2| | | |\n| |2| | |9|7|6| | |\n|5|4| | | | | |9|8|\n|6|1| | | |5|4|3|7|"
 81 |     },
 82 |     {
 83 |         "motivation_app": "tripadvisor.com",
 84 |         "instruction": "Someone gives you some information about where they want to go and what they enjoy seeing and doing. Describe a proper road trip for them and find the best stops and sctivities in the way.",
 85 |         "input": "I'm looking for a west coast road trip. My goal is to start near San Diego and travel to Los Angeles."
 86 |     },
 87 |     {
 88 |         "motivation_app": "Indeed",
 89 |         "instruction": "You will be required to write a few questions from the given company's interview section.",
 90 |         "input": "Kohl's"
 91 |     },
 92 |     {
 93 |         "motivation_app": "Doulingo",
 94 |         "instruction": "What sound does this make?",
 95 |         "input": "\u597d"
 96 |     },
 97 |     {
 98 |         "motivation_app": "Blogger",
 99 |         "instruction": "Design an outline for a blog post based on the given information and list the sections accordingly.",
100 |         "input": "This blog post is going to about making a list of last-minute gift ideas. Also, the tone of the post is going to be relaxed and casual."
101 |     },
102 |     {
103 |         "motivation_app": "Amazon",
104 |         "instruction": "You should write a Product Description that will persuade the buyer that the product is beneficial.",
105 |         "input": "Flipslide Game, Electronic Handheld Game | Flip, Slide, and Match the Colors to Beat the Clock - 4 Game Modes - Multiplayer Fun"
106 |     },
107 |     {
108 |         "motivation_app": "Reddit",
109 |         "instruction": "You should take the time to read the situation and problem carefully and give the best advice you can.",
110 |         "input": "I'm roadtripping down New Zealand with a friend and he is driving me insane. He hasn't enjoyed all the tours we've been on and doesn't want to chat about ANYTHING. We're a week into our trip and this is the quietest I've ever been when hanging out with another person as he rebuffs all my attempts at conversation. He wears his airpods in the car and around the accommodation - not that I've kept trying to talk to him. Honestly, he is bringing some seriously bad vibes to everything and it's ruining my trip.\nWe don't have anything booked in yet for the South Island, so I'm planning to ditch him in Wellington at the ferry. Probably book my own car in Picton and just relax alone for the rest of the trip. (Just thinking about it is such sharp relief that it makes me feel emotional. I'll pay whatever price I have to just to get away from him.)\nObviously telling him he is ruining my trip with his negative energy isn't the best way to approach this.\nAny advice for the best approach so we can split amicably? Or at least not have an argument that ends with very hurt feelings?"
111 |     },
112 |     {
113 |         "motivation_app": "Meetup",
114 |         "instruction": "Recommend some helpful activities or conversation starters to use at meetups to break the ice.",
115 |         "input": ""
116 |     },
117 |     {
118 |         "motivation_app": "merriam-webster.com",
119 |         "instruction": "Find the answer that best describes the underlined SAT word. Select the correct option and explain the meaning of the underlined word.",
120 |         "input": "Despite the _cacophony, the student tried to study. \nA. Loud sounds\nB. Difficult subject\nC. Late hour\nD. Low lighting"
121 |     },
122 |     {
123 |         "motivation_app": "Gmail",
124 |         "instruction": "Write an email to attendees as a reminder that the event is coming up.",
125 |         "input": ""
126 |     },
127 |     {
128 |         "motivation_app": "tripadvisor.com",
129 |         "instruction": "Make a list of the top places in the U.S. to visit at the given time.",
130 |         "input": "November"
131 |     },
132 |     {
133 |         "motivation_app": "https://cohere.ai/",
134 |         "instruction": "Give a brief summary of the intention of the dialogue that just happened.",
135 |         "input": "Customer: Hi there, I'm looking for a new phone.\nAI: Hi! What type of phone are you looking for?\nCustomer: I'm not sure. Maybe something with a good camera?\nAI: We have a few phones with great cameras. Would you like to see some options?\nCustomer: Yeah, that would be great."
136 |     },
137 |     {
138 |         "motivation_app": "MS Powerpoint",
139 |         "instruction": "Design a soothing pastel color palette for your slides. Pastel colors generally come across as pretty and delicate, so you\u2019ll want to make sure your presentation calls for a similar mood. Choose up to five colors or color codes.",
140 |         "input": ""
141 |     },
142 |     {
143 |         "motivation_app": "Spotify",
144 |         "instruction": "How would you describe the meaning of this lyrics? Describe some of the metaphors.",
145 |         "input": "My lover\u2019s got humor\nShe\u2019s the giggle at a funeral\nKnows everybody\u2019s disapproval\nI should\u2019ve worshipped her sooner"
146 |     },
147 |     {
148 |         "motivation_app": "Redfin",
149 |         "instruction": "List some of the top real estate marketing words to add value to the listing and engage more potential buyers.",
150 |         "input": ""
151 |     },
152 |     {
153 |         "motivation_app": "merriam-webster.com",
154 |         "instruction": "Invent a new word based on a description of the word.",
155 |         "input": "The feeling of knowing, while you are still in the moment, that something occurring will be remembered forever."
156 |     },
157 |     {
158 |         "motivation_app": "merriam-webster.com",
159 |         "instruction": "Decide which part of speech the underlined word belongs to.",
160 |         "input": "school will _keep through the winter"
161 |     },
162 |     {
163 |         "motivation_app": "Spotify",
164 |         "instruction": "Make a list of the most popular podcasts.",
165 |         "input": ""
166 |     },
167 |     {
168 |         "motivation_app": "Jira",
169 |         "instruction": "Create a template in markdown to create scope for Jira tickets. Members should use this template as a checklist to ensure they have included all the necessary information when creating a ticket.",
170 |         "input": ""
171 |     },
172 |     {
173 |         "motivation_app": "instructables",
174 |         "instruction": "Think of an idea to help you win a challenge given to you. Write the steps you need to take to develop the idea as well.",
175 |         "input": "What can you make by recycling and repurposing some old, previously used materials? We want to see it!\nProjects must incorporate some kind of used materials - take that old used thing, and turn it into something new and different!\nE-waste is a huge problem. What can you make using old electronics and other salvageable electronic materials? \nWe have a special judges prize for our favorite Instructable that incorporates or is related to reusing e-waste.\nBe smart and be safe; note that potentially deadly projects (like Lichtenberg/fractal wood burning machines) will not be allowed to be published or submitted to this contest."
176 |     },
177 |     {
178 |         "motivation_app": "LinkedIn",
179 |         "instruction": "Write a template for First-Person LinkedIn profile summary.",
180 |         "input": ""
181 |     },
182 |     {
183 |         "motivation_app": "Spotify",
184 |         "instruction": "Curate a Spotify playlist based on the vibe. Publish this playlist as a song list.",
185 |         "input": "Vibe: coding on weekend"
186 |     },
187 |     {
188 |         "motivation_app": "Redfin",
189 |         "instruction": "Given a real estate listing description, extract the information and details of the property from the text. The details consist of address, basic house attributes, etc.",
190 |         "input": "WALK TO THE BEACH FROM CAPITOLA KNOLLS! Single-level upstairs condo with your own large private deck overlooking nature. Adorable home that is move-in ready and waiting for you to put your own personal touches on. Gorgeous and well-maintained landscaped park-like setting with lush green grasses with a community pool and spa to enjoy. Desirable location...walk to Capitola Village, Gayle's Bakery, restaurants, shopping, schools or jump on Hwy 1 right around the corner."
191 |     },
192 |     {
193 |         "motivation_app": "Indeed",
194 |         "instruction": "A job description is a document that clearly states the requirements, duties, responsibilities, and skills required to perform a specific role. You need to write a job description based on the given information. You should describe responsibilities and duties in your job description in a logical order. Remember that potential employees often get their first impression of your company from your job description. So, the job description should convey the attractiveness of your company.",
195 |         "input": "Company: [Company name]\nJob: Software Engineer\nJob Responsibilities:\nReceive and perform code reviews with other engineers.\nWrite unit, integration, and end-to-end tests to verify functionality using automated testing frameworks such as Pytest.\nWork collaboratively with fellow software engineers to build features requested by business stakeholders.\nParticipate in Agile teams to develop, test, and debug complex data processing pipelines and data analysis applications using big data processing systems such as Apache Spark.\nDiagnose, debug, and perform root cause analysis of issues and defects.\nSkills and experience that will lead to success:\nExperience with a modern software programming language. We use Python, and Pyspark extensively.\nExperience with cloud systems like AWS or Azure\nExperience writing SQL queries \nStrong written and verbal communication skills\nEagerness to work in a highly-collaborative environment\nSome visualization experience such as Tableau or PowerBI is helpful"
196 |     },
197 |     {
198 |         "motivation_app": "Wolfram alpha",
199 |         "instruction": "Provide a formula for computing the nth term in the given sequence",
200 |         "input": "5, 14, 23, 32, 41, ..."
201 |     },
202 |     {
203 |         "motivation_app": "Quora",
204 |         "instruction": "Choose an appealing title for your post.",
205 |         "input": "The typical avocado is over 300 calories from the oil in it. That\u2019s the amount of calories in a large candy bar. If you get enough exercise to eat a large candy bar every day without gaining weight, it wouldn\u2019t be a problem to eat an avocado every day. Other wise you should probably eat them sparingly."
206 |     },
207 |     {
208 |         "motivation_app": "Spotify",
209 |         "instruction": "List a few popular songs from the given album.",
210 |         "input": "Back To Black"
211 |     },
212 |     {
213 |         "motivation_app": "Messenger",
214 |         "instruction": "Respond to the last text in the chat using the correct emojis to convey your feelings.",
215 |         "input": "+ Hey, how did your exams go?\n- They were alright. I'm just glad they're over.\n+ Yeah, I know how you feel. I'm just glad I don't have to worry about them anymore."
216 |     },
217 |     {
218 |         "motivation_app": "merriam-webster.com",
219 |         "instruction": "Discuss the origins and history of the word that has been given to you.",
220 |         "input": "oblique"
221 |     },
222 |     {
223 |         "motivation_app": "National Geographic",
224 |         "instruction": "You can easily make your holidays more eco-friendly by following these simple steps:",
225 |         "input": ""
226 |     },
227 |     {
228 |         "motivation_app": "Grammarly",
229 |         "instruction": "Rewrite the text and correct the spelling errors.",
230 |         "input": "It solves problems comon and uniqe to every team."
231 |     },
232 |     {
233 |         "motivation_app": "Netflix",
234 |         "instruction": "The story of a movie has been described using many emojis. You need to guess the name of the movie based on the emojis.",
235 |         "input": ""
236 |     },
237 |     {
238 |         "motivation_app": "Gmail",
239 |         "instruction": "Decide if the given email belongs to the Promotions or Social category.",
240 |         "input": "Subject: You have an invitation\nEmail: Hi, I'd like to join your LinkedIn network."
241 |     },
242 |     {
243 |         "motivation_app": "Twitter",
244 |         "instruction": "You are given a tweet and you should decide whether it's offensive or not.",
245 |         "input": "She has certainly taken some heat for being such an....well idiot."
246 |     },
247 |     {
248 |         "motivation_app": "Weather",
249 |         "instruction": "Please provide us with a list of the best conservation starter questions related to the weather.",
250 |         "input": ""
251 |     },
252 |     {
253 |         "motivation_app": "Blogger",
254 |         "instruction": "Write a section for a blog post and try to cover all of the provided information about this section in your text.",
255 |         "input": "Blog Topic: 7 Fall Fashion Trends Worth Taking on Your Trip\nSection Title: Trusty Jeans\nMain point of the section: jeans can be worn all year and look good with everything ."
256 |     },
257 |     {
258 |         "motivation_app": "Twitter",
259 |         "instruction": "Write a social media post about the call for collaboration on a crowdsourcing project in a persuasive way.",
260 |         "input": ""
261 |     },
262 |     {
263 |         "motivation_app": "Amazon",
264 |         "instruction": "Write a review based on the given information.",
265 |         "input": "- Product: Persil Discs Laundry Detergent Pacs, Original Scent, High Efficiency (HE) Compatible, Laundry Soap, 62 Count\n- Sentiment: Positive"
266 |     },
267 |     {
268 |         "motivation_app": "ludwig.guru",
269 |         "instruction": "Think of alternatives and paraphrases for the underlined word.",
270 |         "input": "what we have _expected"
271 |     },
272 |     {
273 |         "motivation_app": "Play Store",
274 |         "instruction": "Provide an ESRB rating for the following game.",
275 |         "input": "This is an action-adventure game in which players help Kratos and his son on a dangerous quest. Players explore various realms and engage in frenetic hand-to-hand combat against human-like raiders and fantastical creatures (e.g., centaurs, trolls, dragons). Players use axes and chained blades to battle enemies, often resulting in large blood-splatter effects and dismemberment. Players can perform finishing attacks that depict close-up impalement via bladed and hand-held weapons; repeated axe strikes to a creature's neck results in decapitation. The words \u201cf**k\u201d and \u201csh*t\u201d are heard in the game."
276 |     },
277 |     {
278 |         "motivation_app": "tripadvisor.com",
279 |         "instruction": "Name the top cities in France that should not be missed. Include the best aspects of each place as well.",
280 |         "input": ""
281 |     },
282 |     {
283 |         "motivation_app": "Yelp",
284 |         "instruction": "Categorize the Business into one of the Restaurants, Home Services, Auto Services and Miscellaneous based on its Specialties.",
285 |         "input": "Call us at 650-636-4884 or visit our website to receive a quote. This shop specializes in New Tires and General Auto Repair. We carry all tires in-house and have a wide range to fit any budget or car specifics. If you are unsure what tires you need there are experts on hand and available to help you pick the best tire for your needs. We also carry commercial vehicle tires and can serve a wide array of fleets."
286 |     },
287 |     {
288 |         "motivation_app": "Goodreads",
289 |         "instruction": "Think of a motivational quote that you have read in a book. Try to keep it positive and sweet.",
290 |         "input": ""
291 |     },
292 |     {
293 |         "motivation_app": "Coursera",
294 |         "instruction": "List the concepts that should be learned before approaching the given complex concept.",
295 |         "input": "Deep Learning"
296 |     },
297 |     {
298 |         "motivation_app": "Netflix",
299 |         "instruction": "Think of some series or movies that would be enjoyable for someone who liked the given series.",
300 |         "input": "Squid Game"
301 |     },
302 |     {
303 |         "motivation_app": "Yelp",
304 |         "instruction": "Give the provided brand a motto to use as a part of its marketing strategies.",
305 |         "input": "The Ice cream shop is a small, quaint shop located in the heart of the historic district. They offer a variety of ice cream flavors, as well as Italian ice, gelato, and sorbet. The shop has a few tables and chairs, but most of their business is to-go. They offer a loyalty program where you can get a free scoop of ice cream after 10 purchases."
306 |     },
307 |     {
308 |         "motivation_app": "Grammarly",
309 |         "instruction": "Expand the given sentence and give it more details and depth.",
310 |         "input": "It changed my life."
311 |     },
312 |     {
313 |         "motivation_app": "Messenger",
314 |         "instruction": "Currently, there is a chat in which people are arranging a meeting in the future. Retrieve the meeting information from the conversation.",
315 |         "input": "Person 1: Hey, when are you free? I'd like to meet up.\nPerson 2: I'm free tomorrow afternoon.\nPerson 1: Great, what time?\nPerson 2: How about 3 pm?\nPerson 1: I have another meeting at that time. what about 5 pm?\nPerson 2: That works. Where should we meet?\nPerson 1: How about the coffee shop on Main Street?\nPerson 2: Sounds good. I've heard they have great coffee there. See you tomorrow!"
316 |     },
317 |     {
318 |         "motivation_app": "Indeed",
319 |         "instruction": "Provide a list of the skills that may help you find the job.",
320 |         "input": "Crew Members"
321 |     },
322 |     {
323 |         "motivation_app": "ESPN",
324 |         "instruction": "Using a table, compare the career overviews of the given players in Major League Baseball. Use \"|\" for separating the columns in the table.",
325 |         "input": "Derek Jeter, Albert Pujols"
326 |     },
327 |     {
328 |         "motivation_app": "Wikipedia",
329 |         "instruction": "An evaluation of the article's quality should be carried out. In order to do so, you should review the quality of the writing and the explanation of the topic.",
330 |         "input": "The 20th century saw a revolution in music listening as the radio gained popularity worldwide and new media and technologies were developed to record, edit and distribute music. Music performances became increasingly visual with the broadcast and recording of performances.\n20th-century music brought new freedom and wide experimentation with new musical styles and forms that challenged the accepted rules of music of earlier periods. The invention of musical amplification and electronic instruments, especially the synthesizer, in the mid-20th century revolutionized classical and popular music, and accelerated the development of new forms of music."
331 |     },
332 |     {
333 |         "motivation_app": "Doulingo",
334 |         "instruction": "Answer the following question.",
335 |         "input": "How do you say \"good evening\" in French."
336 |     },
337 |     {
338 |         "motivation_app": "Indeed",
339 |         "instruction": "Based on the information provided, you need to estimate the average summary for the given job.",
340 |         "input": "Data entry clerk in United States"
341 |     },
342 |     {
343 |         "motivation_app": "merriam-webster.com",
344 |         "instruction": "Create alliterations by finding synonyms for words in the given sentence.",
345 |         "input": "David wears a hat everyday."
346 |     },
347 |     {
348 |         "motivation_app": "Instagram",
349 |         "instruction": "Create a captivating Instagram caption based on the provided details. Try to think of as many captions as you can.",
350 |         "input": "I am sharing images of the first snow of the year taken from a window in an office"
351 |     },
352 |     {
353 |         "motivation_app": "ludwig.guru",
354 |         "instruction": "During writing, we added an asterisk for the word that did not come to mind. You will need to provide several examples to demonstrate all the words that can be used in the sentence instead of the asterisk.",
355 |         "input": "we * to know"
356 |     },
357 |     {
358 |         "motivation_app": "YouTube",
359 |         "instruction": "Using the provided topic as a starting point, brainstorm ideas for videos that can be made about it on YouTube.",
360 |         "input": "In spite of the fact that procrastination feels bad to you, why do you do it?"
361 |     },
362 |     {
363 |         "motivation_app": "Spotify",
364 |         "instruction": "Choose Spotify playlists that match the given mood. Provide a link to these playlists for easy access.",
365 |         "input": "Feeling Good"
366 |     },
367 |     {
368 |         "motivation_app": "Jira",
369 |         "instruction": "Write a Jira ticket for the given task.",
370 |         "input": "New Employee onboarding"
371 |     },
372 |     {
373 |         "motivation_app": "Sudoku",
374 |         "instruction": "Design a medium-level sudoku puzzle.",
375 |         "input": ""
376 |     },
377 |     {
378 |         "motivation_app": "MS Excel",
379 |         "instruction": "We have entered the home supplies budget in the following table. Calculate the last column with the given function and rewrite the completed table.\nNote that, columns in a row are separated using a comma, and rows are separated using a \"|\".",
380 |         "input": "Item, Qty, Price, Subtotal | mango, 5, 500, ? | apple, 4, 150, ? | oil, 2, 1500, ? \nFunction: column2 * column 3"
381 |     },
382 |     {
383 |         "motivation_app": "Netflix",
384 |         "instruction": "Give examples of popular shows and movies in the genre.",
385 |         "input": "Genre: Crime TV Shows"
386 |     },
387 |     {
388 |         "motivation_app": "Spotify",
389 |         "instruction": "Suggest some Audiobooks for First-Time Listeners. You can attract them by choosing popular works.",
390 |         "input": ""
391 |     },
392 |     {
393 |         "motivation_app": "tripadvisor.com",
394 |         "instruction": "Make a questionnaire to help hotel guests write hotel reviews.",
395 |         "input": ""
396 |     },
397 |     {
398 |         "motivation_app": "https://cohere.ai/",
399 |         "instruction": "Take a look at the contract and extract the parties of the agreement from it.",
400 |         "input": "As of the 13th day of December 2021, this Music Recording Agreement (\"Agreement\") is made between Good Kid, a Toronto-based musical group (\"Artist\"), and Universal Music Group, a record label with license number 545345 (\"Record Label\"). The Artist and the Recording Label may be referred to in this Agreement individually as \"Parties\" and collectively as \"Parties.\" Work under this Agreement will begin on March 15, 2022, and will last for the duration of the Agreement."
401 |     },
402 |     {
403 |         "motivation_app": "IMDB",
404 |         "instruction": "The movie tagline is the film's advertising slogan, usually printed on posters prior to the official release. It is a short description or comment on a movie to capture the essence of the movie, and ultimately make you watch the movie. Here is an example:",
405 |         "input": "Movie: Joker (2019)"
406 |     },
407 |     {
408 |         "motivation_app": "instructables",
409 |         "instruction": "Make a list of the materials that will be required to build the given tool.",
410 |         "input": "Coffee Darkness Meter"
411 |     },
412 |     {
413 |         "motivation_app": "Doulingo",
414 |         "instruction": "This is a test for the reading ability of French. Read the given story carefully, and then answer the question in the end.",
415 |         "input": "Lucie est \u00e0 la maison avec sa petite-fille, Lin.\nLucie: Oh, non! Nous n'avons pas de pain!\nLin: Ah! Tu vas au supermarch\u00e9?\nLucie: Oui, je veux une baguette pour mon sandwich.\nDo Lin and Lucie have a lot of bread?"
416 |     },
417 |     {
418 |         "motivation_app": "Instagram",
419 |         "instruction": "Create a detailed caption for an Instagram post about a discount. A proper caption should explain the product, offer the discount, and tell the user how to claim it.",
420 |         "input": "Product: Golang course\nProduct description: teaching the foundational skills for building cloud services, web applications, DevOps tools, and backend development\nDiscount: 50%\nHow to claim: Use the coupon code: GOLANG"
421 |     },
422 |     {
423 |         "motivation_app": "tripadvisor.com",
424 |         "instruction": "Please let me know your thoughts on the given place and why you think it deserves to be visited.",
425 |         "input": "Barcelona, Spain"
426 |     },
427 |     {
428 |         "motivation_app": "Reddit",
429 |         "instruction": "Provide a pleasant compliment based on the quality you have been given.",
430 |         "input": "Truthfulness"
431 |     },
432 |     {
433 |         "motivation_app": "Messenger",
434 |         "instruction": "The given text must be classified as offensive or not offensive.",
435 |         "input": "You acted really crazy last night."
436 |     },
437 |     {
438 |         "motivation_app": "Gmail",
439 |         "instruction": "If you could help me write an email to my friends inviting them to dinner on Friday, it would be greatly appreciated.",
440 |         "input": ""
441 |     },
442 |     {
443 |         "motivation_app": "IMDB",
444 |         "instruction": "Decide whether children can watch the given movie. Mention what makes it inappropriate if you think they can't.",
445 |         "input": "The Dark Knight (2008)"
446 |     },
447 |     {
448 |         "motivation_app": "LinkedIn",
449 |         "instruction": "You will be asked to answer the following question as part of the LinkedIn Skill assessments.",
450 |         "input": "You find that your project has a tag and branch both named push-notifications, which causes confusion when trying to print out given reference. How can you specify which branch you want to look at?"
451 |     },
452 |     {
453 |         "motivation_app": "Netflix",
454 |         "instruction": "Give a brief description of the given category of movies and shows.",
455 |         "input": "Period Dramas"
456 |     },
457 |     {
458 |         "motivation_app": "Instagram",
459 |         "instruction": "You are given a topic for an Instagram post. Help the post reach a broader audience by suggesting hashtags related to the post.",
460 |         "input": "Another episode of women in science is out now"
461 |     },
462 |     {
463 |         "motivation_app": "Grammarly",
464 |         "instruction": "Notify me of any suggestions you may have for making the text more grammatically correct.",
465 |         "input": "This is the second time I've been here, and the vibes, and the food were exquisite."
466 |     },
467 |     {
468 |         "motivation_app": "LinkedIn",
469 |         "instruction": "Design a skill assessment questioner for R (Programming Language).",
470 |         "input": ""
471 |     },
472 |     {
473 |         "motivation_app": "Gmail",
474 |         "instruction": "Prepare an email signature template for the given person or company.",
475 |         "input": "a graduate student of computer science"
476 |     },
477 |     {
478 |         "motivation_app": "YouTube",
479 |         "instruction": "Choosing a name for your product or business YouTube channel is an important part of the process. Based on the description of the product or business, you should come up with some interesting names. Take some time to brainstorm your ideas.",
480 |         "input": "Here you will find videos and content that will help students prepare for the application process to graduate schools as well as how to apply to graduate schools"
481 |     },
482 |     {
483 |         "motivation_app": "Google Search",
484 |         "instruction": "Come up with some search queries on google about coding stuff.",
485 |         "input": ""
486 |     },
487 |     {
488 |         "motivation_app": "IMDB",
489 |         "instruction": "Give a brief summary of the movie or series' plot.",
490 |         "input": "Breaking Bad (TV Series 2008\u20132013)"
491 |     },
492 |     {
493 |         "motivation_app": "Grammarly",
494 |         "instruction": "The sentence you are given might be too wordy, complicated, or unclear. Rewrite the sentence and make your writing clearer by keeping it concise. Whenever possible, break complex sentences into multiple sentences and eliminate unnecessary words.",
495 |         "input": "If you have any questions about my rate or if you find it necessary to increase or decrease the scope for this project, please let me know."
496 |     },
497 |     {
498 |         "motivation_app": "CNN News",
499 |         "instruction": "You are given a description that provides a set of facts or a scenario. It is up to you to craft a story from these facts and scenarios. The missing pieces must be filled in with imaginative but logical information.",
500 |         "input": "Ten European football teams \u2013 the Netherlands, England, Belgium, Denmark, France, Germany, Norway, Sweden, Switzerland and Wales \u2013 will participate in a season-long \u201cOneLove\u201d campaign promoting inclusion and opposing discrimination."
501 |     },
502 |     {
503 |         "motivation_app": "yelp",
504 |         "instruction": "React properly to reviews from your customers. In your responses, you can highlight your business' policies and customer service.",
505 |         "input": "Another repeat visit and no disappointment from us. Great place for breakfast or brunch - and you won't need to eat again for awhile. The restaurant is close to the Tenderloin so be aware of your surroundings.\nWe enjoyed the Succotash Omelet (at least my wife enjoyed it), the Pulled Pork Benedict and the Hangtown Fry. Portions are large and pricing is fair. I wish we could have managed a Beignet Flight!\nDefinitely a spot to seek out. It can be crowed at times so be patient. Well worth any wait."
506 |     },
507 |     {
508 |         "motivation_app": "Quora",
509 |         "instruction": "Create a list of subtopics for the given topic.",
510 |         "input": "Music theory"
511 |     },
512 |     {
513 |         "motivation_app": "YouTube",
514 |         "instruction": "The topic of a YouTube post has been described and based on the information, you need to write a hook for starting the post. A catchy hook will keep your readers interested so they keep reading. It would be great if you could come up with as many hooks as you can.",
515 |         "input": "A video showing how to make a tasty cup of coffee"
516 |     },
517 |     {
518 |         "motivation_app": "IMDB",
519 |         "instruction": "Write a short analysis of the cinematography in the movie.",
520 |         "input": "Dead Poets Society"
521 |     },
522 |     {
523 |         "motivation_app": "https://cohere.ai/",
524 |         "instruction": "Correct the transcription of an excerpt containing errors.",
525 |         "input": "I got got charged interest on ly credit card but I paid my pull balance one day due date. I not missed a pavement year yet. Man you reverse the interest charge?"
526 |     },
527 |     {
528 |         "motivation_app": "IMDB",
529 |         "instruction": "You are given some reviews for a movie. Each video might have mentioned some negative or positive points about the movie. Read them carefully and extract the positive and negative points mentioned in all of the reviews. Then, make a list of these points and summarize them as positive points and negative points.",
530 |         "input": "Review 1: I'm not sure if it's just the fact that Ed Norton really grates on me in this film, but I never really bought into the idea of Fight Club being some kind of cultural phenomenon. Yes it's an innovative story, told via a strong script, engaging characters and intense cinematography. But is it really worthy of such high praise? Personally, I find it starts to fall apart halfway through and actually becomes quite tedious towards the end. Everything just becomes a little bit childish and contrived as the story loses touch with its base elements and the rage against the machine vibe takes over. I have huge respect for the way this film was made but, much like The Matrix, I feel like people attach way more importance to this movie than it deserves.\nReview 2: The film tells the story of men who cannot adapt to the social order in today's world, under the leadership of Tyler Durden, to discover their natural instinct, masculinity, and to escape from their unhappy lives.\nReview 3: Despite a good theme, great acting and important messages that this movie convey in an unorthodox way, I think it fails to connect the audience with the storyline and leaves him in a world of confusion. Although, majority of reviews find this movie entertaining and interesting, yet I would choose to be a minority that believes that this movie is extremely overrated.\nReview 4: Is creating chaos, explosions and membership in a militant sect the way to get rid of material shackles ?! The visual effects are fascinating and the story is compelling until the second half. Unfortunately, it doesn't end well."
531 |     },
532 |     {
533 |         "motivation_app": "National Geographic",
534 |         "instruction": "Give students tips on how to keep their nerves under control during class presentations.",
535 |         "input": ""
536 |     },
537 |     {
538 |         "motivation_app": "IMDB",
539 |         "instruction": "My favorite witty review of The Rings of Power series is this:",
540 |         "input": ""
541 |     },
542 |     {
543 |         "motivation_app": "Tasty",
544 |         "instruction": "Provide a cooking hack for improving the flavor of the given food.",
545 |         "input": "popcorn"
546 |     },
547 |     {
548 |         "motivation_app": "MS Excel",
549 |         "instruction": "Please write the Excel function name associated with each description.",
550 |         "input": "- Returns the number of days between two dates\n- Returns the starting position of a text string within another text string.\n- Returns the number in the middle of the set of given numbers"
551 |     },
552 |     {
553 |         "motivation_app": "Netflix",
554 |         "instruction": "Come up with an interesting idea for a new movie plot. Your plot should be described with a title and a summary.",
555 |         "input": ""
556 |     },
557 |     {
558 |         "motivation_app": "MS Excel",
559 |         "instruction": "I bought two shirts from the store and each one cost me $10. I also bought a pair of pants and they cost me $20. Then, I bought a dress and it cost me $30. Also, I bought a pair of shoes and they cost me $40. Lastly, I bought a jacket and it cost me $50.\n\nMake a table chart showing items, the number, and the price of these items. Separate columns with \"|\".",
560 |         "input": ""
561 |     },
562 |     {
563 |         "motivation_app": "(Wolfram alpha)?",
564 |         "instruction": "Look for poems that mention the given object or character. The names of the poems and their poets should be written.",
565 |         "input": "santa claus"
566 |     },
567 |     {
568 |         "motivation_app": "https://abcnotation.com/",
569 |         "instruction": "You will need to compose the ABC notation for a given song.",
570 |         "input": "The South Wind"
571 |     },
572 |     {
573 |         "motivation_app": "Yelp",
574 |         "instruction": "Based on the given keywords, write a review of a restaurant.",
575 |         "input": "Name: Pizzeria\ngood prices, quiet place, tasty, nice staff"
576 |     },
577 |     {
578 |         "motivation_app": "Wysa",
579 |         "instruction": "Develop a mental exercise that can help people manage their anxiety and explain how it works.",
580 |         "input": ""
581 |     },
582 |     {
583 |         "motivation_app": "Tasty",
584 |         "instruction": "Suggest a recipe for a vegan dessert.",
585 |         "input": ""
586 |     },
587 |     {
588 |         "motivation_app": "https://cohere.ai/",
589 |         "instruction": "Classify the questions in the FAQ into Finding policy details, Change account settings, Filing a claim and viewing status, or Cancelling coverage.",
590 |         "input": "Could you deposit money into my account rather than mailing me a physical cheque?"
591 |     },
592 |     {
593 |         "motivation_app": "Google Search",
594 |         "instruction": "You are given a search query and a document. Classify whether the document is relevant to the search query or not relevant.",
595 |         "input": "Search: why sky is blue\nDocument: The Short Answer: Sunlight reaches Earth's atmosphere and is scattered in all directions by all the gases and particles in the air. Blue light is scattered more than the other colors because it travels as shorter, smaller waves. This is why we see a blue sky most of the time."
596 |     },
597 |     {
598 |         "motivation_app": "Grammarly",
599 |         "instruction": "Desk jobs require writing a lot of emails, so it isn't surprising we get tired of repeating ourselves. Come up with several synonyms for the given word.",
600 |         "input": "Sincerely"
601 |     },
602 |     {
603 |         "motivation_app": "IMDB",
604 |         "instruction": "A list of all movies that meet the criteria given should be compiled.",
605 |         "input": "movies directed by Spike Lee by release date"
606 |     },
607 |     {
608 |         "motivation_app": "CNN News",
609 |         "instruction": "Give the news title a category. Pick a category from the list of News & Buzz, Travel, Style, Arts & Culture, Politics, Tech, and Science & Health.",
610 |         "input": "The #Banksy Exhibit in Cambridge, MA is absolutely terrific."
611 |     },
612 |     {
613 |         "motivation_app": "Gmail",
614 |         "instruction": "You need to write an email to negotiate your salary.",
615 |         "input": ""
616 |     },
617 |     {
618 |         "motivation_app": "Quora",
619 |         "instruction": "Give some examples of what people usually say in the given social situation.",
620 |         "input": "when someone arrives safely"
621 |     },
622 |     {
623 |         "motivation_app": "Goodreads",
624 |         "instruction": "Choose the best books from the given genre.",
625 |         "input": "Crime & Mystery"
626 |     },
627 |     {
628 |         "motivation_app": "Amazon",
629 |         "instruction": "Compare the given item with similar products based on its properties such as price, rating, etc. For product comparison, use a table and separate the columns with \"|\".",
630 |         "input": "Item: iBayam Journal Planner Pens Colored Pens Fine Point Markers Fine Tip Drawing Pens Porous Fineliner Pen for Journaling Writing Note Taking Calendar Coloring Art Office Back to School Supplies, 18 Color"
631 |     },
632 |     {
633 |         "motivation_app": "Indeed",
634 |         "instruction": "Describe the responsibilities of the given job.",
635 |         "input": "Security Officer"
636 |     },
637 |     {
638 |         "motivation_app": "Yelp",
639 |         "instruction": "Predict how many stars the author will give to the restaurant from a Yelp review.",
640 |         "input": "The reviews were great, but honestly i felt it was just ok. Seemed like a typical tourist spot were the food was mediocre. The service was ok, not great, but not the worst.\nThe bread was very good. I ordered the muscle fries. They had a sweet Vidalia onion flavor. No need for fries bc they were soggy wo much flavor. My boyfriend ordered a pesto chicken pasta and it was again mediocre."
641 |     },
642 |     {
643 |         "motivation_app": "merriam-webster.com",
644 |         "instruction": "Define what the underlined word means for kids.",
645 |         "input": "_keep a promise"
646 |     },
647 |     {
648 |         "motivation_app": "merriam-webster.com",
649 |         "instruction": "Write down antonyms for the given word.",
650 |         "input": "laureating"
651 |     },
652 |     {
653 |         "motivation_app": "Wikipedia",
654 |         "instruction": "Make the article available in a second language by translating it into it.",
655 |         "input": "Dentistry, also known as dental medicine and oral medicine, is the branch of medicine focused on the teeth, gums, and mouth. It consists of the study, diagnosis, prevention, management, and treatment of diseases, disorders, and conditions of the mouth, most commonly focused on dentition (the development and arrangement of teeth) as well as the oral mucosa. Dentistry may also encompass other aspects of the craniofacial complex including the temporomandibular joint. The practitioner is called a dentist.\nTranslate to French:"
656 |     },
657 |     {
658 |         "motivation_app": "Wolfram alpha",
659 |         "instruction": "Based on the facts that have been provided, prove the following statement.",
660 |         "input": "Statement: \"For every integer k, k^2 + 2k + 1 \u2265 0\"\nFact: The square of any real number is non-negative."
661 |     },
662 |     {
663 |         "motivation_app": "Messenger",
664 |         "instruction": "Change the response to have a more empathic tone in the chat.",
665 |         "input": "+ How are you doing?\n- Fine, I had a bunch of problems today.\n+ What kind of problems?"
666 |     },
667 |     {
668 |         "motivation_app": "https://cohere.ai/",
669 |         "instruction": "Describe the content of the article in a brief manner.",
670 |         "input": "A study published earlier this year by Zee and her team examined the role of light in sleep for healthy adults in their 20s. Sleeping for only one night with a dim light, such as a TV set with the sound off, raised the blood sugar and heart rate of the young people during the sleep lab experiment. An elevated heart rate at night has been shown in prior studies to be a risk factor for future heart disease and early death, while higher blood sugar levels are a sign of insulin resistance, which can ultimately lead to type 2 diabetes."
671 |     },
672 |     {
673 |         "motivation_app": "merriam-webster.com",
674 |         "instruction": "Come up with words that rhyme with the given word.",
675 |         "input": "instruct"
676 |     },
677 |     {
678 |         "motivation_app": "Messenger",
679 |         "instruction": "The last message in the chat is a partial response. You should complete and rewrite it.",
680 |         "input": "+ What are going to do now?\n- I don't know. I had to give up my dream of being a mathematician.\n+ Why"
681 |     },
682 |     {
683 |         "motivation_app": "Facebook",
684 |         "instruction": "Take the product description and write a creative ad for it.",
685 |         "input": "Document Cameras allow teachers to display documents, books, or other materials for their students to see."
686 |     },
687 |     {
688 |         "motivation_app": "Wikipedia",
689 |         "instruction": "Summarize the article you have been given in a brief manner.",
690 |         "input": "Mathematics and art are related in a variety of ways. Mathematics has itself been described as an art motivated by beauty. Mathematics can be discerned in arts such as music, dance, painting, architecture, sculpture, and textiles. This article focuses, however, on mathematics in the visual arts.\nMathematics and art have a long historical relationship. Artists have used mathematics since the 4th century BC when the Greek sculptor Polykleitos wrote his Canon, prescribing proportions conjectured to have been based on the ratio 1:\u221a2 for the ideal male nude. Persistent popular claims have been made for the use of the golden ratio in ancient art and architecture, without reliable evidence. In the Italian Renaissance, Luca Pacioli wrote the influential treatise De divina proportione (1509), illustrated with woodcuts by Leonardo da Vinci, on the use of the golden ratio in art. Another Italian painter, Piero della Francesca, developed Euclid's ideas on perspective in treatises such as De Prospectiva Pingendi, and in his paintings. The engraver Albrecht D\u00fcrer made many references to mathematics in his work Melencolia I. In modern times, the graphic artist M. C. Escher made intensive use of tessellation and hyperbolic geometry, with the help of the mathematician H. S. M. Coxeter, while the De Stijl movement led by Theo van Doesburg and Piet Mondrian explicitly embraced geometrical forms. Mathematics has inspired textile arts such as quilting, knitting, cross-stitch, crochet, embroidery, weaving, Turkish and other carpet-making, as well as kilim. In Islamic art, symmetries are evident in forms as varied as Persian girih and Moroccan zellige tilework, Mughal jali pierced stone screens, and widespread muqarnas vaulting."
691 |     },
692 |     {
693 |         "motivation_app": "Amazon",
694 |         "instruction": "Suggest some product bundles that are often purchased with a given product.",
695 |         "input": "MacBook Air"
696 |     },
697 |     {
698 |         "motivation_app": "Grammarly",
699 |         "instruction": "You should capitalize the sentence according to the guide.",
700 |         "input": "Guide: Every other letter alternates between lower case and upper case.\nSentence: A giant spider blocks your path."
701 |     },
702 |     {
703 |         "motivation_app": "merriam-webster.com",
704 |         "instruction": "Enter the words that satisfy the given condition.",
705 |         "input": "5 Countries that Start with S"
706 |     },
707 |     {
708 |         "motivation_app": "LinkedIn",
709 |         "instruction": "Write a LinkedIn post to announce that you have accepted a new job offer.",
710 |         "input": ""
711 |     },
712 |     {
713 |         "motivation_app": "Grammarly",
714 |         "instruction": "Rewrite the given text and correct grammar, spelling, and punctuation errors.",
715 |         "input": "If you'd told me year ago that today I would finish a marathon, I would of laughed. Your support had a huge affect on me!"
716 |     },
717 |     {
718 |         "motivation_app": "Tasty",
719 |         "instruction": "Provide a name for the dish given the ingredients and instructions.",
720 |         "input": "INGREDIENTS:\n2 (5 oz) cans Bumble Bee\u00ae Solid White Albacore Tuna, drained\n1 avocado\n2 Tbsp Sriracha\n1 Tbsp Dijon mustard\n2 to 3 Tbsp celery, chopped\n2 Tbsp red onion, chopped\n2 green onions, chopped\n1 Tbsp fresh cilantro, chopped\nSalt and pepper, to taste\n2 heaping cups leafy green lettuce\n1 cup matchstick carrots\n4 (10 inch) whole wheat tortillas\nINSTRUCTIONS:\nIn a medium bowl, mash together tuna and avocado until combined. Add in the rest of the ingredients through the salt and pepper, mixing well.\nTo assemble, top each tortilla with a 1/2 cup leafy greens, 1/4 cup matchstick carrots and divide the tuna mixture evenly among the wraps. Tightly roll up the tortilla, slice and enjoy!"
721 |     },
722 |     {
723 |         "motivation_app": "Wolfram alpha",
724 |         "instruction": "Solve this equation.",
725 |         "input": "x^3 - 4x^2 + 6x - 24 = 0"
726 |     },
727 |     {
728 |         "motivation_app": "Twitter",
729 |         "instruction": "Write a good Twitter Bio. Try to cover all the provided main points in your generated bio.",
730 |         "input": "Name: Jane\nMain points:\n- Architect\n- Interested in Tech\nWritten in first person\nUse casual tone"
731 |     },
732 |     {
733 |         "motivation_app": "Twitter",
734 |         "instruction": "We need to write a thematic tweet to share company news and facts. Please take a careful look at the facts and details provided and create a tweet based on them.",
735 |         "input": "- news: Applications for summer internships are open now\n- Start with a thematic tie-in to Halloween"
736 |     },
737 |     {
738 |         "motivation_app": "merriam-webster.com",
739 |         "instruction": "Find synonyms for the given word. You need to write down how the provided synonyms differ from the original word in terms of meaning, usage, etc.",
740 |         "input": "adversity"
741 |     },
742 |     {
743 |         "motivation_app": "Tasty",
744 |         "instruction": "Make a list of snacks and foods to serve as party snacks on a game day!",
745 |         "input": ""
746 |     },
747 |     {
748 |         "motivation_app": "Grammarly",
749 |         "instruction": "Change the first person to the third person in the given sentence. The meaning should be kept, but you can paraphrase it or expand it in order to have a better pose.",
750 |         "input": "We were recently able to increase the amount of stock we hold with the same supplier thereby reducing our risk."
751 |     },
752 |     {
753 |         "motivation_app": "Redfin",
754 |         "instruction": "Review the property description and tell us what is good and what is bad about it and list its key takeaways.",
755 |         "input": "There's real magic in combining inherently compelling Victorian architecture with a modern aesthetic. Add in an extraordinary multi-level garden & the result is the undeniable elegance of 444 29th St. The front of the home is light-filled w/ a seamless connection between living, dining & kitchen. Newly updated kitchen w/ quartz counters & upgraded appliances. A lg dining rm has a bay of windows &, when paired w/ the adjacent living rm, provides an ideal space to entertain. 2 lg bdrms on this level are at the rear. Both w/ lg closets & walk out to the garden. Stylishly remodeled full bath on this level. 3rd bdrm/2nd full bath are located on the ground floor - perfect as a guest rm, home office, fitness area, etc. The enormous garden occupies an extra deep lot & is beautifully landscaped & terraced. Laundry/storage rm, 1-car garage pkg, Tesla solar panels. Prime, flat part of Noe Valley - short walk to all that Noe has to offer. Steps from the J-Church & convenient to shuttles/freeways."
756 |     },
757 |     {
758 |         "motivation_app": "Amazon",
759 |         "instruction": "Take the title of the product and extract its attributes. The attributes in this case refer to the characteristics of the products, such as their brand and color. Your answer must a list following the format of \"attribute: value\".",
760 |         "input": "SAMSUNG 980 PRO SSD 2TB PCIe NVMe Gen 4 Gaming M.2 Internal Solid State Hard Drive Memory Card, Maximum Speed, Thermal Control, MZ-V8P2T0B"
761 |     },
762 |     {
763 |         "motivation_app": "Coursera",
764 |         "instruction": "Please list the courses that someone interested in the first course might like.",
765 |         "input": "Bitcoin and Cryptocurrency Technologies"
766 |     },
767 |     {
768 |         "motivation_app": "ESPN",
769 |         "instruction": "Create a table listing all games that meet the specified criteria in the National Football League. Use the season, local time, game, and score as columns of the table.",
770 |         "input": "Ravens home games in 2011"
771 |     },
772 |     {
773 |         "motivation_app": "sth related to real estate?",
774 |         "instruction": "Write an engaging and well-written property listing description for selling a house. Address of the house and some of the details are given to you. Fill in the information gap with hallucinations if needed.",
775 |         "input": "Property Address: 412 Monterey Ave, Capitola, CA 95010\nParking: 3 cars + a finished single car garage\nDetails:\n- open floorplan\n- window with views to park/mountains\n- Kitchen with shaker cabinetry and white Calcatta quartz counters"
776 |     },
777 |     {
778 |         "motivation_app": "Reddit",
779 |         "instruction": "Explain the meaning of the given phrase in simple terms. Use an example if possible. It would be helpful if you could give an example.",
780 |         "input": "\"With a little give in them\""
781 |     },
782 |     {
783 |         "motivation_app": "Telegram",
784 |         "instruction": "Suggest some names for a friendly group in telegram.",
785 |         "input": ""
786 |     },
787 |     {
788 |         "motivation_app": "Gmail",
789 |         "instruction": "A confirmation email should be written appropriately for the situation.",
790 |         "input": "A meeting has been scheduled, and the sender expects the other to review the slides."
791 |     },
792 |     {
793 |         "motivation_app": "Wikipedia",
794 |         "instruction": "Please answer the following question based on the information provided in the article.",
795 |         "input": "Development studies is an interdisciplinary branch of social science. Development studies is offered as a specialized master's degree in a number of reputed universities around the world. It has grown in popularity as a subject of study since the early 1990s, and has been most widely taught and researched in developing countries and countries with a colonial history, such as the UK, where the discipline originated. Students of development studies often choose careers in international organisations such as the United Nations, World Bank, non-governmental organisations (NGOs), media and journalism houses, private sector development consultancy firms, corporate social responsibility (CSR) bodies and research centers.\nWhat are some popular careers for students of development studies?"
796 |     },
797 |     {
798 |         "motivation_app": "Blogger",
799 |         "instruction": "We've already decided on the title and topic for our blog post. I would appreciate it if you wrote an introduction paragraph to attract the readers' attention.",
800 |         "input": "Blog title: Healthy Apple Muffins\nBlog Topic: recipe and tips for cooking a good apple muffin"
801 |     },
802 |     {
803 |         "motivation_app": "National Geographic",
804 |         "instruction": "Create a daily itinerary based on the given information.",
805 |         "input": "Our family is looking for a 9-day Morocco trip that has light to moderate activity levels."
806 |     },
807 |     {
808 |         "motivation_app": "Reddit",
809 |         "instruction": " ",
810 |         "input": "Truthfulness"
811 |     },
812 |     {
813 |         "motivation_app": "LinkedIn",
814 |         "instruction": "Make a list of desirable Skills for software engineers to add to LinkedIn.",
815 |         "input": ""
816 |     },
817 |     {
818 |         "motivation_app": "Gmail",
819 |         "instruction": "Considering the reason for the rejection, write a polite rejection letter for rejecting a candidate.",
820 |         "input": "Reason: You came in second"
821 |     },
822 |     {
823 |         "motivation_app": "Google Docs",
824 |         "instruction": "Make a list of common phrases for the given section of the paper.",
825 |         "input": "Introduction"
826 |     },
827 |     {
828 |         "motivation_app": "(Wolfram alpha)?",
829 |         "instruction": "You will be tested on your knowledge of classic witticisms and aphorisms by completing the given aphorism. Write the original quote as the answer.",
830 |         "input": "two things are infinite"
831 |     },
832 |     {
833 |         "motivation_app": "Coursera",
834 |         "instruction": "Design a syllabus for the given course. Students should be given a list of the chapters with brief explanations of each chapter's purpose.",
835 |         "input": "Programming for Everybody (Getting Started with Python)"
836 |     },
837 |     {
838 |         "motivation_app": "Tasty",
839 |         "instruction": "Using the ingredients that are available, create a dinner recipe.",
840 |         "input": "Kale + almond butter + apple cider vinegar + agave nectar + tamari"
841 |     },
842 |     {
843 |         "motivation_app": "Messenger",
844 |         "instruction": "For answering the chat, think of a funny response you can use.",
845 |         "input": "+ how are you?"
846 |     },
847 |     {
848 |         "motivation_app": "Google Sheet",
849 |         "instruction": "Make a list of the pros and cons of the given decision.",
850 |         "input": "Implementing a Remote Working Policy"
851 |     }
852 | ]


--------------------------------------------------------------------------------
/figures/inst-tune-pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeOpenML/PandaLM/3871806e72b4832f815ecafd24d3503c73403f48/figures/inst-tune-pipeline.png


--------------------------------------------------------------------------------
/figures/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeOpenML/PandaLM/3871806e72b4832f815ecafd24d3503c73403f48/figures/logo.png


--------------------------------------------------------------------------------
/figures/main-figure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeOpenML/PandaLM/3871806e72b4832f815ecafd24d3503c73403f48/figures/main-figure.png


--------------------------------------------------------------------------------
/figures/pandalm-webui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeOpenML/PandaLM/3871806e72b4832f815ecafd24d3503c73403f48/figures/pandalm-webui.png


--------------------------------------------------------------------------------
/pandalm/__init__.py:
--------------------------------------------------------------------------------
1 | from .utils import (
2 |     EvaluationPipeline,
3 |     CandidateBatchInferenceProvider,
4 |     PandaLMBatchInferenceProvider,
5 | )
6 | 


--------------------------------------------------------------------------------
/pandalm/assets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WeOpenML/PandaLM/3871806e72b4832f815ecafd24d3503c73403f48/pandalm/assets/__init__.py


--------------------------------------------------------------------------------
/pandalm/assets/ds_config_zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 | 
11 |     "bf16": {
12 |         "enabled": "auto"
13 |     },
14 | 
15 |     "optimizer": {
16 |         "type": "AdamW",
17 |         "params": {
18 |             "lr": "auto",
19 |             "betas": "auto",
20 |             "eps": "auto",
21 |             "weight_decay": "auto"
22 |         }
23 |     },
24 | 
25 |     "zero_optimization": {
26 |         "stage": 2,
27 |         "offload_optimizer": {
28 |             "device": "cpu",
29 |             "pin_memory": true
30 |         },
31 |         "allgather_partitions": true,
32 |         "allgather_bucket_size": 2e8,
33 |         "overlap_comm": true,
34 |         "reduce_scatter": true,
35 |         "reduce_bucket_size": 2e8,
36 |         "contiguous_gradients": true
37 |     },
38 | 
39 |     "gradient_accumulation_steps": "auto",
40 |     "gradient_clipping": "auto",
41 |     "steps_per_print": 2000,
42 |     "train_batch_size": "auto",
43 |     "train_micro_batch_size_per_gpu": "auto",
44 |     "wall_clock_breakdown": false
45 | }
46 | 


--------------------------------------------------------------------------------
/pandalm/assets/ds_config_zero2_linear_lr.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 | 
11 |     "bf16": {
12 |         "enabled": "auto"
13 |     },
14 | 
15 |     "optimizer": {
16 |         "type": "AdamW",
17 |         "params": {
18 |             "lr": "auto",
19 |             "betas": "auto",
20 |             "eps": "auto",
21 |             "weight_decay": "auto"
22 |         }
23 |     },
24 | 
25 |     "scheduler": {
26 |          "type": "WarmupDecayLR",
27 |          "params": {
28 |              "last_batch_iteration": -1,
29 |              "total_num_steps": "auto",
30 |              "warmup_min_lr": "auto",
31 |              "warmup_max_lr": "auto",
32 |              "warmup_num_steps": "auto"
33 |          }
34 |      },
35 | 
36 |     "zero_optimization": {
37 |         "stage": 2,
38 |         "offload_optimizer": {
39 |             "device": "cpu",
40 |             "pin_memory": true
41 |         },
42 |         "allgather_partitions": true,
43 |         "allgather_bucket_size": 2e8,
44 |         "overlap_comm": true,
45 |         "reduce_scatter": true,
46 |         "reduce_bucket_size": 2e8,
47 |         "contiguous_gradients": true
48 |     },
49 | 
50 |     "gradient_accumulation_steps": "auto",
51 |     "gradient_clipping": "auto",
52 |     "steps_per_print": 2000,
53 |     "train_batch_size": "auto",
54 |     "train_micro_batch_size_per_gpu": "auto",
55 |     "wall_clock_breakdown": false
56 | }
57 | 


--------------------------------------------------------------------------------
/pandalm/core/__init__.py:
--------------------------------------------------------------------------------
1 | from .customtrainer import CustomTrainer
2 | from .nets import Nets
3 | from .datasets import Datasets, DataCollatorForDataset
4 | 


--------------------------------------------------------------------------------
/pandalm/core/customtrainer.py:
--------------------------------------------------------------------------------
1 | from transformers import Trainer
2 | import logging
3 | class CustomTrainer(Trainer):
4 |     def placeholder_func(self):
5 |         logging.warning("Using CustomTrainer. This is a placeholder fuction...")
6 |  
7 | 


--------------------------------------------------------------------------------
/pandalm/core/datasets.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import transformers
  3 | 
  4 | # import global variables
  5 | from core.global_var import IGNORE_INDEX, DEFAULT_PAD_TOKEN, DEFAULT_EOS_TOKEN, DEFAULT_BOS_TOKEN, DEFAULT_UNK_TOKEN, PROMPT_DICT, RAW_DATA_PROMPT_DICT 
  6 | from dataclasses import dataclass, field
  7 | from typing import Dict, Sequence
  8 | import logging
  9 | import copy
 10 | import json
 11 | 
 12 | @dataclass
 13 | class DataCollatorForDataset(object):
 14 |     """Collate examples for supervised fine-tuning."""
 15 | 
 16 |     tokenizer: transformers.PreTrainedTokenizer
 17 | 
 18 |     def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
 19 |         input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
 20 |         input_ids = torch.nn.utils.rnn.pad_sequence(
 21 |             input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
 22 |         )
 23 |         labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
 24 |         return dict(
 25 |             input_ids=input_ids,
 26 |             labels=labels,
 27 |             attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
 28 |         )
 29 | 
 30 | 
 31 | 
 32 | class Datasets:
 33 |     def  __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer, use_raw_data: bool):
 34 |         super(Datasets, self).__init__()
 35 |         logging.warning("Loading data...")
 36 |         with open(data_path, mode='r') as data_file:
 37 |             list_data_dict = json.load(data_file)
 38 | 
 39 |         logging.warning("Formatting inputs...")
 40 |         if use_raw_data is False:
 41 |             prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
 42 |         else:
 43 |             prompt_input, prompt_no_input = RAW_DATA_PROMPT_DICT["prompt_input"], RAW_DATA_PROMPT_DICT["prompt_no_input"]
 44 |         sources = [
 45 |             prompt_input.format_map(example) if example.get("input", "") != "" else prompt_no_input.format_map(example)
 46 |             for example in list_data_dict
 47 |         ]
 48 |         if use_raw_data is False:
 49 |             targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]
 50 |         else:
 51 |             targets = [f"{example['output_sequence']}{tokenizer.eos_token}" for example in list_data_dict]
 52 |         logging.warning("Tokenizing inputs... This may take some time...")
 53 |         data_dict = self.preprocess(sources, targets, tokenizer)
 54 | 
 55 |         self.input_ids = data_dict["input_ids"]
 56 |         self.labels = data_dict["labels"]
 57 |     def __len__(self):
 58 |         return len(self.input_ids)
 59 | 
 60 |     def __getitem__(self, i) -> Dict[str, torch.Tensor]:
 61 |         return dict(input_ids=self.input_ids[i], labels=self.labels[i])
 62 |     def _tokenize_fn(self, strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
 63 |         """Tokenize a list of strings."""
 64 |         tokenized_list = [
 65 |             tokenizer(
 66 |                 text,
 67 |                 return_tensors="pt",
 68 |                 padding="longest",
 69 |                 max_length=tokenizer.model_max_length,
 70 |                 truncation=True,
 71 |             )
 72 |             for text in strings
 73 |         ]
 74 |         input_ids = labels = [tokenized.input_ids[0] for tokenized in tokenized_list]
 75 |         input_ids_lens = labels_lens = [
 76 |             tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item() for tokenized in tokenized_list
 77 |         ]
 78 |         return dict(
 79 |             input_ids=input_ids,
 80 |             labels=labels,
 81 |             input_ids_lens=input_ids_lens,
 82 |             labels_lens=labels_lens,
 83 |         )
 84 | 
 85 | 
 86 |     def preprocess(
 87 |         self,
 88 |         sources: Sequence[str],
 89 |         targets: Sequence[str],
 90 |         tokenizer: transformers.PreTrainedTokenizer,
 91 |     ) -> Dict:
 92 |         """Preprocess the data by tokenizing."""
 93 |         examples = [s + t for s, t in zip(sources, targets)]
 94 |         examples_tokenized, sources_tokenized = [self._tokenize_fn(strings, tokenizer) for strings in (examples, sources)]
 95 |         input_ids = examples_tokenized["input_ids"]
 96 |         labels = copy.deepcopy(input_ids)
 97 |         for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
 98 |             label[:source_len] = IGNORE_INDEX
 99 |         return dict(input_ids=input_ids, labels=labels)
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 
106 | 


--------------------------------------------------------------------------------
/pandalm/core/global_var.py:
--------------------------------------------------------------------------------
 1 | IGNORE_INDEX = -100
 2 | DEFAULT_PAD_TOKEN = "[PAD]"
 3 | DEFAULT_EOS_TOKEN = "</s>"
 4 | DEFAULT_BOS_TOKEN = "</s>"
 5 | DEFAULT_UNK_TOKEN = "</s>"
 6 | PROMPT_DICT = {
 7 |     "prompt_input": (
 8 |         "Below is an instruction that describes a task, paired with an input that provides further context. "
 9 |         "Write a response that appropriately completes the request.\n\n"
10 |         "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
11 |     ),
12 |     "prompt_no_input": (
13 |         "Below is an instruction that describes a task. "
14 |         "Write a response that appropriately completes the request.\n\n"
15 |         "### Instruction:\n{instruction}\n\n### Response:"
16 |     ),
17 | }
18 | 
19 | RAW_DATA_PROMPT_DICT = {
20 |     "prompt_input": (
21 |         "{input_sequence}\n"
22 |     ),
23 |     "prompt_no_input": (
24 |         "{input_sequence}\n"
25 |     ),
26 | }
27 | 
28 | 
29 | 


--------------------------------------------------------------------------------
/pandalm/core/nets.py:
--------------------------------------------------------------------------------
  1 | import transformers
  2 | from typing import  Dict
  3 | from peft import TaskType, LoraConfig, get_peft_model, prepare_model_for_int8_training,get_peft_model_state_dict
  4 | import logging
  5 | 
  6 | # import global variables
  7 | from core.global_var import IGNORE_INDEX, DEFAULT_PAD_TOKEN, DEFAULT_EOS_TOKEN, DEFAULT_BOS_TOKEN, DEFAULT_UNK_TOKEN, PROMPT_DICT 
  8 | 
  9 | 
 10 | class Nets:
 11 |     def __init__(self, model_args, training_args, peft_args, **kwargs):
 12 |         super(Nets, self).__init__()
 13 | 
 14 |         # init model
 15 |         if 'glm' in model_args.model_name_or_path:
 16 |             model = transformers.AutoModelForSeq2SeqLM.from_pretrained(
 17 |                             model_args.model_name_or_path,
 18 |                             cache_dir=training_args.cache_dir,
 19 |                             trust_remote_code=True,
 20 |                         )
 21 |         else:
 22 |             model = transformers.AutoModelForCausalLM.from_pretrained(
 23 |                             model_args.model_name_or_path,
 24 |                             cache_dir=training_args.cache_dir,
 25 |                         )
 26 |         model = self.get_peft_model(peft_args,model) 
 27 |         # init tokenizer
 28 |         if 'llama' in model_args.model_name_or_path:
 29 |             tokenizer = transformers.LlamaTokenizer.from_pretrained(
 30 |                 model_args.model_name_or_path,
 31 |                 cache_dir=training_args.cache_dir,
 32 |                 model_max_length=training_args.model_max_length,
 33 |                 padding_side="right",
 34 |                 use_fast=False,
 35 |             )
 36 |         elif 'glm' in model_args.model_name_or_path:
 37 |             tokenizer = transformers.AutoTokenizer.from_pretrained(
 38 |                 model_args.model_name_or_path,
 39 |                 cache_dir=training_args.cache_dir,
 40 |                 model_max_length=training_args.model_max_length,
 41 |                 padding_side="right",
 42 |                 use_fast=False,
 43 |                 trust_remote_code=True,
 44 |             )
 45 |         elif 'pythia' in model_args.model_name_or_path:
 46 |             tokenizer = transformers.GPTNeoXTokenizerFast.from_pretrained(
 47 |                 model_args.model_name_or_path,
 48 |                 cache_dir=training_args.cache_dir,
 49 |                 model_max_length=training_args.model_max_length,
 50 |                 padding_side="right",
 51 |                 use_fast=False,
 52 |             )
 53 |         
 54 |         else:
 55 |             tokenizer = transformers.AutoTokenizer.from_pretrained(
 56 |                 model_args.model_name_or_path,
 57 |                 cache_dir=training_args.cache_dir,
 58 |                 model_max_length=training_args.model_max_length,
 59 |                 padding_side="right",
 60 |                 use_fast=False,
 61 |             )
 62 |  
 63 |         if tokenizer.pad_token is None:
 64 |             self.smart_tokenizer_and_embedding_resize(
 65 |                 special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
 66 |                 tokenizer=tokenizer,
 67 |                 model=model,
 68 |             )
 69 |         if "llama" in model_args.model_name_or_path:
 70 |             tokenizer.add_special_tokens(
 71 |                 {
 72 |                     "eos_token": DEFAULT_EOS_TOKEN,
 73 |                     "bos_token": DEFAULT_BOS_TOKEN,
 74 |                     "unk_token": DEFAULT_UNK_TOKEN,
 75 |                 }
 76 |             )
 77 | 
 78 |         # finish initilization
 79 |         self.model = model
 80 |         self.tokenizer = tokenizer
 81 | 
 82 |     def get_model(self):
 83 |         return self.model
 84 |     def get_tokenizer(self):
 85 |         return self.tokenizer
 86 |     def get_peft_model(self,peft_args,model):
 87 |         if peft_args.peft_model == 'none':
 88 |             logging.warning("Full finetuning...")
 89 |             return model
 90 |         elif peft_args.peft_model == 'lora':
 91 |             logging.warning("Using lora...")
 92 |             peft_config = LoraConfig(
 93 |                 r=8,
 94 |                 lora_alpha=16,
 95 |                 lora_dropout=0.05,
 96 |                 bias="none",
 97 |                 target_modules=["q_proj","v_proj"],
 98 |                 task_type=TaskType.CAUSAL_LM,
 99 |             )
100 | 
101 | 
102 |         model = get_peft_model(model, peft_config)
103 |         model.print_trainable_parameters()
104 |         
105 |         return model
106 | 
107 |  
108 |     def smart_tokenizer_and_embedding_resize(
109 |         self,
110 |         special_tokens_dict: Dict,
111 |         tokenizer: transformers.PreTrainedTokenizer,
112 |         model: transformers.PreTrainedModel,
113 |     ):
114 |         """Resize tokenizer and embedding.
115 | 
116 |         Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
117 |         """
118 |         num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
119 |         model.resize_token_embeddings(len(tokenizer))
120 | 
121 |         if num_new_tokens > 0:
122 |             input_embeddings = model.get_input_embeddings().weight.data
123 |             output_embeddings = model.get_output_embeddings().weight.data
124 | 
125 |             input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
126 |             output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
127 | 
128 |             input_embeddings[-num_new_tokens:] = input_embeddings_avg
129 |             output_embeddings[-num_new_tokens:] = output_embeddings_avg
130 | 
131 |     def safe_save_model_for_hf_trainer(self, trainer: transformers.Trainer, output_dir: str):
132 |         """Collects the state dict and dump to disk."""
133 |         state_dict = trainer.model.state_dict()
134 |         if trainer.args.should_save:
135 |             cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
136 |             del state_dict
137 |             trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
138 | 


--------------------------------------------------------------------------------
/pandalm/inst-tune.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass, field
 2 | from typing import Optional
 3 | import transformers
 4 | 
 5 | # import core classes
 6 | from core import CustomTrainer, Nets, Datasets, DataCollatorForDataset
 7 | # import global variables
 8 | from core.global_var import IGNORE_INDEX, DEFAULT_PAD_TOKEN, DEFAULT_EOS_TOKEN, DEFAULT_BOS_TOKEN, DEFAULT_UNK_TOKEN, PROMPT_DICT 
 9 | 
10 | @dataclass
11 | class ModelArguments:
12 |     model_name_or_path: Optional[str] = field(default="")
13 | 
14 | @dataclass
15 | class PeftArguments:
16 |     peft_model: Optional[str] = field(default="lora")
17 | 
18 | @dataclass
19 | class DataArguments:
20 |     data_path: str = field(default=None, metadata={"help": "Path to the training data."})
21 |     use_raw_data: bool = field(default=False, metadata={"help": "Add prompt or not."})
22 | 
23 | @dataclass
24 | class TrainingArguments(transformers.TrainingArguments):
25 |     deepspeed: Optional[str] = field(default=None)
26 |     cache_dir: Optional[str] = field(default=None)
27 |     optim: str = field(default="adamw_torch")
28 |     model_max_length: int = field(
29 |         default=1024,
30 |         metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
31 |     )
32 | 
33 | def train():
34 |     parser = transformers.HfArgumentParser((ModelArguments, PeftArguments, DataArguments, TrainingArguments))
35 |     model_args, peft_args, data_args, training_args = parser.parse_args_into_dataclasses()
36 |     
37 |     #prepare model and tokenizer for training
38 |     nets = Nets(model_args, training_args, peft_args)
39 |     model = nets.get_model()
40 |     tokenizer = nets.get_tokenizer()
41 | 
42 | 
43 |     #prepare datasets for training and validation
44 |     train_dataset = Datasets(tokenizer=tokenizer, data_path=data_args.data_path, use_raw_data=data_args.use_raw_data)
45 |     data_collator = DataCollatorForDataset(tokenizer=tokenizer)
46 |      
47 |     # prepare trainer and train model
48 |     trainer = CustomTrainer(model=model, tokenizer=tokenizer, args=training_args, train_dataset=train_dataset, eval_dataset=None, data_collator=data_collator)
49 |     trainer.train()
50 |     trainer.save_state()
51 |     
52 |     # save model after train
53 |     nets.safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
54 | 
55 | 
56 | if __name__ == "__main__":
57 |     train()
58 | 


--------------------------------------------------------------------------------
/pandalm/run-gradio.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import re
  4 | import fire
  5 | import gradio as gr
  6 | import torch
  7 | import transformers
  8 | import traceback
  9 | 
 10 | from transformers import GenerationConfig, AutoTokenizer, AutoModelForCausalLM
 11 | from queue import Queue
 12 | from threading import Thread
 13 | 
 14 | 
 15 | DEFAULT_PAD_TOKEN = "[PAD]"
 16 | DEFAULT_EOS_TOKEN = "</s>"
 17 | DEFAULT_BOS_TOKEN = "</s>"
 18 | DEFAULT_UNK_TOKEN = "</s>"
 19 | 
 20 | class Stream(transformers.StoppingCriteria):
 21 |     def __init__(self, callback_func=None):
 22 |         self.callback_func = callback_func
 23 | 
 24 |     def __call__(self, input_ids, scores) -> bool:
 25 |         if self.callback_func is not None:
 26 |             self.callback_func(input_ids[0])
 27 |         return False
 28 | 
 29 | 
 30 | class Iteratorize:
 31 | 
 32 |     """
 33 |     Transforms a function that takes a callback
 34 |     into a lazy iterator (generator).
 35 |     """
 36 | 
 37 |     def __init__(self, func, kwargs={}, callback=None):
 38 |         self.mfunc = func
 39 |         self.c_callback = callback
 40 |         self.q = Queue()
 41 |         self.sentinel = object()
 42 |         self.kwargs = kwargs
 43 |         self.stop_now = False
 44 | 
 45 |         def _callback(val):
 46 |             if self.stop_now:
 47 |                 raise ValueError
 48 |             self.q.put(val)
 49 | 
 50 |         def gentask():
 51 |             try:
 52 |                 ret = self.mfunc(callback=_callback, **self.kwargs)
 53 |             except ValueError:
 54 |                 pass
 55 |             except:
 56 |                 traceback.print_exc()
 57 |                 pass
 58 | 
 59 |             self.q.put(self.sentinel)
 60 |             if self.c_callback:
 61 |                 self.c_callback(ret)
 62 | 
 63 |         self.thread = Thread(target=gentask)
 64 |         self.thread.start()
 65 | 
 66 |     def __iter__(self):
 67 |         return self
 68 | 
 69 |     def __next__(self):
 70 |         obj = self.q.get(True, None)
 71 |         if obj is self.sentinel:
 72 |             raise StopIteration
 73 |         else:
 74 |             return obj
 75 | 
 76 |     def __enter__(self):
 77 |         return self
 78 | 
 79 |     def __exit__(self, exc_type, exc_val, exc_tb):
 80 |         self.stop_now = True
 81 | 
 82 | 
 83 | if torch.cuda.is_available():
 84 |     device = "cuda"
 85 | else:
 86 |     device = "cpu"
 87 | 
 88 | try:
 89 |     if torch.backends.mps.is_available():
 90 |         device = "mps"
 91 | except:  # noqa: E722
 92 |     pass
 93 | 
 94 | 
 95 | def build_prompt(instruction, input, resp1, resp2, result=None, explain=None, ref=None):
 96 |     rsp = f"### Response 1:\n{resp1}\n\n### Response 2:\n{resp2}"
 97 | 
 98 |     if input:
 99 |         input_sequence = f"Below are two responses for a given task. The task is defined by the Instruction with an Input that provides further context. Evaluate the responses and generate a reference answer for the task.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n{rsp}\n\n### Evaluation:\n"
100 |     else:
101 |         input_sequence = f"Below are two responses for a given task. The task is defined by the Instruction. Evaluate the responses and generate a reference answer for the task.\n\n### Instruction:\n{instruction}\n\n{rsp}\n\n### Evaluation:\n"
102 | 
103 |     if result:
104 |         output_sequence = f"{result}\n\n### Reason: {explain}\n\n### Reference: {ref}\n"
105 |         return input_sequence, output_sequence
106 |     else:
107 |         return input_sequence
108 | 
109 | 
110 | def smart_tokenizer_and_embedding_resize(
111 |     special_tokens_dict,
112 |     tokenizer: transformers.PreTrainedTokenizer,
113 |     model: transformers.PreTrainedModel,
114 | ):
115 |     """Resize tokenizer and embedding.
116 | 
117 |     Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
118 |     """
119 |     num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
120 |     model.resize_token_embeddings(len(tokenizer))
121 | 
122 |     if num_new_tokens > 0:
123 |         input_embeddings = model.get_input_embeddings().weight.data
124 |         output_embeddings = model.get_output_embeddings().weight.data
125 | 
126 |         input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
127 |             dim=0, keepdim=True
128 |         )
129 |         output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
130 |             dim=0, keepdim=True
131 |         )
132 | 
133 |         input_embeddings[-num_new_tokens:] = input_embeddings_avg
134 |         output_embeddings[-num_new_tokens:] = output_embeddings_avg
135 | 
136 | 
137 | def post_process_output(text):
138 |     text = text.strip().split("### Evaluation:")[1].strip()
139 |     pattern = re.compile(
140 |         r"<unk>|<pad>|<s>|</s>|\[PAD\]|<\|endoftext\|>|\[UNK\]|\[CLS\]|\[MASK\]|<\|startofpiece\|>|<\|endofpiece\|>|\[gMASK\]|\[sMASK\]"
141 |     )
142 |     pattern.sub("", text.strip()).strip()
143 |     return text
144 | 
145 | 
146 | def main(
147 |     load_8bit: bool = False,
148 |     base_model: str = "WeOpenML/PandaLM-7B-v1",
149 |     server_name: str = "0.0.0.0",  # Allows to listen on all interfaces by providing '0.
150 |     share_gradio: bool = False,
151 |     server_port: int = 31228,
152 | ):
153 |     base_model = base_model or os.environ.get("BASE_MODEL", "")
154 |     assert (
155 |         base_model
156 |     ), "Please specify a --base_model, e.g. --base_model='WeOpenML/PandaLM-7B-v1'"
157 | 
158 |     tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=False)
159 |     if tokenizer.pad_token is None:
160 |         smart_tokenizer_and_embedding_resize(
161 |             special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
162 |             tokenizer=tokenizer,
163 |             model=model,
164 |         )
165 |     tokenizer.add_special_tokens(
166 |         {
167 |             "eos_token": DEFAULT_EOS_TOKEN,
168 |             "bos_token": DEFAULT_BOS_TOKEN,
169 |             "unk_token": DEFAULT_UNK_TOKEN,
170 |         }
171 |     )
172 |     model = AutoModelForCausalLM.from_pretrained(
173 |         base_model,
174 |         load_in_8bit=load_8bit,
175 |         torch_dtype=torch.bfloat16,
176 |         device_map="auto",
177 |     )
178 | 
179 |     # unwind broken decapoda-research config
180 |     model.config.pad_token_id = tokenizer.pad_token_id = 0  # unk
181 |     model.config.bos_token_id = 1
182 |     model.config.eos_token_id = 2
183 | 
184 |     if not load_8bit:
185 |         model.half()  # seems to fix bugs for some users.
186 | 
187 |     model.eval()
188 |     if torch.__version__ >= "2" and sys.platform != "win32":
189 |         model = torch.compile(model)
190 | 
191 |     def evaluate(
192 |         instruction,
193 |         input=None,
194 |         response1=None,
195 |         response2=None,
196 |         temperature=0,
197 |         top_p=1,
198 |         top_k=1,
199 |         num_beams=4,
200 |         max_new_tokens=150,
201 |         stream_output=True,
202 |         repetition_penalty=1.2,
203 |         early_stopping=True,
204 |         **kwargs,
205 |     ):
206 |         # prompt = prompter.generate_prompt(instruction, input)
207 |         prompt = build_prompt(instruction, input, response1, response2)
208 |         inputs = tokenizer(prompt, return_tensors="pt")
209 |         input_ids = inputs["input_ids"].to(device)
210 |         generation_config = GenerationConfig(
211 |             temperature=temperature,
212 |             top_p=top_p,
213 |             top_k=top_k,
214 |             num_beams=num_beams,
215 |             early_stopping=early_stopping,
216 |             repetition_penalty=repetition_penalty,
217 |             **kwargs,
218 |         )
219 |         generate_params = {
220 |             "input_ids": input_ids,
221 |             "generation_config": generation_config,
222 |             "return_dict_in_generate": True,
223 |             "output_scores": True,
224 |             "max_new_tokens": max_new_tokens,
225 |         }
226 | 
227 |         if stream_output:
228 |             # Stream the reply 1 token at a time.
229 |             # This is based on the trick of using 'stopping_criteria' to create an iterator,
230 |             # from https://github.com/oobabooga/text-generation-webui/blob/ad37f396fc8bcbab90e11ecf17c56c97bfbd4a9c/modules/text_generation.py#L216-L243.
231 | 
232 |             def generate_with_callback(callback=None, **kwargs):
233 |                 kwargs.setdefault(
234 |                     "stopping_criteria", transformers.StoppingCriteriaList()
235 |                 )
236 |                 kwargs["stopping_criteria"].append(Stream(callback_func=callback))
237 |                 with torch.no_grad():
238 |                     model.generate(**kwargs)
239 | 
240 |             def generate_with_streaming(**kwargs):
241 |                 return Iteratorize(generate_with_callback, kwargs, callback=None)
242 | 
243 |             with generate_with_streaming(**generate_params) as generator:
244 |                 for output in generator:
245 |                     # new_tokens = len(output) - len(input_ids[0])
246 |                     decoded_output = tokenizer.decode(output)
247 | 
248 |                     if output[-1] in [tokenizer.eos_token_id]:
249 |                         break
250 | 
251 |                     yield post_process_output(decoded_output)
252 |             return  # early return for stream_output
253 | 
254 |         # Without streaming
255 |         with torch.no_grad():
256 |             generation_output = model.generate(
257 |                 input_ids=input_ids,
258 |                 generation_config=generation_config,
259 |                 return_dict_in_generate=True,
260 |                 output_scores=True,
261 |                 max_new_tokens=max_new_tokens,
262 |             )
263 |         s = generation_output.sequences[0]
264 |         output = tokenizer.decode(s)
265 |         yield post_process_output(output)
266 |     with gr.Blocks(title="PandaLM", description="Compare different responses with a given context.") as demo:
267 |         gr.Markdown('# PandaLM')
268 |         with gr.Row():
269 |             with gr.Column():
270 |                 instruction = gr.components.Textbox(
271 |                         lines=3,
272 |                         label="Instruction",
273 |                         value="Build a Java program to output the following message",
274 |                     )
275 |                 input = gr.components.Textbox(lines=3, label="Input", value="Hello World!")
276 |                 response1 = gr.components.Textbox(
277 |                         lines=3, label="Response 1", value='System.out.println("Hello World");'
278 |                     )
279 |                 response2 = gr.components.Textbox(
280 |                         lines=3,
281 |                         label="Response 2",
282 |                         value='public class HelloWorld {\n    public static void main(String[] args) {\n        System.out.println("Hello World!");\n    }\n}',
283 |                     )
284 |             with gr.Column():
285 |                 output = gr.inputs.Textbox(
286 |                     lines=12,
287 |                     label="Which response is better?",
288 |                 )
289 |         
290 |         eval_btn = gr.Button("Evaluate")
291 |         with gr.Row():
292 |             with gr.Column():
293 |                 temp = gr.components.Slider(minimum=0, maximum=1, value=0, label="Temperature")
294 |                 top_p = gr.components.Slider(minimum=0, maximum=1, value=1, label="Top p")
295 |                 top_k = gr.components.Slider(minimum=0, maximum=100, step=1, value=1, label="Top k")
296 |                 early_stopping = gr.components.Checkbox(label="Early stopping", value=True)
297 |             with gr.Column():
298 |                 num_beams = gr.components.Slider(minimum=1, maximum=8, step=1, value=4, label="Beams")
299 |                 max_tokens = gr.components.Slider(minimum=1, maximum=1024, step=1, value=512, label="Max new tokens")
300 |                 repetition_penalty = gr.components.Slider(minimum=0.0, maximum=2.0, value=1.2, label="Repetition penalty")
301 |                 stream = gr.components.Checkbox(label="Stream output", value=True)
302 |         eval_btn.click(fn=evaluate, inputs=[instruction, input, response1, response2, temp, top_p, top_k, num_beams, max_tokens, stream, repetition_penalty, early_stopping], outputs=[output], api_name="eval")
303 |     demo.queue().launch(
304 |         server_name=server_name, share=share_gradio, server_port=server_port
305 |     )
306 | 
307 | if __name__ == "__main__":
308 |     fire.Fire(main)
309 | 


--------------------------------------------------------------------------------
/pandalm/scripts/inst-tune.sh:
--------------------------------------------------------------------------------
 1 | #export CUDA_VISIBLE_DEVICES=4,5,6,7
 2 | if [ ! -f "./assets/alpaca_data.json" ];then
 3 |   echo "alpaca data does not exist"
 4 |   wget -P ./assets https://raw.githubusercontent.com/tatsu-lab/stanford_alpaca/main/alpaca_data.json
 5 |   else
 6 |   echo "alpaca data exists"
 7 | fi
 8 | deepspeed  --master_port=2023  inst-tune.py    --model_name_or_path huggyllama/llama-7b --output_dir ./output/llama-7b-tuned  --data_path ./assets/alpaca_data.json --bf16 True    --num_train_epochs 3     --per_device_train_batch_size 2   --per_device_eval_batch_size 2     --gradient_accumulation_steps 8     --evaluation_strategy "no"     --save_strategy "steps"     --save_steps 2000     --save_total_limit 1     --learning_rate 2e-5     --weight_decay 0.     --warmup_ratio 0.03     --lr_scheduler_type "cosine"     --logging_steps 1   --peft_model none --deepspeed assets/ds_config_zero2.json --model_max_length 1024 --report_to wandb --use_raw_data False
 9 | 
10 | deepspeed  --master_port=2023  inst-tune.py    --model_name_or_path bigscience/bloom-7b1 --output_dir ./output/bloom-7b-tuned  --data_path ./assets/alpaca_data.json --bf16 True    --num_train_epochs 3     --per_device_train_batch_size 2   --per_device_eval_batch_size 2     --gradient_accumulation_steps 8     --evaluation_strategy "no"     --save_strategy "steps"     --save_steps 2000     --save_total_limit 1     --learning_rate 2e-5     --weight_decay 0.     --warmup_ratio 0.03     --lr_scheduler_type "cosine"     --logging_steps 1   --peft_model none --deepspeed assets/ds_config_zero2.json --model_max_length 1024 --report_to wandb --use_raw_data False
11 | 
12 | deepspeed  --master_port=2023  inst-tune.py    --model_name_or_path facebook/opt-6.7b --output_dir ./output/opt-6.7b-tuned  --data_path ./assets/alpaca_data.json --bf16 True    --num_train_epochs 3     --per_device_train_batch_size 2   --per_device_eval_batch_size 2     --gradient_accumulation_steps 8     --evaluation_strategy "no"     --save_strategy "steps"     --save_steps 2000     --save_total_limit 1     --learning_rate 2e-5     --weight_decay 0.     --warmup_ratio 0.03     --lr_scheduler_type "cosine"     --logging_steps 1   --peft_model none --deepspeed assets/ds_config_zero2.json --model_max_length 1024 --report_to wandb --use_raw_data False
13 | 
14 | deepspeed  --master_port=2023  inst-tune.py    --model_name_or_path cerebras/Cerebras-GPT-6.7B --output_dir ./output/cerebras-gpt-6.7b-tuned --data_path ./assets/alpaca_data.json --bf16 True    --num_train_epochs 3     --per_device_train_batch_size 2   --per_device_eval_batch_size 2     --gradient_accumulation_steps 8     --evaluation_strategy "no"     --save_strategy "steps"     --save_steps 2000     --save_total_limit 1     --learning_rate 2e-5     --weight_decay 0.     --warmup_ratio 0.03     --lr_scheduler_type "cosine"     --logging_steps 1   --peft_model none --deepspeed assets/ds_config_zero2.json --model_max_length 1024 --report_to wandb --use_raw_data False
15 | 
16 | deepspeed  --master_port=2023  inst-tune.py    --model_name_or_path EleutherAI/pythia-6.9b --output_dir ./output/pythia-6.9b-tuned  --data_path ./assets/alpaca_data.json --bf16 True    --num_train_epochs 3     --per_device_train_batch_size 2   --per_device_eval_batch_size 2     --gradient_accumulation_steps 8     --evaluation_strategy "no"     --save_strategy "steps"     --save_steps 2000     --save_total_limit 1     --learning_rate 2e-5     --weight_decay 0.     --warmup_ratio 0.03     --lr_scheduler_type "cosine"     --logging_steps 1   --peft_model none --deepspeed assets/ds_config_zero2.json --model_max_length 1024 --report_to wandb --use_raw_data False
17 | 
18 | 


--------------------------------------------------------------------------------
/pandalm/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .evaluation_pipeline import EvaluationPipeline
2 | from .candidate_model_inference import CandidateBatchInferenceProvider
3 | from .pandalm_inference import PandaLMBatchInferenceProvider
4 | 


--------------------------------------------------------------------------------
/pandalm/utils/candidate_model_inference.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import transformers
  4 | from transformers import GenerationConfig, AutoModelForCausalLM, AutoTokenizer
  5 | import json
  6 | import os, sys
  7 | import logging
  8 | from typing import Union, Dict
  9 | from tqdm import tqdm
 10 | import re, random
 11 | 
 12 | import logging
 13 | 
 14 | 
 15 | def seed_everything(seed):
 16 |     torch.manual_seed(seed)
 17 |     torch.cuda.manual_seed_all(seed)
 18 |     random.seed(seed)
 19 | 
 20 | 
 21 | prompt_templates = {
 22 |     "alpaca": {
 23 |         "description": "Template used by Alpaca-LoRA.",
 24 |         "prompt_input": "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
 25 |         "prompt_no_input": "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Response:\n",
 26 |         "response_split": "### Response:",
 27 |     }
 28 | }
 29 | 
 30 | 
 31 | class CandidateBatchInferenceProvider(object):
 32 |     """
 33 |     Batch inference provider for candidate generation models.
 34 |     """
 35 | 
 36 |     def __init__(self, model_path, prompt_template_name="alpaca") -> None:
 37 |         super().__init__()
 38 |         self.template = prompt_templates[prompt_template_name]
 39 |         try:
 40 |             tokenizer = AutoTokenizer.from_pretrained(model_path)
 41 |         except:
 42 |             tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
 43 |         model = AutoModelForCausalLM.from_pretrained(
 44 |             model_path,
 45 |             torch_dtype=torch.bfloat16,
 46 |             device_map="auto",
 47 |         )
 48 |         if tokenizer.pad_token is None:
 49 |             self.smart_tokenizer_and_embedding_resize(
 50 |                 special_tokens_dict=dict(pad_token="[PAD]"),
 51 |                 tokenizer=tokenizer,
 52 |                 model=model,
 53 |             )
 54 |         if "llama" in model_path:
 55 |             tokenizer.add_special_tokens(
 56 |                 {
 57 |                     "eos_token": "</s>",
 58 |                     "bos_token": "</s>",
 59 |                     "unk_token": "</s>",
 60 |                 }
 61 |             )
 62 |         self.tokenizer = tokenizer
 63 | 
 64 |         model.config.pad_token_id = self.tokenizer.pad_token_id = 0  # unk
 65 |         model.config.bos_token_id = 1
 66 |         model.config.eos_token_id = 2
 67 |         model.eval()
 68 | 
 69 |         if torch.__version__ >= "2" and sys.platform != "win32":
 70 |             model = torch.compile(model)
 71 | 
 72 |         self.model = model
 73 |         self.prepared = []
 74 |         self.pattern = re.compile(
 75 |             r"<unk>|<pad>|<s>|</s>|\[PAD\]|<\|endoftext\|>|\[UNK\]|\[CLS\]|\[MASK\]|<\|startofpiece\|>|<\|endofpiece\|>|\[gMASK\]|\[sMASK\]"
 76 |         )
 77 | 
 78 |     def generate_prompt(self, instruction, input=None, label=None):
 79 |         if input:
 80 |             res = self.template["prompt_input"].format(
 81 |                 instruction=instruction, input=input
 82 |             )
 83 |         else:
 84 |             res = self.template["prompt_no_input"].format(instruction=instruction)
 85 |         if label:
 86 |             res = f"{res}{label}"
 87 |         return res
 88 | 
 89 |     def post_process_output(self, text):
 90 |         text = text.split(self.template["response_split"])[1].strip()
 91 |         text = self.pattern.sub("", text.strip()).strip()
 92 |         return text
 93 | 
 94 |     def smart_tokenizer_and_embedding_resize(
 95 |         self,
 96 |         special_tokens_dict: Dict,
 97 |         tokenizer: transformers.PreTrainedTokenizer,
 98 |         model: transformers.PreTrainedModel,
 99 |     ):
100 |         """Resize tokenizer and embedding.
101 | 
102 |         Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
103 |         """
104 |         num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
105 |         model.resize_token_embeddings(len(tokenizer))
106 | 
107 |         if num_new_tokens > 0:
108 |             input_embeddings = model.get_input_embeddings().weight.data
109 |             output_embeddings = model.get_output_embeddings().weight.data
110 | 
111 |             input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
112 |                 dim=0, keepdim=True
113 |             )
114 |             output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
115 |                 dim=0, keepdim=True
116 |             )
117 | 
118 |             input_embeddings[-num_new_tokens:] = input_embeddings_avg
119 |             output_embeddings[-num_new_tokens:] = output_embeddings_avg
120 | 
121 |     def preprocess_input(self, instruction, input):
122 |         prompt = self.generate_prompt(instruction, input)
123 |         # self.prepared.append(self.tokenizer(prompt, return_tensors="pt", padding=True))
124 |         self.prepared.append(prompt)
125 | 
126 |     def filter_special_token(self, text):
127 |         return self.pattern.sub("", text.strip()).strip()
128 | 
129 |     def inference(
130 |         self,
131 |         temperature=0,
132 |         top_p=1,
133 |         top_k=1,
134 |         num_beams=4,
135 |         max_new_tokens=300,
136 |         repetition_penalty=1.2,
137 |     ):
138 |         generated = []
139 |         for idx in tqdm(range(len(self.prepared))):
140 |             inputs = self.tokenizer(self.prepared[idx], return_tensors="pt")
141 |             input_ids = inputs["input_ids"].to(self.model.device)
142 |             generation_config = GenerationConfig(
143 |                 temperature=temperature,
144 |                 top_p=top_p,
145 |                 top_k=top_k,
146 |                 num_beams=num_beams,
147 |                 early_stopping=True,
148 |                 repetition_penalty=repetition_penalty,
149 |             )
150 |             with torch.no_grad():
151 |                 generation_output = self.model.generate(
152 |                     input_ids=input_ids,
153 |                     generation_config=generation_config,
154 |                     return_dict_in_generate=True,
155 |                     output_scores=True,
156 |                     max_new_tokens=max_new_tokens,
157 |                 )
158 | 
159 |             for j in range(len(generation_output.sequences)):
160 |                 s = generation_output.sequences[j]
161 |                 output = self.tokenizer.decode(s)
162 |                 resp = self.post_process_output(output)
163 |                 resp = self.filter_special_token(resp)
164 |                 generated.append(resp)
165 |         return generated
166 | 
167 | 
168 | if __name__ == "__main__":
169 |     logging.basicConfig(
170 |         format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO
171 |     )
172 | 
173 |     parser = argparse.ArgumentParser(
174 |         description="Candidate model batch inference script"
175 |     )
176 |     parser.add_argument("-s", "--seed", type=int, default=2023)
177 |     parser.add_argument(
178 |         "-m", "--model_name", default="/ssdwork/wyd/test/llm/output/llama-7b/"
179 |     )
180 |     parser.add_argument(
181 |         "-i",
182 |         "--input_path",
183 |         default="/home/yzh/llm/PandaLM/data/testset-inference-v1.json",
184 |     )
185 |     parser.add_argument("-o", "--output_path", default=None)
186 | 
187 |     parser.add_argument("-r", "--data_root", default=None)
188 | 
189 |     args = parser.parse_args()
190 |     if args.data_root is None:
191 |         args.data_root = os.path.join(args.model_name, "batch_eval_outputs")
192 | 
193 |     logging.info(args)
194 | 
195 |     seed_everything(args.seed)
196 | 
197 |     logging.info(f"Loading model from {args.model_name}")
198 |     handler = CandidateBatchInferenceProvider(
199 |         model_path=args.model_name,
200 |     )
201 |     logging.info(f"Loading inference data from{args.input_path}")
202 |     with open(args.input_path) as f:
203 |         input_data = json.load(f)
204 |     logging.info(f"Loaded {len(input_data)} instance(s) to inference")
205 |     results = []
206 |     for item in tqdm(input_data):
207 |         output = handler.preprocess_input(
208 |             instruction=item["instruction"],
209 |             input=item["input"],
210 |         )
211 |     generated = handler.inference()
212 |     for idx, item in enumerate(input_data):
213 |         results.append([item, generated[idx]])
214 | 
215 |     if args.output_path:
216 |         with open(args.output_path, "w") as f:
217 |             json.dump(results, f)
218 |     else:
219 |         print(results)
220 | 


--------------------------------------------------------------------------------
/pandalm/utils/evaluation_pipeline.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import json
  3 | import logging
  4 | import random
  5 | from typing import Optional, List
  6 | from tqdm import tqdm
  7 | import gc
  8 | 
  9 | from .candidate_model_inference import CandidateBatchInferenceProvider
 10 | from .pandalm_inference import PandaLMBatchInferenceProvider
 11 | 
 12 | 
 13 | class EvaluationPipeline:
 14 |     def __init__(
 15 |         self,
 16 |         candidate_paths: List[str],
 17 |         pandalm_path: str = "WeOpenML/PandaLM-7B-v1",
 18 |         input_data_path: Optional[str] = None,
 19 |         output_data_path: Optional[str] = None,
 20 |         seed: Optional[int] = 2023,
 21 |         log_level: Optional[int] = logging.INFO,
 22 |     ):
 23 |         logging.basicConfig(
 24 |             format="%(asctime)s %(levelname)s %(message)s", level=log_level
 25 |         )
 26 |         self.input_data_path = input_data_path
 27 |         self.output_data_path = output_data_path
 28 |         self.pandalm_path = pandalm_path
 29 |         self.seed = seed
 30 | 
 31 |         if len(candidate_paths) < 2:
 32 |             raise ValueError(
 33 |                 f"At least two candidate models are required, provided candidate_paths: {candidate_paths}."
 34 |             )
 35 | 
 36 |         if self.input_data_path:
 37 |             with open(input_data_path) as f:
 38 |                 self.input_data = json.load(f)
 39 |         else:
 40 |             logging.info(f"No input_data_path provided, skipping candidate inference")
 41 |             for candidate in candidate_paths:
 42 |                 if not candidate.endswith(".json"):
 43 |                     raise ValueError(
 44 |                         f"Candidate inference skipped, please pass .json inference results instead of {candidate}"
 45 |                     )
 46 | 
 47 |         self.candidate_paths = candidate_paths
 48 |         self.candidate_results = {}
 49 |         self.pandalm_results = {}
 50 |         self.pandalm_results_parsed = {}
 51 | 
 52 |     def seed_everything(self, seed):
 53 |         torch.manual_seed(seed)
 54 |         torch.cuda.manual_seed_all(seed)
 55 |         random.seed(seed)
 56 | 
 57 |     def inference_candidate(self, candidate_path):
 58 |         candidate = CandidateBatchInferenceProvider(candidate_path)
 59 |         for item in self.input_data:
 60 |             candidate.preprocess_input(
 61 |                 instruction=item["instruction"],
 62 |                 input=item["input"],
 63 |             )
 64 |         logging.info(f"Running inference on candidate model: {candidate_path}")
 65 |         generated = candidate.inference().copy()
 66 |         del candidate
 67 |         gc.collect()
 68 |         torch.cuda.empty_cache()
 69 |         return generated
 70 | 
 71 |     def collect_all_candidates(self):
 72 |         """
 73 |         Run inference on all candidate models.
 74 |         """
 75 |         for candidate_path in self.candidate_paths:
 76 |             if candidate_path.endswith(".json"):
 77 |                 logging.info(
 78 |                     f"Loading candidate inference result and skipping inference:{candidate_path}"
 79 |                 )
 80 |                 with open(candidate_path) as f:
 81 |                     self.candidate_results[candidate_path] = json.load(f)
 82 |             else:
 83 |                 logging.info(
 84 |                     f"Loading candidate model and inferencing: {candidate_path}"
 85 |                 )
 86 |                 generated = self.inference_candidate(candidate_path)
 87 |                 self.candidate_results[candidate_path] = generated
 88 | 
 89 |     def pandalm_inference(self):
 90 |         """
 91 |         Run inference on the PandaLM model.
 92 |         """
 93 |         logging.info(f"Loading PandaLM model: {self.pandalm_path}")
 94 |         pandalm = PandaLMBatchInferenceProvider(self.pandalm_path)
 95 | 
 96 |         for i in range(len(self.candidate_paths)):
 97 |             for j in range(i + 1, len(self.candidate_paths)):
 98 |                 pandalm.prepared = []
 99 |                 candidate1 = self.candidate_paths[i]
100 |                 candidate2 = self.candidate_paths[j]
101 |                 logging.info(
102 |                     f"Running inference on PandaLM model with candidate1:{candidate1}, candidate2:{candidate2}"
103 |                 )
104 |                 cand1_results = self.candidate_results[candidate1]
105 |                 cand2_results = self.candidate_results[candidate2]
106 | 
107 |                 assert len(cand1_results) == len(cand2_results)
108 | 
109 |                 for idx in range(len(cand1_results)):
110 |                     pandalm.preprocess_input(
111 |                         instruction=self.input_data[idx]["instruction"],
112 |                         input=self.input_data[idx]["input"],
113 |                         response1=cand1_results[idx],
114 |                         response2=cand2_results[idx],
115 |                     )
116 |                 generated = pandalm.inference().copy()
117 | 
118 |                 self.pandalm_results[(candidate1, candidate2)] = generated
119 |                 parsed = []
120 |                 for item in generated:
121 |                     parsed.append(pandalm.parse_pandalm_response(item))
122 |                 self.pandalm_results_parsed[(candidate1, candidate2)] = parsed
123 |         del pandalm
124 |         gc.collect()
125 |         torch.cuda.empty_cache()
126 |         if self.output_data_path:
127 |             try:
128 |                 with open(self.output_data_path) as f:
129 |                     json.dump(self.pandalm_results, f)
130 |             except:
131 |                 logging.error(f'Failed to output at: {self.output_data_path}')
132 |         return self.pandalm_results_parsed
133 | 
134 |     def evaluate(self):
135 |         self.seed_everything(self.seed)
136 |         self.collect_all_candidates()
137 |         parsed_results = self.pandalm_inference()
138 |         return parsed_results
139 | 


--------------------------------------------------------------------------------
/pandalm/utils/pandalm_inference.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import torch
  3 | import transformers
  4 | from transformers import GenerationConfig, AutoModelForCausalLM, AutoTokenizer
  5 | import json
  6 | import sys
  7 | import logging
  8 | from typing import Union, Dict
  9 | from tqdm import tqdm
 10 | import re, random
 11 | 
 12 | import logging
 13 | 
 14 | 
 15 | def seed_everything(seed):
 16 |     torch.manual_seed(seed)
 17 |     torch.cuda.manual_seed_all(seed)
 18 |     random.seed(seed)
 19 | 
 20 | 
 21 | class PandaLMBatchInferenceProvider(object):
 22 |     """
 23 |     Evaluate batch responses with PandaLM
 24 |     """
 25 | 
 26 |     def __init__(self, model_path) -> None:
 27 |         super().__init__()
 28 |         try:
 29 |             tokenizer = AutoTokenizer.from_pretrained(model_path)
 30 |         except:
 31 |             tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
 32 |         model = AutoModelForCausalLM.from_pretrained(
 33 |             model_path,
 34 |             load_in_8bit=False,
 35 |             torch_dtype=torch.bfloat16,
 36 |             device_map="auto",
 37 |         )
 38 |         if tokenizer.pad_token is None:
 39 |             self.smart_tokenizer_and_embedding_resize(
 40 |                 special_tokens_dict=dict(pad_token="[PAD]"),
 41 |                 tokenizer=tokenizer,
 42 |                 model=model,
 43 |             )
 44 |         tokenizer.add_special_tokens(
 45 |             {
 46 |                 "eos_token": "</s>",
 47 |                 "bos_token": "</s>",
 48 |                 "unk_token": "</s>",
 49 |             }
 50 |         )
 51 |         self.tokenizer = tokenizer
 52 | 
 53 |         model.config.pad_token_id = self.tokenizer.pad_token_id = 0  # unk
 54 |         model.config.bos_token_id = 1
 55 |         model.config.eos_token_id = 2
 56 |         model.eval()
 57 | 
 58 |         if torch.__version__ >= "2" and sys.platform != "win32":
 59 |             model = torch.compile(model)
 60 | 
 61 |         self.model = model
 62 |         self.prepared = []
 63 |         self.pattern = re.compile(
 64 |             r"<unk>|<pad>|<s>|</s>|\[PAD\]|<\|endoftext\|>|\[UNK\]|\[CLS\]|\[MASK\]|<\|startofpiece\|>|<\|endofpiece\|>|\[gMASK\]|\[sMASK\]"
 65 |         )
 66 | 
 67 |     def build_pandalm_prompt(
 68 |         self, instruction, input, resp1, resp2, result=None, explain=None, ref=None
 69 |     ):
 70 |         resp1 = self.pattern.sub("", resp1.strip()).strip()
 71 |         resp2 = self.pattern.sub("", resp2.strip()).strip()
 72 |         rsp = f"### Response 1:\n{resp1}\n\n### Response 2:\n{resp2}"
 73 |         if input:
 74 |             input_sequence = f"Below are two responses for a given task. The task is defined by the Instruction with an Input that provides further context. Evaluate the responses and generate a reference answer for the task.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n{rsp}\n\n### Evaluation:\n"
 75 |         else:
 76 |             input_sequence = f"Below are two responses for a given task. The task is defined by the Instruction. Evaluate the responses and generate a reference answer for the task.\n\n### Instruction:\n{instruction}\n\n{rsp}\n\n### Evaluation:\n"
 77 |         if result:
 78 |             output_sequence = (
 79 |                 f"{result}\n\n### Reason: {explain}\n\n### Reference: {ref}\n"
 80 |             )
 81 |             return input_sequence, output_sequence
 82 |         else:
 83 |             return input_sequence
 84 | 
 85 |     def parse_pandalm_response(self, text):
 86 |         sp = text.strip().split("\n")
 87 |         if sp[0] in ["1", "2"]:
 88 |             return int(sp[0])
 89 |         elif sp[0].lower() == "tie":
 90 |             return 0
 91 |         else:
 92 |             return 0
 93 | 
 94 |     def smart_tokenizer_and_embedding_resize(
 95 |         self,
 96 |         special_tokens_dict: Dict,
 97 |         tokenizer: transformers.PreTrainedTokenizer,
 98 |         model: transformers.PreTrainedModel,
 99 |     ):
100 |         """Resize tokenizer and embedding.
101 | 
102 |         Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
103 |         """
104 |         num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
105 |         model.resize_token_embeddings(len(tokenizer))
106 | 
107 |         if num_new_tokens > 0:
108 |             input_embeddings = model.get_input_embeddings().weight.data
109 |             output_embeddings = model.get_output_embeddings().weight.data
110 | 
111 |             input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
112 |                 dim=0, keepdim=True
113 |             )
114 |             output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
115 |                 dim=0, keepdim=True
116 |             )
117 | 
118 |             input_embeddings[-num_new_tokens:] = input_embeddings_avg
119 |             output_embeddings[-num_new_tokens:] = output_embeddings_avg
120 | 
121 |     def preprocess_input(self, instruction, input, response1, response2):
122 |         prompt = self.build_pandalm_prompt(instruction, input, response1, response2)
123 |         self.prepared.append(self.tokenizer(prompt, return_tensors="pt", padding=True))
124 | 
125 |     def postprocess_output(self, text):
126 |         text = text.strip().split("### Evaluation:")[1].strip()
127 |         self.pattern.sub("", text.strip()).strip()
128 |         return text
129 | 
130 |     def filter_special_token(self, text):
131 |         return self.pattern.sub("", text.strip()).strip()
132 | 
133 |     def inference(
134 |         self,
135 |         temperature=0,
136 |         top_p=1,
137 |         top_k=1,
138 |         num_beams=4,
139 |         max_new_tokens=512,
140 |         repetition_penalty=1.2,
141 |     ):
142 |         generated = []
143 | 
144 |         for idx in tqdm(range(len(self.prepared))):
145 |             inputs = self.prepared[idx]
146 |             input_ids = inputs["input_ids"].to(self.model.device)
147 |             generation_config = GenerationConfig(
148 |                 temperature=temperature,
149 |                 top_p=top_p,
150 |                 top_k=top_k,
151 |                 num_beams=num_beams,
152 |                 early_stopping=True,
153 |                 repetition_penalty=repetition_penalty,
154 |             )
155 |             with torch.no_grad():
156 |                 generation_output = self.model.generate(
157 |                     input_ids=input_ids,
158 |                     generation_config=generation_config,
159 |                     return_dict_in_generate=True,
160 |                     output_scores=True,
161 |                     max_new_tokens=max_new_tokens,
162 |                 )
163 | 
164 |             for j in range(len(generation_output.sequences)):
165 |                 s = generation_output.sequences[j]
166 |                 output = self.tokenizer.decode(s)
167 |                 resp = self.postprocess_output(output)
168 |                 resp = self.filter_special_token(resp)
169 |                 generated.append(resp)
170 | 
171 |         return generated
172 | 
173 | 
174 | if __name__ == "__main__":
175 |     logging.basicConfig(
176 |         format="%(asctime)s %(levelname)s %(message)s", level=logging.INFO
177 |     )
178 | 
179 |     parser = argparse.ArgumentParser(description="PandaLM batch inference script")
180 |     parser.add_argument("-s", "--seed", type=int, default=2023)
181 |     parser.add_argument("-m", "--model_name", default="/ssdwork/yzh/7b-upload/")
182 |     parser.add_argument(
183 |         "-i",
184 |         "--input_path",
185 |         default="/home/yzh/llm/PandaLM/data/compare-sanity-check.json",
186 |     )
187 |     parser.add_argument("-o", "--output_path", default=None)
188 | 
189 |     args = parser.parse_args()
190 | 
191 |     logging.info(args)
192 | 
193 |     seed_everything(args.seed)
194 | 
195 |     logging.info(f"Loading model from {args.model_name}")
196 |     handler = PandaLMBatchInferenceProvider(
197 |         model_path=args.model_name,
198 |     )
199 |     with open(args.input_path) as f:
200 |         input_data = json.load(f)
201 | 
202 |     results = []
203 |     for item in tqdm(input_data):
204 |         output = handler.preprocess_input(
205 |             instruction=item["instruction"],
206 |             input=item["input"],
207 |             response1=item["response1"],
208 |             response2=item["response2"],
209 |         )
210 |     generated = handler.inference()
211 |     for idx, item in enumerate(input_data):
212 |         results.append([item, generated[idx]])
213 | 
214 |     if args.output_path:
215 |         with open(args.output_path, "w") as f:
216 |             json.dump(results, f)
217 |     else:
218 |         print(results)
219 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | accelerate==0.18.0
 2 | bitsandbytes==0.38.1
 3 | datasets==2.12.0
 4 | fire==0.5.0
 5 | gradio==3.28.1
 6 | matplotlib==3.7.1
 7 | numpy==1.24.3
 8 | pandas==2.0.1
 9 | peft==0.2.0
10 | scikit-learn==1.2.2
11 | sentencepiece==0.1.98
12 | tokenizers==0.13.3
13 | torch==2.0.0
14 | torchaudio==2.0.0
15 | torchvision==0.15.0
16 | tqdm==4.65.0
17 | transformers==4.28.1
18 | wandb==0.15.0


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
1 | from pandalm import EvaluationPipeline
2 | pipeline = EvaluationPipeline(
3 |     candidate_paths=["./llama-7b-tuned/", "./opt-7b-tuned", "some-other-model.json"], 
4 |     input_data_path="data/pipeline-sanity-check.json",
5 | )
6 | print(pipeline.evaluate())


--------------------------------------------------------------------------------