├── .gitignore
├── LICENSE
├── Pipfile
├── Pipfile.lock
├── README.md
├── config
    ├── config_default.yml
    ├── config_diff
    │   ├── config_batch_classification.yml
    │   ├── config_generation.yml
    │   └── config_ranking.yml
    └── llm_env.yml
├── dataset
    └── base_dataset.py
├── docs
    ├── AutoPrompt_Diagram.png
    ├── arch_overview.png
    ├── architecture.md
    ├── argilla_movie_spoilers_example.png
    ├── autoprompt_recording.gif
    ├── contributing.md
    ├── examples.md
    ├── how-it-works.md
    └── installation.md
├── environment_dev.yml
├── estimator
    ├── __init__.py
    ├── estimator_argilla.py
    ├── estimator_llm.py
    └── estimator_llm_batch.py
├── eval
    ├── eval_utils.py
    └── evaluator.py
├── optimization_pipeline.py
├── prompts
    ├── meta_prompts_classification
    │   ├── error_analysis.prompt
    │   ├── initial.prompt
    │   ├── initial_verbose.prompt
    │   ├── output_schemes.py
    │   ├── step_prompt.prompt
    │   ├── step_prompt_verbose.prompt
    │   └── step_samples.prompt
    ├── meta_prompts_completion
    │   ├── error_analysis.prompt
    │   ├── initial.prompt
    │   ├── output_schemes.py
    │   ├── step_prompt.prompt
    │   └── step_samples.prompt
    ├── meta_prompts_generation
    │   ├── error_analysis.prompt
    │   ├── initial.prompt
    │   ├── output_schemes.py
    │   ├── step_prompt.prompt
    │   └── step_samples.prompt
    ├── meta_prompts_ranking
    │   ├── error_analysis.prompt
    │   ├── initial.prompt
    │   ├── initial_verbose.prompt
    │   ├── output_schemes.py
    │   ├── step_prompt.prompt
    │   ├── step_prompt_verbose.prompt
    │   └── step_samples.prompt
    ├── modifiers
    │   ├── modifiers.yml
    │   ├── ranker_prompt_mod.prompt
    │   └── ranker_task_desc_mod.prompt
    ├── predictor
    │   ├── output_schemes.py
    │   └── prediction.prompt
    └── predictor_completion
    │   ├── output_schemes.py
    │   ├── prediction.prompt
    │   ├── prediction_generation.prompt
    │   └── prediction_verbose.prompt
├── requirements.txt
├── run_generation_pipeline.py
├── run_pipeline.py
└── utils
    ├── config.py
    ├── dedup.py
    └── llm_chain.py


/.gitignore:
--------------------------------------------------------------------------------
1 | /venv
2 | __pycache__/
3 | *.log
4 | /wandb
5 | .idea/
6 | dump/
7 | 
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Pipfile:
--------------------------------------------------------------------------------
 1 | [[source]]
 2 | url = "https://pypi.org/simple"
 3 | verify_ssl = true
 4 | name = "pypi"
 5 | 
 6 | [packages]
 7 | openai = "*"
 8 | langchain = "*"
 9 | pandas = "*"
10 | wandb = "*"
11 | transformers = "*"
12 | tqdm = "*"
13 | faiss-cpu = "*"
14 | sentence-transformers = "*"
15 | prodict = "*"
16 | schedule = "*"
17 | easydict = "*"
18 | argilla = "*"
19 | langchain-google-genai = "*"
20 | 
21 | [dev-packages]
22 | 
23 | [requires]
24 | python_version = "3.10"
25 | python_full_version = "3.10.13"
26 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |     <!-- community badges -->
  3 |     <a href="https://discord.gg/G2rSbAf8uP"><img src="https://img.shields.io/badge/Join-Discord-blue.svg"/></a>
  4 |     <!-- license badge -->
  5 |     <a href="https://github.com/Eladlev/AutoPrompt/blob/main/LICENSE">
  6 |         <img alt="License" src="https://img.shields.io/badge/License-Apache_2.0-blue.svg"></a>
  7 | </p>
  8 | 
  9 | # 📝 AutoPrompt
 10 | 
 11 | 
 12 | <!-- MARKDOWN LINKS & IMAGES -->
 13 | <!-- https://www.markdownguide.org/basic-syntax/#reference-style-links -->
 14 | 
 15 | **Auto Prompt is a prompt optimization framework designed to enhance and perfect your prompts for real-world use cases.**
 16 | 
 17 | The framework automatically generates high-quality, detailed prompts tailored to user intentions. It employs a refinement (calibration) process, where it iteratively builds a dataset of challenging edge cases and optimizes the prompt accordingly. This approach not only reduces manual effort in prompt engineering but also effectively addresses common issues such as prompt [sensitivity](https://arxiv.org/abs/2307.09009) and inherent prompt [ambiguity](https://arxiv.org/abs/2311.04205) issues.
 18 | 
 19 | 
 20 | **Our mission:** Empower users to produce high-quality robust prompts using the power of large language models (LLMs).
 21 | 
 22 | # Why Auto Prompt?
 23 | - **Prompt Engineering Challenges.** The quality of LLMs greatly depends on the prompts used. Even [minor changes](#prompt-sensitivity-example) can significantly affect their performance. 
 24 | - **Benchmarking Challenges.**  Creating a benchmark for production-grade prompts is often labour-intensive and time-consuming.
 25 | - **Reliable Prompts.** Auto Prompt generates robust high-quality prompts, offering measured accuracy and performance enhancement using minimal data and annotation steps.
 26 | - **Modularity and Adaptability.** With modularity at its core, Auto Prompt integrates seamlessly with popular open-source tools such as LangChain, Wandb, and Argilla, and can be adapted for a variety of tasks, including data synthesis and prompt migration.
 27 | 
 28 | ## System Overview
 29 | 
 30 | ![System Overview](./docs/AutoPrompt_Diagram.png)
 31 | 
 32 | The system is designed for real-world scenarios, such as moderation tasks, which are often  challenged by imbalanced data distributions. The system implements the [Intent-based Prompt Calibration](https://arxiv.org/abs/2402.03099) method. The process begins with a user-provided initial prompt and task description, optionally including user examples. The refinement process iteratively generates diverse samples, annotates them via user/LLM, and evaluates prompt performance, after which an LLM suggests an improved prompt.  
 33 | 
 34 | The optimization process can be extended to content generation tasks by first devising a ranker prompt and then performing the prompt optimization with this learned ranker. The optimization concludes upon reaching the budget or iteration limit.  
 35 | 
 36 | 
 37 | This joint synthetic data generation and prompt optimization approach outperform traditional methods while requiring minimal data and iterations. Learn more in our paper
 38 | [Intent-based Prompt Calibration: Enhancing prompt optimization with synthetic boundary cases](https://arxiv.org/abs/2402.03099) by E. Levi et al. (2024).
 39 | 
 40 | 
 41 | **Using GPT-4 Turbo, this optimization typically completes in just a few minutes at a cost of under $1.** To manage costs associated with GPT-4 LLM's token usage, the framework enables users to set a budget limit for optimization, in USD or token count, configured as illustrated [here](docs/examples.md#steps-to-run-example).
 42 | 
 43 | ## Demo
 44 | 
 45 | ![pipeline_recording](./docs/autoprompt_recording.gif)
 46 | 
 47 | 
 48 | ## 📖 Documentation
 49 |  - [How to install](docs/installation.md) (Setup instructions)
 50 |  - [Prompt optimization examples](docs/examples.md) (Use cases: movie review classification, generation, and chat moderation)
 51 |  - [How it works](docs/how-it-works.md) (Explanation of pipelines)
 52 |  - [Architecture guide](docs/architecture.md) (Overview of main components)
 53 | 
 54 | ## Features
 55 | - 📝 Boosts prompt quality with a minimal amount of data and annotation steps.
 56 | - 🛬 Designed for production use cases like moderation, multi-label classification, and content generation.
 57 | - ⚙️ Enables seamless migrating of prompts across model versions or LLM providers.
 58 | - 🎓 Supports prompt squeezing. Combine multiple rules into a single efficient prompt.
 59 | 
 60 | 
 61 | ## QuickStart
 62 | AutoPrompt requires `python <= 3.10`
 63 | <br />
 64 | 
 65 | > **Step 1** - Download the project
 66 | 
 67 | ```bash
 68 | git clone git@github.com:Eladlev/AutoPrompt.git
 69 | cd AutoPrompt
 70 | ```
 71 | 
 72 | <br />
 73 | 
 74 | > **Step 2** - Install dependencies
 75 | 
 76 | Use either Conda or pip, depending on your preference. Using Conda:
 77 | ```bash
 78 | conda env create -f environment_dev.yml
 79 | conda activate AutoPrompt
 80 | ```
 81 | 
 82 | Using pip: 
 83 | ```bash
 84 | pip install -r requirements.txt
 85 | ```
 86 | 
 87 | Using pipenv:
 88 | ```bash
 89 | pip install pipenv
 90 | pipenv sync
 91 | ```
 92 | 
 93 | <br />
 94 | 
 95 | > **Step 3** - Configure your LLM. 
 96 | 
 97 | Set your OpenAI API key  by updating the configuration file `config/llm_env.yml`
 98 | - If you need help locating your API key, visit this [link](https://help.openai.com/en/articles/4936850-where-do-i-find-my-api-key).
 99 | 
100 | - We recommend using [OpenAI's GPT-4](https://platform.openai.com/docs/guides/gpt) for the LLM. Our framework also supports other providers and open-source models, as discussed [here](docs/installation.md#configure-your-llm).
101 | 
102 | <br />
103 | 
104 | > **Step 4** - Configure your Annotator
105 | - Select an annotation approach for your project:
106 |     -  We recommend beginning with a human-in-the-loop method, utilizing [Argilla](https://docs.v1.argilla.io/en/v1.11.0). Observe that AutoPrompt is compatible with **Argilla V1**, not with the latest V2. Follow the [Argilla setup instructions](https://docs.v1.argilla.io/en/v1.11.0/getting_started/quickstart_installation.html), with the following modifications:
107 |         -  If you are using local docker use `v1.29.0` instead of the `latest` tag.
108 |         -  For a quick setup using HF, duplicate the following [space](https://huggingface.co/spaces/Eladlev/test4)
109 |     -  Alternatively, you can set up an LLM as your annotator by following these [configuration steps](docs/installation.md#configure-llm-annotator).
110 | 
111 | - The default predictor LLM, GPT-3.5, for estimating prompt performance, is configured in the `predictor` section of `config/config_default.yml`.
112 | 
113 | - Define your budget in the input config yaml file using the `max_usage parameter`. For OpenAI models, `max_usage` sets the maximum spend in USD. For other LLMs, it limits the maximum token count.
114 | 
115 | <br />
116 | 
117 | 
118 | > **Step 5** - Run the pipeline
119 | 
120 | First, configure your labels by editing `config/config_default.yml`
121 | ```
122 | dataset:
123 |     label_schema: ["Yes", "No"]
124 | ```
125 | 
126 | 
127 | For a **classification pipeline**, use the following command from your terminal within the appropriate working directory: 
128 | ```bash
129 | python run_pipeline.py
130 | ```
131 | If the initial prompt and task description are not provided directly as input, you will be guided to provide these details.  Alternatively, specify them as command-line arguments:
132 | ```bash
133 | python run_pipeline.py \
134 |     --prompt "Does this movie review contain a spoiler? answer Yes or No" \
135 |     --task_description "Assistant is an expert classifier that will classify a movie review, and let the user know if it contains a spoiler for the reviewed movie or not." \
136 |     --num_steps 30
137 | ```
138 | You can track the optimization progress using the [W&B](https://wandb.ai/site) dashboard, with setup instructions available  [here](docs/installation.md#monitoring-weights-and-biases-setup). 
139 | 
140 | 
141 | If you are using pipenv, be sure to activate the environment:
142 | ``` bash
143 | pipenv shell
144 | python run_pipeline.py  
145 | ```
146 | or alternatively prefix your command with `pipenv run`:
147 | ```bash
148 | pipenv run python run_pipeline.py 
149 | ```
150 | 
151 | #### Generation pipeline
152 | To run the generation pipeline, use the following example command:
153 | ```bash
154 | python run_generation_pipeline.py \
155 |     --prompt "Write a good and comprehensive movie review about a specific movie." \
156 |     --task_description "Assistant is a large language model that is tasked with writing movie reviews."
157 | ```
158 | For more information, refer to our [generation task example](docs/examples.md#generating-movie-reviews-generation-task). 
159 | 
160 | <br />
161 | 
162 | Enjoy the results. Completion of these steps yields a **refined (calibrated)
163 | prompt** tailored for your task, alongside a **benchmark** featuring challenging samples,
164 | stored in the default `dump` path.
165 | 
166 | 
167 | 
168 | ## Tips
169 | 
170 | - Prompt accuracy may fluctuate during the optimization. To identify the best prompts, we recommend continuous refinement following the initial generation of the benchmark.  Set the number of optimization iterations with `--num_steps` and control sample generation by specifying `max_samples` in the `dataset` section. For instance, setting `max_samples: 50` and `--num_steps 30` limits the benchmark to 50 samples, allowing for 25 additional refinement iterations, assuming 10 samples per iteration.
171 | 
172 | - The framework supports checkpoints for easy resumption of optimization from the last saved state. It automatically saves the most recent optimization state in a `dump` path. Use `--output_dump` to set this path and `--load_path` to resume from a checkpoint.
173 | - The iterations include multiple calls to the LLM service, with long prompts and requests for a relatively large amount of generated tokens by the LLM. This might take time ~1 minute (especially in the generative tasks), so please be patient.
174 | - If there are some issues with the Argilla server connection/error, try to restart the space.
175 | <!-- 
176 | Meanwhile, the num_initialize_samples and num_generated_samples fields within the meta_prompts section specify the counts for initial and per iteration sample generation, respectively. -->
177 | 
178 | 
179 | ## Prompt Sensitivity Example
180 | You write a prompt for identifying movie spoilers:
181 | ```
182 | Review the content provided and indicate whether it includes any significant plot revelations or critical points that could reveal important elements of the story or its outcome. Respond with "Yes" if it contains such spoilers or critical insights, and "No" if it refrains from unveiling key story elements.
183 | ```
184 | This prompt scores 81 on your [benchmark](docs/examples.md#filtering-movie-reviews-with-spoilers-classification-task)  using GPT-4 LLM. Then, you make a minor modification:
185 | ```
186 | Review the text and determine if it provides essential revelations or critical details about the story that would constitute a spoiler. Respond with "Yes" for the presence of spoilers, and "No" for their absence.
187 | ```
188 | Surprisingly, the second prompt scores 72, representing an 11% drop in accuracy. This illustrates the need for a careful prompt engineering process.
189 | 
190 | ## 🚀 Contributing
191 | 
192 | Your contributions are greatly appreciated! If you're eager to contribute, kindly refer to our [Contributing Guidelines](docs/contributing.md)) for detailed information.
193 | 
194 | <!-- For an insight into our future plans, visit our Project Roadmap. -->
195 | If you wish to be a part of our journey, we invite you to connect with us through our [Discord Community](https://discord.gg/G2rSbAf8uP). We're excited to have you onboard! 
196 | 
197 | ## 🛡 Disclaimer
198 | 
199 | The AutoPrompt project is provided on an "as-is" basis without any guarantees or warranties, expressed or implied. 
200 | 
201 | Our perspective on the optimization and usage of prompts:
202 | 
203 | 1. The core objective of AutoPrompt is to refine and perfect prompts to achieve high-quality results. This is achieved through an iterative calibration process, which helps in reducing errors and enhancing the performance of LLMs. However, the framework does not guarantee absolute correctness or unbiased results in every instance.
204 | 
205 | 2. AutoPrompt aims to improve the reliability of prompts and mitigate sensitivity issues, but it does not claim to completely eliminate such issues. 
206 | <!-- Our community is committed to exploring the most effective ways to interact with LLMs, fostering a space for diverse views and approaches. -->
207 | 
208 | Please note that using LLMs like OpenAI's GPT-4, supported by AutoPrompt, may lead to significant costs due to token usage. By using AutoPrompt, you acknowledge your responsibility to monitor and manage your token use and expenses. We advise regularly reviewing your LLM provider's API usage and establishing limits or alerts to prevent unexpected charges.
209 | To manage costs associated with GPT-4 LLM's token usage, the framework enables users to set a budget limit for optimization, in USD or token count, configured as illustrated [here](docs/examples.md#steps-to-run-example).
210 | 
211 | ## Citation
212 | 
213 | If you have used our code in your research, please cite our [paper](https://arxiv.org/abs/2402.03099):
214 | 
215 | ```
216 | @misc{2402.03099,
217 | Author = {Elad Levi and Eli Brosh and Matan Friedmann},
218 | Title = {Intent-based Prompt Calibration: Enhancing prompt optimization with synthetic boundary cases},
219 | Year = {2024},
220 | Eprint = {arXiv:2402.03099},
221 | }
222 | ```
223 | 
224 | 
225 | ## License
226 | 
227 | This framework is licensed under the [Apache License, Version 2.0](http://www.apache.org/licenses/LICENSE-2.0).
228 | 
229 | ## ✉️ Support / Contact us
230 | - [Community Discord](https://discord.gg/G2rSbAf8uP)
231 | - Our email: [‫autopromptai@gmail.com‬](mailto:autopromptai@gmail.com)
232 | 
233 | 
234 | 


--------------------------------------------------------------------------------
/config/config_default.yml:
--------------------------------------------------------------------------------
 1 | use_wandb: False
 2 | dataset:
 3 |     name: 'dataset'
 4 |     records_path: null
 5 |     initial_dataset: ''
 6 |     label_schema: ["Yes", "No"]
 7 |     max_samples: 50
 8 |     semantic_sampling: False # Change to True in case you don't have M1. Currently there is an issue with faiss and M1
 9 | 
10 | annotator:
11 |     method : 'argilla'
12 |     config:
13 |         api_url: ''
14 |         api_key: 'admin.apikey'
15 |         workspace: 'admin'
16 |         time_interval: 5
17 | 
18 | predictor:
19 |     method : 'llm'
20 |     config:
21 |         llm:
22 |             type: 'OpenAI'
23 |             name: 'gpt-3.5-turbo-1106'
24 | #            async_params:
25 | #                retry_interval: 10
26 | #                max_retries: 2
27 |             model_kwargs: {"seed": 220}
28 |         num_workers: 5
29 |         prompt: 'prompts/predictor_completion/prediction.prompt'
30 |         mini_batch_size: 1  #change to >1 if you want to include multiple samples in the one prompt
31 |         mode: 'prediction'
32 | 
33 | meta_prompts:
34 |     folder: 'prompts/meta_prompts_classification'
35 |     num_err_prompt: 1  # Number of error examples per sample in the prompt generation
36 |     num_err_samples: 2 # Number of error examples per sample in the sample generation
37 |     history_length: 4 # Number of sample in the meta-prompt history
38 |     num_generated_samples: 10 # Number of generated samples at each iteration
39 |     num_initialize_samples: 10 # Number of generated samples at iteration 0, in zero-shot case
40 |     samples_generation_batch: 10 # Number of samples generated in one call to the LLM
41 |     num_workers: 5 #Number of parallel workers
42 |     warmup: 4 # Number of warmup steps
43 | 
44 | eval:
45 |     function_name: 'accuracy'
46 |     num_large_errors: 4
47 |     num_boundary_predictions : 0
48 |     error_threshold: 0.5
49 | 
50 | llm:
51 |     name: 'gpt-4-1106-preview' # This is the meta-prompt LLM, it should be a strong model. For example, using GPT-3.5 will cause an error in many cases.
52 |     type: 'OpenAI' # Can be OpenAI, Anthropic, Google, Azure
53 |     temperature: 0.8
54 | 
55 | stop_criteria:
56 |     max_usage: 2 #In $ in case of OpenAI models, otherwise number of tokens
57 |     patience: 10 # Number of patience steps
58 |     min_delta: 0.01 # Delta for the improvement definition
59 | 


--------------------------------------------------------------------------------
/config/config_diff/config_batch_classification.yml:
--------------------------------------------------------------------------------
 1 | use_wandb: True
 2 | dataset:
 3 |     label_schema: ["Yes", "No"]
 4 | 
 5 | annotator:
 6 |     method : 'llm_batch'
 7 |     config:
 8 |         instructions: ['Is there is an address in the text?', 'Is there is a phone number in the text?',
 9 |         'Is there is a password in the text?']
10 |         aggregation_mode: 'exist'  #'majority_vote',  'exist', or 'all'. exist/all is working only in case label_schema: ["Yes", "No"]!
11 |         estimator_config:
12 |             num_workers: 2
13 |             prompt: 'prompts/predictor/prediction.prompt'
14 |             mode: 'annotation'


--------------------------------------------------------------------------------
/config/config_diff/config_generation.yml:
--------------------------------------------------------------------------------
 1 | annotator:
 2 |     method : ''
 3 | 
 4 | dataset:
 5 |     max_samples: 20
 6 |     label_schema:  ["1","2","3","4","5"]
 7 | 
 8 | predictor:
 9 |     method : 'llm'
10 |     config:
11 |         prompt: 'prompts/predictor_completion/prediction_generation.prompt'
12 |         mini_batch_size: 1
13 |         llm:
14 |             type: 'OpenAI'
15 |             name: 'gpt-4-1106-preview' #'gpt-3.5-turbo-1106'
16 |         num_workers: 7
17 | 
18 | meta_prompts:
19 |     folder: 'prompts/meta_prompts_generation'
20 |     warmup: 1
21 | 
22 | eval:
23 |     function_name: 'ranking'
24 |     error_threshold: 4
25 | 
26 | 


--------------------------------------------------------------------------------
/config/config_diff/config_ranking.yml:
--------------------------------------------------------------------------------
1 | dataset:
2 |     label_schema:  ["1","2","3","4","5"]
3 | 
4 | meta_prompts:
5 |     folder: 'prompts/meta_prompts_ranking'


--------------------------------------------------------------------------------
/config/llm_env.yml:
--------------------------------------------------------------------------------
 1 | anthropic:
 2 |   ANTHROPIC_API_KEY: ''
 3 | openai:
 4 |   OPENAI_API_KEY: ''
 5 |   OPENAI_API_BASE: ''
 6 |   OPENAI_ORGANIZATION: ''
 7 | 
 8 | azure:
 9 |   AZURE_OPENAI_API_KEY: ''
10 |   AZURE_OPENAI_ENDPOINT: ''
11 |   OPENAI_API_VERSION: ''
12 | 
13 | google:
14 |   GOOGLE_API_KEY: ''


--------------------------------------------------------------------------------
/dataset/base_dataset.py:
--------------------------------------------------------------------------------
  1 | import os.path
  2 | import logging
  3 | import pandas as pd
  4 | from pathlib import Path
  5 | from datetime import datetime
  6 | import csv
  7 | 
  8 | from utils.dedup import Dedup
  9 | 
 10 | class DatasetBase:
 11 |     """
 12 |     This class store and manage all the dataset records (including the annotations and prediction)
 13 |     """
 14 | 
 15 |     def __init__(self, config):
 16 |         if config.records_path is None:
 17 |             self.records = pd.DataFrame(columns=['id', 'text', 'prediction',
 18 |                                                  'annotation', 'metadata', 'score', 'batch_id'])
 19 |         else:
 20 |             self.records = pd.read_csv(config.records_path)
 21 |         dt_string = datetime.now().strftime("%d_%m_%Y_%H_%M_%S")
 22 | 
 23 |         self.name = config.name + '__' + dt_string
 24 |         self.label_schema = config.label_schema
 25 |         self.dedup = Dedup(config)
 26 |         self.sample_size = config.get("sample_size", 3)
 27 |         self.semantic_sampling = config.get("semantic_sampling", False)
 28 |         if not config.get('dedup_new_samples', False):
 29 |             self.remove_duplicates = self._null_remove
 30 | 
 31 |     def __len__(self):
 32 |         """
 33 |         Return the number of samples in the dataset.
 34 |         """
 35 |         return len(self.records)
 36 | 
 37 |     def __getitem__(self, batch_idx):
 38 |         """
 39 |         Return the batch idx.
 40 |         """
 41 |         extract_records = self.records[self.records['batch_id'] == batch_idx]
 42 |         extract_records = extract_records.reset_index(drop=True)
 43 |         return extract_records
 44 | 
 45 |     def get_leq(self, batch_idx):
 46 |         """
 47 |         Return all the records up to batch_idx (includes).
 48 |         """
 49 |         extract_records = self.records[self.records['batch_id'] <= batch_idx]
 50 |         extract_records = extract_records.reset_index(drop=True)
 51 |         return extract_records
 52 | 
 53 |     def add(self, sample_list: dict = None, batch_id: int = None, records: pd.DataFrame = None):
 54 |         """
 55 |         Add records to the dataset.
 56 |         :param sample_list: The samples to add in a dict structure (only used in case record=None)
 57 |         :param batch_id: The batch_id for the upload records (only used in case record= None)
 58 |         :param records: dataframes, update using pandas
 59 |         """
 60 |         if records is None:
 61 |             records = pd.DataFrame([{'id': len(self.records) + i, 'text': sample, 'batch_id': batch_id} for
 62 |                        i, sample in enumerate(sample_list)])
 63 |         self.records = pd.concat([self.records, records], ignore_index=True)
 64 | 
 65 |     def update(self, records: pd.DataFrame):
 66 |         """
 67 |         Update records in dataset.
 68 |         """
 69 |         # Ignore if records is empty
 70 |         if len(records) == 0:
 71 |             return
 72 | 
 73 |         # Set 'id' as the index for both DataFrames
 74 |         records.set_index('id', inplace=True)
 75 |         self.records.set_index('id', inplace=True)
 76 | 
 77 |         # Update using 'id' as the key
 78 |         self.records.update(records)
 79 | 
 80 |         # Remove null annotations
 81 |         if len(self.records.loc[self.records["annotation"]=="Discarded"]) > 0:
 82 |             discarded_annotation_records = self.records.loc[self.records["annotation"]=="Discarded"]
 83 |             #TODO: direct `discarded_annotation_records` to another dataset to be used later for corner-cases
 84 |             self.records = self.records.loc[self.records["annotation"]!="Discarded"]
 85 | 
 86 |         # Reset index
 87 |         self.records.reset_index(inplace=True)
 88 | 
 89 |     def modify(self, index: int, record: dict):
 90 |         """
 91 |         Modify a record in the dataset.
 92 |         """
 93 |         self.records[index] = record
 94 | 
 95 |     def apply(self, function, column_name: str):
 96 |         """
 97 |         Apply function on each record.
 98 |         """
 99 |         self.records[column_name] = self.records.apply(function, axis=1)
100 | 
101 |     def save_dataset(self, path: Path):
102 |         self.records.to_csv(path, index=False, quoting=csv.QUOTE_NONNUMERIC)
103 | 
104 |     def load_dataset(self, path: Path):
105 |         """
106 |         Loading dataset
107 |         :param path: path for the csv
108 |         """
109 |         if os.path.isfile(path):
110 |             self.records = pd.read_csv(path, dtype={'annotation': str, 'prediction': str, 'batch_id': int})
111 |         else:
112 |             logging.warning('Dataset dump not found, initializing from zero')
113 | 
114 |     def remove_duplicates(self, samples: list) -> list:
115 |         """
116 |         Remove (soft) duplicates from the given samples
117 |         :param samples: The samples
118 |         :return: The samples without duplicates
119 |         """
120 |         dd = self.dedup.copy()
121 |         df = pd.DataFrame(samples, columns=['text'])
122 |         df_dedup = dd.sample(df, operation_function=min)
123 |         return df_dedup['text'].tolist()
124 | 
125 |     def _null_remove(self, samples: list) -> list:
126 |         # Identity function that returns the input unmodified
127 |         return samples
128 | 
129 |     def sample_records(self, n: int = None) -> pd.DataFrame:
130 |         """
131 |         Return a sample of the records after semantic clustering
132 |         :param n: The number of samples to return
133 |         :return: A sample of the records
134 |         """
135 |         n = n or self.sample_size
136 |         if self.semantic_sampling:
137 |             dd = self.dedup.copy()
138 |             df_samples = dd.sample(self.records).head(n)
139 | 
140 |             if len(df_samples) < n:
141 |                 df_samples = self.records.head(n)
142 |         else:
143 |             df_samples = self.records.sample(n)
144 |         return df_samples
145 | 
146 |     @staticmethod
147 |     def samples_to_text(records: pd.DataFrame) -> str:
148 |         """
149 |         Return a string that organize the samples for a meta-prompt
150 |         :param records: The samples for the step
151 |         :return: A string that contains the organized samples
152 |         """
153 |         txt_res = '##\n'
154 |         for i, row in records.iterrows():
155 |             txt_res += f"Sample:\n {row.text}\n#\n"
156 |         return txt_res
157 | 
158 | 
159 | 


--------------------------------------------------------------------------------
/docs/AutoPrompt_Diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Eladlev/AutoPrompt/a55a31e6fd4238ca3d58159b106baa1fd924ff66/docs/AutoPrompt_Diagram.png


--------------------------------------------------------------------------------
/docs/arch_overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Eladlev/AutoPrompt/a55a31e6fd4238ca3d58159b106baa1fd924ff66/docs/arch_overview.png


--------------------------------------------------------------------------------
/docs/architecture.md:
--------------------------------------------------------------------------------
 1 | # Architecture Guide
 2 | <img src="./arch_overview.png" alt="Architecture overview" width="70%">
 3 | 
 4 | This document outlines the system design of AutoPrompt, which is built around four primary components: Dataset, Estimator, Evaluator, and Optimizer Manager. These components collaborate to refine prompts through an iterative process involving sample generation, annotation, prediction, evaluation of scores, and optimization.
 5 | 
 6 | * __Dataset.__ This component manages the dataset and performs operations such as insertion, modification, deletion, and applying functions,  on the dataset rows. The component also handles data cleaning by removing semantic duplications and performing semantic sampling. Since the system is optimized for small datasets, the current implementation is based on a local database using [pandas](https://pandas.pydata.org).
 7 | * __Estimator.__ The estimator is responsible for estimating a batch of samples. We implement this component in two forms, once for the predictions and once for the annotations. Such a generic implementation (for both use cases) allows for easy adaptation of the system to diverse use cases, including  prompt calibration, prompt distillation and prompt squashing. The currently supported types of estimators are:
 8 |     1. __Human annotation__: Using [Argilla UI](https://docs.argilla.io/en/latest/index.html#). The system is connected to the Argilla server and is waiting until the annotation task is completed.
 9 |     2. __LLM estimator__: Using an LLM to estimate the sample given a prompt. We support various types of LLMs, using [Langchain](https://python.langchain.com/docs/get_started/introduction) integration. For efficiency, the system supports parallelism using both workers and async calls. The system also supports sending a few samples in one prompt (prompt batching), which can reduce the cost significantly.
10 |     3. __Batch estimator__: The batch estimator runs multiple LLM estimators and adds a policy layer to aggregate the results. It is mainly used for prompt-squashing, aiming to optimize a single prompt that achieves the efficacy of multiple prompts. For example, in case of a user with several moderation rules.
11 | * __Evaluator.__ The evaluator is responsible for evaluating the records after the prediction and annotation stage. The evaluator accepts a function and applies it to each row. It's important to note that the function is generic, for example in the generation pipeline the function is performed by invoking an LLM. The evaluator is also responsible for defining the errors and handling the error analysis using the Analyzer meta-prompt.
12 | * __Optimizer manager (Optimization Pipeline).__ The optimizer manager handles the whole optimization process flow, it performs the iteration steps described in the system flow [documentation](how-it-works.md) and is responsible for stopping and returning the final calibrated prompt. The currently supported criteria are either convergence (determined by a patient hyper-parameter), or usage limit (determined by maximal cost if relevant, or by the number of generated tokens).
13 | 
14 | ## Design Considerations
15 | 
16 | - **Modularity and Flexibility**: Each component is designed with modularity in mind, allowing for easy swaps or upgrades to accommodate different use cases.
17 | - **Scalability**: The framework's architecture supports scaling, from handling small datasets efficiently to accommodating the computational demands of parallel processing and batch estimation.
18 | - **Cost-Efficiency**: Features like prompt batching and the use of a batch estimator are specifically included to manage and minimize operational costs associated with LLM usage.
19 | 


--------------------------------------------------------------------------------
/docs/argilla_movie_spoilers_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Eladlev/AutoPrompt/a55a31e6fd4238ca3d58159b106baa1fd924ff66/docs/argilla_movie_spoilers_example.png


--------------------------------------------------------------------------------
/docs/autoprompt_recording.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Eladlev/AutoPrompt/a55a31e6fd4238ca3d58159b106baa1fd924ff66/docs/autoprompt_recording.gif


--------------------------------------------------------------------------------
/docs/contributing.md:
--------------------------------------------------------------------------------
 1 | # Contributing to AutoPrompt
 2 | 
 3 | Thank you for considering contributing to AutoPrompt! We deeply appreciate your interest in improving our project.
 4 | 
 5 | ## Bug Fixes and Documentation Enhancements
 6 | 
 7 | Bug fixes and documentation improvements, including compelling examples and use cases, greatly benefit our project. If you encounter any bugs or identify areas where the documentation could be strengthened, please do not hesitate to submit a pull request (PR) containing your proposed changes.
 8 | 
 9 | ## Feature Requests
10 | 
11 | For significant feature additions, we encourage you to open an issue on GitHub. Additionally, we invite you to join our Discord community and engage in discussions about the feature in the #features-requests channel. This collaborative environment enables us to delve deeper into the proposed features and foster meaningful dialogue.
12 | 
13 | We value your contributions and look forward to working together to enhance AutoPrompt!
14 | 


--------------------------------------------------------------------------------
/docs/examples.md:
--------------------------------------------------------------------------------
  1 | 
  2 | # Prompt Optimization Examples
  3 | 
  4 | This document provides practical examples of using the AutoPrompt pipeline across various scenarios. It focuses on movie review and chat moderation tasks to demonstrate the flexibility and effectiveness of the AutoPrompt framework.
  5 | 
  6 | 
  7 | 1. [Filtering Movie Reviews with Spoilers (Classification task)](#filtering-movie-reviews-with-spoilers-i-task)
  8 | 2. [Movie Genre Identification (Multi-label classification task)](#movie-genre-identification-multi-label-classification)
  9 | 3. [Rating Movie Reviews (Scoring task)](#rating-movie-reviews-scoring-task)
 10 | 4. [Generating Movie Reviews (Generation task)](#generating-movie-reviews-generation-task)
 11 | 5. [Single Topic Moderation](#single-topic-moderation)
 12 | 6. [Multi-Topic Moderation (Prompt squeezing task)](#multi-topic-moderation-prompt-squeezing)
 13 | 
 14 | ### Filtering Movie Reviews with Spoilers (Classification task)
 15 | 
 16 | In this binary classification example, we aim to filter out movie reviews containing spoilers for a specific movie. A correctly implemented filter can be a powerful tool in a large-scale movie review system.
 17 | 
 18 | We'll start with a simple initial prompt and task description: 
 19 |  - Initial prompt: “Does this movie review contain a spoiler? answer Yes or No”<br>
 20 |  - Task description: “Assistant is an expert classifier that will classify a movie review, and let the user know if it contains a spoiler for the reviewed movie or not.”
 21 | 
 22 | #### Steps to Run Example
 23 | 
 24 | 1. Configure your labels by editing `config/config_default.yml`. Modify the `label_schema` in the `dataset` section to include only 'Yes' and 'No' options.
 25 | 
 26 | ```
 27 | dataset:
 28 |     name: 'dataset'
 29 |     records_path: null
 30 |     initial_dataset: 'dump/dataset.csv'
 31 |     label_schema: ["Yes", "No"]
 32 |     max_samples: 50
 33 | ```
 34 | 2. Run the main pipeline from an IDE or the command line
 35 | ```bash
 36 | > python run_pipeline.py
 37 | ```
 38 | 
 39 | *Note*: Without input parameters, the pipeline prompts the user to provide them. Alternatively, specify initial prompt and task description as command-line arguments:
 40 | ```bash
 41 | > python run_pipeline.py \
 42 |     --prompt "Does this movie review contain a spoiler? answer Yes or No" \
 43 |     --task_description "Assistant is an expert classifier that will classify a movie review, and let the user know if it contains a spoiler for the reviewed movie or not."
 44 | ```
 45 | 
 46 | 3. A browser window displaying the Argilla workspace will open for manual annotations
 47 | ![argilla_example](./argilla_movie_spoilers_example.png)
 48 | 
 49 | Annotate the generated examples as they appear and monitor the pipeline's progress. Control the number of optimization iterations with the `num_steps` parameter, specified at start:
 50 | ```bash
 51 | > python run_pipeline.py --num_steps 30
 52 | ```
 53 | The pipeline concludes after reaching the `num_steps` or meeting a predefined stop criteria, defined in `config/config_default.yml`:
 54 | ```
 55 | stop_criteria:
 56 |     max_usage: 0.5  # Max budget for optimization (USD for OpenAI's LLM model)
 57 |     patience: 3     # Number of iterations to wait for improvement
 58 |     min_delta: 0.05 # Minimum improvement between iterations
 59 | ```
 60 | Note that the framework also supports using an LLM as the annotator, see setup instructions [here](installation.md#configure-llm-annotator).
 61 | 
 62 | 4. After completion, the pipeline outputs a **refined (calibrated) prompt** tailored for the task and a reference **benchmark** with challenging samples. In this example, the final spoiler identification prompt might be:
 63 | 
 64 | ```
 65 | Review Spoiler Identification Protocol: For the task of classifying IMDB reviews for
 66 | the presence of spoilers, the classifier must label reviews with a heightened sensitivity to
 67 | nuanced language and indirect spoiler cues. The classification labels are ’Yes’ for spoilers
 68 | and ’No’ for non-spoilers. Apply the following criteria rigorously: Label ’Yes’ if a review: -
 69 | Contains subtle references or nuanced language that hints at plot developments or character
 70 | arcs, without explicit detail. - Includes emotional responses or descriptive language that
 71 | indirectly reveals plot outcomes or twists. - Employs suggestive language that points to future
 72 | events or endings, even if it does not reveal specific information. Label ’No’ if a review: -
 73 | Discusses technical aspects, acting, direction, or personal viewer impressions in a manner
 74 | that does not hint at or reveal any plot details. - Comments on thematic elements, genre
 75 | characteristics, or storytelling techniques without disclosing or implying crucial plot twists.
 76 | ```
 77 | 
 78 | - The framework automatically saves the benchmark, run log, and a checkpoint file (which stores the state of the optimization, enabling seamless continuation from a previous run) in a default `dump` path, adjustable with the `--output_dump` command line argument.
 79 | - Note that the steps above are relevant to all classification and generation tasks. See the following examples for more use cases. 
 80 | 
 81 | 5. Until now, we've initiated the pipeline with just an initial prompt and task description. However, you can also include a few examples by specifying an initial dataset in the `initial_dataset` field within the `dataset` section of the `config/config_default.yml` file. For example:
 82 | ```
 83 | dataset:
 84 |     initial_dataset: 'dump/dataset.csv'
 85 | ```
 86 | An example of an initial dataset with two samples is shown below:
 87 | ```
 88 | id,text,prediction,annotation,metadata,score,batch_id
 89 | 0,"The cinematography was mesmerizing, especially during the scene where they finally reveal the mysterious room that captivated the main character.",No,Yes,,,0
 90 | 1,"The director's bold choice to leave the world's fate unclear until the final frame will spark audience discussions.",No,Yes,,,0
 91 | ```
 92 | 
 93 | 
 94 | ### Movie Genre Identification (Multi-label classification):
 95 | 
 96 | In this example, we want to segment movie reviews into pre-defined genres. The initial prompt and task description might look like this: 
 97 |  - Initial prompt: "Based on the following movie review, what genre is this movie? Select between Action, Comedy, Drama, Romance or Horror."
 98 |  - Task description: "Assistant is an expert cinema critic for all genres, and is tasked with classifying other movie reviews."
 99 | 
100 | #### Run Example
101 | For this multi-label classification, update the `label_schema` in `config/config_default.yml`
102 | ```
103 | dataset:
104 |     label_schema: ["Action", "Comedy", "Drama", "Romance", "Horror"]
105 | ```
106 | And then simply run the pipeline with the corresponding input parameters:
107 | ```bash
108 | > python run_pipeline.py \
109 |     --prompt "Based on the following movie review, what genre is this movie? Select between Action, Comedy, Drama, Romance or Horror." \
110 |     --task_description "Assistant is an expert cinema critic for all genres, and is tasked with classifying other movie reviews."
111 | ```
112 | Please follow the same annotation and monitoring procedures as shown in the previous example.
113 | 
114 | ### Rating Movie Reviews (Scoring task):
115 | In this example, we aim to score (rank) the movie reviews based on various criteria, assigning a numerical rating to each
116 | 
117 | We'll start with a simple initial prompt: 
118 |  - Initial prompt: "How well is this movie review written? Give it a score between 1 and 5, with 1 being the lowest score."
119 |  - Task description: "Assistant is an expert cinema reviewer and editor, and is tasked with scoring other movie reviews."
120 | 
121 | Note that although this task involves scoring, it is treated as a classification task, similar to the examples above.
122 | 
123 | #### Run Example
124 | To run this task, update the `label_scheme` in the input `config/config_default.yml` config file:
125 | ```
126 | dataset:
127 |     label_schema: ["1", "2", "3", "4", "5"]
128 | ```
129 | And then simply use the input parameters to run the pipeline:
130 | ```bash
131 | > python run_pipeline.py \
132 |     --prompt "How well is this movie review written? Give it a score between 1 and 5, with 1 being the lowest score." \
133 |     --task_description "Assistant is an expert cinema reviewer and editor, and is tasked with scoring other movie reviews."
134 | ```
135 | Follow the same steps as in the simple classification example for running the pipeline and annotating through the Argilla UI.
136 | 
137 | ### Generating Movie Reviews (Generation task):
138 | Here, we aim to generate good (insightful and comprehensive) movie reviews from scratch. The initial prompt might look something like this: 
139 |  - Initial prompt: “Write a good and comprehensive movie review about a specific movie.”
140 |  - Task description: “Assistant is a large language model that is tasked with writing movie reviews.”
141 | 
142 | This time, we'll need to use the `run_generation_pipeline.py` to initiate a generation run. This pipeline is different from but builds on the classification pipeline in our earlier examples.
143 | 
144 | The generation pipeline starts by taking the initial prompt and modifying it for a scoring task, similar to the scoring example above. Once it establishes a robust estimtor for high-quality content, in this instance movie reviews, it runs the generation pipeline without the need for human annotation. 
145 | 
146 | To facilitate this, two distinct input config files are employed: `config/config_diff/config_ranking.yml`, and `config/config_diff/config_generation.yml`.
147 | 
148 | Note that the `annotator` section in the generation config yaml file remains empty: 
149 | ```
150 | annotator:
151 |     method : ''
152 | ```
153 | 
154 | #### Run Example
155 | 
156 | Run the generation pipeline with appropriate arguments: 
157 | ```bash
158 | > python run_generation_pipeline.py \
159 |     --prompt "Write a good and comprehensive movie review about a specific movie." \
160 |     --task_description "Assistant is a large language model that is tasked with writing movie reviews."
161 | ```
162 | 
163 | As the pipeline runs, the user will be prompted to annotate ranking examples of movie reviews. The final output will be a calibrated prompt for the generation task.
164 | 
165 | ### Single Topic Moderation:
166 | 
167 | In this example, we aim to monitor user interactions on an Enterprise's chat platform to moderate (filter out) any unsolicited advertisements. This ensures a focused and relevant communication environment.
168 | 
169 | The initial prompt could be as follows:
170 | 
171 | - Initial prompt: “Assess whether the message contains advertising. Answer 'Yes' or 'No'.”
172 |  - Task description: “As a moderation expert at FabricFantasia, an online store selling clothes, you meticulously review customer inquiries and support tickets.”
173 | 
174 | #### Run Example
175 | For the moderation, update the `label_schema` in `config/config_default.yml`
176 | ```
177 | dataset:
178 |     label_schema: ["Yes", "No"]
179 | ```
180 | And then execute the pipeline with the specified input parameters:
181 | ```bash
182 | > python run_pipeline.py \
183 |     --prompt "Assess whether the message contains advertising. Answer 'Yes' or 'No'." \
184 |     --task_description "As a moderation expert at FabricFantasia, an online store selling clothes, you meticulously review customer inquiries and support tickets."
185 | ```
186 | Please follow the same annotation and monitoring procedures as shown in the previous examples.
187 | 
188 | ### Multi Topic Moderation (Prompt squeezing task):
189 | In this example, our goal is to monitor user interactions on an enterprise's chat platform and moderate (filter out) any problematic topics, including disclosing personal information, deceptive practices, spam, illegal activities, conflict of interest, and off-topic content.
190 | 
191 | The initial prompt could be structured as follows:
192 | 
193 | - Initial prompt: “Does this message contain any problematic topics such as disclosing personal information, deceptive practices, spam, illegal activities, conflict of interest, or off-topic content? Respond with 'Yes' or 'No'.”
194 |  - Task description: “As a moderation expert at FabricFantasia, an online store selling clothes, you meticulously review customer inquiries and support tickets.”
195 | 
196 | 
197 | #### Run Example
198 | In a multi-topic moderation setting, we use various moderation rules to annotate a sample. Each rule is evaluated independently, and the outcomes are combined to generate the final labels. We employ an LLM annotator to avoid time-intensive manual annotation.
199 | 
200 | This task utilizes two distinct input configuration files: `config/config_default.yml`, used previously, and `config/config_diff/config_batch_classification.yml`, which specifies the individual moderation rules, the policy for aggregating results, and LLM configuration. The available aggregation policies are 'exist', 'majority', and 'all'. The 'exist' and 'all' policies are suited for scenarios with 'Yes' or 'No' labels, while the 'majority' policy assigns the final label based on the most frequently occurring outcome across the rules.
201 | 
202 | In our case, it can look like this:
203 | ```
204 | dataset:
205 |     label_schema: ["Yes", "No"]
206 | 
207 | annotator:
208 |     method : 'llm_batch'
209 |     config:
210 |         instructions: 
211 |         ['Does the message disclosure sensitive personal information? Answer Yes or No', 
212 |          'Does the message involve deceptive practices? Answer Yes or No',
213 |          'Is this message an example of spam? Answer Yes or No',
214 |          'Does the message reference or promote any illegal activities? Answer Yes or No',
215 |          'Does the message come from someone with a potential conflict of interest? Answer Yes or No',
216 |          'Is this message completely irrelevant to the services or products offered? Answer Yes or No'
217 |          ]
218 |         aggregation_mode: 'exist'  #'majority',  'exist', or 'all'. exist/all is working only in case label_schema: ["Yes", "No"]!
219 |         estimator_config:
220 |             num_workers: 2
221 |             prompt: 'prompts/predictor/prediction.prompt'
222 |             mode: 'annotation'
223 |             mini_batch_size: 1
224 |             llm:
225 |               type: 'OpenAI'
226 |               name: 'gpt-4-1106-preview'
227 | ```
228 | 
229 | Also, update the `label_schema` in `config/config_default.yml`
230 | ```
231 | dataset:
232 |     label_schema: ["Yes", "No"]
233 | ```
234 | 
235 | #### Run Example
236 | As before, we'll use the `run_pipeline.py` to initiate a multi-topic moderation run.
237 | ```bash
238 | > python run_pipeline.py \
239 |     --batch_config_path "config/config_diff/config_batch_classification.yml" \
240 |     --prompt "Assess whether the message contains any of the following problematic topics:  disclosing personal information, deceptive practices, spam, illegal activities, conflict of interest, off-topic content. Answer 'Yes' if it does or 'No' otherwise." \
241 |     --task_description "As a moderation expert at FabricFantasia, an online store selling clothes, you meticulously review customer inquiries and support tickets."
242 | ```
243 | Please follow the same annotation and monitoring procedures as shown in the previous examples.
244 | 


--------------------------------------------------------------------------------
/docs/how-it-works.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # How AutoPrompt works
 3 | 
 4 | This document outlines the optimization process flows of AutoPrompt. The framework is designed with modularity and adaptability in mind, allowing for easy extension of the prompt calibration process from classification tasks to generative tasks. 
 5 | 
 6 | 
 7 | ##   Classification Pipeline Overview 
 8 | 
 9 | The classification pipeline executes a calibration process involving the following steps:
10 | 
11 | 1. **User Input:**
12 |    - The user provides an initial prompt and task description to kickstart the calibration process.
13 | 
14 | 2. **Challenging Examples:**
15 |    - A set of challenging examples is proposed to the user to enhance the model's performance.
16 | 
17 | 3. **Annotation:**
18 |    - The provided examples are annotated, utilizing either a human-in-the-loop approach or leveraging Language Model (LLM) capabilities.
19 | 
20 | 4. **Prediction:**
21 |    - The annotated samples are evaluated using the current prompt to assess model performance.
22 | 
23 | 5. **Prompt Analysis:**
24 |    - The pipeline analyzes the prompt scores and identifies instances of large errors.
25 | 
26 | 6. **Prompt Refinement:**
27 |    - A new prompt is suggested based on the evaluation results, aiming to improve model accuracy.
28 | 
29 | 7. **Iteration:**
30 |    - Steps 2-6 are iteratively repeated until convergence, refining the prompt and enhancing the model's performance throughout the process. 
31 | 
32 | 
33 | ## Generation Pipeline Overview 
34 | 
35 | The generation pipeline shares a common structure with the classification flow but introduces a modification step for generation prompts. The process unfolds as follows:
36 | 
37 | 1. **User Input:**
38 |    - The user provides an initial prompt and task description for the generation process.
39 | 
40 | 2. **Prompt Modification (LLM):**
41 |    - The initial prompt is transformed into a classification-compatible input using a Language Model (LLM), creating an intermediary task for boolean classification or ranking.
42 | 
43 | 3. **Annotation (Classification):**
44 |    - Challenging examples are annotated for boolean classification or ranking based on the modified prompts. This step is analogous to the classification flow.
45 | 
46 | 4. **Ranker Calibration (LLM):**
47 |    - Utilizing the annotated examples, a ranking prompt (implemented as an LLM estimator) is fitted.
48 | 
49 | 5. **Calibration (Generation):**
50 |    - The original generation prompt is calibrated using the ranking LLM estimator (now used for evaluation), resulting in enhanced prompt formulations for generation tasks.
51 |    
52 | 
53 | 
54 | The modular architecture of the pipeline demonstrates the flexibility of the core calibration process and effectiveness for both classification and generation tasks. The additional step in the generation flow seamlessly integrates with the overall iterative prompt calibration approach.
55 | 
56 | 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/docs/installation.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | This guide provides detailed instructions for setting up your development environment, configuring LLMs, and integrating various tools necessary for your project.
 4 | 
 5 | ## Python version
 6 | We recommend using python 3.10.13
 7 | 
 8 | ## Install with Conda
 9 | We recommend installing using Conda:
10 | ```bash
11 | conda env create -f environment_dev.yml
12 | conda activate AutoPrompt
13 | ```
14 | 
15 | ## Install with pip
16 | Install using pip directly:
17 | ```bash
18 | pip install -r requirements.txt
19 | ```
20 | 
21 | ## Install with pipenv
22 | Install using pipenv:
23 | ```bash
24 | pip install pipenv
25 | pipenv sync
26 | ```
27 | 
28 | ### Configure your LLM
29 | 
30 | Set your OpenAI API key in the configuration file `config/llm_env.yml`. For assistance locating your API key, visit this [link](https://help.openai.com/en/articles/4936850-where-do-i-find-my-api-key).
31 | 
32 | - For LLM, we recommend using [OpenAI's GPT-4](https://platform.openai.com/docs/guides/gpt). Alternatively, configure Azure by setting llm type in `config/config_default.yml` to `"Azure"` and specifying the key in `config/llm_env.yml`. Our system also supports various LLMs, including open source models, through [Langchain Pipeline](https://python.langchain.com/docs/integrations/llms/huggingface_pipelines). Change the llm `type` to `"HuggingFacePipeline"` and specify the model ID in the llm `name` field.  
33 | 
34 | - **Configure your Predictor**.  We employ a predictor to estimate prompt performance. The default predictor LLM is GPT-3.5. Configuration is located in the `predictor` section of `config/config_default.yml`.
35 | 
36 | ### Configure Human-in-the-Loop Annotator 
37 | 
38 | Our pipeline incorporates a human-in-the-loop annotation process using [Argilla](https://docs.argilla.io/en/latest/index.html). Follow these steps to set it up:
39 | 
40 | 1. **Set Up Argilla Server and UI:** Follow the [instructions](https://docs.argilla.io/en/latest/getting_started/quickstart_installation.html) to install and set up an Argilla server and user interface.
41 | 
42 | 2. **Quick Installation Option:** For a faster setup, we recommend deploying Argilla on a Hugging Face [space](https://huggingface.co/new-space?template=argilla/argilla-template-space).
43 | 
44 | 3. **Configure API Settings:** After setting up the server, modify the `api_url` and `api_key` in the `config/config_default.yml` file. For instance, if using the recommended Hugging Face space, your API URL should be formatted as follows: `api_url: 'https://<your-argilla-space-name>.hf.space'`.
45 | 
46 | 
47 | ### Configure LLM Annotator 
48 | 
49 | To specify an LLM as the annotation tool in your pipeline, update the `annotator` section in the `config/config_default.yml` file as follows:
50 | 
51 | ``` 
52 | annotator:
53 |     method: 'llm'
54 |     config:
55 |         llm:
56 |             type: 'OpenAI'
57 |             name: 'gpt-4-1106-preview'
58 |         instruction:
59 |             'Assess whether the text contains a harmful topic. 
60 |             Answer Yes if it does and No otherwise.'
61 |         num_workers: 5
62 |         prompt: 'prompts/predictor_completion/prediction.prompt'
63 |         mini_batch_size: 1
64 |         mode: 'annotation'
65 | ```
66 | We recommend using a robust LLM, like GPT-4, for annotation purposes. In the `instruction` field, you specify the task instructions for the annotation. The `mini_batch_size` field determines the number of samples processed in a single annotation pass, allowing you to balance efficiency with LLM token usage.
67 | 
68 | 
69 | ### Monitoring: Weights and Biases Setup
70 | 
71 | To effectively track your optimization process, including metrics like score, prompts instances, and error analysis  across iterations, we recommend using [Weights and Biases](https://wandb.ai/site).
72 | 
73 | 1. **Sign Up for Weights and Biases:** Visit their [website](https://wandb.ai/site) and follow the instructions to create an account.
74 | 
75 | 2. **Enable wandb in Your Configuration:** In your project's `config/config_default.yml` file, set `use_wandb` to `True` to activate wandb support.


--------------------------------------------------------------------------------
/environment_dev.yml:
--------------------------------------------------------------------------------
 1 | name: AutoPrompt
 2 | 
 3 | channels:
 4 |   - conda-forge
 5 | dependencies:
 6 |   - python=3.10.13
 7 |   - pip>=2.22.0
 8 |   - openai
 9 |   - anthropic
10 |   - langchain
11 |   - pandas
12 |   - wandb
13 |   - transformers
14 |   - tqdm
15 |   - faiss-cpu
16 |   - sentence-transformers
17 |   - pip:
18 |       - prodict
19 |       - argilla==1.25.0
20 |       - schedule
21 |       - pandas
22 |       - easydict
23 |       - pillow==10.2.0


--------------------------------------------------------------------------------
/estimator/__init__.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | from .estimator_argilla import ArgillaEstimator
 4 | from .estimator_llm import LLMEstimator
 5 | from .estimator_llm_batch import LLMBatchEstimator
 6 | from dataset.base_dataset import DatasetBase
 7 | 
 8 | 
 9 | class DummyEstimator:
10 |     """
11 |     A dummy callback for the Estimator class.
12 |     This is a method to handle an empty estimator.
13 |     """
14 | 
15 |     @staticmethod
16 |     def calc_usage():
17 |         """
18 |         Dummy function to calculate the usage of the dummy estimator
19 |         """
20 |         return 0
21 | 
22 |     @staticmethod
23 |     def apply(dataset: DatasetBase, batch_id: int):
24 |         """
25 |         Dummy function to mimic the apply method, returns an empty dataframe
26 |         """
27 |         return pd.DataFrame()
28 | 
29 | def give_estimator(opt):
30 |     if opt.method == 'argilla':
31 |         return ArgillaEstimator(opt.config)
32 |     elif opt.method == 'llm':
33 |         return LLMEstimator(opt.config)
34 |     elif opt.method == 'llm_batch':
35 |         return LLMBatchEstimator(opt.config)
36 |     else:
37 |         return DummyEstimator()
38 | 


--------------------------------------------------------------------------------
/estimator/estimator_argilla.py:
--------------------------------------------------------------------------------
  1 | import argilla as rg
  2 | import time
  3 | import pandas as pd
  4 | from argilla.client.singleton import active_client
  5 | from utils.config import Color
  6 | from dataset.base_dataset import DatasetBase
  7 | import json
  8 | import webbrowser
  9 | import base64
 10 | 
 11 | class ArgillaEstimator:
 12 |     """
 13 |     The ArgillaEstimator class is responsible to generate the GT for the dataset by using Argilla interface.
 14 |     In particular using the text classification mode.
 15 |     """
 16 |     def __init__(self, opt):
 17 |         """
 18 |         Initialize a new instance of the ArgillaEstimator class.
 19 |         """
 20 |         try:
 21 |             self.opt = opt
 22 |             rg.init(
 23 |                 api_url=opt.api_url,
 24 |                 api_key=opt.api_key,
 25 |                 workspace=opt.workspace
 26 |             )
 27 |             self.time_interval = opt.time_interval
 28 |         except:
 29 |             raise Exception("Failed to connect to argilla, check connection details")
 30 | 
 31 |     @staticmethod
 32 |     def initialize_dataset(dataset_name: str, label_schema: set[str]):
 33 |         """
 34 |         Initialize a new dataset in the Argilla system
 35 |         :param dataset_name: The name of the dataset
 36 |         :param label_schema: The list of classes
 37 |         """
 38 |         try:
 39 |             settings = rg.TextClassificationSettings(label_schema=label_schema)
 40 |             rg.configure_dataset_settings(name=dataset_name, settings=settings)
 41 |         except:
 42 |             raise Exception("Failed to create dataset")
 43 | 
 44 |     @staticmethod
 45 |     def upload_missing_records(dataset_name: str, batch_id: int, batch_records: pd.DataFrame):
 46 |         """
 47 |         Update the Argilla dataset by adding missing records from batch_id that appears in batch_records
 48 |         :param dataset_name: The dataset name
 49 |         :param batch_id: The batch id
 50 |         :param batch_records: A dataframe of the batch records
 51 |         """
 52 |         #TODO: sort visualization according to batch_id descending
 53 |         query = "metadata.batch_id:{}".format(batch_id)
 54 |         result = rg.load(name=dataset_name, query=query)
 55 |         df = result.to_pandas()
 56 |         if len(df) == len(batch_records):
 57 |             return
 58 |         if df.empty:
 59 |             upload_df = batch_records
 60 |         else:
 61 |             merged_df = pd.merge(batch_records, df['text'], on='text', how='left', indicator=True)
 62 |             upload_df = merged_df[merged_df['_merge'] == 'left_only'].drop(columns=['_merge'])
 63 |         record_list = []
 64 |         for index, row in upload_df.iterrows():
 65 |             config = {'text': row['text'], 'metadata': {"batch_id": row['batch_id'], 'id': row['id']}, "id": row['id']}
 66 |             # if not (row[['prediction']].isnull().any()):
 67 |             #     config['prediction'] = row['prediction']  # TODO: fix it incorrect type!!!
 68 |             if not(row[['annotation']].isnull().any()):  # TODO: fix it incorrect type!!!
 69 |                 config['annotation'] = row['annotation']
 70 |             record_list.append(rg.TextClassificationRecord(**config))
 71 |         rg.log(records=record_list, name=dataset_name)
 72 | 
 73 |     def calc_usage(self):
 74 |         """
 75 |         Dummy function to calculate the usage of the estimator
 76 |         """
 77 |         return 0
 78 | 
 79 |     def apply(self, dataset: DatasetBase, batch_id: int):
 80 |         """
 81 |         Apply the estimator on the dataset. The function enter to infinite loop until all the records are annotated.
 82 |         Then it update the dataset with all the annotations
 83 |         :param dataset: DatasetBase object, contains all the processed records
 84 |         :param batch_id: The batch id to annotate
 85 |         """
 86 |         current_api = active_client()
 87 |         try:
 88 |             rg_dataset = current_api.datasets.find_by_name(dataset.name)
 89 |         except:
 90 |             self.initialize_dataset(dataset.name, dataset.label_schema)
 91 |             rg_dataset = current_api.datasets.find_by_name(dataset.name)
 92 |         batch_records = dataset[batch_id]
 93 |         if batch_records.empty:
 94 |             return []
 95 |         self.upload_missing_records(dataset.name, batch_id, batch_records)
 96 |         data = {'metadata': {'batch_id': [str(batch_id)]}}
 97 |         json_data = json.dumps(data)
 98 |         encoded_bytes = base64.b64encode(json_data.encode('utf-8'))
 99 |         encoded_string = str(encoded_bytes, "utf-8")
100 |         url_link = self.opt.api_url + '/datasets/' + self.opt.workspace + '/' \
101 |                    + dataset.name + '?query=' + encoded_string
102 |         print(f"{Color.GREEN}Waiting for annotations from batch {batch_id}:\n{url_link}{Color.END}")
103 |         webbrowser.open(url_link)
104 |         while True:
105 |             query = "(status:Validated OR status:Discarded) AND metadata.batch_id:{}".format(batch_id)
106 |             search_results = current_api.search.search_records(
107 |                 name=dataset.name,
108 |                 task=rg_dataset.task,
109 |                 size=0,
110 |                 query_text=query,
111 |             )
112 |             if search_results.total == len(batch_records):
113 |                 result = rg.load(name=dataset.name, query=query)
114 |                 df = result.to_pandas()[['text', 'annotation', 'metadata', 'status']]
115 |                 df["annotation"] = df.apply(lambda x: 'Discarded' if x['status']=='Discarded' else x['annotation'], axis=1)
116 |                 df = df.drop(columns=['status'])
117 |                 df['id'] = df.apply(lambda x: x['metadata']['id'], axis=1)
118 |                 return df
119 |             time.sleep(self.time_interval)
120 | 


--------------------------------------------------------------------------------
/estimator/estimator_llm.py:
--------------------------------------------------------------------------------
 1 | from utils.llm_chain import ChainWrapper, get_chain_metadata
 2 | from pathlib import Path
 3 | from dataset.base_dataset import DatasetBase
 4 | import pandas as pd
 5 | 
 6 | class LLMEstimator:
 7 |     """
 8 |     A wrapper for an estimator using LLM
 9 |     """
10 | 
11 |     def __init__(self, opt):
12 |         """
13 |         Initialize a new instance of the LLMEstimator class.
14 |         :param opt: The configuration file (EasyDict)
15 |         """
16 |         self.opt = opt
17 |         self.chain = None
18 |         self.mini_batch_size = opt.mini_batch_size
19 |         self.mode = opt.mode
20 |         self.num_workers = opt.num_workers
21 |         if 'instruction' in opt.keys():
22 |             self.cur_instruct = opt.instruction
23 |         else:
24 |             self.cur_instruct = None
25 | 
26 |     @staticmethod
27 |     def generate_sample_text(sample_id: int, text: str) -> str:
28 |         """
29 |         Generate a sample text for the chain prompt
30 |         :param sample_id: The sample id
31 |         :param text: The text of the sample
32 |         :return: The sample text for the prompt
33 |         """
34 |         return f"ID: {sample_id};  Sample: {text}\n"
35 | 
36 |     def calc_usage(self) -> float:
37 |         """"
38 |         Calculate the usage of the estimator
39 |         """
40 |         return self.chain.accumulate_usage
41 | 
42 |     def init_chain(self, label_schema: set[str]):
43 |         """
44 |         Initialize the chain
45 |         :param label_schema: The label schema
46 |         """
47 |         chain_metadata = get_chain_metadata(Path(self.opt.prompt), retrieve_module=True)
48 |         if hasattr(chain_metadata['module'], 'update_classification_prediction_schema'):
49 |             chain_metadata['json_schema'] = chain_metadata['module'].update_classification_prediction_schema(
50 |                 chain_metadata['json_schema'],
51 |                 label_schema
52 |             )
53 |         self.chain = ChainWrapper(self.opt.llm, self.opt.prompt, chain_metadata['json_schema'],
54 |                                   chain_metadata['parser_func'])
55 | 
56 |     def apply_dataframe(self, record: pd.DataFrame):
57 |         """
58 |         Apply the estimator on a dataframe
59 |         :param record: The record
60 |         """
61 |         chain_input = ''
62 |         mini_batch_inputs = []
63 |         record[self.mode] = 'Discarded'
64 |         # prepare all the inputs for the chains
65 |         for i, row in record.iterrows():
66 |             chain_input += self.generate_sample_text(i, row['text'])
67 |             if ((i + 1) % self.mini_batch_size) == 0:
68 |                 mini_batch_inputs.append({'batch_size': self.mini_batch_size, 'task_instruction': self.cur_instruct,
69 |                                           'samples': chain_input})
70 |                 chain_input = ''
71 |         if not (chain_input == ''):
72 |             mini_batch_inputs.append({'batch_size': self.mini_batch_size, 'task_instruction': self.cur_instruct,
73 |                                       'samples': chain_input})
74 | 
75 |         all_results = self.chain.batch_invoke(mini_batch_inputs, self.num_workers)
76 |         union_results = [element for sublist in all_results for element in sublist['results']]
77 |         for res in union_results:
78 |             record.loc[res['id'], self.mode] = res['prediction']
79 |         return record
80 | 
81 |     def apply(self, dataset: DatasetBase, idx: int, leq: bool = False):
82 |         """
83 |         Apply the estimator on the batches up to idx (includes), it then updates the annotation field
84 |         if self.mode is 'annotation', otherwise it update the prediction field.
85 |         :param dataset: The dataset
86 |         :param idx: The current batch index
87 |         :param leq: If True, apply on all the batches up to idx (includes), otherwise apply only on idx
88 |         """
89 |         if self.chain is None:
90 |             self.init_chain(dataset.label_schema)
91 |         if leq:
92 |             batch_records = dataset.get_leq(idx)
93 |         else:
94 |             batch_records = dataset[idx]
95 |         return self.apply_dataframe(batch_records)
96 | 


--------------------------------------------------------------------------------
/estimator/estimator_llm_batch.py:
--------------------------------------------------------------------------------
 1 | from estimator.estimator_llm import LLMEstimator
 2 | from dataset.base_dataset import DatasetBase
 3 | import pandas as pd
 4 | 
 5 | 
 6 | class LLMBatchEstimator:
 7 |     """
 8 |     A wrapper for an estimator using aggregation of multiple LLMs estimators
 9 |     """
10 | 
11 |     def __init__(self, opt):
12 |         """
13 |         Initialize a new instance of the LLMEstimator class.
14 |         :param opt: The configuration file (EasyDict)
15 |         """
16 |         self.llm_estimators = [LLMEstimator(opt.estimator_config) for _ in range(len(opt.instructions))]
17 |         for i, estimator in enumerate(self.llm_estimators):
18 |             estimator.cur_instruct = opt.instructions[i]
19 |         self.mode = opt.estimator_config.mode
20 |         self.aggregation_mode = opt.aggregation_mode
21 | 
22 |     def calc_usage(self) -> float:
23 |         """"
24 |         Calculate the usage of the estimator
25 |         """
26 |         return sum([estimator.calc_usage() for estimator in self.llm_estimators])
27 | 
28 |     def get_aggregation_function(self):
29 |         if self.aggregation_mode == 'max':
30 |             return lambda record: max(record)
31 |         elif self.aggregation_mode == 'min':
32 |             return lambda record: min(record)
33 |         elif self.aggregation_mode == 'mean':
34 |             return lambda record: sum(record) / len(record)
35 |         elif self.aggregation_mode == 'median':
36 |             return lambda record: sorted(record)[len(record) // 2]
37 |         elif self.aggregation_mode == 'majority':
38 |             return lambda record: max(set(record), key=record.count)
39 |         elif self.aggregation_mode == 'exist':
40 |             return lambda record: 'Yes' if any([t == 'Yes' for t in record]) else 'No'
41 |         elif self.aggregation_mode == 'all':
42 |             return lambda record: 'Yes' if all([t == 'Yes' for t in record]) else 'No'
43 |         else:
44 |             raise Exception(f'Unknown aggregation class {self.aggregation_mode}')
45 | 
46 |     def apply(self, dataset: DatasetBase, idx: int, leq: bool = False):
47 |         """
48 |         Apply the estimator on the batches up to idx (includes), it then updates the annotation field
49 |         if self.mode is 'annotation', otherwise it update the prediction field.
50 |         :param dataset: The dataset
51 |         :param idx: The current batch index
52 |         :param leq: If True, apply on all the batches up to idx (includes), otherwise apply only on idx
53 |         """
54 |         update_datasets = [estimator.apply(dataset, idx, leq) for estimator in self.llm_estimators]
55 |         res_dataset = update_datasets[0]
56 |         if res_dataset.empty:
57 |             return res_dataset
58 |         for i, df in enumerate(update_datasets[1:]):
59 |             # Merge the dataframes on the 'id' column
60 |             merged_df = pd.merge(res_dataset, df[['id', self.mode]], on='id', how='left', suffixes=('_left', '_right'))
61 |             if i == 0:
62 |                 res_dataset[self.mode] = merged_df.apply(lambda row: [str(row['{}_left'.format(self.mode)])] +
63 |                                                                      [str(row['{}_right'.format(self.mode)])], axis=1)
64 |             else:
65 |                 res_dataset[self.mode] = merged_df.apply(lambda row: row['{}_left'.format(self.mode)] +
66 |                                                                      [str(row['{}_right'.format(self.mode)])], axis=1)
67 |         res_dataset[self.mode] = res_dataset[self.mode].apply(self.get_aggregation_function())
68 |         return res_dataset
69 | 


--------------------------------------------------------------------------------
/eval/eval_utils.py:
--------------------------------------------------------------------------------
 1 | from estimator.estimator_llm import LLMEstimator
 2 | 
 3 | 
 4 | def set_function_from_iterrow(func):
 5 |     def wrapper(dataset):
 6 |         dataset['score'] = dataset.apply(func, axis=1)
 7 |         return dataset
 8 | 
 9 |     return wrapper
10 | 
11 | 
12 | def set_ranking_function(params):
13 |     evaluator = LLMEstimator(params)
14 |     evaluator.init_chain(params.label_schema)
15 |     evaluator.mode = 'score'
16 |     def wrapper(dataset):
17 |         generation_dataset = dataset.copy()
18 |         generation_dataset['text'] = '###User input:\n' + generation_dataset['text'] + '\n####model prediction:\n' + generation_dataset['prediction']
19 | 
20 |         generation_dataset = evaluator.apply_dataframe(generation_dataset)
21 |         generation_dataset.score = generation_dataset.score.astype(int)
22 |         dataset.score = generation_dataset.score
23 |         return dataset
24 |     return wrapper
25 | 


--------------------------------------------------------------------------------
/eval/evaluator.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | from sklearn.metrics import confusion_matrix
  4 | import eval.eval_utils as utils
  5 | 
  6 | class Eval:
  7 |     """
  8 |     The Eval class is responsible to calculate the score and the large errors
  9 |     """
 10 | 
 11 |     def __init__(self, config, analyzer=None, label_schema=None):
 12 |         """
 13 |         Initialize a new instance of the Eval class.
 14 |         :param config: The configuration file (EasyDict)
 15 |         :analyzer (optional): A chain that analyze the errors
 16 |         :label_schema (optional): The label schema
 17 |         """
 18 |         self.score_function_name = config.function_name
 19 |         self.score_func = self.get_eval_function(config)
 20 |         self.num_errors = config.num_large_errors
 21 |         self.error_threshold = config.error_threshold
 22 |         self.dataset = None
 23 |         self.mean_score = None
 24 |         self.label_schema = label_schema
 25 |         self.errors = None
 26 |         self.history = []
 27 |         self.analyzer = analyzer
 28 | 
 29 |     @staticmethod
 30 |     def get_eval_function(config: dict):
 31 |         """
 32 |         Returns the eval function
 33 |         :param config: The eval configuration
 34 |         :return: The function implementation on a record
 35 |         """
 36 |         if config.function_name == 'accuracy':
 37 |             return utils.set_function_from_iterrow(lambda record: record['annotation'] == record['prediction'])
 38 |         elif config.function_name == 'ranking':
 39 |             return utils.set_ranking_function(config.function_params)
 40 |         else:
 41 |             raise NotImplementedError("Eval function not implemented")
 42 | 
 43 |     def eval_score(self) -> float:
 44 |         """
 45 |         Calculate the score on each row and return the mean score.
 46 |         :return: The mean score
 47 |         """
 48 |         # filter out the discarded samples
 49 |         self.dataset = self.dataset[(self.dataset['prediction'] != 'Discarded') &
 50 |                                     (self.dataset['annotation'] != 'Discarded')]
 51 |         self.dataset = self.score_func(self.dataset)
 52 |         self.mean_score = self.dataset['score'].mean()
 53 |         return self.mean_score
 54 | 
 55 |     def get_max_score(self, warmup=0):
 56 |         """
 57 |         Return the maximum 'mean score' (with respect to all history epochs, starting form warmup, up to last) and the epoch index of the maximum score
 58 |         :return: The epoch index of the maximum score, and the maximum score
 59 |         """
 60 |         max_idx = np.argmax([epoch['score'] for epoch in self.history[warmup:-1]])
 61 |         max_idx += warmup
 62 |         return max_idx, self.history[max_idx]['score']
 63 | 
 64 | 
 65 |     def large_error_to_str(self, error_df: pd.DataFrame, num_large_errors_per_label: int) -> str:
 66 |         """
 67 |         Return a string that contains the large errors
 68 |         :param error_df: A dataframe contains all the mislabeled samples
 69 |         :param num_large_errors_per_label: The (maximum) number of large errors per label
 70 |         :return: A string that contains the large errors that is used in the meta-prompt
 71 |         """
 72 |         required_columns = ['annotation', 'text', 'score', 'prediction']
 73 |         label_schema = error_df['annotation'].unique()
 74 |         if self.score_function_name == 'ranker':
 75 |             gt_name = 'Rank:'
 76 |         else:
 77 |             gt_name = 'GT:'
 78 |         error_res_df_list = []
 79 |         txt_res = ''
 80 |         for label in label_schema:
 81 |             cur_df = error_df[error_df['annotation'] == label]
 82 |             cur_df = cur_df.sample(frac=1.0, random_state=42)[:num_large_errors_per_label]
 83 |             error_res_df_list.append(cur_df[required_columns])
 84 |         if len(error_res_df_list) > 0:
 85 |             error_res_df = pd.concat(error_res_df_list, ignore_index=True)
 86 |             error_res_df = error_res_df.sample(frac=1.0, random_state=42)
 87 |             for i, row in error_res_df.iterrows():
 88 |                 txt_res += f"Sample: {row.text}\nPrediction: {row.prediction}, {gt_name}: {row.annotation}\n#\n"
 89 |         return txt_res
 90 | 
 91 |     def sample_to_text(self, sample: dict, num_errors_per_label: int = 0, is_score: bool = True) -> str:
 92 |         """
 93 |         Return a string that organize the information of from the step run for the meta-prompt
 94 |         :param sample: The eval information for specific step
 95 |         :param num_errors_per_label: The max number of large errors per class that will appear in the meta-prompt
 96 |         :param is_score: If True, add the score information to the meta-prompt
 97 |         :return: A string that contains the information of the step run
 98 |         """
 99 |         if is_score:
100 |             return f"####\n##Prompt Score: {sample['score']:.2f}\n##Prompt:\n{sample['prompt']}\n#################\n"
101 |         else:
102 |             return f"####\n##Prompt:\n{sample['prompt']}\n{self.large_error_to_str(sample['errors'], num_errors_per_label)}####\n "
103 | 
104 |     def add_history(self, prompt: str, task_description: str):
105 |         """
106 |         Add the current step information to the history
107 |         :param prompt: The current prompt
108 |         :param task_description: The task description
109 |         """
110 |         conf_matrix = None
111 |         large_error_to_str = self.large_error_to_str(self.errors, self.num_errors)
112 |         prompt_input = {'task_description': task_description, 'accuracy': self.mean_score, 'prompt': prompt,
113 |                                          'failure_cases': large_error_to_str}
114 |         if self.score_function_name == 'accuracy':
115 |             conf_matrix = confusion_matrix(self.dataset['annotation'],
116 |                                            self.dataset['prediction'], labels=self.label_schema)
117 |             conf_text = f"Confusion matrix columns:{self.label_schema} the matrix data:"
118 |             for i, row in enumerate(conf_matrix):
119 |                 conf_text += f"\n{self.label_schema[i]}: {row}"
120 |             prompt_input['confusion_matrix'] = conf_text
121 |         elif self.score_function_name == 'ranking':
122 |             prompt_input['labels'] = self.label_schema
123 |         analysis = self.analyzer.invoke(prompt_input)
124 | 
125 |         self.history.append({'prompt': prompt, 'score': self.mean_score,
126 |                              'errors': self.errors, 'confusion_matrix': conf_matrix, 'analysis': analysis['text']})
127 | 
128 |     def extract_errors(self) -> pd.DataFrame:
129 |         """
130 |         Extract the errors from the dataset
131 |         :return: records that contains the errors
132 |         """
133 |         df = self.dataset
134 |         err_df = df[df['score'] < self.error_threshold]
135 |         err_df.sort_values(by=['score'])
136 |         self.errors = err_df
137 |         return self.errors
138 | 
139 |     def extract_correct(self) -> pd.DataFrame:
140 |         """
141 |         Extract the correct samples from the dataset
142 |         :return: records that contains the correct samples
143 |         """
144 |         df = self.dataset
145 |         return df[df['score'] > self.error_threshold]
146 | 
147 |     def extract_boundary_predictions(self) -> pd.DataFrame:
148 |         """
149 |         Extract boundary samples on which the model is uncertain
150 |         :return: records that contains boundary samples
151 |         """
152 |         pass


--------------------------------------------------------------------------------
/optimization_pipeline.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | 
  3 | from eval.evaluator import Eval
  4 | from dataset.base_dataset import DatasetBase
  5 | from utils.llm_chain import MetaChain
  6 | from estimator import give_estimator
  7 | from pathlib import Path
  8 | import pickle
  9 | import os
 10 | import json
 11 | import logging
 12 | import wandb
 13 | 
 14 | 
 15 | class OptimizationPipeline:
 16 |     """
 17 |     The main pipeline for optimization. The pipeline is composed of 4 main components:
 18 |     1. dataset - The dataset handle the data including the annotation and the prediction
 19 |     2. annotator - The annotator is responsible generate the GT
 20 |     3. predictor - The predictor is responsible to generate the prediction
 21 |     4. eval - The eval is responsible to calculate the score and the large errors
 22 |     """
 23 | 
 24 |     def __init__(self, config, task_description: str = None, initial_prompt: str = None, output_path: str = ''):
 25 |         """
 26 |         Initialize a new instance of the ClassName class.
 27 |         :param config: The configuration file (EasyDict)
 28 |         :param task_description: Describe the task that needed to be solved
 29 |         :param initial_prompt: Provide an initial prompt to solve the task
 30 |         :param output_path: The output dir to save dump, by default the dumps are not saved
 31 |         """
 32 | 
 33 |         if config.use_wandb:  # In case of using W&B
 34 |             wandb.login()
 35 |             self.wandb_run = wandb.init(
 36 |                 project="AutoGPT",
 37 |                 config=config,
 38 |             )
 39 |         if output_path == '':
 40 |             self.output_path = None
 41 |         else:
 42 |             if not os.path.isdir(output_path):
 43 |                 os.makedirs(output_path)
 44 |             self.output_path = Path(output_path)
 45 |             logging.basicConfig(filename=self.output_path / 'info.log', level=logging.DEBUG,
 46 |                                 format='%(asctime)s - %(levelname)s - %(message)s', force=True)
 47 | 
 48 |         self.dataset = None
 49 |         self.config = config
 50 |         self.meta_chain = MetaChain(config)
 51 |         self.initialize_dataset()
 52 | 
 53 |         self.task_description = task_description
 54 |         self.cur_prompt = initial_prompt
 55 | 
 56 |         self.predictor = give_estimator(config.predictor)
 57 |         self.annotator = give_estimator(config.annotator)
 58 |         self.eval = Eval(config.eval, self.meta_chain.error_analysis, self.dataset.label_schema)
 59 |         self.batch_id = 0
 60 |         self.patient = 0
 61 | 
 62 |     @staticmethod
 63 |     def log_and_print(message):
 64 |         print(message)
 65 |         logging.info(message)
 66 | 
 67 |     def initialize_dataset(self):
 68 |         """
 69 |         Initialize the dataset: Either empty dataset or loading an existing dataset
 70 |         """
 71 |         logging.info('Initialize dataset')
 72 |         self.dataset = DatasetBase(self.config.dataset)
 73 |         if 'initial_dataset' in self.config.dataset.keys():
 74 |             logging.info(f'Load initial dataset from {self.config.dataset.initial_dataset}')
 75 |             self.dataset.load_dataset(self.config.dataset.initial_dataset)
 76 | 
 77 |     def calc_usage(self):
 78 |         """
 79 |         Calculate the usage of the optimization process (either $ in case of openAI or #tokens the other cases)
 80 |         """
 81 |         total_usage = 0
 82 |         total_usage += self.meta_chain.calc_usage()
 83 |         total_usage += self.annotator.calc_usage()
 84 |         total_usage += self.predictor.calc_usage()
 85 |         return total_usage
 86 | 
 87 |     def extract_best_prompt(self):
 88 |         sorted_history = sorted(
 89 |             self.eval.history[min(self.config.meta_prompts.warmup - 1, len(self.eval.history) - 1):],
 90 |             key=lambda x: x['score'],
 91 |             reverse=False)
 92 |         return {'prompt': sorted_history[-1]['prompt'], 'score': sorted_history[-1]['score']}
 93 | 
 94 |     def run_step_prompt(self):
 95 |         """
 96 |         Run the meta-prompts and get new prompt suggestion, estimated prompt score and a set of challenging samples
 97 |         for the new prompts
 98 |         """
 99 |         step_num = len(self.eval.history)
100 |         if (step_num < self.config.meta_prompts.warmup) or (step_num % 3) > 0:
101 |             last_history = self.eval.history[-self.config.meta_prompts.history_length:]
102 |         else:
103 |             sorted_history = sorted(self.eval.history[self.config.meta_prompts.warmup - 1:], key=lambda x: x['score'],
104 |                                     reverse=False)
105 |             last_history = sorted_history[-self.config.meta_prompts.history_length:]
106 |         history_prompt = '\n'.join([self.eval.sample_to_text(sample,
107 |                                                         num_errors_per_label=self.config.meta_prompts.num_err_prompt,
108 |                                                         is_score=True) for sample in last_history])
109 |         prompt_input = {"history": history_prompt, "task_description": self.task_description,
110 |                         'error_analysis': last_history[-1]['analysis']}
111 |         if 'label_schema' in self.config.dataset.keys():
112 |             prompt_input["labels"] = json.dumps(self.config.dataset.label_schema)
113 |         prompt_suggestion = self.meta_chain.step_prompt_chain.invoke(prompt_input)
114 |         if self.meta_chain.step_prompt_chain.llm_config.type == 'google':
115 |             if isinstance(prompt_suggestion, list) and len(prompt_suggestion) == 1:
116 |                 prompt_suggestion = prompt_suggestion[0]['args']
117 |         self.log_and_print(f'Previous prompt score:\n{self.eval.mean_score}\n#########\n')
118 |         self.log_and_print(f'Get new prompt:\n{prompt_suggestion["prompt"]}')
119 |         self.batch_id += 1
120 |         if len(self.dataset) < self.config.dataset.max_samples:
121 |             batch_input = {"num_samples": self.config.meta_prompts.samples_generation_batch,
122 |                            "task_description": self.task_description,
123 |                            "prompt": prompt_suggestion['prompt']}
124 |             batch_inputs = self.generate_samples_batch(batch_input, self.config.meta_prompts.num_generated_samples,
125 |                                                        self.config.meta_prompts.samples_generation_batch)
126 | 
127 |             if sum([len(t['errors']) for t in last_history]) > 0:
128 |                 history_samples = '\n'.join([self.eval.sample_to_text(sample,
129 |                                                                  num_errors_per_label=self.config.meta_prompts.num_err_samples,
130 |                                                                  is_score=False) for sample in last_history])
131 |                 for batch in batch_inputs:
132 |                     extra_samples = self.dataset.sample_records()
133 |                     extra_samples_text = DatasetBase.samples_to_text(extra_samples)
134 |                     batch['history'] = history_samples
135 |                     batch['extra_samples'] = extra_samples_text
136 |             else:
137 |                 for batch in batch_inputs:
138 |                     extra_samples = self.dataset.sample_records()
139 |                     extra_samples_text = DatasetBase.samples_to_text(extra_samples)
140 |                     batch['history'] = 'No previous errors information'
141 |                     batch['extra_samples'] = extra_samples_text
142 | 
143 |             samples_batches = self.meta_chain.step_samples.batch_invoke(batch_inputs,
144 |                                                                          self.config.meta_prompts.num_workers)
145 |             new_samples = [element for sublist in samples_batches for element in sublist['samples']]
146 |             new_samples = self.dataset.remove_duplicates(new_samples)
147 |             self.dataset.add(new_samples, self.batch_id)
148 |             logging.info('Get new samples')
149 |         self.cur_prompt = prompt_suggestion['prompt']
150 | 
151 |     def stop_criteria(self):
152 |         """
153 |         Check if the stop criteria holds. The conditions for stopping:
154 |         1. Usage is above the threshold
155 |         2. There was no improvement in the last > patient steps
156 |         """
157 |         if 0 < self.config.stop_criteria.max_usage < self.calc_usage():
158 |             return True
159 |         if len(self.eval.history) <= self.config.meta_prompts.warmup:
160 |             self.patient = 0
161 |             return False
162 |         min_batch_id, max_score = self.eval.get_max_score(self.config.meta_prompts.warmup-1)
163 |         if max_score - self.eval.history[-1]['score'] > -self.config.stop_criteria.min_delta:
164 |             self.patient += 1
165 |         else:
166 |             self.patient = 0
167 |         if self.patient > self.config.stop_criteria.patience:
168 |             return True
169 |         return False
170 | 
171 |     @staticmethod
172 |     def generate_samples_batch(batch_input, num_samples, batch_size):
173 |         """
174 |         Generate samples in batch
175 |         """
176 |         batch_num = num_samples // batch_size
177 |         all_batches = [batch_input.copy() for _ in range(batch_num)]
178 |         reminder = num_samples - batch_num * batch_size
179 |         if reminder > 0:
180 |             all_batches.append(batch_input.copy())
181 |             all_batches[-1]['num_samples'] = reminder
182 |         return all_batches
183 | 
184 |     def generate_initial_samples(self):
185 |         """
186 |         In case the initial dataset is empty generate the initial samples
187 |         """
188 |         batch_input = {"num_samples": self.config.meta_prompts.samples_generation_batch,
189 |                        "task_description": self.task_description,
190 |                        "instruction": self.cur_prompt}
191 |         batch_inputs = self.generate_samples_batch(batch_input, self.config.meta_prompts.num_initialize_samples,
192 |                                                    self.config.meta_prompts.samples_generation_batch)
193 | 
194 |         samples_batches = self.meta_chain.initial_chain.batch_invoke(batch_inputs, self.config.meta_prompts.num_workers)
195 |         samples_list = [element for sublist in samples_batches for element in sublist['samples']]
196 |         samples_list = self.dataset.remove_duplicates(samples_list)
197 |         self.dataset.add(samples_list, 0)
198 | 
199 |     def save_state(self):
200 |         """
201 |         Save the process state
202 |         """
203 |         if self.output_path is None:
204 |             return
205 |         logging.info('Save state')
206 |         self.dataset.save_dataset(self.output_path / 'dataset.csv')
207 |         state = {'history': self.eval.history, 'batch_id': self.batch_id,
208 |                  'prompt': self.cur_prompt, 'task_description': self.task_description,
209 |                  'patient': self.patient}
210 |         pickle.dump(state, open(self.output_path / 'history.pkl', 'wb'))
211 | 
212 |     def load_state(self, path: str):
213 |         """
214 |         Load pretrain state
215 |         """
216 |         path = Path(path)
217 |         if (path / 'dataset.csv').is_file():
218 |             self.dataset.load_dataset(path / 'dataset.csv')
219 |         if (path / 'history.pkl').is_file():
220 |             state = pickle.load(open(path / 'history.pkl', 'rb'))
221 |             self.eval.history = state['history']
222 |             self.batch_id = state['batch_id']
223 |             self.cur_prompt = state['prompt']
224 |             self.task_description = state['task_description']
225 |             self.patient = state['patient']
226 | 
227 |     def step(self, current_iter, total_iter):
228 |         """
229 |         This is the main optimization process step.
230 |         """
231 |         self.log_and_print(f'Starting step {self.batch_id}')
232 |         if len(self.dataset.records) == 0:
233 |             self.log_and_print('Dataset is empty generating initial samples')
234 |             self.generate_initial_samples()
235 |         if self.config.use_wandb:
236 |             cur_batch = self.dataset.get_leq(self.batch_id)
237 |             random_subset = cur_batch.sample(n=min(10, len(cur_batch)))[['text']]
238 |             self.wandb_run.log(
239 |                 {"Prompt": wandb.Html(f"<p>{self.cur_prompt}</p>"), "Samples": wandb.Table(dataframe=random_subset)},
240 |                 step=self.batch_id)
241 | 
242 |         logging.info('Running annotator')
243 |         records = self.annotator.apply(self.dataset, self.batch_id)
244 |         self.dataset.update(records)
245 | 
246 |         self.predictor.cur_instruct = self.cur_prompt
247 |         logging.info('Running Predictor')
248 |         records = self.predictor.apply(self.dataset, self.batch_id, leq=True)
249 |         self.dataset.update(records)
250 | 
251 |         self.eval.dataset = self.dataset.get_leq(self.batch_id)
252 |         self.eval.eval_score()
253 |         logging.info('Calculating Score')
254 |         large_errors = self.eval.extract_errors()
255 |         self.eval.add_history(self.cur_prompt, self.task_description)
256 |         if self.config.use_wandb:
257 |             large_errors = large_errors.sample(n=min(6, len(large_errors)))
258 |             correct_samples = self.eval.extract_correct()
259 |             correct_samples = correct_samples.sample(n=min(6, len(correct_samples)))
260 |             vis_data = pd.concat([large_errors, correct_samples])
261 |             self.wandb_run.log({"score": self.eval.history[-1]['score'],
262 |                                 "prediction_result": wandb.Table(dataframe=vis_data),
263 |                                 'Total usage': self.calc_usage()}, step=self.batch_id)
264 |         if self.stop_criteria():
265 |             self.log_and_print('Stop criteria reached')
266 |             return True
267 |         if current_iter != total_iter-1:
268 |             self.run_step_prompt()
269 |         self.save_state()
270 |         return False
271 | 
272 |     def run_pipeline(self, num_steps: int):
273 |         # Run the optimization pipeline for num_steps
274 |         num_steps_remaining = num_steps - self.batch_id
275 |         for i in range(num_steps_remaining):
276 |             stop_criteria = self.step(i, num_steps_remaining)
277 |             if stop_criteria:
278 |                 break
279 |         final_result = self.extract_best_prompt()
280 |         return final_result
281 | 


--------------------------------------------------------------------------------
/prompts/meta_prompts_classification/error_analysis.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to provide a high quality analysis for every task.
 2 | You are given the following task description
 3 | {task_description}
 4 | 
 5 | Here is the prompt instructions that was given to the model:
 6 | {prompt}
 7 | 
 8 | The accuracy for this prompt is: {accuracy}
 9 | The confusion matrix for this prompt is: {confusion_matrix}
10 | ##
11 | Here is a list of failure cases for the given prompt:
12 | ##Failure Cases:
13 | {failure_cases}
14 | 
15 | ###
16 | Note that the ground-truth labels are __absolutely correct__, but the prompts (task descriptions) may be incorrect and need modification.
17 | Your task is to provide a brief analysis of the given prompt performance.
18 | Guidelines:
19 | 1. The analysis should contain only the following information:
20 |     - If there exists abnormal behavior in the confusion matrix, describe it.
21 |     - A summary of the common failure cases, try to cluster the failure cases into groups and describe each group.
22 | 3. The total length of your analysis should be less than 200 token!
23 | ###
24 | Analysis:


--------------------------------------------------------------------------------
/prompts/meta_prompts_classification/initial.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to generate challenging samples for every task.
 2 | Generate a list of {num_samples} challenging samples for the following task.
 3 | ### Task description:
 4 | {task_description}
 5 | ### Task Instruction:
 6 | {instruction}
 7 | ###
 8 | ### Requirements for Challenging Samples:
 9 | 1. The generated samples must be challenging and diverse such that using the task instruction as a prompt will result in the wrong result.
10 | 2. The number of generated samples from each class in the task instruction should be balanced (i.e. the same number of samples for each class)
11 | 3. The generated samples should be distinct, realistic, and vary significantly to ensure diversity.


--------------------------------------------------------------------------------
/prompts/meta_prompts_classification/initial_verbose.prompt:
--------------------------------------------------------------------------------
 1 | As an advanced language model you should create {num_samples} challenging and unique samples for the task outlined below.
 2 | These samples should be intricately designed to test the limits of the task's instructions, challenging yet relevant to the task description.
 3 | 
 4 | ### Task Description:
 5 | {task_description}
 6 | 
 7 | ### Task Instructions:
 8 | {instruction}
 9 | 
10 | ### Requirements for Challenging Samples:
11 | 1. Each sample must present a unique and intricate challenge.
12 | 2. The complexity of the samples should be such that simply applying the given task instruction would likely lead to incorrect or incomplete results.
13 | 3. The samples should cover a diverse range of scenarios within the scope of the task, avoiding repetition and predictability.
14 | 4. Ensure that the samples, while challenging, remain realistic and pertinent to the task's context.
15 | 
16 | Generate the samples keeping these requirements in mind.
17 | ###


--------------------------------------------------------------------------------
/prompts/meta_prompts_classification/output_schemes.py:
--------------------------------------------------------------------------------
 1 | # A file containing the json schema for the output of all the LLM chains
 2 | 
 3 | initial_schema = step_samples_schema = {
 4 |   "description": "A List of all results",
 5 |   "properties": {
 6 |     "samples": {
 7 |       "description": "Each sample is a string containing the sample content, without any additional information like the Prediction or GT",
 8 |       "items": {
 9 |         "type": "string"
10 |       },
11 |       "title": "Samples",
12 |       "type": "array"
13 |     }
14 |   },
15 |   "required": [
16 |     "samples"
17 |   ],
18 |   "title": "Sample_List",
19 |   "type": "object"
20 | }
21 | 
22 | 
23 | classification_prediction_schema = {
24 |   "$defs": {
25 |     "Result": {
26 |       "description": "A single result",
27 |       "properties": {
28 |         "id": {
29 |           "description": "The sample id",
30 |           "title": "Id",
31 |           "type": "integer"
32 |         },
33 |         "prediction": {
34 |           "description": "The prediction of the sample.",
35 |           "title": "Prediction",
36 |           "type": "string"
37 |         }
38 |       },
39 |       "required": [
40 |         "id",
41 |         "prediction"
42 |       ],
43 |       "title": "Result",
44 |       "type": "object"
45 |     }
46 |   },
47 |   "description": "A List of task classification results",
48 |   "properties": {
49 |     "results": {
50 |       "description": "Each item contain the id and the prediction of the sample",
51 |       "items": {
52 |         "$ref": "#/$defs/Result"
53 |       },
54 |       "title": "Results",
55 |       "type": "array"
56 |     }
57 |   },
58 |   "required": [
59 |     "results"
60 |   ],
61 |   "title": "Results_List",
62 |   "type": "object"
63 | }
64 | 
65 | 
66 | step_prompt_schema = {
67 |   "description": "A prompt suggestion which expect to get high score, and the associated score prediction",
68 |   "properties": {
69 |     "prompt": {
70 |       "description": "The prompt prediction",
71 |       "title": "Prompt",
72 |       "type": "string"
73 |     },
74 |     "score": {
75 |       "description": "The score prediction",
76 |       "title": "Score",
77 |       "type": "number"
78 |     }
79 |   },
80 |   "required": [
81 |     "prompt",
82 |     "score"
83 |   ],
84 |   "title": "Suggested_Prompt",
85 |   "type": "object"
86 | }
87 | 
88 | def update_classification_prediction_schema(label_schema:list)->dict:
89 |   """
90 |   Updates the classification prediction schema with the label schema from the yaml file
91 |   :param yaml_data: The yaml data
92 |   """
93 | 
94 |   classification_prediction_schema['$defs']['Result']['properties']['prediction']['enum'] = label_schema
95 |   classification_prediction_schema['$defs']['Result']['properties']['prediction'][
96 |     'description'] += 'The answer must be one of the following options: {} !!'.format(label_schema)
97 |   return classification_prediction_schema


--------------------------------------------------------------------------------
/prompts/meta_prompts_classification/step_prompt.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to provide the best prompt for every task.
 2 | Below are a few suggested prompts for the task and their score, for the following task:
 3 | {task_description}
 4 | 
 5 | ## Examples
 6 | {history}
 7 | ######
 8 | This is the error analysis for the last prompt:
 9 | {error_analysis}
10 | ######
11 | Your task is to generate:
12 | 1. A new prompt that is
13 |     -Different from all the prompts above
14 |     -Follows exactly the error analysis modification suggestions, and fix the prompt to prevent the failure cases.
15 |     -Has a higher score than all the prompts above.
16 | 2. The predicted score of this prompt
17 | 
18 | You must adhere the error analysis instructions! even in case it seems there is a contradiction between these instructions, and the task. The error analysis is tested on a ground truth, thus represent the exact intent of the task.
19 | The generated prompt should be phrased as a clear classification instruction! it should not include any instructions and descriptions on the modification that should be done to the prompt.
20 | Note that the previous prompt contains an implicit assumptions on the intent of the task that might be incorrect. You should replace this assumption with more accurate assumptions using the score of the previous prompts and the error analysis.
21 | The result prompt should indicate that the task is a classification class with the following labels {labels}!


--------------------------------------------------------------------------------
/prompts/meta_prompts_classification/step_prompt_verbose.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to provide the best prompt for every task.
 2 | Below are a few suggested prompts for the task and their score, for the following task:
 3 | {task_description}. The exact intent of this task description should be determine by external annotated dataset of challenging cases. The error analysis of this dataset is provided.
 4 | ## Examples
 5 | {history}
 6 | ######
 7 | For the last prompt we got the following error analysis:
 8 | {error_analysis}
 9 | ######
10 | Your task is to generate:
11 | 1. A new prompt that is
12 |     -Different from all the prompts above
13 |     -Follows exactly after the error analysis modification suggestions, and fix the prompt to prevent the failure cases.
14 |     -Has a higher score than all the prompts above.
15 | 2. The predicted score of this prompt
16 | 
17 | Guidelines for the new prompt:
18 | 1. The prompt is given a 'scratchpad', he can use it to extract from the sample text relevant information to make his prediction and perform a reasoning thought to get to the correct decision
19 | 2. The prompt is intended for a shallow LLM, which does not have access to previous failure cases or the analysis! he has only access to the generated new prompt which should be independent of the previous prompts.
20 | 4. Lists can organize the information and help the prompt (for example list of rules and a list of samples), the lists should be short and accurate
21 | 5. Note that the prompts and task descriptions may be inaccurate and need modification.
22 | 6. Note that higher score means better prompt.
23 | 7. The result prompt should indicate that the task is a classification class with the following labels {labels}!
24 | 
25 | Sample randomly a number between 1 to 3. If the result is zero __change completely__ the generated prompt! including the instruction, the structure and the phrasing!


--------------------------------------------------------------------------------
/prompts/meta_prompts_classification/step_samples.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to generate challenging samples for every task.
 2 | Below a few prompts that were build to answer the given task description and their failure case.
 3 | Task description:
 4 | {task_description}
 5 | 
 6 | ## Examples of common failure, each sample is followed by the the model prediction and the GT (ground truth)
 7 | {history}
 8 | ######
 9 | Here are few unique samples derived from realistic scenarios for the task outlined above.
10 | ## Realistic Samples
11 | {extra_samples}
12 | #####
13 | This was the new proposed prompt:
14 | ## Prompt
15 | {prompt}
16 | 
17 | Your task is to generate {num_samples} by following this guidelines:
18 | 1. The generated samples should be diverse
19 | 2. They should preserve the style and the length of the given examples
20 | 3. The samples must be challenging and hard to classify by the model. This can be achieved by:
21 |     1. targeting the same weakness that the model failed on in the given examples
22 |     2. targeting weakness that are different from the existing examples in the failure cases
23 | 4. The number of generated samples from each class should be almost balanced (i.e. the same number of samples for each class)
24 | 5. The generated samples should include only the sample content without additional information! (like the model prediction and the ground truth)


--------------------------------------------------------------------------------
/prompts/meta_prompts_completion/error_analysis.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to provide a high quality analysis for every task.
 2 | You are given the following task description
 3 | {task_description}
 4 | 
 5 | Here is the prompt instructions that was given to the model:
 6 | {prompt}
 7 | 
 8 | The accuracy for this prompt is: {accuracy}
 9 | The confusion matrix for this prompt is: {confusion_matrix}
10 | ##
11 | Here is a list of failure cases for the given prompt:
12 | ##Failure Cases:
13 | {failure_cases}
14 | 
15 | ###
16 | Note that the ground-truth labels are __absolutely correct__, but the prompts (task descriptions) may be incorrect and need modification.
17 | Your task is to provide a brief analysis of the given prompt performance.
18 | Guidelines:
19 | 1. The analysis should contain only the following information:
20 |     - If there exists abnormal behavior in the confusion matrix, describe it.
21 |     - A summary of the common failure cases, try to cluster the failure cases into groups and describe each group.
22 | 3. The total length of your analysis should be less than 200 token!
23 | ###
24 | Analysis:


--------------------------------------------------------------------------------
/prompts/meta_prompts_completion/initial.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to generate challenging samples for every task.
 2 | Generate a list of {num_samples} challenging samples for the following task.
 3 | ### Task description:
 4 | {task_description}
 5 | ### Task Instruction:
 6 | {instruction}
 7 | ###
 8 | The generated samples should be challenging and diverse such that using the task instruction as a prompt will result in the wrong result.
 9 | 
10 | Answer in the following format:
11 | #### Sample 1:
12 | <text>
13 | #### Sample 2:
14 | <text>
15 | ############
16 | Results:
17 | 


--------------------------------------------------------------------------------
/prompts/meta_prompts_completion/output_schemes.py:
--------------------------------------------------------------------------------
 1 | # A file containing the parser for the output of all the LLM chains
 2 | import re
 3 | 
 4 | def initial_parser(response: dict) -> dict:
 5 |     """
 6 |     Parse the response from the LLM chain
 7 |     :param response: The response from the LLM chain
 8 |     :return: The parsed response
 9 |     """
10 |     pattern = r'(#### Sample \d+:)([\s\S]*?)(?=(#### Sample \d+:|$))'
11 | 
12 |     matches = re.findall(pattern, response['text'])
13 |     results = {'samples' :[]}
14 |     for match in matches:
15 |         header, content = match[0], match[1]
16 |         results['samples'].append(content.strip())
17 |     return results
18 | 
19 | step_samples_parser = initial_parser
20 | 
21 | def step_prompt_parser(response: dict) -> dict:
22 |     """
23 |     Parse the response from the LLM chain
24 |     :param response: The response from the LLM chain
25 |     :return: The parsed response
26 |     """
27 |     pattern = re.compile( r"#### prompt:\n(?P<prompt>.*?)\n#### score:\n(?P<score>[\d.]+)", re.DOTALL)
28 |     match = pattern.search(response['text'])
29 |     if match:
30 |         result = {
31 |             'prompt': match.group('prompt'),
32 |             'score': float(match.group('score'))
33 |         }
34 |         return result
35 |     else:
36 |         result = {
37 |             'prompt': '',
38 |             'score': 0.0
39 |         }
40 |         return result


--------------------------------------------------------------------------------
/prompts/meta_prompts_completion/step_prompt.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to provide the best prompt for every task.
 2 | Below are a few suggested prompts for the task and their score, for the following task:
 3 | {task_description}
 4 | 
 5 | ## Examples
 6 | {history}
 7 | ######
 8 | This is the error analysis for the last prompt:
 9 | {error_analysis}
10 | ######
11 | Your task is to generate:
12 | 1. A new prompt that is
13 |     -Different from all the prompts above
14 |     -Follows exactly the error analysis modification suggestions, and fix the prompt to prevent the failure cases.
15 |     -Has a higher score than all the prompts above.
16 | 2. The predicted score of this prompt
17 | 
18 | You must adhere the error analysis instructions! even in case it seems there is a contradiction between these instructions, and the task. The error analysis is tested on a ground truth, thus represent the exact intent of the task.
19 | The generated prompt should be phrased as a clear classification instruction! it should not include any instructions and descriptions on the modification that should be done to the prompt.
20 | Note that the previous prompt contains an implicit assumptions on the intent of the task that might be incorrect. You should replace this assumption with more accurate assumptions using the score of the previous prompts and the error analysis.
21 | The result prompt should indicate that the task is a classification class with the following labels {labels}!
22 | 
23 | Answer in the following format:
24 | #### prompt:
25 | <prompt suggestion>
26 | #### score:
27 | <score>
28 | ############
29 | Results:
30 | 


--------------------------------------------------------------------------------
/prompts/meta_prompts_completion/step_samples.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to generate challenging samples for every task.
 2 | Below a few prompts and their failure case, for the following task:
 3 | {task_description}
 4 | 
 5 | ## Examples of common failure
 6 | {history}
 7 | ######
 8 | Your task is to generate {num_samples} challenging and diverse samples that will confuse the model with the following prompt:
 9 | ## Prompt
10 | {prompt}
11 | 
12 | Answer in the following format:
13 | #### Sample 1:
14 | <text>
15 | #### Sample 2:
16 | <text>
17 | ############
18 | Results:
19 | 


--------------------------------------------------------------------------------
/prompts/meta_prompts_generation/error_analysis.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to provide a high quality analysis for every task.
 2 | You are given the following task description
 3 | {task_description}
 4 | 
 5 | Here is the prompt instructions that was given to the model:
 6 | {prompt}
 7 | 
 8 | An expert ranker evaluated the model's performance on the given task description.
 9 | and rank according to the following scale: {labels}
10 | 
11 | The mean score for this prompt is: {accuracy}
12 | ##
13 | Here is a list of challenging cases for the given prompt and their rank:
14 | ##Challenging Cases:
15 | {failure_cases}
16 | 
17 | ###
18 | Note that the ranker labels are __absolutely correct__, but the prompts (task descriptions) may be incorrect and need modification.
19 | Your task is to provide a brief analysis of the given prompt performance.
20 | Guidelines:
21 | 1. The analysis should contain only the following information:
22 |     - A summary of the common mistakes of the prompt and the ways he can be improve his generation, try to cluster the failure cases into groups and describe each group.
23 | 2. The total length of your analysis should be less than 200 token!
24 | ###
25 | Analysis:


--------------------------------------------------------------------------------
/prompts/meta_prompts_generation/initial.prompt:
--------------------------------------------------------------------------------
 1 | As an advanced language model you should create {num_samples} challenging and unique prompts for the task outlined below.
 2 | These samples should be intricately designed to test the limits of the task's instructions, challenging yet relevant to the task description.
 3 | 
 4 | The task description and instruction is phrased as a generative task. The results prompts samples should be input to the the model.
 5 | The model will be able then to generate an example given the instructions and the prompt input.
 6 | 
 7 | ### Task Description:
 8 | {task_description}
 9 | 
10 | ### Task Instructions:
11 | {instruction}
12 | 
13 | ### Requirements for Challenging Samples:
14 | 1. Each prompt must present a unique and intricate challenge.
15 | 2. The prompts should cover a diverse range of scenarios within the scope of the task, avoiding repetition and predictability.
16 | 3. Each prompt should contain only the prompt part, without generating also the results
17 | 4. Each prompt should contain only the prompt part, without any mention of the task description or instructions!!
18 | 
19 | Generate the prompt samples keeping these requirements in mind.
20 | ###


--------------------------------------------------------------------------------
/prompts/meta_prompts_generation/output_schemes.py:
--------------------------------------------------------------------------------
 1 | # A file containing the json schema for the output of all the LLM chains
 2 | 
 3 | initial_schema = step_samples_schema = {
 4 |   "description": "A List of all results",
 5 |   "properties": {
 6 |     "samples": {
 7 |       "description": "Each sample is a string containing only the prompt sample content, without any additional information",
 8 |       "items": {
 9 |         "type": "string"
10 |       },
11 |       "title": "Samples",
12 |       "type": "array"
13 |     }
14 |   },
15 |   "required": [
16 |     "samples"
17 |   ],
18 |   "title": "Sample_List",
19 |   "type": "object"
20 | }
21 | 
22 | 
23 | classification_prediction_schema = {
24 |   "$defs": {
25 |     "Result": {
26 |       "description": "A single result",
27 |       "properties": {
28 |         "id": {
29 |           "description": "The sample id",
30 |           "title": "Id",
31 |           "type": "integer"
32 |         },
33 |         "prediction": {
34 |           "description": "The prediction of the sample.",
35 |           "title": "Prediction",
36 |           "type": "string"
37 |         }
38 |       },
39 |       "required": [
40 |         "id",
41 |         "prediction"
42 |       ],
43 |       "title": "Result",
44 |       "type": "object"
45 |     }
46 |   },
47 |   "description": "A List of task classification results",
48 |   "properties": {
49 |     "results": {
50 |       "description": "Each item contain the id and the prediction of the sample",
51 |       "items": {
52 |         "$ref": "#/$defs/Result"
53 |       },
54 |       "title": "Results",
55 |       "type": "array"
56 |     }
57 |   },
58 |   "required": [
59 |     "results"
60 |   ],
61 |   "title": "Results_List",
62 |   "type": "object"
63 | }
64 | 
65 | 
66 | step_prompt_schema = {
67 |   "description": "A prompt suggestion which expect to get high score, and the associated score prediction",
68 |   "properties": {
69 |     "prompt": {
70 |       "description": "The prompt prediction",
71 |       "title": "Prompt",
72 |       "type": "string"
73 |     },
74 |     "score": {
75 |       "description": "The score prediction",
76 |       "title": "Score",
77 |       "type": "number"
78 |     }
79 |   },
80 |   "required": [
81 |     "prompt",
82 |     "score"
83 |   ],
84 |   "title": "Suggested_Prompt",
85 |   "type": "object"
86 | }
87 | 
88 | def update_classification_prediction_schema(label_schema:list)->dict:
89 |   """
90 |   Updates the classification prediction schema with the label schema from the yaml file
91 |   :param yaml_data: The yaml data
92 |   """
93 | 
94 |   classification_prediction_schema['$defs']['Result']['properties']['prediction']['enum'] = label_schema
95 |   classification_prediction_schema['$defs']['Result']['properties']['prediction'][
96 |     'description'] += 'The answer must be one of the following options: {} !!'.format(label_schema)
97 |   return classification_prediction_schema


--------------------------------------------------------------------------------
/prompts/meta_prompts_generation/step_prompt.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to provide the best instructions for every task.
 2 | Below are a few suggested instructions for the task and score (mean of the rank), for the following task description:
 3 | {task_description}
 4 | 
 5 | ## Examples
 6 | {history}
 7 | ######
 8 | This is the analysis for the last instruction:
 9 | {error_analysis}
10 | ######
11 | Your task is to generate:
12 | 1. A new instruction that is
13 |     -Different from all the instructions above
14 |     -Follows exactly the error analysis modification suggestions, and fix the instruction to improve the quality of the instruction.
15 |     -Has a higher score than all the instructions above.
16 | 2. The predicted score of this instructions
17 | 
18 | You must adhere the error analysis instructions! even in case it seems there is a contradiction between these instructions, and the task. The error analysis was evaluate by an expert ranker, thus represent the exact intent of the task.
19 | The generated instruction should be phrased as a clear generation instruction! it should not include any instructions and descriptions on the modification that should be done to the instruction.
20 | Note that the previous instruction contains an implicit assumptions on the intent of the task that might be incorrect. You should replace this assumption with more accurate assumptions using the score of the previous instructions and the error analysis.


--------------------------------------------------------------------------------
/prompts/meta_prompts_generation/step_samples.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to generate challenging samples for every task.
 2 | Below a few prompts that were build to answer the given task description and their failure case.
 3 | Task description:
 4 | {task_description}
 5 | 
 6 | ## Examples, each sample is followed by the the moder prediction and the GT (ground truth)
 7 | {history}
 8 | ######
 9 | Here are few unique samples derived from realistic scenarios for the task outlined above.
10 | ## Realistic Samples
11 | {extra_samples}
12 | #####
13 | This was the new proposed prompt:
14 | ## Prompt
15 | {prompt}
16 | 
17 | Your task is to generate {num_samples} by following this guidelines:
18 | 1. The generated samples should be diverse
19 | 2. They should preserve the style and the length of the given examples
20 | 3. The samples must be challenging and hard to classify by the model. This can be achieved by:
21 |     1. targeting the same weakness that the model failed on in the given examples
22 |     2. targeting weakness that are different from the existing examples in the failure cases
23 | 4. The number of generated samples from each class should be almost balanced (i.e. the same number of samples for each class)
24 | 5. The generated samples should include only the sample content without additional information! (like the model prediction and the ground truth)


--------------------------------------------------------------------------------
/prompts/meta_prompts_ranking/error_analysis.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to provide a high quality analysis for every task.
 2 | You are given the following task description
 3 | {task_description}
 4 | 
 5 | Here is the prompt instructions that was given to the model:
 6 | {prompt}
 7 | 
 8 | The accuracy for this prompt is: {accuracy}
 9 | The confusion matrix for this prompt is: {confusion_matrix}
10 | ##
11 | Here is a list of failure cases for the given prompt:
12 | ##Failure Cases:
13 | {failure_cases}
14 | 
15 | ###
16 | Note that the ground-truth labels are __absolutely correct__, but the prompts (task descriptions) may be incorrect and need modification.
17 | Your task is to provide a brief analysis of the given prompt performance.
18 | Guidelines:
19 | 1. The analysis should contain only the following information:
20 |     - If there exists abnormal behavior in the confusion matrix, describe it.
21 |     - A summary of the common failure cases, try to cluster the failure cases into groups and describe each group.
22 | 3. The total length of your analysis should be less than 200 token!
23 | ###
24 | Analysis:


--------------------------------------------------------------------------------
/prompts/meta_prompts_ranking/initial.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to generate challenging samples for every task.
 2 | Generate a list of {num_samples} challenging samples for the following task.
 3 | ### Task description:
 4 | {task_description}
 5 | ### Task Instruction:
 6 | {instruction}
 7 | ###
 8 | ### Requirements for Challenging Samples:
 9 | 1. The generated samples must be challenging and diverse such that using the task instruction as a prompt will result in the wrong result.
10 | 2. The generated samples must be only from the top two scores! With equal distribution between the two.
11 | 3. The generated samples should be distinct, realistic, and vary significantly to ensure diversity.
12 | 
13 | If the task depends both on a context, or a user input and a generated content then the sample content must include all the relevant parts.
14 |     -In this case the sample content structure should be as follows:
15 |         1. First write the require context or user input.
16 |         2. Then write the generated content of the model on this context or user input.
17 |      The style of the separation and the indication of the different parts, should be different in each sample.


--------------------------------------------------------------------------------
/prompts/meta_prompts_ranking/initial_verbose.prompt:
--------------------------------------------------------------------------------
 1 | As an advanced language model you should create {num_samples} challenging and unique samples for the task outlined below.
 2 | These samples should be intricately designed to test the limits of the task's instructions, challenging yet relevant to the task description.
 3 | 
 4 | ### Task Description:
 5 | {task_description}
 6 | 
 7 | ### Task Instructions:
 8 | {instruction}
 9 | 
10 | ### Requirements for Challenging Samples:
11 | 1. Each sample must present a unique and intricate challenge.
12 | 2. The complexity of the samples should be such that simply applying the given task instruction would likely lead to incorrect or incomplete results.
13 | 3. The samples should cover a diverse range of scenarios within the scope of the task, avoiding repetition and predictability.
14 | 4. Ensure that the samples, while challenging, remain realistic and pertinent to the task's context.
15 | 
16 | Generate the samples keeping these requirements in mind.
17 | ###


--------------------------------------------------------------------------------
/prompts/meta_prompts_ranking/output_schemes.py:
--------------------------------------------------------------------------------
 1 | # A file containing the json schema for the output of all the LLM chains
 2 | 
 3 | initial_schema = step_samples_schema = {
 4 |   "description": "A List of all results",
 5 |   "properties": {
 6 |     "samples": {
 7 |       "description": "Each sample is a string containing the sample content, without any additional information like the Prediction or GT",
 8 |       "items": {
 9 |         "type": "string"
10 |       },
11 |       "title": "Samples",
12 |       "type": "array"
13 |     }
14 |   },
15 |   "required": [
16 |     "samples"
17 |   ],
18 |   "title": "Sample_List",
19 |   "type": "object"
20 | }
21 | 
22 | 
23 | classification_prediction_schema = {
24 |   "$defs": {
25 |     "Result": {
26 |       "description": "A single result",
27 |       "properties": {
28 |         "id": {
29 |           "description": "The sample id",
30 |           "title": "Id",
31 |           "type": "integer"
32 |         },
33 |         "prediction": {
34 |           "description": "The prediction of the sample.",
35 |           "title": "Prediction",
36 |           "type": "string"
37 |         }
38 |       },
39 |       "required": [
40 |         "id",
41 |         "prediction"
42 |       ],
43 |       "title": "Result",
44 |       "type": "object"
45 |     }
46 |   },
47 |   "description": "A List of task classification results",
48 |   "properties": {
49 |     "results": {
50 |       "description": "Each item contain the id and the prediction of the sample",
51 |       "items": {
52 |         "$ref": "#/$defs/Result"
53 |       },
54 |       "title": "Results",
55 |       "type": "array"
56 |     }
57 |   },
58 |   "required": [
59 |     "results"
60 |   ],
61 |   "title": "Results_List",
62 |   "type": "object"
63 | }
64 | 
65 | 
66 | step_prompt_schema = {
67 |   "description": "A prompt suggestion which expect to get high score, and the associated score prediction",
68 |   "properties": {
69 |     "prompt": {
70 |       "description": "The prompt prediction",
71 |       "title": "Prompt",
72 |       "type": "string"
73 |     },
74 |     "score": {
75 |       "description": "The score prediction",
76 |       "title": "Score",
77 |       "type": "number"
78 |     }
79 |   },
80 |   "required": [
81 |     "prompt",
82 |     "score"
83 |   ],
84 |   "title": "Suggested_Prompt",
85 |   "type": "object"
86 | }
87 | 
88 | def update_classification_prediction_schema(label_schema:list)->dict:
89 |   """
90 |   Updates the classification prediction schema with the label schema from the yaml file
91 |   :param yaml_data: The yaml data
92 |   """
93 | 
94 |   classification_prediction_schema['$defs']['Result']['properties']['prediction']['enum'] = label_schema
95 |   classification_prediction_schema['$defs']['Result']['properties']['prediction'][
96 |     'description'] += 'The answer must be one of the following options: {} !!'.format(label_schema)
97 |   return classification_prediction_schema


--------------------------------------------------------------------------------
/prompts/meta_prompts_ranking/step_prompt.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to provide the best prompt for every task.
 2 | Below are a few suggested prompts for the task and their score, for the following task:
 3 | {task_description}
 4 | 
 5 | ## Examples
 6 | {history}
 7 | ######
 8 | This is the error analysis for the last prompt:
 9 | {error_analysis}
10 | ######
11 | Your task is to generate:
12 | 1. A new prompt that is
13 |     -Different from all the prompts above
14 |     -Follows exactly the error analysis modification suggestions, and fix the prompt to prevent the failure cases.
15 |     -Has a higher score than all the prompts above.
16 | 2. The predicted score of this prompt
17 | 
18 | You must adhere the error analysis instructions! even in case it seems there is a contradiction between these instructions, and the task. The error analysis is tested on a ground truth, thus represent the exact intent of the task.
19 | The generated prompt should be phrased as a clear classification instruction! it should not include any instructions and descriptions on the modification that should be done to the prompt.
20 | Note that the previous prompt contains an implicit assumptions on the intent of the task that might be incorrect. You should replace this assumption with more accurate assumptions using the score of the previous prompts and the error analysis.
21 | The result prompt should indicate that the task is a classification class with the following labels {labels}!


--------------------------------------------------------------------------------
/prompts/meta_prompts_ranking/step_prompt_verbose.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to provide the best prompt for every task.
 2 | Below are a few suggested prompts for the task and their score, for the following task:
 3 | {task_description}. The exact intent of this task description should be determine by external annotated dataset of challenging cases. The error analysis of this dataset is provided.
 4 | ## Examples
 5 | {history}
 6 | ######
 7 | For the last prompt we got the following error analysis:
 8 | {error_analysis}
 9 | ######
10 | Your task is to generate:
11 | 1. A new prompt that is
12 |     -Different from all the prompts above
13 |     -Follows exactly after the error analysis modification suggestions, and fix the prompt to prevent the failure cases.
14 |     -Has a higher score than all the prompts above.
15 | 2. The predicted score of this prompt
16 | 
17 | Guidelines for the new prompt:
18 | 1. The prompt is given a 'scratchpad', he can use it to extract from the sample text relevant information to make his prediction and perform a reasoning thought to get to the correct decision
19 | 2. The prompt is intended for a shallow LLM, which does not have access to previous failure cases or the analysis! he has only access to the generated new prompt which should be independent of the previous prompts.
20 | 4. Lists can organize the information and help the prompt (for example list of rules and a list of samples), the lists should be short and accurate
21 | 5. Note that the prompts and task descriptions may be inaccurate and need modification.
22 | 6. Note that higher score means better prompt.
23 | 7. The result prompt should indicate that the task is a classification class with the following labels {labels}!
24 | 
25 | Sample randomly a number between 1 to 3. If the result is zero __change completely__ the generated prompt! including the instruction, the structure and the phrasing!


--------------------------------------------------------------------------------
/prompts/meta_prompts_ranking/step_samples.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to generate challenging samples for every task.
 2 | Below a few prompts that were build to answer the given task description and their failure case.
 3 | Task description:
 4 | {task_description}
 5 | 
 6 | ## Examples of common failure, each sample is followed by the the model prediction and the GT (ground truth)
 7 | {history}
 8 | ######
 9 | Here are few unique samples derived from realistic scenarios for the task outlined above.
10 | ## Realistic Samples
11 | {extra_samples}
12 | #####
13 | This was the new proposed prompt:
14 | ## Prompt
15 | {prompt}
16 | 
17 | Your task is to generate {num_samples} by following this guidelines:
18 | 1. The generated samples should be diverse
19 | 2. They should preserve the style and the length of the given examples
20 | 3. The samples must be challenging and hard to classify by the model. This can be achieved by:
21 |     1. targeting the same weakness that the model failed on in the given examples
22 |     2. targeting weakness that are different from the existing examples in the failure cases
23 | 4. The generated samples must be only from the top two scores! With equal distribution between the two!
24 | 
25 | If the task depends both on a context, or a user input and a generated content then the sample content must include all the relevant parts.
26 |     -In this case the sample content structure should be as follows:
27 |         1. First write the require context or user input.
28 |         2. Then write the generated content of the model on this context or user input.
29 |      The style of the separation and the indication of the different parts, should be different in each sample.


--------------------------------------------------------------------------------
/prompts/modifiers/modifiers.yml:
--------------------------------------------------------------------------------
1 | 
2 | ranker:
3 |     prompt_mod: 'prompts/modifiers/ranker_prompt_mod.prompt'
4 |     task_desc_mod: 'prompts/modifiers/ranker_task_desc_mod.prompt'
5 | 


--------------------------------------------------------------------------------
/prompts/modifiers/ranker_prompt_mod.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to generate instructions for every task.
 2 | You are given a instructions phrased as text generation task.
 3 | Your task is to write an instruction for a classification ranking task that suppose to evaluate the quality of a generated sample given a user prompt for this generative instruction.
 4 | Guidelines:
 5 | 1. The classifier labels are {label_schema}. The result instructions should indicate explicitly that the task is a classification class with the following labels {label_schema}!
 6 | 2. The generated instruction must also evaluate how well the generated sample adhere the user prompt
 7 | #####
 8 | Input generative instruction: {prompt}
 9 | #####
10 | Rephrased classification quality evaluation instruction:


--------------------------------------------------------------------------------
/prompts/modifiers/ranker_task_desc_mod.prompt:
--------------------------------------------------------------------------------
1 | Assistant is a large language model designed to generate a task description.
2 | You are given a task description phrased as text generation task given some user input. Your task is to rephrase it as a task that suppose to evaluate the quality of the given generative task and how well it adhere to the user input.
3 | #####
4 | Input task description: {task_description}
5 | #####
6 | Rephrased task description:


--------------------------------------------------------------------------------
/prompts/predictor/output_schemes.py:
--------------------------------------------------------------------------------
 1 | # A file containing the json schema for the output of all the LLM chains
 2 | 
 3 | prediction_schema = {
 4 |   "$defs": {
 5 |     "Result": {
 6 |       "description": "A single result",
 7 |       "properties": {
 8 |         "id": {
 9 |           "description": "The sample id",
10 |           "title": "Id",
11 |           "type": "integer"
12 |         },
13 |         "prediction": {
14 |           "description": "The prediction of the sample.",
15 |           "title": "Prediction",
16 |           "type": "string"
17 |         }
18 |       },
19 |       "required": [
20 |         "id",
21 |         "prediction"
22 |       ],
23 |       "title": "Result",
24 |       "type": "object"
25 |     }
26 |   },
27 |   "description": "A List of task classification results",
28 |   "properties": {
29 |     "results": {
30 |       "description": "Each item contain the id and the prediction of the sample",
31 |       "items": {
32 |         "$ref": "#/$defs/Result"
33 |       },
34 |       "title": "Results",
35 |       "type": "array"
36 |     }
37 |   },
38 |   "required": [
39 |     "results"
40 |   ],
41 |   "title": "Results_List",
42 |   "type": "object"
43 | }
44 | 
45 | 
46 | def update_classification_prediction_schema(schema, label_schema:list)->dict:
47 |   """
48 |   Updates the classification prediction schema with the label schema from the yaml file
49 |   :param yaml_data: The yaml data
50 |   """
51 | 
52 |   schema['$defs']['Result']['properties']['prediction']['enum'] = label_schema
53 |   schema['$defs']['Result']['properties']['prediction'][
54 |     'description'] += 'The answer must be one of the following options: {} !!'.format(label_schema)
55 |   return schema


--------------------------------------------------------------------------------
/prompts/predictor/prediction.prompt:
--------------------------------------------------------------------------------
1 | Assistant is a large language model designed to classify challenging language tasks.
2 | Given a list of {batch_size} samples classify them according to the following task
3 | ### Task Instruction:
4 | {task_instruction}
5 | 
6 | ### list of samples:
7 | {samples}
8 | ##
9 | Remember, follow carefully after the exact task instructions!


--------------------------------------------------------------------------------
/prompts/predictor_completion/output_schemes.py:
--------------------------------------------------------------------------------
 1 | # A file containing the json schema for the output of all the LLM chains
 2 | # A file containing the parser for the output of all the LLM chains
 3 | import re
 4 | 
 5 | 
 6 | def prediction_parser(response: dict) -> dict:
 7 |     """
 8 |     Parse the response from the LLM chain
 9 |     :param response: The response from the LLM chain
10 |     :return: The parsed response
11 |     """
12 |     pattern = re.compile(r'Sample (\d+): (\w+)')
13 |     matches = pattern.findall(response['text'])
14 |     predictions = [{'id': int(match[0]), 'prediction': match[1]} for match in matches]
15 |     return {'results': predictions}
16 | 
17 | def prediction_generation_parser(response: dict) -> dict:
18 |     """
19 |     Parse the response from the LLM chain
20 |     :param response: The response from the LLM chain
21 |     :return: The parsed response
22 |     """
23 |     pattern = re.compile(r'Sample (\d+): (.*?)(?=<eos>|$)', re.DOTALL)
24 |     matches = pattern.findall(response['text'])
25 |     predictions = [{'id': int(match[0]), 'prediction': match[1].strip()} for match in matches]
26 |     return {'results': predictions}
27 | 


--------------------------------------------------------------------------------
/prompts/predictor_completion/prediction.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to classify challenging language tasks.
 2 | Given a list of {batch_size} samples classify them according to the following task
 3 | ### Task Instruction:
 4 | {task_instruction}
 5 | 
 6 | ### list of samples:
 7 | {samples}
 8 | ##
 9 | Answer exactly in the following format for each sample:
10 | #### Sample <ID>: <label>
11 | ############


--------------------------------------------------------------------------------
/prompts/predictor_completion/prediction_generation.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to generate challenging language tasks.
 2 | Given a list of {batch_size} prompts, you should generate a response according to the following task
 3 | ### Task Instruction:
 4 | {task_instruction}
 5 | 
 6 | ### list of samples:
 7 | {samples}
 8 | ##
 9 | Each generated text should be end with <eos> token.
10 | Answer exactly in the following format for each sample:
11 | #### Sample <ID>: <text generation> <eos>
12 | ############
13 | #### Sample <ID>: <text generation> <eos>
14 | ############


--------------------------------------------------------------------------------
/prompts/predictor_completion/prediction_verbose.prompt:
--------------------------------------------------------------------------------
 1 | Assistant is a large language model designed to classify challenging language tasks.
 2 | Given a list of {batch_size} samples classify them according to the following task
 3 | ### Task Instruction:
 4 | {task_instruction}
 5 | 
 6 | ### list of samples:
 7 | {samples}
 8 | ##
 9 | Answer exactly in the following format for each sample:
10 | ###
11 | <sample scratchpad> #a scratchpad for writing notes
12 | ###
13 | #### Sample <ID>: <label> #Provide only the final label
14 | ############
15 | 
16 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | argilla==1.25.0
 2 | schedule==1.2.1
 3 | pandas==1.5.3
 4 | tqdm==4.66.1
 5 | prodict==0.8.18
 6 | langchain==0.2.7
 7 | openai==1.35.10
 8 | tiktoken==0.7.0
 9 | easydict==1.11
10 | wandb==0.16.0
11 | transformers==4.35.2
12 | scikit-learn==1.3.2
13 | faiss-cpu==1.7.4
14 | sentence-transformers==2.2.2
15 | langchain-google-genai==1.0.8
16 | pillow==10.2.0
17 | langchain_openai==0.1.20


--------------------------------------------------------------------------------
/run_generation_pipeline.py:
--------------------------------------------------------------------------------
 1 | from optimization_pipeline import OptimizationPipeline
 2 | from utils.config import load_yaml, modify_input_for_ranker, validate_generation_config, override_config
 3 | import argparse
 4 | import os
 5 | from estimator.estimator_llm import LLMEstimator
 6 | # General Training Parameters
 7 | parser = argparse.ArgumentParser()
 8 | 
 9 | parser.add_argument('--generation_config_path', default='config/config_diff/config_generation.yml', type=str, help='Configuration file path')
10 | parser.add_argument('--ranker_config_path', default='config/config_diff/config_ranking.yml', type=str, help='Configuration file path')
11 | 
12 | parser.add_argument('--task_description',
13 |                     default='',
14 |                     required=False, type=str, help='Describing the task')
15 | parser.add_argument('--prompt',
16 |                     default='',
17 |                     required=False, type=str, help='Prompt to use as initial.')
18 | parser.add_argument('--load_dump', default='', required=False, type=str, help='In case of loading from checkpoint')
19 | parser.add_argument('--output_dump', default='dump', required=False, type=str, help='Output to save checkpoints')
20 | parser.add_argument('--num_ranker_steps', default=20, type=int, help='Number of iterations')
21 | parser.add_argument('--num_generation_steps', default=20, type=int, help='Number of iterations')
22 | 
23 | opt = parser.parse_args()
24 | 
25 | 
26 | ranker_config_params = override_config(opt.ranker_config_path)
27 | generation_config_params = override_config(opt.generation_config_path)
28 | validate_generation_config(ranker_config_params, generation_config_params)
29 | 
30 | if opt.task_description == '':
31 |     task_description = input("Describe the task: ")
32 | else:
33 |     task_description = opt.task_description
34 | 
35 | if opt.prompt == '':
36 |     initial_prompt = input("Initial prompt: ")
37 | else:
38 |     initial_prompt = opt.prompt
39 | 
40 | ranker_pipeline = OptimizationPipeline(ranker_config_params, output_path=os.path.join(opt.output_dump, 'ranker'))
41 | if opt.load_dump != '':
42 |     ranker_pipeline.load_state(os.path.join(opt.load_dump, 'ranker'))
43 |     ranker_pipeline.predictor.init_chain(ranker_config_params.dataset.label_schema)
44 | 
45 | if (ranker_pipeline.cur_prompt is None) or (ranker_pipeline.task_description is None):
46 |     ranker_mod_prompt, ranker_mod_task_desc = modify_input_for_ranker(ranker_config_params, task_description,
47 |                                                                       initial_prompt)
48 |     ranker_pipeline.cur_prompt = ranker_mod_prompt
49 |     ranker_pipeline.task_description = ranker_mod_task_desc
50 | 
51 | best_prompt = ranker_pipeline.run_pipeline(opt.num_ranker_steps)
52 | generation_config_params.eval.function_params = ranker_config_params.predictor.config
53 | generation_config_params.eval.function_params.instruction = best_prompt['prompt']
54 | generation_config_params.eval.function_params.label_schema = ranker_config_params.dataset.label_schema
55 | 
56 | 
57 | generation_pipeline = OptimizationPipeline(generation_config_params, task_description, initial_prompt,
58 |                                            output_path=os.path.join(opt.output_dump, 'generator'))
59 | if opt.load_dump != '':
60 |     generation_pipeline.load_state(os.path.join(opt.load_dump, 'generator'))
61 | best_generation_prompt = generation_pipeline.run_pipeline(opt.num_generation_steps)
62 | print('\033[92m' + 'Calibrated prompt score:', str(best_generation_prompt['score']) + '\033[0m')
63 | print('\033[92m' + 'Calibrated prompt:', best_generation_prompt['prompt'] + '\033[0m')
64 | 


--------------------------------------------------------------------------------
/run_pipeline.py:
--------------------------------------------------------------------------------
 1 | from optimization_pipeline import OptimizationPipeline
 2 | from utils.config import load_yaml, override_config
 3 | import argparse
 4 | 
 5 | # General Training Parameters
 6 | parser = argparse.ArgumentParser()
 7 | 
 8 | parser.add_argument('--basic_config_path', default='config/config_default.yml', type=str, help='Configuration file path')
 9 | parser.add_argument('--batch_config_path', default='',
10 |                     type=str, help='Batch classification configuration file path')
11 | parser.add_argument('--prompt',
12 |                     default='',
13 |                     required=False, type=str, help='Prompt to use as initial.')
14 | parser.add_argument('--task_description',
15 |                     default='',
16 |                     required=False, type=str, help='Describing the task')
17 | parser.add_argument('--load_path', default='', required=False, type=str, help='In case of loading from checkpoint')
18 | parser.add_argument('--output_dump', default='dump', required=False, type=str, help='Output to save checkpoints')
19 | parser.add_argument('--num_steps', default=40, type=int, help='Number of iterations')
20 | 
21 | opt = parser.parse_args()
22 | 
23 | if opt.batch_config_path == '':
24 |     # load the basic configuration using load_yaml
25 |     config_params = load_yaml(opt.basic_config_path)
26 | else:
27 |     # override the basic configuration with the batch configuration
28 |     config_params = override_config(opt.batch_config_path, config_file=opt.basic_config_path)
29 | 
30 | if opt.task_description == '':
31 |     task_description = input("Describe the task: ")
32 | else:
33 |     task_description = opt.task_description
34 | 
35 | if opt.prompt == '':
36 |     initial_prompt = input("Initial prompt: ")
37 | else:
38 |     initial_prompt = opt.prompt
39 | 
40 | # Initializing the pipeline
41 | pipeline = OptimizationPipeline(config_params, task_description, initial_prompt, output_path=opt.output_dump)
42 | if (opt.load_path != ''):
43 |     pipeline.load_state(opt.load_path)
44 | best_prompt = pipeline.run_pipeline(opt.num_steps)
45 | print('\033[92m' + 'Calibrated prompt score:', str(best_prompt['score']) + '\033[0m')
46 | print('\033[92m' + 'Calibrated prompt:', best_prompt['prompt'] + '\033[0m')
47 | 
48 | 


--------------------------------------------------------------------------------
/utils/config.py:
--------------------------------------------------------------------------------
  1 | import yaml
  2 | from easydict import EasyDict as edict
  3 | from langchain.prompts import PromptTemplate
  4 | from langchain_openai import ChatOpenAI
  5 | from pathlib import Path
  6 | from langchain.llms.huggingface_pipeline import HuggingFacePipeline
  7 | from langchain_openai.chat_models import AzureChatOpenAI
  8 | from langchain.chains import LLMChain
  9 | import logging
 10 | 
 11 | LLM_ENV = yaml.safe_load(open('config/llm_env.yml', 'r'))
 12 | 
 13 | 
 14 | class Color:
 15 |     RED = '\033[91m'
 16 |     GREEN = '\033[92m'
 17 |     YELLOW = '\033[93m'
 18 |     BLUE = '\033[94m'
 19 |     END = '\033[0m'  # Reset to default color
 20 | 
 21 | 
 22 | def get_llm(config: dict):
 23 |     """
 24 |     Returns the LLM model
 25 |     :param config: dictionary with the configuration
 26 |     :return: The llm model
 27 |     """
 28 |     if 'temperature' not in config:
 29 |         temperature = 0
 30 |     else:
 31 |         temperature = config['temperature']
 32 |     if 'model_kwargs' in config:
 33 |         model_kwargs = config['model_kwargs']
 34 |     else:
 35 |         model_kwargs = {}
 36 | 
 37 |     if config['type'].lower() == 'openai':
 38 |         api_base = LLM_ENV['openai']['OPENAI_API_BASE'] if LLM_ENV['openai']['OPENAI_API_BASE'] != '' else 'https://api.openai.com/v1'
 39 |         if LLM_ENV['openai']['OPENAI_ORGANIZATION'] == '':
 40 |             return ChatOpenAI(temperature=temperature, model_name=config['name'],
 41 |                               openai_api_key=config.get('openai_api_key', LLM_ENV['openai']['OPENAI_API_KEY']),
 42 |                               openai_api_base=config.get('openai_api_base', api_base),
 43 |                               model_kwargs=model_kwargs)
 44 |         else:
 45 |             return ChatOpenAI(temperature=temperature, model_name=config['name'],
 46 |                               openai_api_key=config.get('openai_api_key', LLM_ENV['openai']['OPENAI_API_KEY']),
 47 |                               openai_api_base=config.get('openai_api_base', api_base),
 48 |                               openai_organization=config.get('openai_organization', LLM_ENV['openai']['OPENAI_ORGANIZATION']),
 49 |                               model_kwargs=model_kwargs)
 50 |     elif config['type'].lower() == 'azure':
 51 |         return AzureChatOpenAI(temperature=temperature, azure_deployment=config['name'],
 52 |                         openai_api_key=config.get('openai_api_key', LLM_ENV['azure']['AZURE_OPENAI_API_KEY']),
 53 |                         azure_endpoint=config.get('azure_endpoint', LLM_ENV['azure']['AZURE_OPENAI_ENDPOINT']),
 54 |                         openai_api_version=config.get('openai_api_version', LLM_ENV['azure']['OPENAI_API_VERSION']))
 55 | 
 56 |     elif config['type'].lower() == 'google':
 57 |         from langchain_google_genai import ChatGoogleGenerativeAI
 58 |         return ChatGoogleGenerativeAI(temperature=temperature, model=config['name'],
 59 |                               google_api_key=LLM_ENV['google']['GOOGLE_API_KEY'],
 60 |                               model_kwargs=model_kwargs)
 61 |     
 62 |     elif config['type'].lower() == 'anthropic':
 63 |         from langchain_anthropic import ChatAnthropic
 64 |         return ChatAnthropic(temperature=temperature, model=config['name'],
 65 |                               api_key=LLM_ENV['anthropic']['ANTHROPIC_API_KEY'],
 66 |                               model_kwargs=model_kwargs)
 67 | 
 68 | 
 69 |     elif config['type'].lower() == 'huggingfacepipeline':
 70 |         device = config.get('gpu_device', -1)
 71 |         device_map = config.get('device_map', None)
 72 | 
 73 |         return HuggingFacePipeline.from_model_id(
 74 |             model_id=config['name'],
 75 |             task="text-generation",
 76 |             pipeline_kwargs={"max_new_tokens": config['max_new_tokens']},
 77 |             device=device,
 78 |             device_map=device_map
 79 |         )
 80 |     else:
 81 |         raise NotImplementedError("LLM not implemented")
 82 | 
 83 | 
 84 | def load_yaml(yaml_path: str, as_edict: bool = True) -> edict:
 85 |     """
 86 |     Reads the yaml file and enrich it with more vales.
 87 |     :param yaml_path: The path to the yaml file
 88 |     :param as_edict: If True, returns an EasyDict configuration
 89 |     :return: An EasyDict configuration
 90 |     """
 91 |     with open(yaml_path, 'r') as file:
 92 |         yaml_data = yaml.safe_load(file)
 93 |         if 'meta_prompts' in yaml_data.keys() and 'folder' in yaml_data['meta_prompts']:
 94 |             yaml_data['meta_prompts']['folder'] = Path(yaml_data['meta_prompts']['folder'])
 95 |     if as_edict:
 96 |         yaml_data = edict(yaml_data)
 97 |     return yaml_data
 98 | 
 99 | 
100 | def load_prompt(prompt_path: str) -> PromptTemplate:
101 |     """
102 |     Reads and returns the contents of a prompt file.
103 |     :param prompt_path: The path to the prompt file
104 |     """
105 |     with open(prompt_path, 'r') as file:
106 |         prompt = file.read().rstrip()
107 |     return PromptTemplate.from_template(prompt)
108 | 
109 | 
110 | def validate_generation_config(base_config, generation_config):
111 |     if "annotator" not in generation_config:
112 |         raise Exception("Generation config must contain an empty annotator.")
113 |     if "label_schema" not in generation_config.dataset or \
114 |             base_config.dataset.label_schema != generation_config.dataset.label_schema:
115 |         raise Exception("Generation label schema must match the basic config.")
116 | 
117 | 
118 | def modify_input_for_ranker(config, task_description, initial_prompt):
119 |     modifiers_config = yaml.safe_load(open('prompts/modifiers/modifiers.yml', 'r'))
120 |     task_desc_setup = load_prompt(modifiers_config['ranker']['task_desc_mod'])
121 |     init_prompt_setup = load_prompt(modifiers_config['ranker']['prompt_mod'])
122 | 
123 |     llm = get_llm(config.llm)
124 |     task_llm_chain = LLMChain(llm=llm, prompt=task_desc_setup)
125 |     task_result = task_llm_chain(
126 |         {"task_description": task_description})
127 |     mod_task_desc = task_result['text']
128 |     logging.info(f"Task description modified for ranking to: \n{mod_task_desc}")
129 | 
130 |     prompt_llm_chain = LLMChain(llm=llm, prompt=init_prompt_setup)
131 |     prompt_result = prompt_llm_chain({"prompt": initial_prompt, 'label_schema': config.dataset.label_schema})
132 |     mod_prompt = prompt_result['text']
133 |     logging.info(f"Initial prompt modified for ranking to: \n{mod_prompt}")
134 | 
135 |     return mod_prompt, mod_task_desc
136 | 
137 | 
138 | def override_config(override_config_file, config_file='config/config_default.yml'):
139 |     """
140 |     Override the default configuration file with the override configuration file
141 |     :param config_file: The default configuration file
142 |     :param override_config_file: The override configuration file
143 |     """
144 | 
145 |     def override_dict(config_dict, override_config_dict):
146 |         for key, value in override_config_dict.items():
147 |             if isinstance(value, dict):
148 |                 if key not in config_dict:
149 |                     config_dict[key] = value
150 |                 else:
151 |                     override_dict(config_dict[key], value)
152 |             else:
153 |                 config_dict[key] = value
154 |         return config_dict
155 | 
156 |     default_config_dict = load_yaml(config_file, as_edict=False)
157 |     override_config_dict = load_yaml(override_config_file, as_edict=False)
158 |     config_dict = override_dict(default_config_dict, override_config_dict)
159 |     return edict(config_dict)
160 | 


--------------------------------------------------------------------------------
/utils/dedup.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import faiss
 3 | import pandas as pd
 4 | 
 5 | 
 6 | class Dedup:
 7 | 
 8 |     def __init__(self, config=None):
 9 |         self.index = None
10 |         self.xb = None
11 |         self.clusters = None
12 |         self.th = (config or {}).get("dedup_threshold", 0.5)
13 |         self.model_name = (config or {}).get("embeddings_model", 'all-MiniLM-L6-v2')
14 | 
15 |     def copy(self):
16 |         return Dedup(
17 |             {"dedup_threshold": self.th,
18 |              "embeddings_model": self.model_name}
19 |         )
20 | 
21 |     def generate_embeddings(self, texts):
22 |         """
23 |         Generate embeddings for the given texts using the SentenceTransformer model.
24 |         """
25 |         from sentence_transformers import SentenceTransformer
26 |         model = SentenceTransformer(self.model_name)
27 |         embeddings = model.encode(texts, show_progress_bar=True)
28 |         return embeddings
29 | 
30 |     def build_index(self, records):
31 |         """
32 |         Build the FAISS index for the given dataset.
33 |         input: records - a pandas dataframe with a 'text' column
34 |         output: index - the FAISS index
35 |                 embeddings - the embeddings of the dataset
36 |         """
37 |         # Generate embeddings for the dataset
38 |         embeddings = self.generate_embeddings(records['text'].tolist())
39 | 
40 |         # Build the FAISS index
41 |         embeddings_dim = embeddings.shape[1]
42 |         index = faiss.IndexFlatL2(embeddings_dim)
43 |         index.add(embeddings)
44 |         return index, embeddings
45 | 
46 |     def cluster_data(self, records):
47 |         """
48 |         Cluster the given dataset.
49 |         input: records - a pandas dataframe with a 'text' column
50 |         output: clusters - a list of clusters, where each cluster is a set of indices
51 |         """
52 | 
53 |         if self.index is None:
54 |             self.index, self.xb = self.build_index(records)
55 | 
56 |         distances, indices = self.index.search(self.xb, 30) #TODO: dereive it from the batch size
57 | 
58 |         clusters = []
59 |         visited = set()
60 | 
61 |         for i in range(len(self.xb)):
62 |             if i in visited:
63 |                 continue
64 | 
65 |             # Find neighbors and create a new cluster
66 |             neighbors = [idx for idx, distance in zip(indices[i], distances[i]) if distance <= self.th]
67 |             new_cluster = {i}
68 | 
69 |             # Add all neighbors to the new cluster
70 |             for neighbor in neighbors:
71 |                 if neighbor not in visited:
72 |                     visited.add(neighbor)
73 |                     new_cluster.add(neighbor)
74 | 
75 |             clusters.append(new_cluster)
76 |         return clusters
77 | 
78 |     def sample(self, records: pd.DataFrame, operation_function=random.choice):
79 |         """
80 |         Sample the given dataset.
81 |         input: records - a pandas dataframe with a 'text' column
82 |                operation_function - a function that receives a cluster and returns an index
83 |         output: a pandas dataframe with the sampled records
84 |         """
85 | 
86 |         if not callable(operation_function):
87 |             raise ValueError("The 'operation_function' must be a callable function.")
88 | 
89 |         if self.clusters is None:
90 |             self.clusters = self.cluster_data(records)
91 | 
92 |         samples = [operation_function(list(cluster)) for cluster in self.clusters]
93 |         return records.iloc[sorted(samples)]
94 | 


--------------------------------------------------------------------------------
/utils/llm_chain.py:
--------------------------------------------------------------------------------
  1 | from langchain.chains.openai_functions import (
  2 |     create_structured_output_runnable)
  3 | from utils.config import get_llm, load_prompt
  4 | from langchain_community.callbacks import get_openai_callback
  5 | import asyncio
  6 | from langchain.chains import LLMChain
  7 | import importlib
  8 | from pathlib import Path
  9 | from tqdm import trange, tqdm
 10 | import concurrent.futures
 11 | import logging
 12 | 
 13 | 
 14 | class DummyCallback:
 15 |     """
 16 |     A dummy callback for the LLM.
 17 |     This is a trick to handle an empty callback.
 18 |     """
 19 | 
 20 |     def __enter__(self):
 21 |         self.total_cost = 0
 22 |         return self
 23 | 
 24 |     def __exit__(self, exc_type, exc_value, traceback):
 25 |         pass
 26 | 
 27 | 
 28 | def get_dummy_callback():
 29 |     return DummyCallback()
 30 | 
 31 | 
 32 | class ChainWrapper:
 33 |     """
 34 |     A wrapper for a LLM chain
 35 |     """
 36 | 
 37 |     def __init__(self, llm_config, prompt_path: str, json_schema: dict = None, parser_func=None):
 38 |         """
 39 |         Initialize a new instance of the ChainWrapper class.
 40 |         :param llm_config: The config for the LLM
 41 |         :param prompt_path: A path to the prompt file (text file)
 42 |         :param json_schema: A dict for the json schema, to get a structured output for the LLM
 43 |         :param parser_func: A function to parse the output of the LLM
 44 |         """
 45 |         self.llm_config = llm_config
 46 |         self.llm = get_llm(llm_config)
 47 |         self.json_schema = json_schema
 48 |         self.parser_func = parser_func
 49 |         self.prompt = load_prompt(prompt_path)
 50 |         self.build_chain()
 51 |         self.accumulate_usage = 0
 52 |         if self.llm_config.type.lower() == 'openai':
 53 |             self.callback = get_openai_callback
 54 |         else:
 55 |             self.callback = get_dummy_callback
 56 | 
 57 |     def invoke(self, chain_input: dict) -> dict:
 58 |         """
 59 |         Invoke the chain on a single input
 60 |         :param chain_input: The input for the chain
 61 |         :return: A dict with the defined json schema
 62 |         """
 63 |         with self.callback() as cb:
 64 |             try:
 65 |                 result = self.chain.invoke(chain_input)
 66 |                 if self.parser_func is not None:
 67 |                     result = self.parser_func(result)
 68 |             except Exception as e:
 69 |                 if e.http_status == 401:
 70 |                     raise e
 71 |                 else:
 72 |                     logging.error('Error in chain invoke: {}'.format(e.user_message))
 73 |                     result = None
 74 |             self.accumulate_usage += cb.total_cost
 75 |             return result
 76 | 
 77 |     async def retry_operation(self, tasks):
 78 |         """
 79 |         Retry an async operation
 80 |         :param tasks:
 81 |         :return:
 82 |         """
 83 |         delay = self.llm_config.async_params.retry_interval
 84 |         timeout = delay * self.llm_config.async_params.max_retries
 85 | 
 86 |         # Ensure all tasks are explicitly created
 87 |         tasks = [asyncio.create_task(task) for task in tasks]
 88 | 
 89 |         start_time = asyncio.get_event_loop().time()
 90 |         end_time = start_time + timeout
 91 |         results = []
 92 |         while True:
 93 |             remaining_time = end_time - asyncio.get_event_loop().time()
 94 |             if remaining_time <= 0:
 95 |                 print("Timeout reached. Operation incomplete.")
 96 |                 break
 97 | 
 98 |             done, pending = await asyncio.wait(tasks, timeout=delay)
 99 |             results += list(done)
100 | 
101 |             if len(done) == len(tasks):
102 |                 print("All tasks completed successfully.")
103 |                 break
104 | 
105 |             if not pending:
106 |                 print("No pending tasks. Operation incomplete.")
107 |                 break
108 | 
109 |             tasks = list(pending)  # Retry with the pending tasks
110 |         return results
111 | 
112 |     async def async_batch_invoke(self, inputs: list[dict]) -> list[dict]:
113 |         """
114 |         Invoke the chain on a batch of inputs in async mode
115 |         :param inputs: A batch of inputs
116 |         :return: A list of dicts with the defined json schema
117 |         """
118 |         with self.callback() as cb:
119 |             tasks = [self.chain.ainvoke(chain_input) for chain_input in inputs]
120 |             all_res = await self.retry_operation(tasks)
121 |             self.accumulate_usage += cb.total_cost
122 |             if self.parser_func is not None:
123 |                 return [self.parser_func(t.result()) for t in list(all_res)]
124 |             return [t.result() for t in list(all_res)]
125 | 
126 |     def batch_invoke(self, inputs: list[dict], num_workers: int):
127 |         """
128 |         Invoke the chain on a batch of inputs either async or not
129 |         :param inputs: The list of all inputs
130 |         :param num_workers: The number of workers
131 |         :return: A list of results
132 |         """
133 | 
134 |         def sample_generator():
135 |             for sample in inputs:
136 |                 yield sample
137 | 
138 |         def process_sample_with_progress(sample):
139 |             result = self.invoke(sample)
140 |             if self.llm_config.type == 'google':
141 |                 if isinstance(result, list) and len(result) == 1:
142 |                     result = result[0]['args']
143 |             pbar.update(1)  # Update the progress bar
144 |             return result
145 | 
146 |         if not ('async_params' in self.llm_config.keys()):  # non async mode, use regular workers
147 |             with concurrent.futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
148 |                 with tqdm(total=len(inputs), desc="Processing samples") as pbar:
149 |                     all_results = list(executor.map(process_sample_with_progress, sample_generator()))
150 |         else:
151 |             all_results = []
152 |             for i in trange(0, len(inputs), num_workers, desc='Predicting'):
153 |                 results = asyncio.run(self.async_batch_invoke(inputs[i:i + num_workers]))
154 |                 all_results += results
155 |         all_results = [res for res in all_results if res is not None]
156 |         return all_results
157 | 
158 |     def build_chain(self):
159 |         """
160 |         Build the chain according to the LLM type
161 |         """
162 |         if (self.llm_config.type.lower() in ['openai', 'azure', 'anthropic', 'google']) and self.json_schema is not None:
163 |             self.chain = self.prompt | self.llm.with_structured_output(self.json_schema)
164 |         else:
165 |             self.chain = LLMChain(llm=self.llm, prompt=self.prompt)
166 | 
167 | 
168 | def get_chain_metadata(prompt_fn: Path, retrieve_module: bool = False) -> dict:
169 |     """
170 |     Get the metadata of the chain
171 |     :param prompt_fn: The path to the prompt file
172 |     :param retrieve_module: If True, retrieve the module
173 |     :return: A dict with the metadata
174 |     """
175 |     prompt_directory = str(prompt_fn.parent)
176 |     prompt_name = str(prompt_fn.stem)
177 |     try:
178 |         spec = importlib.util.spec_from_file_location('output_schemes', prompt_directory + '/output_schemes.py')
179 |         schema_parser = importlib.util.module_from_spec(spec)
180 |         spec.loader.exec_module(schema_parser)
181 |     except ImportError as e:
182 |         print(f"Error loading module {prompt_directory + '/output_schemes'}: {e}")
183 | 
184 |     if hasattr(schema_parser, '{}_schema'.format(prompt_name)):
185 |         json_schema = getattr(schema_parser, '{}_schema'.format(prompt_name))
186 |     else:
187 |         json_schema = None
188 |     if hasattr(schema_parser, '{}_parser'.format(prompt_name)):
189 |         parser_func = getattr(schema_parser, '{}_parser'.format(prompt_name))
190 |     else:
191 |         parser_func = None
192 |     result = {'json_schema': json_schema, 'parser_func': parser_func}
193 |     if retrieve_module:
194 |         result['module'] = schema_parser
195 |     return result
196 | 
197 | 
198 | class MetaChain:
199 |     """
200 |     A wrapper for the meta-prompts chain
201 |     """
202 | 
203 |     def __init__(self, config):
204 |         """
205 |         Initialize a new instance of the MetaChain class. Loading all the meta-prompts
206 |         :param config: An EasyDict configuration
207 |         """
208 |         self.config = config
209 |         self.initial_chain = self.load_chain('initial')
210 |         self.step_prompt_chain = self.load_chain('step_prompt')
211 |         self.step_samples = self.load_chain('step_samples')
212 |         self.error_analysis = self.load_chain('error_analysis')
213 | 
214 |     def load_chain(self, chain_name: str) -> ChainWrapper:
215 |         """
216 |         Load a chain according to the chain name
217 |         :param chain_name: The name of the chain
218 |         """
219 |         metadata = get_chain_metadata(self.config.meta_prompts.folder / '{}.prompt'.format(chain_name))
220 |         return ChainWrapper(self.config.llm, self.config.meta_prompts.folder / '{}.prompt'.format(chain_name),
221 |                             metadata['json_schema'], metadata['parser_func'])
222 | 
223 |     def calc_usage(self) -> float:
224 |         """
225 |         Calculate the usage of all the meta-prompts
226 |         :return: The total usage value
227 |         """
228 |         return self.initial_chain.accumulate_usage + self.step_prompt_chain.accumulate_usage \
229 |                + self.step_samples.accumulate_usage
230 | 


--------------------------------------------------------------------------------