├── .flake8 ├── .gitignore ├── .mypy.ini ├── .pre-commit-config.yaml ├── LICENSE ├── README.md ├── docs ├── Makefile ├── publish.sh └── source │ ├── README.md │ ├── api.rst │ ├── cli.rst │ ├── conf.py │ └── index.rst ├── examples ├── __pycache__ │ └── use_cases.cpython-38.pyc ├── python_examples.py ├── readme_examples.py └── readme_hello_world.py ├── promptimize ├── __init__.py ├── cli.py ├── crawler.py ├── evals.py ├── prompt_cases.py ├── reports.py ├── simple_jinja.py ├── suite.py └── utils.py ├── publish.sh ├── pyproject.toml ├── requirements-dev.txt ├── requirements-examples.txt ├── requirements.txt └── setup.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 100 3 | exclude = tests/* 4 | max-complexity = 10 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *__pycache__* 2 | build/ 3 | dist/ 4 | work/* 5 | promptimize.egg-info/ 6 | -------------------------------------------------------------------------------- /.mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | ignore_missing_imports = True 3 | exclude = ['docs/'] 4 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | repos: 18 | #- repo: https://github.com/pre-commit/mirrors-mypy 19 | # rev: v0.941 20 | # hooks: 21 | # - id: mypy 22 | # additional_dependencies: [types-all] 23 | - repo: https://github.com/pre-commit/pre-commit-hooks 24 | rev: v4.4.0 25 | hooks: 26 | - id: check-added-large-files 27 | exclude: \.(geojson)$ 28 | - id: check-yaml 29 | exclude: ^helm/superset/templates/ 30 | - id: debug-statements 31 | - id: end-of-file-fixer 32 | - id: trailing-whitespace 33 | args: ["--markdown-linebreak-ext=md"] 34 | - repo: https://github.com/psf/black 35 | rev: 23.9.1 36 | hooks: 37 | - id: black 38 | language_version: python3 39 | - repo: https://github.com/pre-commit/mirrors-prettier 40 | rev: v3.0.3 # Use the sha or tag you want to point at 41 | hooks: 42 | - id: prettier 43 | args: ["--ignore-path=./superset-frontend/.prettierignore"] 44 | files: "superset-frontend" 45 | # blacklist unsafe functions like make_url (see #19526) 46 | - repo: https://github.com/skorokithakis/blacklist-pre-commit-hook 47 | rev: e2f070289d8eddcaec0b580d3bde29437e7c8221 48 | hooks: 49 | - id: blacklist 50 | args: ["--blacklisted-names=make_url", "--ignore=tests/"] 51 | - repo: https://github.com/pycqa/flake8 52 | rev: '6.1.0' 53 | hooks: 54 | - id: flake8 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 💡 ¡promptimize! 💡 2 | [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) 3 | [![PyPI version](https://badge.fury.io/py/promptimize.svg)](https://badge.fury.io/py/promptimize) 4 | 5 | 6 | 7 | Promptimize is a prompt engineering **evaluation** and **testing** toolkit. 8 | 9 | It accelerates and provides structure around prompt engineering at scale 10 | with confidence, bringing some of the ideas behind test-driven 11 | development (TDD) to engineering prompts. 12 | 13 | With promptimize, you can: 14 | 15 | - Define your "prompt cases" (think "test cases" but specific to evaluating 16 | prompts) as code and associate them with evaluation functions 17 | - Generate prompt variations dynamically 18 | - Execute and rank prompts test suites across different 19 | engines/models/temperature/settings and compare results, brining 20 | the hyperparameter tuning mindset to prompt engineering 21 | - Get reports on your prompts' performance as you iterate. Answer question 22 | around how different prompt suites are performing against one-another. 23 | Which individual cases or categories of cases improved? regressed? 24 | - Minimize API calls! only re-assess what changed as you change it 25 | - Perform human if and where needed, introspected failed cases, overriding 26 | false negatives 27 | 28 | In essence, promptimize provides a programmatic way to execute and fine-tune 29 | your prompts and evaluation functions in Python, allowing you to iterate 30 | quickly and with confidence. 31 | 32 | ## Hello world - the simplest prompt examples 33 | [more examples on GitHub](https://github.com/preset-io/promptimize/tree/main/examples) 34 | ```python 35 | # Brining some "prompt generator" classes - note that you can derive and extend those 36 | from promptimize.prompts import PromptCase 37 | 38 | # Bringing some useful eval function that help evaluating and scoring responses 39 | # eval functions have a handle on the prompt object and are expected 40 | # to return a score between 0 and 1 41 | from promptimize import evals 42 | 43 | # Promptimize will scan the target folder and find all Prompt objects 44 | # and derivatives that are in the python modules 45 | simple_prompts = [ 46 | 47 | # Prompting "hello there" and making sure there's "hi" or "hello" 48 | # somewhere in the answer 49 | PromptCase("hello there!", lambda x: evals.any_word(x, ["hi", "hello"])), 50 | PromptCase( 51 | "name the top 50 guitar players!", lambda x: evals.all_words(x, ["frank zappa"]) 52 | ), 53 | ] 54 | ``` 55 | 56 | ### The CLI's `run` command 57 | ``` 58 | $ promptimize run --help 59 | Usage: promptimize run [OPTIONS] PATH 60 | 61 | run some prompts 62 | 63 | Options: 64 | -v, --verbose Trigger more verbose output 65 | -f, --force Force run, do not skip 66 | -h, --human Human review, allowing a human to review and force 67 | pass/fail each prompt case 68 | -r, --repair Only re-run previously failed 69 | -x, --dry-run DRY run, don't call the API 70 | --shuffle Shuffle the prompts in a random order 71 | -s, --style [json|yaml] json or yaml formatting 72 | -m, --max-tokens INTEGER max_tokens passed to the model 73 | -l, --limit INTEGER limit how many prompt cases to run in a single 74 | batch 75 | -t, --temperature FLOAT max_tokens passed to the model 76 | -e, --engine TEXT model as accepted by the openai API 77 | -k, --key TEXT The keys to run 78 | -o, --output PATH 79 | -s, --silent 80 | ``` 81 | 82 | Let's run those examples and produce a report `./report.yaml` 83 | ``` 84 | $ promptimize run examples/ --output ./report.yaml 85 | ``` 86 | 87 | ```yaml 88 | 💡 ¡promptimize! 💡 89 | # ---------------------------------------- 90 | # (1/2) [RUN] prompt: prompt-115868ef 91 | # ---------------------------------------- 92 | key: prompt-115868ef 93 | user_input: hello there! 94 | prompt_hash: 115868ef 95 | response: Hi there! How are you doing today? 96 | execution: 97 | api_call_duration_ms: 883.8047981262207 98 | run_at: '2023-04-25T02:21:40.443077' 99 | score: 1.0 100 | 101 | # ---------------------------------------- 102 | # (2/2) [RUN] prompt: prompt-5c085656 103 | # ---------------------------------------- 104 | key: prompt-5c085656 105 | user_input: name the top 10 guitar players! 106 | prompt_hash: 5c085656 107 | response: |- 108 | 1. Jimi Hendrix 109 | 2. Eric Clapton 110 | {{ ... }} 111 | 11. Carlos Santana 112 | weight: 2 113 | execution: 114 | api_call_duration_ms: 2558.135747909546 115 | run_at: '2023-04-25T02:21:43.007529' 116 | score: 0.0 117 | 118 | # ---------------------------------------- 119 | # Suite summary 120 | # ---------------------------------------- 121 | suite_score: 0.3333333333333333 122 | git_info: 123 | sha: 2cf28498ba0f 124 | branch: main 125 | dirty: true 126 | ``` 127 | 128 | ## Problem + POV 129 | 130 | Thousands of product builders are currently trying to figure out how to 131 | bring the power of AI into the products and experiences they are building. 132 | The probabilistic (often semi-random, sometimes hectic) nature of LLMs 133 | makes this a challenge. 134 | 135 | Prompt engineering is a huge piece of the puzzle in terms of how to do this 136 | right, especially given the complexity, risks, and drawbacks around 137 | model tuning. 138 | 139 | We believe product builders need to tame AI through proper, rigorous 140 | **prompt engineering**. This allows making the probabilistic nature of 141 | AI more deterministic, or somewhat predictable, and allows builders to apply 142 | a hyperparameter tuning-type mindset and approach to prompt engineering. 143 | 144 | Any prompt-generator logic that's going to be let loose in the wild inside 145 | a product should be thoroughly tested and evaluated with "prompt cases" that 146 | cover the breath of what people may do in a product. 147 | 148 | In short, Promptimize allows you to test prompts at industrial scale, 149 | so that you can confidently use them in the products you are building. 150 | 151 | ## Information Architecture 152 | 153 | - **Prompt:** A Prompt instance is a certain test case, a single prompt 154 | with an associated set of evaluation functions to rate its success. 155 | - **Evaluation:** An evaluation function that reads the response and returns 156 | a success rate between `0` and `1`. 157 | - **Suite:** A Suite is a collection of Prompt; it's able to run things, 158 | accumulate results, and print reports about its collection of use cases. 159 | - **Report**: a report is the compiled results of running a certain prompt 160 | `Suite` or set of suites. Reports can be consumed, compared, and expanded. 161 | 162 | ## Principles 163 | 164 | - **Configuration as code:** All prompt cases, suites, and evaluations are 165 | defined as code, which makes it easy to dynamically generate all sorts 166 | of use cases and suites. 167 | - **Expressive**: a clean DSL that's to-the-point -> user prompt + assertions. 168 | the actually prompt creation logic lives in the derivative class of `PromptCase`, 169 | so that we can have clean, dense files that contain nice `Suite`s 170 | - **Support the iteration mindset:** making it easy for people to try things, 171 | get suggestions from the AI, adapt, compare, and push forward 172 | - **Extensibility:** the toolkit is designed to be extremely hackable and 173 | extensible. Hooks, extensions, high API surface. 174 | - **AI-powered:** the framework offers ways to expand your suites based 175 | on the examples that exists. Use AI to generate more prompt cases! 176 | 177 | 178 | ## Interesting features / facts 179 | 180 | Listing out a few features you should know about that you can start using as your 181 | suites of prompts become larger / more complex 182 | 183 | * evaluation functions are assumed to return a value between 0 and 1. 184 | contrarily to unit tests, prompt cases aren't boolean 185 | * prompts can be assigned a `weight` (default 1) this enables you to define 186 | which prompts are more important than others for reporting purposes and suite evaluation 187 | * prompts can be assigned a `category`, this can be used in the reporting. 188 | That helps understanding which categories are performing better than 189 | others, or are most affected by iterations 190 | * The `Prompt` class `pre_run` and `post_run` hooks if you want to do 191 | post-processing for instance. An example of that would be if you do a prompt 192 | that expects GPT to generate code, and you'd like actually say run that code 193 | and test it. In our SQL implementation, we run the SQL against the database 194 | for instance and get a pandas dataframe back, and allow doing assertions 195 | on the dataframe itself 196 | 197 | 198 | 199 | ## Getting started 200 | 201 | To install the Promptimize package, use the following command: 202 | ```bash 203 | pip install promptimize 204 | ``` 205 | 206 | First you'll need an openai API key, let's set it as an env var 207 | ```bash 208 | export OPENAI_API_KEY=sk-{{ REDACTED }} 209 | ``` 210 | 211 | Find the examples executed below 212 | [here](https://github.com/preset-io/promptimize/blob/main/examples/readme_examples.py) 213 | 214 | ```bash 215 | # Clone the repo 216 | git clone git@github.com:preset-io/promptimize.git 217 | cd promptimize 218 | 219 | # NOTE: CLI is `promptimize`, but `p9e` is a shorter synonym, can be used interchangibly 220 | # First let's run some of the examples 221 | p9e run ./examples 222 | 223 | # Now the same but with verbose output 224 | p9e run ./examples --verbose --output ./report.yaml 225 | 226 | ``` 227 | ## Langchain 228 | 229 | How does promptimize relate to `langchain`? 230 | 231 | We think langchain is amazing and promptimize uses langchain under the 232 | hood to interact with openai, and has integration with langchain 233 | (see `LangchainPromptCase`, and the upcoming `LangchainChainPromptCase` 234 | and `LangchainAgntPromptCase`) 235 | While you don't have to use 236 | langchain, and could use promptimize on top of any python prompt generation 237 | whether it'd be another library or some home grown thing. 238 | 239 | 240 | ## Context 241 | 242 | 243 | 244 | Where is `promptimize` coming from!? I'm (Maxime Beauchemin) a startup 245 | founder at Preset working on brining AI to BI 246 | (data exploration, 247 | and visualization). At Preset, we use `promptimize` to generate 248 | complex SQL based on natural language, and to suggest charts to users. We 249 | derive the `SimpleQuery` class to make it fitted to our specific use 250 | cases in our own prompt engineering repo. It's not my first open source project 251 | as the creator of 252 | [Apache Superset](https://github.com/apache/superset/) and 253 | [Apache Airflow](https://github.com/apache/airflow/) 254 | 255 | 256 | ## Contribute 257 | 258 | This project is in its super early stages as of `0.2.0`, and contributions, 259 | contributors, and maintainers are highly encouraged. While it's a great time 260 | to onboard and influence the direction of the project, things are still 261 | evolving quickly. To get involved, open a GitHub issue 262 | or submit a pull request! 263 | 264 | ## Links 265 | * [Blog - Mastering AI-Powered Product Development: Introducing Promptimize for Test-Driven Prompt Engineering](https://preset.io/blog/introducing-promptimize/) 266 | * [Preset Blog](https://preset.io/blog/) 267 | * [Promptimize DOCS](https://preset-io.github.io/promptimize/) 268 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | cp $(SOURCEDIR)/../../README.md $(SOURCEDIR)/ 21 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 22 | -------------------------------------------------------------------------------- /docs/publish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Clean up the build/html directory 4 | rm -rf build/html 5 | 6 | # Build the documentation 7 | make html 8 | 9 | # Change to the build/html directory 10 | cd build/html 11 | touch .nojekyll 12 | 13 | # Initialize a new Git repository 14 | git init 15 | 16 | # Add the generated files to the repository 17 | git add . 18 | 19 | # Commit the changes 20 | git commit -a -m "Initial commit" 21 | 22 | # Add the GitHub Pages remote repository 23 | git remote add origin https://github.com/preset-io/promptimize.git 24 | 25 | # Push the changes to the GitHub Pages repository 26 | git push -f origin main:gh-pages 27 | 28 | # Return to the original directory 29 | cd ../.. 30 | -------------------------------------------------------------------------------- /docs/source/README.md: -------------------------------------------------------------------------------- 1 | # 💡 ¡promptimize! 💡 2 | 3 | 4 | Promptimize is a prompt engineering evaluation and testing toolkit. 5 | 6 | It accelerates and provides structure around prompt engineering at scale 7 | with confidence, brigning some of the ideas behind test-driven 8 | developmet (TDD) to engineering prompts. 9 | 10 | With promptimize, you can: 11 | 12 | - Define your "prompt cases" (think "test cases" but specific to evaluating 13 | prompts) as code and associate them with evaluation functions 14 | - Generate prompt variations dynamically 15 | - Execute and rank prompts test suites across different 16 | engines/models/temperature/settings and compare results, brining 17 | the hyperparameter tuning mindset to prompt engineering 18 | - Get reports on your prompts' performance as you iterate. Answer question 19 | around how different prompt suites are performing against one-another. 20 | Which individual cases or categories of cases improved? regressed? 21 | - Minimize API calls! only re-assess what changed as you change it 22 | - Perform human if and where needed, introspected failed cases, overriding 23 | false negatives 24 | 25 | In essence, promptimize provides a programmatic way to execute and fine-tune 26 | your prompts and evaluation functions in Python, allowing you to iterate 27 | quickly and with confidence. 28 | 29 | ## Hello world - the simplest prompt examples 30 | [more examples on GitHub](https://github.com/preset-io/promptimize/tree/master/examples) 31 | ```python 32 | # Brining some "prompt generator" classes - note that you can derive and extend those 33 | from promptimize.prompts import SimplePrompt 34 | 35 | # Bringing some useful eval function that help evaluating and scoring responses 36 | # eval functions have a handle on the prompt object and are expected 37 | # to return a score between 0 and 1 38 | from promptimize import evals 39 | 40 | # Promptimize will scan the target folder and find all Prompt objects 41 | # and derivatives that are in the python modules 42 | simple_prompts = [ 43 | 44 | # Prompting "hello there" and making sure there's "hi" or "hello" 45 | # somewhere in the answer 46 | PromptCase("hello there!", lambda x: evals.any_word(x, ["hi", "hello"])), 47 | PromptCase( 48 | "name the top 50 guitar players!", lambda x: evals.all_words(x, ["frank zappa"]) 49 | ), 50 | ] 51 | ``` 52 | 53 | ### The CLI 54 | ```bash 55 | $ promptimize -h 56 | ``` 57 | 58 | 59 | ## Problem + POV 60 | 61 | Thousands of product builders are currently trying to figure out how to 62 | bring the power of AI into the products and experiences they are building. 63 | The probabilistic (often semi-random, sometimes hectic) nature of LLMs 64 | makes this a challenge. 65 | 66 | Prompt engineering is a huge piece of the puzzle in terms of how to do this 67 | right, especially given the complexity, risks, and drawbacks around 68 | model tuning. 69 | 70 | We believe product builders need to tame AI through proper, rigorous 71 | **prompt engineering**. This allows making the probabilistic nature of 72 | AI more deterministic, or somewhat predictable, and allows builders to apply 73 | a hyperparameter tuning-type mindset and approach to prompt engineering. 74 | 75 | Any prompt-generator logic that's going to be let loose in the wild inside 76 | a product should be thoroughly tested and evaluated with "prompt cases" that 77 | cover the breath of what people may do in a product. 78 | 79 | In short, Promptimize allows you to test prompts at industrial scale, 80 | so that you can confidently use them in the products you are building. 81 | 82 | ## Information Architecture 83 | 84 | - **Prompt:** A Prompt instance is a certain test case, a single prompt 85 | with an associated set of evaluation functions to rate its success. 86 | - **Evaluation:** An evaluation function that reads the response and returns 87 | a success rate between `0` and `1`. 88 | - **Suite:** A Suite is a collection of Prompt; it's able to run things, 89 | accumulate results, and print reports about its collection of use cases. 90 | - **Report**: a report is the compiled results of running a certain prompt 91 | `Suite` or set of suites. Reports can be consumed, compared, and expanded. 92 | 93 | ## Principles 94 | 95 | - **Configuration as code:** All prompt cases, suites, and evaluations are 96 | defined as code, which makes it easy to dynamically generate all sorts 97 | of use cases and suites. 98 | - **Expressive**: a clean DSL that's to-the-point -> user prompt + assertions. 99 | the actually prompt creation logic lives in the derivative class of `PromptCase`, 100 | so that we can have clean, dense files that contain nice `Suite`s 101 | - **Support the iteration mindset:** making it easy for people to try things, 102 | get suggestions from the AI, adapt, compare, and push forward 103 | - **Extensibility:** the toolkit is designed to be extremely hackable and 104 | extensible. Hooks, extensions, high API surface. 105 | - **AI-powered:** the framework offers ways to expand your suites based 106 | on the examples that exists. Use AI to generate more prompt cases! 107 | 108 | 109 | ## Interesting features / facts 110 | 111 | Listing out a few features you should know about that you can start using as your 112 | suites of prompts become larger / more complex 113 | 114 | * evaluation functions are assumed to return a value between 0 and 1. 115 | contrarily to unit tests, prompt cases aren't boolean 116 | * prompts can be assigned a `weight` (default 1) this enables you to define 117 | which prompts are more important than others for reporting purposes and suite evaluation 118 | * prompts can be assigned a `category`, this can be used in the reporting. 119 | That helps understanding which categories are performing better than 120 | others, or are most affected by iterations 121 | * The `Prompt` class `pre_run` and `post_run` hooks if you want to do 122 | post-processing for instance. An example of that would be if you do a prompt 123 | that expects GPT to generate code, and you'd like actually say run that code 124 | and test it. In our SQL implementation, we run the SQL against the database 125 | for instance and get a pandas dataframe back, and allow doing assertions 126 | on the dataframe itself 127 | 128 | 129 | ## Setup 130 | 131 | To install the Promptimize package, use the following command: 132 | ```bash 133 | pip install promptimize 134 | ``` 135 | 136 | ## Getting started 137 | 138 | First you'll need an openai API key, let's set it as an env var 139 | ```bash 140 | export OPENAI_API_KEY=sk-{REDACTED} 141 | ``` 142 | 143 | Find the examples bellow [here](https://github.com/preset-io/promptimize/blob/master/examples/readme_examples.py) 144 | 145 | ```python 146 | ``` 147 | ```bash 148 | # NOTE: CLI is `promptimize`, but `p9e` is a shorter synonym, can be used interchangibly 149 | 150 | # First let's run some of the examples 151 | p9e run ./examples 152 | 153 | # Now the same but with verbose output 154 | p9e run ./examples --verbose 155 | 156 | ``` 157 | ## Langchain? 158 | 159 | How does promptimize relate to `langchain`? 160 | 161 | We think langchain is amazing and promptimize uses langchain under the 162 | hood to interact with openai, and has integration with langchain 163 | (see `LangchainPromptCase`, and the upcoming `LangchainChainPromptCase` 164 | and `LangchainAgntPromptCase`) 165 | While you don't have to use 166 | langchain, and could use promptimize on top of any python prompt generation 167 | whether it'd be another library or some home grown thing. 168 | 169 | 170 | ## Context 171 | 172 | 173 | 174 | Where is `promptimize` coming from!? I'm (Maxime Beauchemin) a startup 175 | founder at Preset working on brining AI to BI 176 | (data exploration, 177 | and visualization). At Preset, we use `promptimize` to generate 178 | complex SQL based on natural language, and to suggest charts to users. We 179 | derive the `SimpleQuery` class to make it fitted to our specific use 180 | cases in our own prompt engineering repo. Not my first open source project 181 | as the creator of 182 | [Apache Superset](https://github.com/apache/superset/) and 183 | [Apache Airflow](https://github.com/apache/airflow/) 184 | 185 | 186 | ## Contribute 187 | 188 | This project is in its super early stages as of `0.1.0`, and contributions, 189 | contributors, and maintainers are highly encouraged. While it's a great time 190 | to onboard and influence the direction of the project, things are still 191 | evolving quickly. To get involved, open a GitHub issue 192 | or submit a pull request! 193 | 194 | ## Links 195 | * [Blog - Mastering AI-Powered Product Development: Introducing Promptimize for Test-Driven Prompt Engineering](https://preset.io/blog/) 196 | * [Preset Blog](https://preset.io/blog/) 197 | -------------------------------------------------------------------------------- /docs/source/api.rst: -------------------------------------------------------------------------------- 1 | API 2 | ============= 3 | 4 | PromptCase 5 | ---------- 6 | 7 | .. automodule:: promptimize.prompt_cases 8 | :members: 9 | 10 | Suite 11 | ----- 12 | 13 | .. automodule:: promptimize.suite 14 | :members: 15 | 16 | Report 17 | ------ 18 | 19 | .. automodule:: promptimize.reports 20 | :members: 21 | 22 | Evaluators 23 | ---------- 24 | 25 | .. automodule:: promptimize.evals 26 | :members: 27 | 28 | Utils 29 | ----- 30 | 31 | .. automodule:: promptimize.utils 32 | :members: 33 | -------------------------------------------------------------------------------- /docs/source/cli.rst: -------------------------------------------------------------------------------- 1 | The CLI 2 | ======= 3 | 4 | .. click:: promptimize:cli 5 | :prog: promptimize 6 | :show-nested: 7 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | from typing import List 9 | 10 | project = "promptimize" 11 | copyright = "2023, Maxime Beauchemin" 12 | author = "Maxime Beauchemin" 13 | 14 | # -- General configuration --------------------------------------------------- 15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 16 | 17 | extensions: List = [ 18 | "sphinx.ext.autodoc", 19 | "sphinx.ext.napoleon", # Optional, for Google and NumPy-style docstrings 20 | "recommonmark", # If you're using the recommonmark extension 21 | "sphinx_click.ext", 22 | ] 23 | 24 | 25 | templates_path = ["_templates"] 26 | exclude_patterns: List = [] 27 | 28 | 29 | # -- Options for HTML output ------------------------------------------------- 30 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 31 | 32 | html_theme = "alabaster" 33 | html_logo = ( 34 | "https://user-images.githubusercontent.com/" 35 | "487433/229948453-36cbc2d1-e71f-4e87-9111-ab428bc96f4c.png" 36 | ) 37 | html_static_path = ["_static"] 38 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. include:: README.md 2 | :parser: markdown 3 | 4 | .. toctree:: 5 | :maxdepth: 2 6 | :caption: Content 7 | 8 | api 9 | cli 10 | -------------------------------------------------------------------------------- /examples/__pycache__/use_cases.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/preset-io/promptimize/9c1ffb32444ea69eae158d2cff72786ee07db1bd/examples/__pycache__/use_cases.cpython-38.pyc -------------------------------------------------------------------------------- /examples/python_examples.py: -------------------------------------------------------------------------------- 1 | """An example of how to test Python code generating prompts""" 2 | import re 3 | 4 | # Brining some "prompt generator" classes 5 | from promptimize.prompt_cases import LangchainPromptCase 6 | 7 | # Bringing some useful eval function that help evaluating and scoring responses 8 | # eval functions have a handle on the prompt object and are expected 9 | # to return a score between 0 and 1 10 | from langchain import PromptTemplate 11 | from langchain.output_parsers import StructuredOutputParser, ResponseSchema 12 | import demjson 13 | 14 | from RestrictedPython import compile_restricted, safe_globals, safe_builtins 15 | from RestrictedPython.Guards import guarded_unpack_sequence 16 | from RestrictedPython.Eval import default_guarded_getiter 17 | 18 | 19 | response_schemas = [ 20 | ResponseSchema( 21 | name="python_function", 22 | description="the python function itself", 23 | ), 24 | ResponseSchema( 25 | name="functon_name", 26 | description="the name of the function", 27 | ), 28 | ResponseSchema(name="test_cases", description="test cases"), 29 | ResponseSchema( 30 | name="hints", 31 | description="if any, any recommendations to the users about clarifying their prompt", 32 | ), 33 | ] 34 | 35 | output_parser = StructuredOutputParser.from_response_schemas(response_schemas) 36 | format_instructions = output_parser.get_format_instructions().replace("\t", " ") 37 | """ 38 | * you include great useful docstrings and doctests that follow the Google conventions 39 | """ 40 | 41 | template = """\ 42 | System: you are an AI that writes python function that accomplish specific tasks 43 | 44 | Python guidelines: 45 | * you follow the PEP8 conventions 46 | * use 4 spaces indent, no tabs! 47 | * use snake case (using underscores) 48 | 49 | The output should be a VALID JSON blob with the following keys: 50 | * "python_function" as a string with the python function code 51 | * "function_name" as the name of the function 52 | * "hints": as some hints about how to use the function 53 | 54 | User: write a function that multipllies a number by 2 and returns the result 55 | 56 | System: 57 | { 58 | "python_function": "def multiply_by_two(number):\\n return number * 2\\n" 59 | "function_name": "multiply_by_two", 60 | "hints": "This function is not that helpful as you can simply mulitply by two\\ninstead of calling this function" 61 | } 62 | 63 | User: {{ user_input }} 64 | 65 | System: 66 | """ # noqa 67 | 68 | lc_template = PromptTemplate( 69 | input_variables=["user_input"], 70 | partial_variables={"format_instructions": format_instructions}, 71 | template=template, 72 | template_format="jinja2", 73 | ) 74 | 75 | 76 | def function_from_string(function_as_string, function_name): 77 | restricted_code = compile_restricted(function_as_string, "", "exec") 78 | 79 | # Define a separate environment for the code to run in 80 | execution_globals = safe_globals.copy() 81 | execution_globals.update( 82 | { 83 | "__builtins__": safe_builtins, 84 | "_unpack_sequence_": guarded_unpack_sequence, 85 | "_getiter_": default_guarded_getiter, 86 | } 87 | ) 88 | 89 | # Execute the code in the restricted environment 90 | exec(restricted_code, execution_globals) 91 | 92 | # Access the function from the restricted environment 93 | return execution_globals[function_name] 94 | 95 | 96 | def test(func, args, expected_result): 97 | if func: 98 | if not isinstance(args, (list, tuple)): 99 | args = [args] 100 | try: 101 | result = func(*args) 102 | if expected_result == result: 103 | return 1 104 | except Exception: 105 | return 0 106 | return 0 107 | 108 | 109 | def decode_shitty_json(s): 110 | json_match = re.search(r"\{[\s\S]*\}", s) 111 | 112 | if json_match: 113 | json_string = json_match.group() 114 | 115 | # Parse the JSON string using demjson 116 | json_data = demjson.decode(json_string) 117 | 118 | return json_data 119 | return None 120 | 121 | 122 | def test_is_prime(prompt_case, val, exp): 123 | return test(prompt_case.python_function, val, exp) 124 | 125 | 126 | class PythonGeneratorPrompt(LangchainPromptCase): 127 | def post_run(self): 128 | success = False 129 | self.python_function = None 130 | self.f = None 131 | try: 132 | self.response = decode_shitty_json(self.response) 133 | success = True 134 | except Exception as e: 135 | self.error = str(e) 136 | 137 | if success: 138 | # try: 139 | f = function_from_string( 140 | self.response.get("python_function"), self.response.get("function_name") 141 | ) 142 | self.python_function = f 143 | self.f = f 144 | # except Exception as e: 145 | # self.error = str(e) 146 | 147 | 148 | prompts = [ 149 | PythonGeneratorPrompt( 150 | lc_template, 151 | key="is_prime", 152 | user_input="write a function that tests if an number is a prime number, returns a boolean", 153 | evaluators=[ 154 | lambda x: test(x.f, 2, True), 155 | lambda x: test(x.f, 4, False), 156 | lambda x: test(x.f, 7, True), 157 | lambda x: test(x.f, 10, False), 158 | lambda x: test(x.f, 11, True), 159 | lambda x: test(x.f, 113, True), 160 | ], 161 | ), 162 | PythonGeneratorPrompt( 163 | lc_template, 164 | key="gcd", 165 | user_input="write a function that finds the greatest common divisor (GCD) of two numbers?", 166 | evaluators=[ 167 | lambda x: test(x.f, [14, 28], 14), 168 | lambda x: test(x.f, [56, 98], 14), 169 | lambda x: test(x.f, [81, 153], 9), 170 | ], 171 | ), 172 | PythonGeneratorPrompt( 173 | lc_template, 174 | key="factorial", 175 | user_input="write a function that calculates the factorial of a given number", 176 | evaluators=[ 177 | lambda x: test(x.f, 0, 1), 178 | lambda x: test(x.f, 1, 1), 179 | lambda x: test(x.f, 5, 120), 180 | lambda x: test(x.f, 7, 5040), 181 | lambda x: test(x.f, 10, 3628800), 182 | ], 183 | ), 184 | PythonGeneratorPrompt( 185 | lc_template, 186 | key="is_palindrome", 187 | user_input="write a function that determines if a given string is a palindrome", 188 | evaluators=[ 189 | lambda x: test(x.f, "racecar", True), 190 | lambda x: test(x.f, "hello", False), 191 | lambda x: test(x.f, "madam", True), 192 | lambda x: test(x.f, "python", False), 193 | lambda x: test(x.f, "Aibohphobia", True), 194 | ], 195 | ), 196 | PythonGeneratorPrompt( 197 | lc_template, 198 | key="fibonacci", 199 | user_input=( 200 | "write a function that generates the Fibonacci sequence ", 201 | "up to a specified number of terms", 202 | ), 203 | evaluators=[ 204 | lambda x: test(x.f, 1, [0]), 205 | lambda x: test(x.f, 2, [0, 1]), 206 | lambda x: test(x.f, 5, [0, 1, 1, 2, 3]), 207 | lambda x: test(x.f, 10, [0, 1, 1, 2, 3, 5, 8, 13, 21, 34]), 208 | lambda x: test(x.f, 7, [0, 1, 1, 2, 3, 5, 8]), 209 | ], 210 | ), 211 | PythonGeneratorPrompt( 212 | lc_template, 213 | key="sum_of_multiples", 214 | user_input=( 215 | "write a function that calculates the sum of all multiples ", 216 | "of 3 and 5 below a given number", 217 | ), 218 | evaluators=[ 219 | lambda x: test(x.f, 10, 23), 220 | lambda x: test(x.f, 20, 78), 221 | lambda x: test(x.f, 30, 195), 222 | lambda x: test(x.f, 50, 543), 223 | lambda x: test(x.f, 100, 2418), 224 | ], 225 | ), 226 | PythonGeneratorPrompt( 227 | lc_template, 228 | key="is_leap_year", 229 | user_input="write a function that checks whether a given year is a leap year", 230 | evaluators=[ 231 | lambda x: test(x.f, 2000, True), 232 | lambda x: test(x.f, 1900, False), 233 | lambda x: test(x.f, 2020, True), 234 | lambda x: test(x.f, 2021, False), 235 | lambda x: test(x.f, 2400, True), 236 | ], 237 | ), 238 | PythonGeneratorPrompt( 239 | lc_template, 240 | key="longest_substring_without_repeating_chars", 241 | user_input=( 242 | "write a function that finds the longest substring of a ", 243 | "given string without repeating characters", 244 | ), 245 | evaluators=[ 246 | lambda x: test(x.f, "abcabcbb", "abc"), 247 | lambda x: test(x.f, "bbbbbb", "b"), 248 | lambda x: test(x.f, "pwwkew", "wke"), 249 | lambda x: test(x.f, "abcdefgh", "abcdefgh"), 250 | lambda x: test(x.f, "abcbdacf", "bdacf"), 251 | ], 252 | ), 253 | PythonGeneratorPrompt( 254 | lc_template, 255 | key="longest_common_prefix", 256 | user_input="write a function that finds the longest common prefix of a list of strings", 257 | evaluators=[ 258 | lambda x: test(x.f, ["flower", "flow", "flight"], "fl"), 259 | lambda x: test(x.f, ["dog", "racecar", "car"], ""), 260 | lambda x: test(x.f, ["interspecies", "interstellar", "interstate"], "inter"), 261 | lambda x: test(x.f, ["prefix", "suffix", "infix"], ""), 262 | lambda x: test(x.f, ["geeksforgeeks", "geeks", "geek"], "geek"), 263 | ], 264 | ), 265 | PythonGeneratorPrompt( 266 | lc_template, 267 | key="sum_of_digits", 268 | user_input="write a function that calculates the sum of the digits of a given number", 269 | evaluators=[ 270 | lambda x: test(x.f, 123, 6), 271 | lambda x: test(x.f, 456, 15), 272 | lambda x: test(x.f, 789, 24), 273 | lambda x: test(x.f, 1001, 2), 274 | lambda x: test(x.f, 54321, 15), 275 | ], 276 | ), 277 | PythonGeneratorPrompt( 278 | lc_template, 279 | key="decimal_to_binary", 280 | user_input=( 281 | "write a function that converts a given decimal number to " "its binary representation" 282 | ), 283 | evaluators=[ 284 | lambda x: test(x.f, 2, "10"), 285 | lambda x: test(x.f, 7, "111"), 286 | lambda x: test(x.f, 10, "1010"), 287 | lambda x: test(x.f, 16, "10000"), 288 | lambda x: test(x.f, 31, "11111"), 289 | ], 290 | ), 291 | ] 292 | -------------------------------------------------------------------------------- /examples/readme_examples.py: -------------------------------------------------------------------------------- 1 | """ 2 | Some basic examples for promptimize. 3 | 4 | to run, simply execute `p9e ./examples/readme_examples.py` 5 | """ 6 | # Brining some "prompt generator" classes 7 | from promptimize.prompt_cases import PromptCase, TemplatedPromptCase 8 | 9 | # Bringing some useful eval function that help evaluating and scoring responses 10 | # eval functions have a handle on the prompt object and are expected 11 | # to return a score between 0 and 1 12 | from promptimize import evals 13 | 14 | # Promptimize will scan the target folder and find all Prompt objects 15 | # and derivatives that are in the python modules 16 | simple_prompts = [ 17 | # Prompting "hello there" and making sure there's "hi" or "hello" 18 | # somewhere in the answer 19 | PromptCase("hello there!", lambda x: evals.any_word(x.response, ["hi", "hello"])), 20 | # Making sure 3 specific guitar players are in the top 10 21 | # the score here is a percentage of the words found 22 | PromptCase( 23 | "who are the top 10 best guitar players of all time?", 24 | lambda x: evals.percentage_of_words( 25 | x.response, ["frank zappa", "david gilmore", "carlos santana"] 26 | ), 27 | ), 28 | # GPT-ing myself and making sure the response mentions Superset and Airflow 29 | PromptCase( 30 | "who is Maxime Beauchemin, (the data guy...)?", 31 | lambda x: evals.percentage_of_words( 32 | x.response, ["superset", "airflow"], case_sensitive=False 33 | ), 34 | ), 35 | ] 36 | 37 | # deriving TemplatedPromptCase to generate prompts that ask GPT to generate SQL 38 | # based on table schemas. The point here is you can derive the `Prompt` 39 | # class to create more specialized Prompt generators 40 | # For instance, the SqlPropt class defined bellow could be extended to fetch 41 | # schema definitions dynamically, acutally run the SQL, and allow 42 | # doing evals against the resultset. 43 | 44 | 45 | class SqlPrompt(TemplatedPromptCase): 46 | # the TemplatedPromptCase expects a dict of defaults that can be overriden in the constructor 47 | template_defaults = {"dialect": "Postgres"} 48 | # The actual Jinja2 template 49 | template = """\ 50 | given these SQL table schemas: 51 | CREATE TABLE world_population ( 52 | country_name STRING, 53 | year DATE, 54 | population_total INT, 55 | ); 56 | 57 | So, can you write a SQL query for {{ dialect }} that answers this user prompt: 58 | {{ input }} 59 | """ 60 | 61 | 62 | # Generating a few SQL prompts 63 | sql_prompts = [ 64 | SqlPrompt( 65 | # you can pass a unique key that can we used to reference a prompt 66 | key="sql-top-10-inc", 67 | # the user input that'll be added in place of {{ input }} in the template above 68 | user_input=( 69 | "give me the top 10 countries with the highest net increase " 70 | "of population over the past 25 years?" 71 | ), 72 | # the dialect template parameter, overriding the default set above 73 | dialect="BigQuery", 74 | # a simple validation function making sure the SQL starts with SELECT 75 | evaluators=lambda x: 1 if x.response.strip().startswith("SELECT") else 0, 76 | ), 77 | ] 78 | -------------------------------------------------------------------------------- /examples/readme_hello_world.py: -------------------------------------------------------------------------------- 1 | """ 2 | Some basic examples for promptimize. 3 | 4 | to run, simply execute `p9e ./examples/readme_examples.py` 5 | """ 6 | # Brining some "prompt generator" classes 7 | from promptimize.prompt_cases import PromptCase 8 | 9 | # Bringing some useful eval function that help evaluating and scoring responses 10 | # eval functions have a handle on the prompt object and are expected 11 | # to return a score between 0 and 1 12 | from promptimize import evals 13 | 14 | # Promptimize will scan the target folder and find all Prompt objects 15 | # and derivatives that are in the python modules 16 | simple_prompts = [ 17 | # Prompting "hello there" and making sure there's "hi" or "hello" 18 | # somewhere in the answer 19 | PromptCase("hello there!", lambda x: evals.any_word(x.response, ["hi", "hello"])), 20 | PromptCase( 21 | "name the top 10 guitar players!", 22 | lambda x: evals.all_words(x.response, ["zappa", "hendrix"]), 23 | weight=2, 24 | ), 25 | ] 26 | -------------------------------------------------------------------------------- /promptimize/__init__.py: -------------------------------------------------------------------------------- 1 | from promptimize.cli import cli # noqa 2 | -------------------------------------------------------------------------------- /promptimize/cli.py: -------------------------------------------------------------------------------- 1 | import click 2 | 3 | from promptimize.crawler import discover_objects 4 | from promptimize.prompt_cases import BasePromptCase 5 | from promptimize.reports import Report 6 | from promptimize.suite import Suite 7 | 8 | 9 | @click.group(help="💡¡promptimize!💡 CLI. `p9e` works too!") 10 | def cli(): 11 | pass 12 | 13 | 14 | @click.command(help="run some prompts") 15 | @click.argument( 16 | "path", 17 | required=True, 18 | type=click.Path(exists=True), 19 | ) 20 | @click.option("--verbose", "-v", is_flag=True, help="Trigger more verbose output") 21 | @click.option("--force", "-f", is_flag=True, help="Force run, do not skip") 22 | @click.option( 23 | "--human", 24 | "-h", 25 | is_flag=True, 26 | help="Human review, allowing a human to review and force pass/fail each prompt case", 27 | ) 28 | @click.option("--repair", "-r", is_flag=True, help="Only re-run previously failed") 29 | @click.option("--dry-run", "-x", is_flag=True, help="DRY run, don't call the API") 30 | @click.option("--shuffle", is_flag=True, help="Shuffle the prompts in a random order") 31 | @click.option( 32 | "--style", 33 | "-s", 34 | type=click.Choice(["json", "yaml"], case_sensitive=False), 35 | default="yaml", 36 | help="json or yaml formatting", 37 | ) 38 | @click.option( 39 | "--max-tokens", 40 | "-m", 41 | type=click.INT, 42 | default=1000, 43 | help="max_tokens passed to the model", 44 | ) 45 | @click.option( 46 | "--limit", 47 | "-l", 48 | type=click.INT, 49 | default=0, 50 | help="limit how many prompt cases to run in a single batch", 51 | ) 52 | @click.option( 53 | "--temperature", 54 | "-t", 55 | type=click.FLOAT, 56 | default=0.5, 57 | help="the temperature passed to the model", 58 | ) 59 | @click.option( 60 | "--engine", 61 | "-e", 62 | type=click.STRING, 63 | default="text-davinci-003", 64 | help="model as accepted by the openai API", 65 | ) 66 | @click.option("--key", "-k", multiple=True, help="The keys to run") 67 | @click.option( 68 | "--output", 69 | "-o", 70 | type=click.Path(), 71 | ) 72 | @click.option("--silent", "-s", is_flag=True) 73 | def run( 74 | path, 75 | verbose, 76 | force, 77 | dry_run, 78 | style, 79 | temperature, 80 | max_tokens, 81 | engine, 82 | key, 83 | output, 84 | silent, 85 | repair, 86 | human, 87 | shuffle, 88 | limit, 89 | ): 90 | """Run some prompts/suites!""" 91 | click.secho("💡 ¡promptimize! 💡", fg="cyan") 92 | if dry_run: 93 | click.secho("# DRY RUN MODE ACTIVATED!", fg="red") 94 | uses_cases = discover_objects(path, BasePromptCase) 95 | completion_create_kwargs = { 96 | "engine": engine, 97 | "max_tokens": max_tokens, 98 | "temperature": temperature, 99 | } 100 | 101 | report = None 102 | if output: 103 | report = Report.from_path(output) 104 | 105 | suite = Suite(uses_cases, completion_create_kwargs) 106 | suite.execute( 107 | verbose=verbose, 108 | style=style, 109 | silent=silent, 110 | report=report, 111 | dry_run=dry_run, 112 | keys=key, 113 | force=force, 114 | repair=repair, 115 | human=human, 116 | shuffle=shuffle, 117 | limit=limit, 118 | ) 119 | 120 | if output: 121 | output_report = Report.from_suite(suite) 122 | if report: 123 | output_report.merge(report) 124 | click.secho(f"# Writing file output to {output}", fg="yellow") 125 | output_report.write(output, style=style) 126 | 127 | 128 | cli.add_command(run) 129 | 130 | 131 | @click.command(help="report on how your suites of prompts are performing") 132 | @click.argument( 133 | "path", 134 | required=True, 135 | type=click.Path(exists=True), 136 | ) 137 | @click.option("--groupby", "-g", help="GROUPBY", default="category") 138 | def report(path, groupby): 139 | """Get some summary of how your prompt suites are performing""" 140 | click.secho(f"# Reading report @ {path}", fg="yellow") 141 | report = Report.from_path(path) 142 | report.print_summary(groupby) 143 | 144 | 145 | cli.add_command(report) 146 | -------------------------------------------------------------------------------- /promptimize/crawler.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import importlib 3 | import pkgutil 4 | from pathlib import Path 5 | from typing import List, Type, Any 6 | 7 | 8 | def is_instance_or_derivative(obj: Any, object_type: Type) -> bool: 9 | return isinstance(obj, object_type) 10 | 11 | 12 | def discover_objects(path: str, object_type: Type) -> List[Any]: # noqa 13 | objects = [] 14 | folder_path = Path(path).resolve() 15 | 16 | def process_module(module): 17 | # Iterate over the objects in the module 18 | for name, obj in module.__dict__.items(): 19 | # Check if the object is an instance or derivative of the specified type 20 | if is_instance_or_derivative(obj, object_type): 21 | objects.append(obj) 22 | # Check if the object is a list or tuple containing instances or 23 | # derivatives of the specified type 24 | elif isinstance(obj, (list, tuple)): 25 | for item in obj: 26 | if is_instance_or_derivative(item, object_type): 27 | objects.append(item) 28 | 29 | # If the path points to a file, import the module and process it directly 30 | if folder_path.is_file() and folder_path.suffix == ".py": 31 | sys.path.insert(0, str(folder_path.parent)) 32 | module_name = folder_path.stem 33 | module = importlib.import_module(module_name) 34 | process_module(module) 35 | 36 | # If the path points to a directory, proceed with the existing logic 37 | elif folder_path.is_dir(): 38 | # Add the folder to the Python path to enable importing modules from it 39 | if folder_path not in sys.path: 40 | sys.path.insert(0, str(folder_path)) 41 | 42 | # Iterate over all the modules in the folder 43 | for _, module_name, _ in pkgutil.iter_modules([str(folder_path)]): 44 | # Import the module 45 | module = importlib.import_module(module_name) 46 | process_module(module) 47 | 48 | return objects 49 | -------------------------------------------------------------------------------- /promptimize/evals.py: -------------------------------------------------------------------------------- 1 | """ 2 | Eval functions that be used/reused with Prompts 3 | 4 | All functions here are expected to: 5 | * receive a response string 6 | * [optional] receive arbitrary extra context 7 | * return a value from 0 to 1, 0 representing failing at the task, 1 full 8 | success, and a range in-between 9 | """ 10 | 11 | from typing import List 12 | 13 | 14 | def percentage_of_words(response: str, words: List[str], case_sensitive: bool = False) -> float: 15 | """ 16 | Calculate the percentage of words from a list that are present in the given response. 17 | 18 | Args: 19 | response (str): The string to search for words. 20 | words (List[str]): A list of words to check for their presence in the response. 21 | case_sensitive (bool, optional): If True, the search will be case-sensitive; 22 | otherwise, it will be case-insensitive. Defaults to False. 23 | 24 | Returns: 25 | float: The percentage of words found in the response (0.0 to 1.0). 26 | 27 | Examples: 28 | >>> percentage_of_words_in_response("This is an Example string.", ["example", "test"]) 29 | 0.5 30 | >>> percentage_of_words_in_response( 31 | "This is an Example string.", ["Example"], case_sensitive=True) 32 | 1.0 33 | >>> percentage_of_words_in_response( 34 | "This is an Example string.", ["example"], case_sensitive=True) 35 | 0.0 36 | >>> percentage_of_words_in_response("This is an Example string.", ["notfound"]) 37 | 0.0 38 | """ 39 | if not case_sensitive: 40 | response = response.lower() 41 | words = [w.lower() for w in words] 42 | 43 | return sum(1 if w in response else 0 for w in words) / len(words) 44 | 45 | 46 | def _common_word_search( 47 | response: str, words: List[str], case_sensitive: bool, match_type: str 48 | ) -> int: 49 | if not case_sensitive: 50 | response = response.lower() 51 | words = [w.lower() for w in words] 52 | 53 | if match_type == "any": 54 | return 1 if any(w in response for w in words) else 0 55 | elif match_type == "all": 56 | return 1 if all(w in response for w in words) else 0 57 | else: 58 | raise ValueError("Invalid match_type. Accepted values are 'any' or 'all'.") 59 | 60 | 61 | def any_word(response: str, words: List[str], case_sensitive: bool = False) -> int: 62 | """ 63 | Check if any word from a list of words is present in the given response and return 1 or 0. 64 | 65 | Args: 66 | response (str): The string to search for words. 67 | words (List[str]): A list of words to check for their presence in the response. 68 | case_sensitive (bool, optional): If True, the search will be case-sensitive; 69 | otherwise, it will be case-insensitive. Defaults to False. 70 | 71 | Returns: 72 | int: 1 if any word from the list is found in the response; otherwise, 0. 73 | 74 | Examples: 75 | >>> any_word("This is an Example string.", ["example", "test"]) 76 | 1 77 | >>> any_word("This is an Example string.", ["Example"], case_sensitive=True) 78 | 1 79 | >>> any_word("This is an Example string.", ["example"], case_sensitive=True) 80 | 0 81 | >>> any_word("This is an Example string.", ["notfound"]) 82 | 0 83 | """ 84 | return _common_word_search(response, words, case_sensitive, match_type="any") 85 | 86 | 87 | def all_words(response: str, words: List[str], case_sensitive: bool = False) -> int: 88 | """ 89 | Check if all words from a list of words are present in the given response and return 1 or 0. 90 | 91 | Args: 92 | response (str): The string to search for words. 93 | words (List[str]): A list of words to check for their presence in the response. 94 | case_sensitive (bool, optional): If True, the search will be case-sensitive; 95 | otherwise, it will be case-insensitive. Defaults to False. 96 | 97 | Returns: 98 | int: 1 if all words from the list are found in the response; otherwise, 0. 99 | 100 | Examples: 101 | >>> all_words("This is an Example string with test.", ["example", "test"]) 102 | 1 103 | >>> all_words("This is an Example string with Test.", ["Example", "Test"], case_sensitive=True) 104 | 1 105 | >>> all_words("This is an Example string.", ["example", "test"], case_sensitive=True) 106 | 0 107 | >>> all_words("This is an Example string.", ["example", "notfound"]) 108 | 0 109 | """ 110 | return _common_word_search(response, words, case_sensitive, match_type="all") 111 | 112 | 113 | base_all = all 114 | base_any = any 115 | 116 | 117 | def all(iteratable): 118 | return 1 if base_all([i == 1 for i in iteratable]) else 0 119 | 120 | 121 | def any(iteratable): 122 | return 1 if base_any([i == 1 for i in iteratable]) else 0 123 | -------------------------------------------------------------------------------- /promptimize/prompt_cases.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Any, Callable, List, Optional, Union 3 | 4 | from langchain.llms import OpenAI 5 | from langchain.callbacks import get_openai_callback 6 | 7 | from box import Box 8 | 9 | from promptimize import utils 10 | from promptimize.simple_jinja import process_template 11 | 12 | 13 | class BasePromptCase: 14 | """Abstract base prompt case""" 15 | 16 | attributes_used_for_hash = set() 17 | verbose_attrs = {"prompt"} 18 | 19 | def __init__( 20 | self, 21 | evaluators: Optional[Union[Callable, List[Callable]]] = None, 22 | key: Optional[str] = None, 23 | weight=1, 24 | category: str = None, # used for info/reporting purposes only 25 | prompt_executor: Any = None, 26 | prompt_executor_kwargs: dict = None, 27 | prompt_hash=None, 28 | *args, 29 | **kwargs, 30 | ) -> None: 31 | """ 32 | Initialize a SimplePromptCase instance. 33 | 34 | Args: 35 | user_input (str): Raw user_input for the prompt. 36 | evaluators (Optional[Union[Callable, List[Callable]]]): Optional 37 | callable or list of callables used for evaluation. 38 | key (Optional[str]): Optional unique key for the prompt. 39 | weight (int, optional): Optional weight for the prompt (default: 1). 40 | category (Optional[str], optional): Optional category for 41 | the prompt (used for info/reporting purposes only). 42 | """ 43 | self.extra_args = args 44 | self.extra_kwargs = kwargs 45 | self.response = None 46 | self.has_run = False 47 | self.was_tested = False 48 | self.test_results = None 49 | self.evaluators = evaluators or [] 50 | self.weight = weight or 1 51 | self.category = category 52 | self.pre_run_output = None 53 | self.post_run_output = None 54 | self.prompt_executor = prompt_executor or self.get_prompt_executor() 55 | self.prompt_executor_kwargs = prompt_executor_kwargs or {} 56 | 57 | self._prompt_hash = prompt_hash 58 | 59 | self.execution = Box() 60 | 61 | self.prompt = utils.literal_str(self.render()).strip() 62 | 63 | self.key = key or "prompt-" + self.prompt_hash 64 | 65 | if not utils.is_iterable(self.evaluators): 66 | self.evaluators = [self.evaluators] # type: ignore 67 | 68 | def get_prompt_executor(self): 69 | model_name = os.environ.get("OPENAI_MODEL") or "text-davinci-003" 70 | openai_api_key = os.environ.get("OPENAI_API_KEY") 71 | self.prompt_executor_kwargs = {"model_name": model_name} 72 | return OpenAI(model_name=model_name, openai_api_key=openai_api_key) 73 | 74 | def execute_prompt(self, prompt_str): 75 | with get_openai_callback() as cb: 76 | self.response = self.prompt_executor(prompt_str) 77 | self.execution.openai = Box() 78 | oai = self.execution.openai 79 | oai.total_tokens = cb.total_tokens 80 | oai.prompt_tokens = cb.prompt_tokens 81 | oai.completion_tokens = cb.completion_tokens 82 | oai.total_cost = cb.total_cost 83 | 84 | return self.response 85 | 86 | def pre_run(self): 87 | pass 88 | 89 | def post_run(self): 90 | pass 91 | 92 | @property 93 | def hash(self): 94 | return utils.short_hash(hash(self)) 95 | 96 | def __hash__(self): 97 | attrs = self.attributes_used_for_hash 98 | s = "|".join([utils.short_hash(utils.hashable_repr(getattr(self, attr))) for attr in attrs]) 99 | return utils.int_hash(s) 100 | 101 | def render(self): 102 | raise NotImplementedError() 103 | 104 | def get_unique_hash(self, extra_context=None): 105 | """Returns a unique identifier, determined by the run 106 | 107 | Generally, the actual call sent to GPT (prompt, execution params) 108 | represent something unique. 109 | """ 110 | return utils.short_hash(str(self.extra_kwargs)) 111 | 112 | def to_dict(self, verbose=False): 113 | d = { 114 | "key": self.key, 115 | "prompt_hash": self.prompt_hash, 116 | "prompt": self.prompt, 117 | "category": self.category, 118 | "response": self.response, 119 | "weight": self.weight, 120 | "execution": self.execution.to_dict(), 121 | } 122 | if hasattr(self, "error"): 123 | d["error"] = self.error 124 | return d 125 | 126 | def print(self, verbose=False, style="yaml"): 127 | style = style or "yaml" 128 | output = self.to_dict(verbose) 129 | if not verbose: 130 | for attr in self.verbose_attrs: 131 | del output[attr] 132 | if self.weight == 1: 133 | del output["weight"] 134 | highlighted = utils.serialize_object(output, style) 135 | print(highlighted) 136 | 137 | def test(self): 138 | test_results = [] 139 | for evaluator in self.evaluators: 140 | result = evaluator(self) 141 | if not (utils.is_numeric(result) and 0 <= result <= 1): 142 | raise Exception("Value should be between 0 and 1") 143 | test_results.append(result) 144 | 145 | if len(test_results): 146 | self.execution.score = sum(test_results) / len(test_results) 147 | self.execution.results = test_results 148 | self.was_tested = True 149 | 150 | @property 151 | def prompt_hash(self): 152 | if self._prompt_hash: 153 | return self._prompt_hash 154 | return utils.short_hash(hash(self)) 155 | 156 | def _run(self, dry_run): 157 | pre_run_output = self.pre_run() 158 | if pre_run_output: 159 | self.execution.pre_run_output = pre_run_output 160 | 161 | if not dry_run: 162 | with utils.MeasureDuration() as md: 163 | self.response = self.execute_prompt(self.prompt).strip() 164 | 165 | self.execution.api_call_duration_ms = md.duration 166 | 167 | post_run_output = self.post_run() 168 | if post_run_output: 169 | self.execution.post_run_output = post_run_output 170 | self.has_run = True 171 | self.execution.run_at = utils.current_iso_timestamp() 172 | return self.response 173 | 174 | 175 | class PromptCase(BasePromptCase): 176 | """A simple prompt case""" 177 | 178 | attributes_used_for_hash = BasePromptCase.attributes_used_for_hash | {"user_input"} 179 | 180 | def __init__( 181 | self, 182 | user_input, 183 | *args, 184 | **kwargs, 185 | ) -> None: 186 | self.user_input = user_input 187 | super().__init__(*args, **kwargs) 188 | 189 | def to_dict(self, verbose=False, *args, **kwargs): 190 | d = super().to_dict(*args, **kwargs) 191 | d = utils.insert_in_dict(d, "user_input", self.user_input, after_key="key") 192 | return d 193 | 194 | def render(self): 195 | return self.user_input 196 | 197 | 198 | class TemplatedPromptCase(BasePromptCase): 199 | template_defaults: dict = {} 200 | attributes_used_for_hash = BasePromptCase.attributes_used_for_hash | { 201 | "user_input", 202 | "extra_kwargs", 203 | } 204 | template = "{{ user_input }}" 205 | 206 | def __init__( 207 | self, 208 | user_input=None, 209 | *args, 210 | **kwargs, 211 | ) -> None: 212 | self.user_input = user_input 213 | return super().__init__(*args, **kwargs) 214 | 215 | def to_dict(self, verbose=False, *args, **kwargs): 216 | d = super().to_dict(*args, **kwargs) 217 | d = utils.insert_in_dict(d, "user_input", self.user_input, after_key="key") 218 | return d 219 | 220 | def get_extra_template_context(self): 221 | """meant to be overriden in derived classes to add logic/context""" 222 | return {} 223 | 224 | @property 225 | def jinja_context(self): 226 | context_kwargs = self.template_defaults.copy() 227 | context_kwargs.update(self.get_extra_template_context()) 228 | context_kwargs.update(self.extra_kwargs) 229 | context_kwargs.update({"user_input": self.user_input}) 230 | return context_kwargs 231 | 232 | def render(self, **kwargs): 233 | return process_template(self.template, **self.jinja_context) 234 | 235 | 236 | class LangchainPromptCase(BasePromptCase): 237 | attributes_used_for_hash = BasePromptCase.attributes_used_for_hash | { 238 | "extra_kwargs", 239 | "langchain_prompt", 240 | } 241 | 242 | def __init__( 243 | self, 244 | langchain_prompt, 245 | *args, 246 | **kwargs, 247 | ) -> None: 248 | self.langchain_prompt = langchain_prompt 249 | return super().__init__(*args, **kwargs) 250 | 251 | def to_dict(self, verbose=False, *args, **kwargs): 252 | d = super().to_dict(*args, **kwargs) 253 | d = utils.insert_in_dict(d, "prompt_kwargs", self.extra_kwargs, after_key="key") 254 | return d 255 | 256 | def render(self): 257 | return self.langchain_prompt.format(**self.extra_kwargs) 258 | -------------------------------------------------------------------------------- /promptimize/reports.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | from box import Box 3 | 4 | import pandas as pd 5 | 6 | from promptimize import utils 7 | 8 | 9 | class Report: 10 | """Report objects interacting with the filesystem / databases and data structures""" 11 | 12 | version = "0.1.0" 13 | 14 | def __init__(self, path=None, data=None): 15 | self.data = Box() 16 | if data: 17 | self.data = Box(data) 18 | self.path = path 19 | 20 | def write(self, path=None, style="yaml"): 21 | """write the report to the filesystem""" 22 | path = path or self.path 23 | with open(path, "w") as f: 24 | f.write(utils.serialize_object(self.data.to_dict(), highlighted=False, style=style)) 25 | 26 | def merge(self, report): 27 | """merge in another report into this one""" 28 | all_keys = set(report.prompts.keys()) | set(self.prompts.keys()) 29 | for k in all_keys: 30 | a = report.prompts.get(k) 31 | b = self.prompts.get(k) 32 | if a and b: 33 | if a.execution.get("run_at", "") > b.execution.get("run_at", ""): 34 | self.prompts[k] = a 35 | else: 36 | self.prompts[k] = b 37 | 38 | if not a: 39 | self.prompts[k] = b 40 | elif not b: 41 | self.prompts[k] = a 42 | 43 | @property 44 | def prompts(self): 45 | """list the prompts in this report""" 46 | if self.data: 47 | return self.data.prompts 48 | return {} 49 | 50 | @property 51 | def failed_keys(self): 52 | """return the list of prompt keys that have not suceeded""" 53 | keys = set() 54 | for p in self.prompts.values(): 55 | if p.execution.get("score", 0) < 1: 56 | keys.add(p.key) 57 | return keys 58 | 59 | @classmethod 60 | def from_path(cls, path): 61 | """load a report object from a path in the filesystem""" 62 | try: 63 | with open(path, "r") as f: 64 | report = cls(path, yaml.safe_load(f)) 65 | return report 66 | except FileNotFoundError: 67 | return None 68 | 69 | @classmethod 70 | def from_suite(cls, suite): 71 | """load a report object from a suite instance""" 72 | report = cls(data=suite.to_dict()) 73 | return report 74 | 75 | def get_prompt(self, prompt_key): 76 | """get a specific prompt data structure from the report""" 77 | return self.prompts.get(prompt_key) 78 | 79 | def prompt_df(self): 80 | """make a flat pandas dataframe out of the prompts in the reports""" 81 | prompts = [p for p in self.prompts.values() if p.execution] 82 | return pd.json_normalize(prompts) 83 | 84 | def print_summary(self, groupby="category"): 85 | """print the summary from the report""" 86 | if groupby: 87 | self.print_summary(groupby=None) 88 | 89 | df = self.prompt_df() 90 | 91 | df["score"] = df["weight"] * df["execution.score"] 92 | 93 | if groupby: 94 | df = df[[groupby, "weight", "score"]].groupby(groupby).sum() 95 | else: 96 | df = df.agg({"weight": "sum", "score": "sum"}).to_frame().T 97 | df["perc"] = (df["score"] / df["weight"]) * 100 98 | df = df.sort_values(by="weight", ascending=False) 99 | headers = [] 100 | if groupby: 101 | headers = "keys" 102 | else: 103 | df = df.T 104 | print(utils.trabulate(df, headers=headers)) 105 | -------------------------------------------------------------------------------- /promptimize/simple_jinja.py: -------------------------------------------------------------------------------- 1 | import jinja2 2 | 3 | environment = jinja2.Environment() 4 | 5 | 6 | def process_template(template_as_string, **kwargs): 7 | template = environment.from_string(template_as_string) 8 | return template.render(**kwargs) 9 | -------------------------------------------------------------------------------- /promptimize/suite.py: -------------------------------------------------------------------------------- 1 | """ 2 | This module provides a Suite class to manage and execute a collection of 3 | use cases (prompts) to be tested. It allows running the tests, displaying 4 | results, and serializing the summary of the suite. 5 | """ 6 | import random 7 | from typing import Any, Dict, List, Optional, Union 8 | 9 | import click 10 | 11 | from promptimize import utils 12 | from promptimize.prompt_cases import BasePromptCase 13 | 14 | 15 | def separator(fg=None) -> None: 16 | """Print a separator line.""" 17 | click.secho("# " + "-" * 40, fg=fg) 18 | 19 | 20 | def separated_section(s, fg=None): 21 | separator(fg) 22 | click.secho(s, fg=fg) 23 | separator(fg) 24 | 25 | 26 | class Suite: 27 | """A collection of use cases to be tested. 28 | 29 | Attributes: 30 | name (Optional[str]): The name of the suite. 31 | prompts (Dict[str, Prompt]): Dictionary of prompts to be tested, 32 | keyed by the prompt key. 33 | last_run_completion_create_kwargs (Dict[str, Any]): Keyword arguments 34 | used in the last run for completion creation. 35 | efective_prompts (List): List of prompts values that finally will be tested. 36 | """ 37 | 38 | def __init__( 39 | self, 40 | prompts: List["BasePromptCase"], 41 | name: Optional[str] = None, 42 | ) -> None: 43 | """ 44 | Args: 45 | prompts (List[Prompt]): List of prompts to be tested. 46 | name (Optional[str]): The name of the suite. Defaults to None. 47 | """ 48 | self.name = name 49 | self.prompts = {o.key: o for o in prompts} 50 | self.last_run_completion_create_kwargs: dict = {} 51 | self.effective_prompts = list(self.prompts.values()) 52 | 53 | def execute( # noqa 54 | self, 55 | verbose: bool = False, 56 | style: str = "yaml", 57 | silent: bool = False, 58 | report=None, 59 | dry_run: bool = False, 60 | keys: list = None, 61 | force: bool = False, 62 | repair: bool = False, 63 | human: bool = False, 64 | shuffle: bool = False, 65 | limit: int = 0, 66 | ) -> None: 67 | """ 68 | Execute the suite with the given settings. 69 | 70 | Args: 71 | verbose (bool): If True, print verbose output. Defaults to False. 72 | style (str): Output style for serialization. Defaults to "yaml". 73 | silent (bool): If True, suppress output. Defaults to False. 74 | """ 75 | self.reload_effective_prompts( 76 | report=report, 77 | keys=keys, 78 | repair=repair, 79 | shuffle=shuffle, 80 | limit=limit, 81 | ) 82 | prompts = self.effective_prompts 83 | 84 | for i, prompt in enumerate(prompts): 85 | should_run = force or self.should_prompt_execute(prompt, report) 86 | progress = f"({i+1}/{len(prompts)})" 87 | if not silent: 88 | if should_run: 89 | separated_section(f"# {progress} [RUN] prompt: {prompt.key}", fg="cyan") 90 | else: 91 | separated_section(f"# {progress} [SKIP] prompt: {prompt.key}", fg="yellow") 92 | 93 | if should_run: 94 | prompt._run(dry_run) 95 | if not dry_run: 96 | prompt.test() 97 | 98 | if not silent and should_run: 99 | prompt.print(verbose=verbose, style=style) 100 | 101 | if should_run and human: 102 | v = click.prompt( 103 | 'Press Enter to continue, "Y" to force success, "N" to force fail, "X" to exit', 104 | default="", 105 | show_default=False, 106 | ) 107 | v = v.lower() 108 | if v == "": 109 | click.secho("Leaving result unaltered", fg="yellow") 110 | elif v == "y": 111 | prompt.execution.score = 1 112 | prompt.execution.human_override = True 113 | click.secho("Forcing SUCCESS", fg="green") 114 | elif v == "n": 115 | prompt.execution.score = 0 116 | prompt.execution.human_override = True 117 | click.secho("Forcing FAILURE", fg="red") 118 | elif v == "x": 119 | break 120 | 121 | # `self.last_run_completion_create_kwargs = completion_create_kwargs 122 | if not silent: 123 | separated_section("# Suite summary", fg="cyan") 124 | click.echo(utils.serialize_object(self._serialize_run_summary(), style)) 125 | 126 | def reload_effective_prompts( 127 | self, 128 | report=None, 129 | keys: list = None, 130 | repair: bool = False, 131 | shuffle: bool = False, 132 | limit: int = 0, 133 | ): 134 | self.effective_prompts = list(self.prompts.values()) 135 | if keys: 136 | self.effective_prompts = [p for p in self.effective_prompts if p.key in keys] 137 | if repair and report: 138 | failed_keys = report.failed_keys 139 | self.effective_prompts = [p for p in self.effective_prompts if p.key in failed_keys] 140 | 141 | if shuffle: 142 | random.shuffle(self.effective_prompts) 143 | 144 | if limit: 145 | self.effective_prompts = self.effective_prompts[:limit] 146 | 147 | def should_prompt_execute(self, prompt, report): 148 | if not report or not report.prompts: 149 | return True 150 | report_prompt = report.prompts.get(prompt.key) 151 | if not report_prompt: 152 | return True 153 | else: 154 | if not report_prompt.execution: 155 | return True 156 | 157 | if report_prompt.prompt_hash == prompt.prompt_hash: 158 | return False 159 | 160 | return True 161 | 162 | def _serialize_run_summary( 163 | self, verbose: bool = False 164 | ) -> Dict[str, Union[Optional[float], Dict[str, Any]]]: 165 | """ 166 | Serialize the run summary of the suite. 167 | 168 | Args: 169 | verbose (bool): If True, include verbose output. Defaults to False. 170 | 171 | Returns: 172 | Dict[str, Union[Optional[float], Dict[str, Any]]]: Serialized run summary of the suite. 173 | """ 174 | prompts = self.prompts.values() 175 | tested = [p for p in prompts if p.was_tested and p.execution.score is not None] 176 | suite_score = None 177 | if len(tested) > 0: 178 | total_weight = sum([p.weight for p in tested]) 179 | suite_score = sum([p.execution.score * p.weight for p in tested]) / total_weight 180 | d = { 181 | "suite_score": suite_score, 182 | "git_info": utils.get_git_info(), 183 | } 184 | 185 | return d 186 | 187 | def to_dict(self) -> Dict[str, Any]: 188 | """ 189 | Convert the suite to a dictionary. 190 | 191 | Returns: 192 | Dict[str, Any]: Dictionary representation of the suite. 193 | """ 194 | return { 195 | "name": self.name, 196 | "prompts": {p.key: p.to_dict() for p in self.prompts.values()}, 197 | "run_summary": self._serialize_run_summary(), 198 | } 199 | -------------------------------------------------------------------------------- /promptimize/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Any, Dict, List, Optional 3 | import subprocess 4 | import hashlib 5 | from datetime import datetime 6 | import re 7 | import time 8 | from pygments import highlight 9 | from pygments.lexers import YamlLexer, JsonLexer 10 | from pygments.formatters import TerminalFormatter 11 | from tabulate import tabulate 12 | import yaml 13 | from yaml.representer import SafeRepresenter 14 | 15 | 16 | def is_numeric(value): 17 | """that'd be nice if we had this in the std lib...""" 18 | return isinstance(value, (int, float, complex)) 19 | 20 | 21 | def is_iterable(obj): 22 | """that'd be nice if we had this in the std lib...""" 23 | try: 24 | iter(obj) 25 | return True 26 | except TypeError: 27 | return False 28 | 29 | 30 | # Yaml-related section, hacking out of some issues 31 | 32 | 33 | class folded_str(str): 34 | pass 35 | 36 | 37 | class literal_str(str): 38 | pass 39 | 40 | 41 | def change_style(style, representer): 42 | def new_representer(dumper, data): 43 | scalar = representer(dumper, data) 44 | scalar.style = style 45 | return scalar 46 | 47 | return new_representer 48 | 49 | 50 | # represent_str does handle some corner cases, so use that 51 | # instead of calling represent_scalar directly 52 | represent_folded_str = change_style(">", SafeRepresenter.represent_str) 53 | represent_literal_str = change_style("|", SafeRepresenter.represent_str) 54 | 55 | 56 | def str_presenter(dumper, data): 57 | """ 58 | Some hack to get yaml output to use look good for multiline, 59 | which is common in this package 60 | 61 | from: https://shorturl.at/klvC1 62 | """ 63 | if len(data.splitlines()) > 1 or "\n" in data or "\\n" in data: 64 | text_list = [line.rstrip() for line in data.splitlines()] 65 | fixed_data = "\n".join(text_list) 66 | return dumper.represent_scalar("tag:yaml.org,2002:str", fixed_data, style="|") 67 | return dumper.represent_scalar("tag:yaml.org,2002:str", data) 68 | 69 | 70 | yaml.add_representer(folded_str, represent_folded_str) 71 | yaml.add_representer(literal_str, represent_literal_str) 72 | yaml.add_representer(str, str_presenter) 73 | 74 | 75 | def extract_json_objects(text: str, get_first: bool = True) -> List[Dict[str, Any]]: 76 | """ 77 | Extract JSON objects from a given string by looking for matching curly brackets. 78 | 79 | Args: 80 | text (str): The input string containing JSON objects. 81 | 82 | Returns: 83 | List[Dict[str, Any]]: A list of JSON objects found in the input string. 84 | 85 | Example: 86 | 87 | >>> extract_json_objects('Some text: {"key1": "value1", "key2": "value2"} and more text.') 88 | [{'key1': 'value1', 'key2': 'value2'}] 89 | 90 | >>> extract_json_objects('No JSON objects in this text.') 91 | [] 92 | 93 | >>> extract_json_objects('{"a": 1, "b": 2} and {"c": 3, "d": 4}') 94 | [{'a': 1, 'b': 2}, {'c': 3, 'd': 4}] 95 | """ 96 | json_objects = [] 97 | # man that regex is something... generated by GPT4... 98 | regex = r"(\{(?:[^{}]|(?:\{(?:[^{}]|(?:\{(?:[^{}]|(?:\{[^{}]*\}))*\}))*\}))*\})" 99 | for match in re.finditer(regex, text): 100 | try: 101 | json_object = json.loads(match.group()) 102 | json_objects.append(json_object) 103 | except json.JSONDecodeError: 104 | pass 105 | return json_objects 106 | 107 | 108 | def int_hash(text): 109 | # Create a SHA-256 hash of the input string 110 | hash_object = hashlib.sha256(str(text).encode()) 111 | 112 | # Convert the hash to a hexadecimal string 113 | hex_hash = hash_object.hexdigest() 114 | return int(hex_hash, 16) 115 | 116 | 117 | def short_hash(obj, length=8): 118 | hash_object = hashlib.sha256(str(obj).encode()) 119 | hex_hash = hash_object.hexdigest() 120 | return hex_hash[:length] 121 | 122 | 123 | def to_yaml(data, highlighted=True): 124 | data = yaml.dump(data, sort_keys=False) 125 | if highlighted: 126 | data = highlight(data, YamlLexer(), TerminalFormatter()) 127 | return data 128 | 129 | 130 | def to_json(data, highlighted=True): 131 | data = json.dumps(data, indent=2) 132 | highlighted = highlight(data, JsonLexer(), TerminalFormatter()) 133 | return highlighted 134 | 135 | 136 | def serialize_object(data, style="yaml", highlighted=True): 137 | if style == "yaml": 138 | return to_yaml(data, highlighted) 139 | return to_json(data, highlighted) 140 | 141 | 142 | def transform_strings(obj, transformation): 143 | """ 144 | Recursively iterates through nested iterables (lists and tuples) and dictionaries, 145 | applying a given transformation function to all strings found. 146 | 147 | :param obj: The input object, which can be a string, dictionary, list, or tuple. 148 | Other types will be returned unchanged. 149 | :param transformation: A function that takes a single string argument and returns 150 | a transformed string. 151 | :return: A new object with the same structure as the input object, but with all 152 | strings transformed by the given transformation function. 153 | """ 154 | if isinstance(obj, str): 155 | return transformation(obj) 156 | elif isinstance(obj, dict): 157 | return {key: transform_strings(value, transformation) for key, value in obj.items()} 158 | elif isinstance(obj, list): 159 | return [transform_strings(item, transformation) for item in obj] 160 | elif isinstance(obj, tuple): 161 | return tuple(transform_strings(item, transformation) for item in obj) 162 | else: 163 | return obj 164 | 165 | 166 | def get_git_info(sha_length: int = 12): 167 | try: 168 | sha = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip() 169 | if sha_length: 170 | sha = sha[:sha_length] 171 | branch = ( 172 | subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"]) 173 | .decode("utf-8") 174 | .strip() 175 | ) 176 | dirty = subprocess.call(["git", "diff-index", "--quiet", "HEAD"]) != 0 177 | 178 | return {"sha": sha, "branch": branch, "dirty": dirty} 179 | except subprocess.CalledProcessError: 180 | return None 181 | 182 | 183 | class MeasureDuration: 184 | def __init__(self): 185 | self.duration = None 186 | 187 | def __enter__(self): 188 | self.start_time = time.time() 189 | return self 190 | 191 | def __exit__(self, exc_type, exc_val, exc_tb): 192 | end_time = time.time() 193 | self.duration = (end_time - self.start_time) * 1000 194 | 195 | 196 | def insert_in_dict( 197 | dictionary: Dict[Any, Any], 198 | key: Any, 199 | value: Any, 200 | position: Optional[int] = None, 201 | before_key: Optional[Any] = None, 202 | after_key: Optional[Any] = None, 203 | ) -> Dict[Any, Any]: 204 | """ 205 | Insert a key/value pair in a dictionary at a specific position, before a 206 | specified key, or after a specified key. 207 | 208 | Args: 209 | dictionary (Dict[Any, Any]): The original dictionary. 210 | key (Any): The key to be inserted. 211 | value (Any): The value associated with the key. 212 | position (Optional[int], optional): The position at which the key/value pair 213 | should be inserted. Defaults to None. 214 | before_key (Optional[Any], optional): The key before which the new 215 | key/value pair should be inserted. Defaults to None. 216 | after_key (Optional[Any], optional): The key after which the new 217 | key/value pair should be inserted. Defaults to None. 218 | 219 | Raises: 220 | ValueError: If more than one of 'position', 'before_key', or 'after_key' is specified. 221 | ValueError: If the specified position is out of range. 222 | KeyError: If 'before_key' or 'after_key' is not found in the dictionary. 223 | 224 | Returns: 225 | Dict[Any, Any]: A new dictionary with the inserted key/value pair. 226 | """ 227 | if sum([bool(position is not None), bool(before_key), bool(after_key)]) > 1: 228 | raise ValueError("Only one of 'position', 'before_key', or 'after_key' can be specified") 229 | 230 | if position is not None and (position > len(dictionary) or position < 0): 231 | raise ValueError("Position is out of range") 232 | 233 | if before_key is not None and before_key not in dictionary: 234 | raise KeyError(f"'before_key': {before_key} not found in the dictionary") 235 | 236 | if after_key is not None and after_key not in dictionary: 237 | raise KeyError(f"'after_key': {after_key} not found in the dictionary") 238 | 239 | new_dict = {} 240 | inserted = False 241 | 242 | for index, (dict_key, dict_value) in enumerate(dictionary.items()): 243 | if position == index or dict_key == before_key: 244 | new_dict[key] = value 245 | inserted = True 246 | elif after_key == dict_key: 247 | new_dict[dict_key] = dict_value 248 | new_dict[key] = value 249 | inserted = True 250 | continue 251 | 252 | new_dict[dict_key] = dict_value 253 | 254 | if not inserted or position == len(dictionary): 255 | new_dict[key] = value 256 | 257 | return new_dict 258 | 259 | 260 | def current_iso_timestamp(): 261 | now = datetime.utcnow() 262 | return now.isoformat() 263 | 264 | 265 | def hashable_repr(obj): 266 | if isinstance(obj, (list, tuple, set)): 267 | return "".join(hashable_repr(item) for item in obj) 268 | elif isinstance(obj, dict): 269 | return "".join( 270 | hashable_repr(key) + hashable_repr(value) for key, value in sorted(obj.items()) 271 | ) 272 | elif callable(obj): 273 | return str(obj.__code__.co_code) 274 | else: 275 | return str(obj) 276 | 277 | 278 | def trabulate(df, showindex=True, headers="keys"): 279 | headers = headers if headers else [] 280 | for column in df.columns: 281 | if df[column].dtype == "int64": 282 | df[column] = df[column].astype(str) 283 | return tabulate(df, headers=headers, showindex=showindex, tablefmt="psql", floatfmt=".2f") 284 | -------------------------------------------------------------------------------- /publish.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # FIRST! manually update the setup.py with the version number 4 | # export PROMPTIMIZE_VERSION=0.2.3 5 | 6 | git checkout main 7 | git commit -a -m "Version $PROMPTIMIZE_VERSION" 8 | git tag $PROMPTIMIZE_VERSION 9 | git push origin main $PROMPTIMIZE_VERSION 10 | python setup.py sdist bdist_wheel 11 | twine upload dist/promptimize-$PROMPTIMIZE_VERSION* 12 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.black] 2 | line-length = 100 3 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | flake8 2 | pre-commit 3 | recommonmark 4 | setuptools 5 | sphinx 6 | sphinx-click 7 | sphinxcontrib-napoleon 8 | twine 9 | wheel 10 | -------------------------------------------------------------------------------- /requirements-examples.txt: -------------------------------------------------------------------------------- 1 | RestrictedPython 2 | demjson 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | jinja2 2 | python-box 3 | black 4 | click 5 | pygments 6 | pyyaml 7 | langchain 8 | tabulate 9 | pandas 10 | openai 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | # Read the contents of the requirements.txt file 4 | with open("requirements.txt") as f: 5 | requirements = f.read().splitlines() 6 | 7 | # Read the contents of the README.md file 8 | with open("README.md", "r", encoding="utf-8") as f: 9 | readme = f.read() 10 | 11 | setup( 12 | name="promptimize", 13 | version="0.2.3", 14 | packages=find_packages(), 15 | include_package_data=True, 16 | install_requires=requirements, 17 | entry_points={ 18 | "console_scripts": [ 19 | "promptimize=promptimize:cli", 20 | "p9e=promptimize:cli", 21 | ], 22 | }, 23 | author="Maxime Beauchemin", 24 | author_email="maximebeauchemin@gmail.com", 25 | description="A python toolkit to generate and evaluate prompts for GPT at scale", 26 | long_description=readme, 27 | long_description_content_type="text/markdown", 28 | license="Apache License, Version 2.0", 29 | license_file="LICENSE", 30 | ) 31 | --------------------------------------------------------------------------------