├── .flake8
├── .gitignore
├── .mypy.ini
├── .pre-commit-config.yaml
├── LICENSE
├── README.md
├── docs
├── Makefile
├── publish.sh
└── source
│ ├── README.md
│ ├── api.rst
│ ├── cli.rst
│ ├── conf.py
│ └── index.rst
├── examples
├── __pycache__
│ └── use_cases.cpython-38.pyc
├── python_examples.py
├── readme_examples.py
└── readme_hello_world.py
├── promptimize
├── __init__.py
├── cli.py
├── crawler.py
├── evals.py
├── prompt_cases.py
├── reports.py
├── simple_jinja.py
├── suite.py
└── utils.py
├── publish.sh
├── pyproject.toml
├── requirements-dev.txt
├── requirements-examples.txt
├── requirements.txt
└── setup.py
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 100
3 | exclude = tests/*
4 | max-complexity = 10
5 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *__pycache__*
2 | build/
3 | dist/
4 | work/*
5 | promptimize.egg-info/
6 |
--------------------------------------------------------------------------------
/.mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | ignore_missing_imports = True
3 | exclude = ['docs/']
4 |
--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
1 | #
2 | # Licensed to the Apache Software Foundation (ASF) under one or more
3 | # contributor license agreements. See the NOTICE file distributed with
4 | # this work for additional information regarding copyright ownership.
5 | # The ASF licenses this file to You under the Apache License, Version 2.0
6 | # (the "License"); you may not use this file except in compliance with
7 | # the License. You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | repos:
18 | #- repo: https://github.com/pre-commit/mirrors-mypy
19 | # rev: v0.941
20 | # hooks:
21 | # - id: mypy
22 | # additional_dependencies: [types-all]
23 | - repo: https://github.com/pre-commit/pre-commit-hooks
24 | rev: v4.4.0
25 | hooks:
26 | - id: check-added-large-files
27 | exclude: \.(geojson)$
28 | - id: check-yaml
29 | exclude: ^helm/superset/templates/
30 | - id: debug-statements
31 | - id: end-of-file-fixer
32 | - id: trailing-whitespace
33 | args: ["--markdown-linebreak-ext=md"]
34 | - repo: https://github.com/psf/black
35 | rev: 23.9.1
36 | hooks:
37 | - id: black
38 | language_version: python3
39 | - repo: https://github.com/pre-commit/mirrors-prettier
40 | rev: v3.0.3 # Use the sha or tag you want to point at
41 | hooks:
42 | - id: prettier
43 | args: ["--ignore-path=./superset-frontend/.prettierignore"]
44 | files: "superset-frontend"
45 | # blacklist unsafe functions like make_url (see #19526)
46 | - repo: https://github.com/skorokithakis/blacklist-pre-commit-hook
47 | rev: e2f070289d8eddcaec0b580d3bde29437e7c8221
48 | hooks:
49 | - id: blacklist
50 | args: ["--blacklisted-names=make_url", "--ignore=tests/"]
51 | - repo: https://github.com/pycqa/flake8
52 | rev: '6.1.0'
53 | hooks:
54 | - id: flake8
55 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
203 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # 💡 ¡promptimize! 💡
2 | [](https://opensource.org/licenses/Apache-2.0)
3 | [](https://badge.fury.io/py/promptimize)
4 |
5 |
6 |
7 | Promptimize is a prompt engineering **evaluation** and **testing** toolkit.
8 |
9 | It accelerates and provides structure around prompt engineering at scale
10 | with confidence, bringing some of the ideas behind test-driven
11 | development (TDD) to engineering prompts.
12 |
13 | With promptimize, you can:
14 |
15 | - Define your "prompt cases" (think "test cases" but specific to evaluating
16 | prompts) as code and associate them with evaluation functions
17 | - Generate prompt variations dynamically
18 | - Execute and rank prompts test suites across different
19 | engines/models/temperature/settings and compare results, brining
20 | the hyperparameter tuning mindset to prompt engineering
21 | - Get reports on your prompts' performance as you iterate. Answer question
22 | around how different prompt suites are performing against one-another.
23 | Which individual cases or categories of cases improved? regressed?
24 | - Minimize API calls! only re-assess what changed as you change it
25 | - Perform human if and where needed, introspected failed cases, overriding
26 | false negatives
27 |
28 | In essence, promptimize provides a programmatic way to execute and fine-tune
29 | your prompts and evaluation functions in Python, allowing you to iterate
30 | quickly and with confidence.
31 |
32 | ## Hello world - the simplest prompt examples
33 | [more examples on GitHub](https://github.com/preset-io/promptimize/tree/main/examples)
34 | ```python
35 | # Brining some "prompt generator" classes - note that you can derive and extend those
36 | from promptimize.prompts import PromptCase
37 |
38 | # Bringing some useful eval function that help evaluating and scoring responses
39 | # eval functions have a handle on the prompt object and are expected
40 | # to return a score between 0 and 1
41 | from promptimize import evals
42 |
43 | # Promptimize will scan the target folder and find all Prompt objects
44 | # and derivatives that are in the python modules
45 | simple_prompts = [
46 |
47 | # Prompting "hello there" and making sure there's "hi" or "hello"
48 | # somewhere in the answer
49 | PromptCase("hello there!", lambda x: evals.any_word(x, ["hi", "hello"])),
50 | PromptCase(
51 | "name the top 50 guitar players!", lambda x: evals.all_words(x, ["frank zappa"])
52 | ),
53 | ]
54 | ```
55 |
56 | ### The CLI's `run` command
57 | ```
58 | $ promptimize run --help
59 | Usage: promptimize run [OPTIONS] PATH
60 |
61 | run some prompts
62 |
63 | Options:
64 | -v, --verbose Trigger more verbose output
65 | -f, --force Force run, do not skip
66 | -h, --human Human review, allowing a human to review and force
67 | pass/fail each prompt case
68 | -r, --repair Only re-run previously failed
69 | -x, --dry-run DRY run, don't call the API
70 | --shuffle Shuffle the prompts in a random order
71 | -s, --style [json|yaml] json or yaml formatting
72 | -m, --max-tokens INTEGER max_tokens passed to the model
73 | -l, --limit INTEGER limit how many prompt cases to run in a single
74 | batch
75 | -t, --temperature FLOAT max_tokens passed to the model
76 | -e, --engine TEXT model as accepted by the openai API
77 | -k, --key TEXT The keys to run
78 | -o, --output PATH
79 | -s, --silent
80 | ```
81 |
82 | Let's run those examples and produce a report `./report.yaml`
83 | ```
84 | $ promptimize run examples/ --output ./report.yaml
85 | ```
86 |
87 | ```yaml
88 | 💡 ¡promptimize! 💡
89 | # ----------------------------------------
90 | # (1/2) [RUN] prompt: prompt-115868ef
91 | # ----------------------------------------
92 | key: prompt-115868ef
93 | user_input: hello there!
94 | prompt_hash: 115868ef
95 | response: Hi there! How are you doing today?
96 | execution:
97 | api_call_duration_ms: 883.8047981262207
98 | run_at: '2023-04-25T02:21:40.443077'
99 | score: 1.0
100 |
101 | # ----------------------------------------
102 | # (2/2) [RUN] prompt: prompt-5c085656
103 | # ----------------------------------------
104 | key: prompt-5c085656
105 | user_input: name the top 10 guitar players!
106 | prompt_hash: 5c085656
107 | response: |-
108 | 1. Jimi Hendrix
109 | 2. Eric Clapton
110 | {{ ... }}
111 | 11. Carlos Santana
112 | weight: 2
113 | execution:
114 | api_call_duration_ms: 2558.135747909546
115 | run_at: '2023-04-25T02:21:43.007529'
116 | score: 0.0
117 |
118 | # ----------------------------------------
119 | # Suite summary
120 | # ----------------------------------------
121 | suite_score: 0.3333333333333333
122 | git_info:
123 | sha: 2cf28498ba0f
124 | branch: main
125 | dirty: true
126 | ```
127 |
128 | ## Problem + POV
129 |
130 | Thousands of product builders are currently trying to figure out how to
131 | bring the power of AI into the products and experiences they are building.
132 | The probabilistic (often semi-random, sometimes hectic) nature of LLMs
133 | makes this a challenge.
134 |
135 | Prompt engineering is a huge piece of the puzzle in terms of how to do this
136 | right, especially given the complexity, risks, and drawbacks around
137 | model tuning.
138 |
139 | We believe product builders need to tame AI through proper, rigorous
140 | **prompt engineering**. This allows making the probabilistic nature of
141 | AI more deterministic, or somewhat predictable, and allows builders to apply
142 | a hyperparameter tuning-type mindset and approach to prompt engineering.
143 |
144 | Any prompt-generator logic that's going to be let loose in the wild inside
145 | a product should be thoroughly tested and evaluated with "prompt cases" that
146 | cover the breath of what people may do in a product.
147 |
148 | In short, Promptimize allows you to test prompts at industrial scale,
149 | so that you can confidently use them in the products you are building.
150 |
151 | ## Information Architecture
152 |
153 | - **Prompt:** A Prompt instance is a certain test case, a single prompt
154 | with an associated set of evaluation functions to rate its success.
155 | - **Evaluation:** An evaluation function that reads the response and returns
156 | a success rate between `0` and `1`.
157 | - **Suite:** A Suite is a collection of Prompt; it's able to run things,
158 | accumulate results, and print reports about its collection of use cases.
159 | - **Report**: a report is the compiled results of running a certain prompt
160 | `Suite` or set of suites. Reports can be consumed, compared, and expanded.
161 |
162 | ## Principles
163 |
164 | - **Configuration as code:** All prompt cases, suites, and evaluations are
165 | defined as code, which makes it easy to dynamically generate all sorts
166 | of use cases and suites.
167 | - **Expressive**: a clean DSL that's to-the-point -> user prompt + assertions.
168 | the actually prompt creation logic lives in the derivative class of `PromptCase`,
169 | so that we can have clean, dense files that contain nice `Suite`s
170 | - **Support the iteration mindset:** making it easy for people to try things,
171 | get suggestions from the AI, adapt, compare, and push forward
172 | - **Extensibility:** the toolkit is designed to be extremely hackable and
173 | extensible. Hooks, extensions, high API surface.
174 | - **AI-powered:** the framework offers ways to expand your suites based
175 | on the examples that exists. Use AI to generate more prompt cases!
176 |
177 |
178 | ## Interesting features / facts
179 |
180 | Listing out a few features you should know about that you can start using as your
181 | suites of prompts become larger / more complex
182 |
183 | * evaluation functions are assumed to return a value between 0 and 1.
184 | contrarily to unit tests, prompt cases aren't boolean
185 | * prompts can be assigned a `weight` (default 1) this enables you to define
186 | which prompts are more important than others for reporting purposes and suite evaluation
187 | * prompts can be assigned a `category`, this can be used in the reporting.
188 | That helps understanding which categories are performing better than
189 | others, or are most affected by iterations
190 | * The `Prompt` class `pre_run` and `post_run` hooks if you want to do
191 | post-processing for instance. An example of that would be if you do a prompt
192 | that expects GPT to generate code, and you'd like actually say run that code
193 | and test it. In our SQL implementation, we run the SQL against the database
194 | for instance and get a pandas dataframe back, and allow doing assertions
195 | on the dataframe itself
196 |
197 |
198 |
199 | ## Getting started
200 |
201 | To install the Promptimize package, use the following command:
202 | ```bash
203 | pip install promptimize
204 | ```
205 |
206 | First you'll need an openai API key, let's set it as an env var
207 | ```bash
208 | export OPENAI_API_KEY=sk-{{ REDACTED }}
209 | ```
210 |
211 | Find the examples executed below
212 | [here](https://github.com/preset-io/promptimize/blob/main/examples/readme_examples.py)
213 |
214 | ```bash
215 | # Clone the repo
216 | git clone git@github.com:preset-io/promptimize.git
217 | cd promptimize
218 |
219 | # NOTE: CLI is `promptimize`, but `p9e` is a shorter synonym, can be used interchangibly
220 | # First let's run some of the examples
221 | p9e run ./examples
222 |
223 | # Now the same but with verbose output
224 | p9e run ./examples --verbose --output ./report.yaml
225 |
226 | ```
227 | ## Langchain
228 |
229 | How does promptimize relate to `langchain`?
230 |
231 | We think langchain is amazing and promptimize uses langchain under the
232 | hood to interact with openai, and has integration with langchain
233 | (see `LangchainPromptCase`, and the upcoming `LangchainChainPromptCase`
234 | and `LangchainAgntPromptCase`)
235 | While you don't have to use
236 | langchain, and could use promptimize on top of any python prompt generation
237 | whether it'd be another library or some home grown thing.
238 |
239 |
240 | ## Context
241 |
242 |
243 |
244 | Where is `promptimize` coming from!? I'm (Maxime Beauchemin) a startup
245 | founder at Preset working on brining AI to BI
246 | (data exploration,
247 | and visualization). At Preset, we use `promptimize` to generate
248 | complex SQL based on natural language, and to suggest charts to users. We
249 | derive the `SimpleQuery` class to make it fitted to our specific use
250 | cases in our own prompt engineering repo. It's not my first open source project
251 | as the creator of
252 | [Apache Superset](https://github.com/apache/superset/) and
253 | [Apache Airflow](https://github.com/apache/airflow/)
254 |
255 |
256 | ## Contribute
257 |
258 | This project is in its super early stages as of `0.2.0`, and contributions,
259 | contributors, and maintainers are highly encouraged. While it's a great time
260 | to onboard and influence the direction of the project, things are still
261 | evolving quickly. To get involved, open a GitHub issue
262 | or submit a pull request!
263 |
264 | ## Links
265 | * [Blog - Mastering AI-Powered Product Development: Introducing Promptimize for Test-Driven Prompt Engineering](https://preset.io/blog/introducing-promptimize/)
266 | * [Preset Blog](https://preset.io/blog/)
267 | * [Promptimize DOCS](https://preset-io.github.io/promptimize/)
268 |
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | cp $(SOURCEDIR)/../../README.md $(SOURCEDIR)/
21 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
22 |
--------------------------------------------------------------------------------
/docs/publish.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Clean up the build/html directory
4 | rm -rf build/html
5 |
6 | # Build the documentation
7 | make html
8 |
9 | # Change to the build/html directory
10 | cd build/html
11 | touch .nojekyll
12 |
13 | # Initialize a new Git repository
14 | git init
15 |
16 | # Add the generated files to the repository
17 | git add .
18 |
19 | # Commit the changes
20 | git commit -a -m "Initial commit"
21 |
22 | # Add the GitHub Pages remote repository
23 | git remote add origin https://github.com/preset-io/promptimize.git
24 |
25 | # Push the changes to the GitHub Pages repository
26 | git push -f origin main:gh-pages
27 |
28 | # Return to the original directory
29 | cd ../..
30 |
--------------------------------------------------------------------------------
/docs/source/README.md:
--------------------------------------------------------------------------------
1 | # 💡 ¡promptimize! 💡
2 |
3 |
4 | Promptimize is a prompt engineering evaluation and testing toolkit.
5 |
6 | It accelerates and provides structure around prompt engineering at scale
7 | with confidence, brigning some of the ideas behind test-driven
8 | developmet (TDD) to engineering prompts.
9 |
10 | With promptimize, you can:
11 |
12 | - Define your "prompt cases" (think "test cases" but specific to evaluating
13 | prompts) as code and associate them with evaluation functions
14 | - Generate prompt variations dynamically
15 | - Execute and rank prompts test suites across different
16 | engines/models/temperature/settings and compare results, brining
17 | the hyperparameter tuning mindset to prompt engineering
18 | - Get reports on your prompts' performance as you iterate. Answer question
19 | around how different prompt suites are performing against one-another.
20 | Which individual cases or categories of cases improved? regressed?
21 | - Minimize API calls! only re-assess what changed as you change it
22 | - Perform human if and where needed, introspected failed cases, overriding
23 | false negatives
24 |
25 | In essence, promptimize provides a programmatic way to execute and fine-tune
26 | your prompts and evaluation functions in Python, allowing you to iterate
27 | quickly and with confidence.
28 |
29 | ## Hello world - the simplest prompt examples
30 | [more examples on GitHub](https://github.com/preset-io/promptimize/tree/master/examples)
31 | ```python
32 | # Brining some "prompt generator" classes - note that you can derive and extend those
33 | from promptimize.prompts import SimplePrompt
34 |
35 | # Bringing some useful eval function that help evaluating and scoring responses
36 | # eval functions have a handle on the prompt object and are expected
37 | # to return a score between 0 and 1
38 | from promptimize import evals
39 |
40 | # Promptimize will scan the target folder and find all Prompt objects
41 | # and derivatives that are in the python modules
42 | simple_prompts = [
43 |
44 | # Prompting "hello there" and making sure there's "hi" or "hello"
45 | # somewhere in the answer
46 | PromptCase("hello there!", lambda x: evals.any_word(x, ["hi", "hello"])),
47 | PromptCase(
48 | "name the top 50 guitar players!", lambda x: evals.all_words(x, ["frank zappa"])
49 | ),
50 | ]
51 | ```
52 |
53 | ### The CLI
54 | ```bash
55 | $ promptimize -h
56 | ```
57 |
58 |
59 | ## Problem + POV
60 |
61 | Thousands of product builders are currently trying to figure out how to
62 | bring the power of AI into the products and experiences they are building.
63 | The probabilistic (often semi-random, sometimes hectic) nature of LLMs
64 | makes this a challenge.
65 |
66 | Prompt engineering is a huge piece of the puzzle in terms of how to do this
67 | right, especially given the complexity, risks, and drawbacks around
68 | model tuning.
69 |
70 | We believe product builders need to tame AI through proper, rigorous
71 | **prompt engineering**. This allows making the probabilistic nature of
72 | AI more deterministic, or somewhat predictable, and allows builders to apply
73 | a hyperparameter tuning-type mindset and approach to prompt engineering.
74 |
75 | Any prompt-generator logic that's going to be let loose in the wild inside
76 | a product should be thoroughly tested and evaluated with "prompt cases" that
77 | cover the breath of what people may do in a product.
78 |
79 | In short, Promptimize allows you to test prompts at industrial scale,
80 | so that you can confidently use them in the products you are building.
81 |
82 | ## Information Architecture
83 |
84 | - **Prompt:** A Prompt instance is a certain test case, a single prompt
85 | with an associated set of evaluation functions to rate its success.
86 | - **Evaluation:** An evaluation function that reads the response and returns
87 | a success rate between `0` and `1`.
88 | - **Suite:** A Suite is a collection of Prompt; it's able to run things,
89 | accumulate results, and print reports about its collection of use cases.
90 | - **Report**: a report is the compiled results of running a certain prompt
91 | `Suite` or set of suites. Reports can be consumed, compared, and expanded.
92 |
93 | ## Principles
94 |
95 | - **Configuration as code:** All prompt cases, suites, and evaluations are
96 | defined as code, which makes it easy to dynamically generate all sorts
97 | of use cases and suites.
98 | - **Expressive**: a clean DSL that's to-the-point -> user prompt + assertions.
99 | the actually prompt creation logic lives in the derivative class of `PromptCase`,
100 | so that we can have clean, dense files that contain nice `Suite`s
101 | - **Support the iteration mindset:** making it easy for people to try things,
102 | get suggestions from the AI, adapt, compare, and push forward
103 | - **Extensibility:** the toolkit is designed to be extremely hackable and
104 | extensible. Hooks, extensions, high API surface.
105 | - **AI-powered:** the framework offers ways to expand your suites based
106 | on the examples that exists. Use AI to generate more prompt cases!
107 |
108 |
109 | ## Interesting features / facts
110 |
111 | Listing out a few features you should know about that you can start using as your
112 | suites of prompts become larger / more complex
113 |
114 | * evaluation functions are assumed to return a value between 0 and 1.
115 | contrarily to unit tests, prompt cases aren't boolean
116 | * prompts can be assigned a `weight` (default 1) this enables you to define
117 | which prompts are more important than others for reporting purposes and suite evaluation
118 | * prompts can be assigned a `category`, this can be used in the reporting.
119 | That helps understanding which categories are performing better than
120 | others, or are most affected by iterations
121 | * The `Prompt` class `pre_run` and `post_run` hooks if you want to do
122 | post-processing for instance. An example of that would be if you do a prompt
123 | that expects GPT to generate code, and you'd like actually say run that code
124 | and test it. In our SQL implementation, we run the SQL against the database
125 | for instance and get a pandas dataframe back, and allow doing assertions
126 | on the dataframe itself
127 |
128 |
129 | ## Setup
130 |
131 | To install the Promptimize package, use the following command:
132 | ```bash
133 | pip install promptimize
134 | ```
135 |
136 | ## Getting started
137 |
138 | First you'll need an openai API key, let's set it as an env var
139 | ```bash
140 | export OPENAI_API_KEY=sk-{REDACTED}
141 | ```
142 |
143 | Find the examples bellow [here](https://github.com/preset-io/promptimize/blob/master/examples/readme_examples.py)
144 |
145 | ```python
146 | ```
147 | ```bash
148 | # NOTE: CLI is `promptimize`, but `p9e` is a shorter synonym, can be used interchangibly
149 |
150 | # First let's run some of the examples
151 | p9e run ./examples
152 |
153 | # Now the same but with verbose output
154 | p9e run ./examples --verbose
155 |
156 | ```
157 | ## Langchain?
158 |
159 | How does promptimize relate to `langchain`?
160 |
161 | We think langchain is amazing and promptimize uses langchain under the
162 | hood to interact with openai, and has integration with langchain
163 | (see `LangchainPromptCase`, and the upcoming `LangchainChainPromptCase`
164 | and `LangchainAgntPromptCase`)
165 | While you don't have to use
166 | langchain, and could use promptimize on top of any python prompt generation
167 | whether it'd be another library or some home grown thing.
168 |
169 |
170 | ## Context
171 |
172 |
173 |
174 | Where is `promptimize` coming from!? I'm (Maxime Beauchemin) a startup
175 | founder at Preset working on brining AI to BI
176 | (data exploration,
177 | and visualization). At Preset, we use `promptimize` to generate
178 | complex SQL based on natural language, and to suggest charts to users. We
179 | derive the `SimpleQuery` class to make it fitted to our specific use
180 | cases in our own prompt engineering repo. Not my first open source project
181 | as the creator of
182 | [Apache Superset](https://github.com/apache/superset/) and
183 | [Apache Airflow](https://github.com/apache/airflow/)
184 |
185 |
186 | ## Contribute
187 |
188 | This project is in its super early stages as of `0.1.0`, and contributions,
189 | contributors, and maintainers are highly encouraged. While it's a great time
190 | to onboard and influence the direction of the project, things are still
191 | evolving quickly. To get involved, open a GitHub issue
192 | or submit a pull request!
193 |
194 | ## Links
195 | * [Blog - Mastering AI-Powered Product Development: Introducing Promptimize for Test-Driven Prompt Engineering](https://preset.io/blog/)
196 | * [Preset Blog](https://preset.io/blog/)
197 |
--------------------------------------------------------------------------------
/docs/source/api.rst:
--------------------------------------------------------------------------------
1 | API
2 | =============
3 |
4 | PromptCase
5 | ----------
6 |
7 | .. automodule:: promptimize.prompt_cases
8 | :members:
9 |
10 | Suite
11 | -----
12 |
13 | .. automodule:: promptimize.suite
14 | :members:
15 |
16 | Report
17 | ------
18 |
19 | .. automodule:: promptimize.reports
20 | :members:
21 |
22 | Evaluators
23 | ----------
24 |
25 | .. automodule:: promptimize.evals
26 | :members:
27 |
28 | Utils
29 | -----
30 |
31 | .. automodule:: promptimize.utils
32 | :members:
33 |
--------------------------------------------------------------------------------
/docs/source/cli.rst:
--------------------------------------------------------------------------------
1 | The CLI
2 | =======
3 |
4 | .. click:: promptimize:cli
5 | :prog: promptimize
6 | :show-nested:
7 |
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # For the full list of built-in configuration values, see the documentation:
4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
5 |
6 | # -- Project information -----------------------------------------------------
7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
8 | from typing import List
9 |
10 | project = "promptimize"
11 | copyright = "2023, Maxime Beauchemin"
12 | author = "Maxime Beauchemin"
13 |
14 | # -- General configuration ---------------------------------------------------
15 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
16 |
17 | extensions: List = [
18 | "sphinx.ext.autodoc",
19 | "sphinx.ext.napoleon", # Optional, for Google and NumPy-style docstrings
20 | "recommonmark", # If you're using the recommonmark extension
21 | "sphinx_click.ext",
22 | ]
23 |
24 |
25 | templates_path = ["_templates"]
26 | exclude_patterns: List = []
27 |
28 |
29 | # -- Options for HTML output -------------------------------------------------
30 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
31 |
32 | html_theme = "alabaster"
33 | html_logo = (
34 | "https://user-images.githubusercontent.com/"
35 | "487433/229948453-36cbc2d1-e71f-4e87-9111-ab428bc96f4c.png"
36 | )
37 | html_static_path = ["_static"]
38 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | .. include:: README.md
2 | :parser: markdown
3 |
4 | .. toctree::
5 | :maxdepth: 2
6 | :caption: Content
7 |
8 | api
9 | cli
10 |
--------------------------------------------------------------------------------
/examples/__pycache__/use_cases.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/preset-io/promptimize/9c1ffb32444ea69eae158d2cff72786ee07db1bd/examples/__pycache__/use_cases.cpython-38.pyc
--------------------------------------------------------------------------------
/examples/python_examples.py:
--------------------------------------------------------------------------------
1 | """An example of how to test Python code generating prompts"""
2 | import re
3 |
4 | # Brining some "prompt generator" classes
5 | from promptimize.prompt_cases import LangchainPromptCase
6 |
7 | # Bringing some useful eval function that help evaluating and scoring responses
8 | # eval functions have a handle on the prompt object and are expected
9 | # to return a score between 0 and 1
10 | from langchain import PromptTemplate
11 | from langchain.output_parsers import StructuredOutputParser, ResponseSchema
12 | import demjson
13 |
14 | from RestrictedPython import compile_restricted, safe_globals, safe_builtins
15 | from RestrictedPython.Guards import guarded_unpack_sequence
16 | from RestrictedPython.Eval import default_guarded_getiter
17 |
18 |
19 | response_schemas = [
20 | ResponseSchema(
21 | name="python_function",
22 | description="the python function itself",
23 | ),
24 | ResponseSchema(
25 | name="functon_name",
26 | description="the name of the function",
27 | ),
28 | ResponseSchema(name="test_cases", description="test cases"),
29 | ResponseSchema(
30 | name="hints",
31 | description="if any, any recommendations to the users about clarifying their prompt",
32 | ),
33 | ]
34 |
35 | output_parser = StructuredOutputParser.from_response_schemas(response_schemas)
36 | format_instructions = output_parser.get_format_instructions().replace("\t", " ")
37 | """
38 | * you include great useful docstrings and doctests that follow the Google conventions
39 | """
40 |
41 | template = """\
42 | System: you are an AI that writes python function that accomplish specific tasks
43 |
44 | Python guidelines:
45 | * you follow the PEP8 conventions
46 | * use 4 spaces indent, no tabs!
47 | * use snake case (using underscores)
48 |
49 | The output should be a VALID JSON blob with the following keys:
50 | * "python_function" as a string with the python function code
51 | * "function_name" as the name of the function
52 | * "hints": as some hints about how to use the function
53 |
54 | User: write a function that multipllies a number by 2 and returns the result
55 |
56 | System:
57 | {
58 | "python_function": "def multiply_by_two(number):\\n return number * 2\\n"
59 | "function_name": "multiply_by_two",
60 | "hints": "This function is not that helpful as you can simply mulitply by two\\ninstead of calling this function"
61 | }
62 |
63 | User: {{ user_input }}
64 |
65 | System:
66 | """ # noqa
67 |
68 | lc_template = PromptTemplate(
69 | input_variables=["user_input"],
70 | partial_variables={"format_instructions": format_instructions},
71 | template=template,
72 | template_format="jinja2",
73 | )
74 |
75 |
76 | def function_from_string(function_as_string, function_name):
77 | restricted_code = compile_restricted(function_as_string, "", "exec")
78 |
79 | # Define a separate environment for the code to run in
80 | execution_globals = safe_globals.copy()
81 | execution_globals.update(
82 | {
83 | "__builtins__": safe_builtins,
84 | "_unpack_sequence_": guarded_unpack_sequence,
85 | "_getiter_": default_guarded_getiter,
86 | }
87 | )
88 |
89 | # Execute the code in the restricted environment
90 | exec(restricted_code, execution_globals)
91 |
92 | # Access the function from the restricted environment
93 | return execution_globals[function_name]
94 |
95 |
96 | def test(func, args, expected_result):
97 | if func:
98 | if not isinstance(args, (list, tuple)):
99 | args = [args]
100 | try:
101 | result = func(*args)
102 | if expected_result == result:
103 | return 1
104 | except Exception:
105 | return 0
106 | return 0
107 |
108 |
109 | def decode_shitty_json(s):
110 | json_match = re.search(r"\{[\s\S]*\}", s)
111 |
112 | if json_match:
113 | json_string = json_match.group()
114 |
115 | # Parse the JSON string using demjson
116 | json_data = demjson.decode(json_string)
117 |
118 | return json_data
119 | return None
120 |
121 |
122 | def test_is_prime(prompt_case, val, exp):
123 | return test(prompt_case.python_function, val, exp)
124 |
125 |
126 | class PythonGeneratorPrompt(LangchainPromptCase):
127 | def post_run(self):
128 | success = False
129 | self.python_function = None
130 | self.f = None
131 | try:
132 | self.response = decode_shitty_json(self.response)
133 | success = True
134 | except Exception as e:
135 | self.error = str(e)
136 |
137 | if success:
138 | # try:
139 | f = function_from_string(
140 | self.response.get("python_function"), self.response.get("function_name")
141 | )
142 | self.python_function = f
143 | self.f = f
144 | # except Exception as e:
145 | # self.error = str(e)
146 |
147 |
148 | prompts = [
149 | PythonGeneratorPrompt(
150 | lc_template,
151 | key="is_prime",
152 | user_input="write a function that tests if an number is a prime number, returns a boolean",
153 | evaluators=[
154 | lambda x: test(x.f, 2, True),
155 | lambda x: test(x.f, 4, False),
156 | lambda x: test(x.f, 7, True),
157 | lambda x: test(x.f, 10, False),
158 | lambda x: test(x.f, 11, True),
159 | lambda x: test(x.f, 113, True),
160 | ],
161 | ),
162 | PythonGeneratorPrompt(
163 | lc_template,
164 | key="gcd",
165 | user_input="write a function that finds the greatest common divisor (GCD) of two numbers?",
166 | evaluators=[
167 | lambda x: test(x.f, [14, 28], 14),
168 | lambda x: test(x.f, [56, 98], 14),
169 | lambda x: test(x.f, [81, 153], 9),
170 | ],
171 | ),
172 | PythonGeneratorPrompt(
173 | lc_template,
174 | key="factorial",
175 | user_input="write a function that calculates the factorial of a given number",
176 | evaluators=[
177 | lambda x: test(x.f, 0, 1),
178 | lambda x: test(x.f, 1, 1),
179 | lambda x: test(x.f, 5, 120),
180 | lambda x: test(x.f, 7, 5040),
181 | lambda x: test(x.f, 10, 3628800),
182 | ],
183 | ),
184 | PythonGeneratorPrompt(
185 | lc_template,
186 | key="is_palindrome",
187 | user_input="write a function that determines if a given string is a palindrome",
188 | evaluators=[
189 | lambda x: test(x.f, "racecar", True),
190 | lambda x: test(x.f, "hello", False),
191 | lambda x: test(x.f, "madam", True),
192 | lambda x: test(x.f, "python", False),
193 | lambda x: test(x.f, "Aibohphobia", True),
194 | ],
195 | ),
196 | PythonGeneratorPrompt(
197 | lc_template,
198 | key="fibonacci",
199 | user_input=(
200 | "write a function that generates the Fibonacci sequence ",
201 | "up to a specified number of terms",
202 | ),
203 | evaluators=[
204 | lambda x: test(x.f, 1, [0]),
205 | lambda x: test(x.f, 2, [0, 1]),
206 | lambda x: test(x.f, 5, [0, 1, 1, 2, 3]),
207 | lambda x: test(x.f, 10, [0, 1, 1, 2, 3, 5, 8, 13, 21, 34]),
208 | lambda x: test(x.f, 7, [0, 1, 1, 2, 3, 5, 8]),
209 | ],
210 | ),
211 | PythonGeneratorPrompt(
212 | lc_template,
213 | key="sum_of_multiples",
214 | user_input=(
215 | "write a function that calculates the sum of all multiples ",
216 | "of 3 and 5 below a given number",
217 | ),
218 | evaluators=[
219 | lambda x: test(x.f, 10, 23),
220 | lambda x: test(x.f, 20, 78),
221 | lambda x: test(x.f, 30, 195),
222 | lambda x: test(x.f, 50, 543),
223 | lambda x: test(x.f, 100, 2418),
224 | ],
225 | ),
226 | PythonGeneratorPrompt(
227 | lc_template,
228 | key="is_leap_year",
229 | user_input="write a function that checks whether a given year is a leap year",
230 | evaluators=[
231 | lambda x: test(x.f, 2000, True),
232 | lambda x: test(x.f, 1900, False),
233 | lambda x: test(x.f, 2020, True),
234 | lambda x: test(x.f, 2021, False),
235 | lambda x: test(x.f, 2400, True),
236 | ],
237 | ),
238 | PythonGeneratorPrompt(
239 | lc_template,
240 | key="longest_substring_without_repeating_chars",
241 | user_input=(
242 | "write a function that finds the longest substring of a ",
243 | "given string without repeating characters",
244 | ),
245 | evaluators=[
246 | lambda x: test(x.f, "abcabcbb", "abc"),
247 | lambda x: test(x.f, "bbbbbb", "b"),
248 | lambda x: test(x.f, "pwwkew", "wke"),
249 | lambda x: test(x.f, "abcdefgh", "abcdefgh"),
250 | lambda x: test(x.f, "abcbdacf", "bdacf"),
251 | ],
252 | ),
253 | PythonGeneratorPrompt(
254 | lc_template,
255 | key="longest_common_prefix",
256 | user_input="write a function that finds the longest common prefix of a list of strings",
257 | evaluators=[
258 | lambda x: test(x.f, ["flower", "flow", "flight"], "fl"),
259 | lambda x: test(x.f, ["dog", "racecar", "car"], ""),
260 | lambda x: test(x.f, ["interspecies", "interstellar", "interstate"], "inter"),
261 | lambda x: test(x.f, ["prefix", "suffix", "infix"], ""),
262 | lambda x: test(x.f, ["geeksforgeeks", "geeks", "geek"], "geek"),
263 | ],
264 | ),
265 | PythonGeneratorPrompt(
266 | lc_template,
267 | key="sum_of_digits",
268 | user_input="write a function that calculates the sum of the digits of a given number",
269 | evaluators=[
270 | lambda x: test(x.f, 123, 6),
271 | lambda x: test(x.f, 456, 15),
272 | lambda x: test(x.f, 789, 24),
273 | lambda x: test(x.f, 1001, 2),
274 | lambda x: test(x.f, 54321, 15),
275 | ],
276 | ),
277 | PythonGeneratorPrompt(
278 | lc_template,
279 | key="decimal_to_binary",
280 | user_input=(
281 | "write a function that converts a given decimal number to " "its binary representation"
282 | ),
283 | evaluators=[
284 | lambda x: test(x.f, 2, "10"),
285 | lambda x: test(x.f, 7, "111"),
286 | lambda x: test(x.f, 10, "1010"),
287 | lambda x: test(x.f, 16, "10000"),
288 | lambda x: test(x.f, 31, "11111"),
289 | ],
290 | ),
291 | ]
292 |
--------------------------------------------------------------------------------
/examples/readme_examples.py:
--------------------------------------------------------------------------------
1 | """
2 | Some basic examples for promptimize.
3 |
4 | to run, simply execute `p9e ./examples/readme_examples.py`
5 | """
6 | # Brining some "prompt generator" classes
7 | from promptimize.prompt_cases import PromptCase, TemplatedPromptCase
8 |
9 | # Bringing some useful eval function that help evaluating and scoring responses
10 | # eval functions have a handle on the prompt object and are expected
11 | # to return a score between 0 and 1
12 | from promptimize import evals
13 |
14 | # Promptimize will scan the target folder and find all Prompt objects
15 | # and derivatives that are in the python modules
16 | simple_prompts = [
17 | # Prompting "hello there" and making sure there's "hi" or "hello"
18 | # somewhere in the answer
19 | PromptCase("hello there!", lambda x: evals.any_word(x.response, ["hi", "hello"])),
20 | # Making sure 3 specific guitar players are in the top 10
21 | # the score here is a percentage of the words found
22 | PromptCase(
23 | "who are the top 10 best guitar players of all time?",
24 | lambda x: evals.percentage_of_words(
25 | x.response, ["frank zappa", "david gilmore", "carlos santana"]
26 | ),
27 | ),
28 | # GPT-ing myself and making sure the response mentions Superset and Airflow
29 | PromptCase(
30 | "who is Maxime Beauchemin, (the data guy...)?",
31 | lambda x: evals.percentage_of_words(
32 | x.response, ["superset", "airflow"], case_sensitive=False
33 | ),
34 | ),
35 | ]
36 |
37 | # deriving TemplatedPromptCase to generate prompts that ask GPT to generate SQL
38 | # based on table schemas. The point here is you can derive the `Prompt`
39 | # class to create more specialized Prompt generators
40 | # For instance, the SqlPropt class defined bellow could be extended to fetch
41 | # schema definitions dynamically, acutally run the SQL, and allow
42 | # doing evals against the resultset.
43 |
44 |
45 | class SqlPrompt(TemplatedPromptCase):
46 | # the TemplatedPromptCase expects a dict of defaults that can be overriden in the constructor
47 | template_defaults = {"dialect": "Postgres"}
48 | # The actual Jinja2 template
49 | template = """\
50 | given these SQL table schemas:
51 | CREATE TABLE world_population (
52 | country_name STRING,
53 | year DATE,
54 | population_total INT,
55 | );
56 |
57 | So, can you write a SQL query for {{ dialect }} that answers this user prompt:
58 | {{ input }}
59 | """
60 |
61 |
62 | # Generating a few SQL prompts
63 | sql_prompts = [
64 | SqlPrompt(
65 | # you can pass a unique key that can we used to reference a prompt
66 | key="sql-top-10-inc",
67 | # the user input that'll be added in place of {{ input }} in the template above
68 | user_input=(
69 | "give me the top 10 countries with the highest net increase "
70 | "of population over the past 25 years?"
71 | ),
72 | # the dialect template parameter, overriding the default set above
73 | dialect="BigQuery",
74 | # a simple validation function making sure the SQL starts with SELECT
75 | evaluators=lambda x: 1 if x.response.strip().startswith("SELECT") else 0,
76 | ),
77 | ]
78 |
--------------------------------------------------------------------------------
/examples/readme_hello_world.py:
--------------------------------------------------------------------------------
1 | """
2 | Some basic examples for promptimize.
3 |
4 | to run, simply execute `p9e ./examples/readme_examples.py`
5 | """
6 | # Brining some "prompt generator" classes
7 | from promptimize.prompt_cases import PromptCase
8 |
9 | # Bringing some useful eval function that help evaluating and scoring responses
10 | # eval functions have a handle on the prompt object and are expected
11 | # to return a score between 0 and 1
12 | from promptimize import evals
13 |
14 | # Promptimize will scan the target folder and find all Prompt objects
15 | # and derivatives that are in the python modules
16 | simple_prompts = [
17 | # Prompting "hello there" and making sure there's "hi" or "hello"
18 | # somewhere in the answer
19 | PromptCase("hello there!", lambda x: evals.any_word(x.response, ["hi", "hello"])),
20 | PromptCase(
21 | "name the top 10 guitar players!",
22 | lambda x: evals.all_words(x.response, ["zappa", "hendrix"]),
23 | weight=2,
24 | ),
25 | ]
26 |
--------------------------------------------------------------------------------
/promptimize/__init__.py:
--------------------------------------------------------------------------------
1 | from promptimize.cli import cli # noqa
2 |
--------------------------------------------------------------------------------
/promptimize/cli.py:
--------------------------------------------------------------------------------
1 | import click
2 |
3 | from promptimize.crawler import discover_objects
4 | from promptimize.prompt_cases import BasePromptCase
5 | from promptimize.reports import Report
6 | from promptimize.suite import Suite
7 |
8 |
9 | @click.group(help="💡¡promptimize!💡 CLI. `p9e` works too!")
10 | def cli():
11 | pass
12 |
13 |
14 | @click.command(help="run some prompts")
15 | @click.argument(
16 | "path",
17 | required=True,
18 | type=click.Path(exists=True),
19 | )
20 | @click.option("--verbose", "-v", is_flag=True, help="Trigger more verbose output")
21 | @click.option("--force", "-f", is_flag=True, help="Force run, do not skip")
22 | @click.option(
23 | "--human",
24 | "-h",
25 | is_flag=True,
26 | help="Human review, allowing a human to review and force pass/fail each prompt case",
27 | )
28 | @click.option("--repair", "-r", is_flag=True, help="Only re-run previously failed")
29 | @click.option("--dry-run", "-x", is_flag=True, help="DRY run, don't call the API")
30 | @click.option("--shuffle", is_flag=True, help="Shuffle the prompts in a random order")
31 | @click.option(
32 | "--style",
33 | "-s",
34 | type=click.Choice(["json", "yaml"], case_sensitive=False),
35 | default="yaml",
36 | help="json or yaml formatting",
37 | )
38 | @click.option(
39 | "--max-tokens",
40 | "-m",
41 | type=click.INT,
42 | default=1000,
43 | help="max_tokens passed to the model",
44 | )
45 | @click.option(
46 | "--limit",
47 | "-l",
48 | type=click.INT,
49 | default=0,
50 | help="limit how many prompt cases to run in a single batch",
51 | )
52 | @click.option(
53 | "--temperature",
54 | "-t",
55 | type=click.FLOAT,
56 | default=0.5,
57 | help="the temperature passed to the model",
58 | )
59 | @click.option(
60 | "--engine",
61 | "-e",
62 | type=click.STRING,
63 | default="text-davinci-003",
64 | help="model as accepted by the openai API",
65 | )
66 | @click.option("--key", "-k", multiple=True, help="The keys to run")
67 | @click.option(
68 | "--output",
69 | "-o",
70 | type=click.Path(),
71 | )
72 | @click.option("--silent", "-s", is_flag=True)
73 | def run(
74 | path,
75 | verbose,
76 | force,
77 | dry_run,
78 | style,
79 | temperature,
80 | max_tokens,
81 | engine,
82 | key,
83 | output,
84 | silent,
85 | repair,
86 | human,
87 | shuffle,
88 | limit,
89 | ):
90 | """Run some prompts/suites!"""
91 | click.secho("💡 ¡promptimize! 💡", fg="cyan")
92 | if dry_run:
93 | click.secho("# DRY RUN MODE ACTIVATED!", fg="red")
94 | uses_cases = discover_objects(path, BasePromptCase)
95 | completion_create_kwargs = {
96 | "engine": engine,
97 | "max_tokens": max_tokens,
98 | "temperature": temperature,
99 | }
100 |
101 | report = None
102 | if output:
103 | report = Report.from_path(output)
104 |
105 | suite = Suite(uses_cases, completion_create_kwargs)
106 | suite.execute(
107 | verbose=verbose,
108 | style=style,
109 | silent=silent,
110 | report=report,
111 | dry_run=dry_run,
112 | keys=key,
113 | force=force,
114 | repair=repair,
115 | human=human,
116 | shuffle=shuffle,
117 | limit=limit,
118 | )
119 |
120 | if output:
121 | output_report = Report.from_suite(suite)
122 | if report:
123 | output_report.merge(report)
124 | click.secho(f"# Writing file output to {output}", fg="yellow")
125 | output_report.write(output, style=style)
126 |
127 |
128 | cli.add_command(run)
129 |
130 |
131 | @click.command(help="report on how your suites of prompts are performing")
132 | @click.argument(
133 | "path",
134 | required=True,
135 | type=click.Path(exists=True),
136 | )
137 | @click.option("--groupby", "-g", help="GROUPBY", default="category")
138 | def report(path, groupby):
139 | """Get some summary of how your prompt suites are performing"""
140 | click.secho(f"# Reading report @ {path}", fg="yellow")
141 | report = Report.from_path(path)
142 | report.print_summary(groupby)
143 |
144 |
145 | cli.add_command(report)
146 |
--------------------------------------------------------------------------------
/promptimize/crawler.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import importlib
3 | import pkgutil
4 | from pathlib import Path
5 | from typing import List, Type, Any
6 |
7 |
8 | def is_instance_or_derivative(obj: Any, object_type: Type) -> bool:
9 | return isinstance(obj, object_type)
10 |
11 |
12 | def discover_objects(path: str, object_type: Type) -> List[Any]: # noqa
13 | objects = []
14 | folder_path = Path(path).resolve()
15 |
16 | def process_module(module):
17 | # Iterate over the objects in the module
18 | for name, obj in module.__dict__.items():
19 | # Check if the object is an instance or derivative of the specified type
20 | if is_instance_or_derivative(obj, object_type):
21 | objects.append(obj)
22 | # Check if the object is a list or tuple containing instances or
23 | # derivatives of the specified type
24 | elif isinstance(obj, (list, tuple)):
25 | for item in obj:
26 | if is_instance_or_derivative(item, object_type):
27 | objects.append(item)
28 |
29 | # If the path points to a file, import the module and process it directly
30 | if folder_path.is_file() and folder_path.suffix == ".py":
31 | sys.path.insert(0, str(folder_path.parent))
32 | module_name = folder_path.stem
33 | module = importlib.import_module(module_name)
34 | process_module(module)
35 |
36 | # If the path points to a directory, proceed with the existing logic
37 | elif folder_path.is_dir():
38 | # Add the folder to the Python path to enable importing modules from it
39 | if folder_path not in sys.path:
40 | sys.path.insert(0, str(folder_path))
41 |
42 | # Iterate over all the modules in the folder
43 | for _, module_name, _ in pkgutil.iter_modules([str(folder_path)]):
44 | # Import the module
45 | module = importlib.import_module(module_name)
46 | process_module(module)
47 |
48 | return objects
49 |
--------------------------------------------------------------------------------
/promptimize/evals.py:
--------------------------------------------------------------------------------
1 | """
2 | Eval functions that be used/reused with Prompts
3 |
4 | All functions here are expected to:
5 | * receive a response string
6 | * [optional] receive arbitrary extra context
7 | * return a value from 0 to 1, 0 representing failing at the task, 1 full
8 | success, and a range in-between
9 | """
10 |
11 | from typing import List
12 |
13 |
14 | def percentage_of_words(response: str, words: List[str], case_sensitive: bool = False) -> float:
15 | """
16 | Calculate the percentage of words from a list that are present in the given response.
17 |
18 | Args:
19 | response (str): The string to search for words.
20 | words (List[str]): A list of words to check for their presence in the response.
21 | case_sensitive (bool, optional): If True, the search will be case-sensitive;
22 | otherwise, it will be case-insensitive. Defaults to False.
23 |
24 | Returns:
25 | float: The percentage of words found in the response (0.0 to 1.0).
26 |
27 | Examples:
28 | >>> percentage_of_words_in_response("This is an Example string.", ["example", "test"])
29 | 0.5
30 | >>> percentage_of_words_in_response(
31 | "This is an Example string.", ["Example"], case_sensitive=True)
32 | 1.0
33 | >>> percentage_of_words_in_response(
34 | "This is an Example string.", ["example"], case_sensitive=True)
35 | 0.0
36 | >>> percentage_of_words_in_response("This is an Example string.", ["notfound"])
37 | 0.0
38 | """
39 | if not case_sensitive:
40 | response = response.lower()
41 | words = [w.lower() for w in words]
42 |
43 | return sum(1 if w in response else 0 for w in words) / len(words)
44 |
45 |
46 | def _common_word_search(
47 | response: str, words: List[str], case_sensitive: bool, match_type: str
48 | ) -> int:
49 | if not case_sensitive:
50 | response = response.lower()
51 | words = [w.lower() for w in words]
52 |
53 | if match_type == "any":
54 | return 1 if any(w in response for w in words) else 0
55 | elif match_type == "all":
56 | return 1 if all(w in response for w in words) else 0
57 | else:
58 | raise ValueError("Invalid match_type. Accepted values are 'any' or 'all'.")
59 |
60 |
61 | def any_word(response: str, words: List[str], case_sensitive: bool = False) -> int:
62 | """
63 | Check if any word from a list of words is present in the given response and return 1 or 0.
64 |
65 | Args:
66 | response (str): The string to search for words.
67 | words (List[str]): A list of words to check for their presence in the response.
68 | case_sensitive (bool, optional): If True, the search will be case-sensitive;
69 | otherwise, it will be case-insensitive. Defaults to False.
70 |
71 | Returns:
72 | int: 1 if any word from the list is found in the response; otherwise, 0.
73 |
74 | Examples:
75 | >>> any_word("This is an Example string.", ["example", "test"])
76 | 1
77 | >>> any_word("This is an Example string.", ["Example"], case_sensitive=True)
78 | 1
79 | >>> any_word("This is an Example string.", ["example"], case_sensitive=True)
80 | 0
81 | >>> any_word("This is an Example string.", ["notfound"])
82 | 0
83 | """
84 | return _common_word_search(response, words, case_sensitive, match_type="any")
85 |
86 |
87 | def all_words(response: str, words: List[str], case_sensitive: bool = False) -> int:
88 | """
89 | Check if all words from a list of words are present in the given response and return 1 or 0.
90 |
91 | Args:
92 | response (str): The string to search for words.
93 | words (List[str]): A list of words to check for their presence in the response.
94 | case_sensitive (bool, optional): If True, the search will be case-sensitive;
95 | otherwise, it will be case-insensitive. Defaults to False.
96 |
97 | Returns:
98 | int: 1 if all words from the list are found in the response; otherwise, 0.
99 |
100 | Examples:
101 | >>> all_words("This is an Example string with test.", ["example", "test"])
102 | 1
103 | >>> all_words("This is an Example string with Test.", ["Example", "Test"], case_sensitive=True)
104 | 1
105 | >>> all_words("This is an Example string.", ["example", "test"], case_sensitive=True)
106 | 0
107 | >>> all_words("This is an Example string.", ["example", "notfound"])
108 | 0
109 | """
110 | return _common_word_search(response, words, case_sensitive, match_type="all")
111 |
112 |
113 | base_all = all
114 | base_any = any
115 |
116 |
117 | def all(iteratable):
118 | return 1 if base_all([i == 1 for i in iteratable]) else 0
119 |
120 |
121 | def any(iteratable):
122 | return 1 if base_any([i == 1 for i in iteratable]) else 0
123 |
--------------------------------------------------------------------------------
/promptimize/prompt_cases.py:
--------------------------------------------------------------------------------
1 | import os
2 | from typing import Any, Callable, List, Optional, Union
3 |
4 | from langchain.llms import OpenAI
5 | from langchain.callbacks import get_openai_callback
6 |
7 | from box import Box
8 |
9 | from promptimize import utils
10 | from promptimize.simple_jinja import process_template
11 |
12 |
13 | class BasePromptCase:
14 | """Abstract base prompt case"""
15 |
16 | attributes_used_for_hash = set()
17 | verbose_attrs = {"prompt"}
18 |
19 | def __init__(
20 | self,
21 | evaluators: Optional[Union[Callable, List[Callable]]] = None,
22 | key: Optional[str] = None,
23 | weight=1,
24 | category: str = None, # used for info/reporting purposes only
25 | prompt_executor: Any = None,
26 | prompt_executor_kwargs: dict = None,
27 | prompt_hash=None,
28 | *args,
29 | **kwargs,
30 | ) -> None:
31 | """
32 | Initialize a SimplePromptCase instance.
33 |
34 | Args:
35 | user_input (str): Raw user_input for the prompt.
36 | evaluators (Optional[Union[Callable, List[Callable]]]): Optional
37 | callable or list of callables used for evaluation.
38 | key (Optional[str]): Optional unique key for the prompt.
39 | weight (int, optional): Optional weight for the prompt (default: 1).
40 | category (Optional[str], optional): Optional category for
41 | the prompt (used for info/reporting purposes only).
42 | """
43 | self.extra_args = args
44 | self.extra_kwargs = kwargs
45 | self.response = None
46 | self.has_run = False
47 | self.was_tested = False
48 | self.test_results = None
49 | self.evaluators = evaluators or []
50 | self.weight = weight or 1
51 | self.category = category
52 | self.pre_run_output = None
53 | self.post_run_output = None
54 | self.prompt_executor = prompt_executor or self.get_prompt_executor()
55 | self.prompt_executor_kwargs = prompt_executor_kwargs or {}
56 |
57 | self._prompt_hash = prompt_hash
58 |
59 | self.execution = Box()
60 |
61 | self.prompt = utils.literal_str(self.render()).strip()
62 |
63 | self.key = key or "prompt-" + self.prompt_hash
64 |
65 | if not utils.is_iterable(self.evaluators):
66 | self.evaluators = [self.evaluators] # type: ignore
67 |
68 | def get_prompt_executor(self):
69 | model_name = os.environ.get("OPENAI_MODEL") or "text-davinci-003"
70 | openai_api_key = os.environ.get("OPENAI_API_KEY")
71 | self.prompt_executor_kwargs = {"model_name": model_name}
72 | return OpenAI(model_name=model_name, openai_api_key=openai_api_key)
73 |
74 | def execute_prompt(self, prompt_str):
75 | with get_openai_callback() as cb:
76 | self.response = self.prompt_executor(prompt_str)
77 | self.execution.openai = Box()
78 | oai = self.execution.openai
79 | oai.total_tokens = cb.total_tokens
80 | oai.prompt_tokens = cb.prompt_tokens
81 | oai.completion_tokens = cb.completion_tokens
82 | oai.total_cost = cb.total_cost
83 |
84 | return self.response
85 |
86 | def pre_run(self):
87 | pass
88 |
89 | def post_run(self):
90 | pass
91 |
92 | @property
93 | def hash(self):
94 | return utils.short_hash(hash(self))
95 |
96 | def __hash__(self):
97 | attrs = self.attributes_used_for_hash
98 | s = "|".join([utils.short_hash(utils.hashable_repr(getattr(self, attr))) for attr in attrs])
99 | return utils.int_hash(s)
100 |
101 | def render(self):
102 | raise NotImplementedError()
103 |
104 | def get_unique_hash(self, extra_context=None):
105 | """Returns a unique identifier, determined by the run
106 |
107 | Generally, the actual call sent to GPT (prompt, execution params)
108 | represent something unique.
109 | """
110 | return utils.short_hash(str(self.extra_kwargs))
111 |
112 | def to_dict(self, verbose=False):
113 | d = {
114 | "key": self.key,
115 | "prompt_hash": self.prompt_hash,
116 | "prompt": self.prompt,
117 | "category": self.category,
118 | "response": self.response,
119 | "weight": self.weight,
120 | "execution": self.execution.to_dict(),
121 | }
122 | if hasattr(self, "error"):
123 | d["error"] = self.error
124 | return d
125 |
126 | def print(self, verbose=False, style="yaml"):
127 | style = style or "yaml"
128 | output = self.to_dict(verbose)
129 | if not verbose:
130 | for attr in self.verbose_attrs:
131 | del output[attr]
132 | if self.weight == 1:
133 | del output["weight"]
134 | highlighted = utils.serialize_object(output, style)
135 | print(highlighted)
136 |
137 | def test(self):
138 | test_results = []
139 | for evaluator in self.evaluators:
140 | result = evaluator(self)
141 | if not (utils.is_numeric(result) and 0 <= result <= 1):
142 | raise Exception("Value should be between 0 and 1")
143 | test_results.append(result)
144 |
145 | if len(test_results):
146 | self.execution.score = sum(test_results) / len(test_results)
147 | self.execution.results = test_results
148 | self.was_tested = True
149 |
150 | @property
151 | def prompt_hash(self):
152 | if self._prompt_hash:
153 | return self._prompt_hash
154 | return utils.short_hash(hash(self))
155 |
156 | def _run(self, dry_run):
157 | pre_run_output = self.pre_run()
158 | if pre_run_output:
159 | self.execution.pre_run_output = pre_run_output
160 |
161 | if not dry_run:
162 | with utils.MeasureDuration() as md:
163 | self.response = self.execute_prompt(self.prompt).strip()
164 |
165 | self.execution.api_call_duration_ms = md.duration
166 |
167 | post_run_output = self.post_run()
168 | if post_run_output:
169 | self.execution.post_run_output = post_run_output
170 | self.has_run = True
171 | self.execution.run_at = utils.current_iso_timestamp()
172 | return self.response
173 |
174 |
175 | class PromptCase(BasePromptCase):
176 | """A simple prompt case"""
177 |
178 | attributes_used_for_hash = BasePromptCase.attributes_used_for_hash | {"user_input"}
179 |
180 | def __init__(
181 | self,
182 | user_input,
183 | *args,
184 | **kwargs,
185 | ) -> None:
186 | self.user_input = user_input
187 | super().__init__(*args, **kwargs)
188 |
189 | def to_dict(self, verbose=False, *args, **kwargs):
190 | d = super().to_dict(*args, **kwargs)
191 | d = utils.insert_in_dict(d, "user_input", self.user_input, after_key="key")
192 | return d
193 |
194 | def render(self):
195 | return self.user_input
196 |
197 |
198 | class TemplatedPromptCase(BasePromptCase):
199 | template_defaults: dict = {}
200 | attributes_used_for_hash = BasePromptCase.attributes_used_for_hash | {
201 | "user_input",
202 | "extra_kwargs",
203 | }
204 | template = "{{ user_input }}"
205 |
206 | def __init__(
207 | self,
208 | user_input=None,
209 | *args,
210 | **kwargs,
211 | ) -> None:
212 | self.user_input = user_input
213 | return super().__init__(*args, **kwargs)
214 |
215 | def to_dict(self, verbose=False, *args, **kwargs):
216 | d = super().to_dict(*args, **kwargs)
217 | d = utils.insert_in_dict(d, "user_input", self.user_input, after_key="key")
218 | return d
219 |
220 | def get_extra_template_context(self):
221 | """meant to be overriden in derived classes to add logic/context"""
222 | return {}
223 |
224 | @property
225 | def jinja_context(self):
226 | context_kwargs = self.template_defaults.copy()
227 | context_kwargs.update(self.get_extra_template_context())
228 | context_kwargs.update(self.extra_kwargs)
229 | context_kwargs.update({"user_input": self.user_input})
230 | return context_kwargs
231 |
232 | def render(self, **kwargs):
233 | return process_template(self.template, **self.jinja_context)
234 |
235 |
236 | class LangchainPromptCase(BasePromptCase):
237 | attributes_used_for_hash = BasePromptCase.attributes_used_for_hash | {
238 | "extra_kwargs",
239 | "langchain_prompt",
240 | }
241 |
242 | def __init__(
243 | self,
244 | langchain_prompt,
245 | *args,
246 | **kwargs,
247 | ) -> None:
248 | self.langchain_prompt = langchain_prompt
249 | return super().__init__(*args, **kwargs)
250 |
251 | def to_dict(self, verbose=False, *args, **kwargs):
252 | d = super().to_dict(*args, **kwargs)
253 | d = utils.insert_in_dict(d, "prompt_kwargs", self.extra_kwargs, after_key="key")
254 | return d
255 |
256 | def render(self):
257 | return self.langchain_prompt.format(**self.extra_kwargs)
258 |
--------------------------------------------------------------------------------
/promptimize/reports.py:
--------------------------------------------------------------------------------
1 | import yaml
2 | from box import Box
3 |
4 | import pandas as pd
5 |
6 | from promptimize import utils
7 |
8 |
9 | class Report:
10 | """Report objects interacting with the filesystem / databases and data structures"""
11 |
12 | version = "0.1.0"
13 |
14 | def __init__(self, path=None, data=None):
15 | self.data = Box()
16 | if data:
17 | self.data = Box(data)
18 | self.path = path
19 |
20 | def write(self, path=None, style="yaml"):
21 | """write the report to the filesystem"""
22 | path = path or self.path
23 | with open(path, "w") as f:
24 | f.write(utils.serialize_object(self.data.to_dict(), highlighted=False, style=style))
25 |
26 | def merge(self, report):
27 | """merge in another report into this one"""
28 | all_keys = set(report.prompts.keys()) | set(self.prompts.keys())
29 | for k in all_keys:
30 | a = report.prompts.get(k)
31 | b = self.prompts.get(k)
32 | if a and b:
33 | if a.execution.get("run_at", "") > b.execution.get("run_at", ""):
34 | self.prompts[k] = a
35 | else:
36 | self.prompts[k] = b
37 |
38 | if not a:
39 | self.prompts[k] = b
40 | elif not b:
41 | self.prompts[k] = a
42 |
43 | @property
44 | def prompts(self):
45 | """list the prompts in this report"""
46 | if self.data:
47 | return self.data.prompts
48 | return {}
49 |
50 | @property
51 | def failed_keys(self):
52 | """return the list of prompt keys that have not suceeded"""
53 | keys = set()
54 | for p in self.prompts.values():
55 | if p.execution.get("score", 0) < 1:
56 | keys.add(p.key)
57 | return keys
58 |
59 | @classmethod
60 | def from_path(cls, path):
61 | """load a report object from a path in the filesystem"""
62 | try:
63 | with open(path, "r") as f:
64 | report = cls(path, yaml.safe_load(f))
65 | return report
66 | except FileNotFoundError:
67 | return None
68 |
69 | @classmethod
70 | def from_suite(cls, suite):
71 | """load a report object from a suite instance"""
72 | report = cls(data=suite.to_dict())
73 | return report
74 |
75 | def get_prompt(self, prompt_key):
76 | """get a specific prompt data structure from the report"""
77 | return self.prompts.get(prompt_key)
78 |
79 | def prompt_df(self):
80 | """make a flat pandas dataframe out of the prompts in the reports"""
81 | prompts = [p for p in self.prompts.values() if p.execution]
82 | return pd.json_normalize(prompts)
83 |
84 | def print_summary(self, groupby="category"):
85 | """print the summary from the report"""
86 | if groupby:
87 | self.print_summary(groupby=None)
88 |
89 | df = self.prompt_df()
90 |
91 | df["score"] = df["weight"] * df["execution.score"]
92 |
93 | if groupby:
94 | df = df[[groupby, "weight", "score"]].groupby(groupby).sum()
95 | else:
96 | df = df.agg({"weight": "sum", "score": "sum"}).to_frame().T
97 | df["perc"] = (df["score"] / df["weight"]) * 100
98 | df = df.sort_values(by="weight", ascending=False)
99 | headers = []
100 | if groupby:
101 | headers = "keys"
102 | else:
103 | df = df.T
104 | print(utils.trabulate(df, headers=headers))
105 |
--------------------------------------------------------------------------------
/promptimize/simple_jinja.py:
--------------------------------------------------------------------------------
1 | import jinja2
2 |
3 | environment = jinja2.Environment()
4 |
5 |
6 | def process_template(template_as_string, **kwargs):
7 | template = environment.from_string(template_as_string)
8 | return template.render(**kwargs)
9 |
--------------------------------------------------------------------------------
/promptimize/suite.py:
--------------------------------------------------------------------------------
1 | """
2 | This module provides a Suite class to manage and execute a collection of
3 | use cases (prompts) to be tested. It allows running the tests, displaying
4 | results, and serializing the summary of the suite.
5 | """
6 | import random
7 | from typing import Any, Dict, List, Optional, Union
8 |
9 | import click
10 |
11 | from promptimize import utils
12 | from promptimize.prompt_cases import BasePromptCase
13 |
14 |
15 | def separator(fg=None) -> None:
16 | """Print a separator line."""
17 | click.secho("# " + "-" * 40, fg=fg)
18 |
19 |
20 | def separated_section(s, fg=None):
21 | separator(fg)
22 | click.secho(s, fg=fg)
23 | separator(fg)
24 |
25 |
26 | class Suite:
27 | """A collection of use cases to be tested.
28 |
29 | Attributes:
30 | name (Optional[str]): The name of the suite.
31 | prompts (Dict[str, Prompt]): Dictionary of prompts to be tested,
32 | keyed by the prompt key.
33 | last_run_completion_create_kwargs (Dict[str, Any]): Keyword arguments
34 | used in the last run for completion creation.
35 | efective_prompts (List): List of prompts values that finally will be tested.
36 | """
37 |
38 | def __init__(
39 | self,
40 | prompts: List["BasePromptCase"],
41 | name: Optional[str] = None,
42 | ) -> None:
43 | """
44 | Args:
45 | prompts (List[Prompt]): List of prompts to be tested.
46 | name (Optional[str]): The name of the suite. Defaults to None.
47 | """
48 | self.name = name
49 | self.prompts = {o.key: o for o in prompts}
50 | self.last_run_completion_create_kwargs: dict = {}
51 | self.effective_prompts = list(self.prompts.values())
52 |
53 | def execute( # noqa
54 | self,
55 | verbose: bool = False,
56 | style: str = "yaml",
57 | silent: bool = False,
58 | report=None,
59 | dry_run: bool = False,
60 | keys: list = None,
61 | force: bool = False,
62 | repair: bool = False,
63 | human: bool = False,
64 | shuffle: bool = False,
65 | limit: int = 0,
66 | ) -> None:
67 | """
68 | Execute the suite with the given settings.
69 |
70 | Args:
71 | verbose (bool): If True, print verbose output. Defaults to False.
72 | style (str): Output style for serialization. Defaults to "yaml".
73 | silent (bool): If True, suppress output. Defaults to False.
74 | """
75 | self.reload_effective_prompts(
76 | report=report,
77 | keys=keys,
78 | repair=repair,
79 | shuffle=shuffle,
80 | limit=limit,
81 | )
82 | prompts = self.effective_prompts
83 |
84 | for i, prompt in enumerate(prompts):
85 | should_run = force or self.should_prompt_execute(prompt, report)
86 | progress = f"({i+1}/{len(prompts)})"
87 | if not silent:
88 | if should_run:
89 | separated_section(f"# {progress} [RUN] prompt: {prompt.key}", fg="cyan")
90 | else:
91 | separated_section(f"# {progress} [SKIP] prompt: {prompt.key}", fg="yellow")
92 |
93 | if should_run:
94 | prompt._run(dry_run)
95 | if not dry_run:
96 | prompt.test()
97 |
98 | if not silent and should_run:
99 | prompt.print(verbose=verbose, style=style)
100 |
101 | if should_run and human:
102 | v = click.prompt(
103 | 'Press Enter to continue, "Y" to force success, "N" to force fail, "X" to exit',
104 | default="",
105 | show_default=False,
106 | )
107 | v = v.lower()
108 | if v == "":
109 | click.secho("Leaving result unaltered", fg="yellow")
110 | elif v == "y":
111 | prompt.execution.score = 1
112 | prompt.execution.human_override = True
113 | click.secho("Forcing SUCCESS", fg="green")
114 | elif v == "n":
115 | prompt.execution.score = 0
116 | prompt.execution.human_override = True
117 | click.secho("Forcing FAILURE", fg="red")
118 | elif v == "x":
119 | break
120 |
121 | # `self.last_run_completion_create_kwargs = completion_create_kwargs
122 | if not silent:
123 | separated_section("# Suite summary", fg="cyan")
124 | click.echo(utils.serialize_object(self._serialize_run_summary(), style))
125 |
126 | def reload_effective_prompts(
127 | self,
128 | report=None,
129 | keys: list = None,
130 | repair: bool = False,
131 | shuffle: bool = False,
132 | limit: int = 0,
133 | ):
134 | self.effective_prompts = list(self.prompts.values())
135 | if keys:
136 | self.effective_prompts = [p for p in self.effective_prompts if p.key in keys]
137 | if repair and report:
138 | failed_keys = report.failed_keys
139 | self.effective_prompts = [p for p in self.effective_prompts if p.key in failed_keys]
140 |
141 | if shuffle:
142 | random.shuffle(self.effective_prompts)
143 |
144 | if limit:
145 | self.effective_prompts = self.effective_prompts[:limit]
146 |
147 | def should_prompt_execute(self, prompt, report):
148 | if not report or not report.prompts:
149 | return True
150 | report_prompt = report.prompts.get(prompt.key)
151 | if not report_prompt:
152 | return True
153 | else:
154 | if not report_prompt.execution:
155 | return True
156 |
157 | if report_prompt.prompt_hash == prompt.prompt_hash:
158 | return False
159 |
160 | return True
161 |
162 | def _serialize_run_summary(
163 | self, verbose: bool = False
164 | ) -> Dict[str, Union[Optional[float], Dict[str, Any]]]:
165 | """
166 | Serialize the run summary of the suite.
167 |
168 | Args:
169 | verbose (bool): If True, include verbose output. Defaults to False.
170 |
171 | Returns:
172 | Dict[str, Union[Optional[float], Dict[str, Any]]]: Serialized run summary of the suite.
173 | """
174 | prompts = self.prompts.values()
175 | tested = [p for p in prompts if p.was_tested and p.execution.score is not None]
176 | suite_score = None
177 | if len(tested) > 0:
178 | total_weight = sum([p.weight for p in tested])
179 | suite_score = sum([p.execution.score * p.weight for p in tested]) / total_weight
180 | d = {
181 | "suite_score": suite_score,
182 | "git_info": utils.get_git_info(),
183 | }
184 |
185 | return d
186 |
187 | def to_dict(self) -> Dict[str, Any]:
188 | """
189 | Convert the suite to a dictionary.
190 |
191 | Returns:
192 | Dict[str, Any]: Dictionary representation of the suite.
193 | """
194 | return {
195 | "name": self.name,
196 | "prompts": {p.key: p.to_dict() for p in self.prompts.values()},
197 | "run_summary": self._serialize_run_summary(),
198 | }
199 |
--------------------------------------------------------------------------------
/promptimize/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | from typing import Any, Dict, List, Optional
3 | import subprocess
4 | import hashlib
5 | from datetime import datetime
6 | import re
7 | import time
8 | from pygments import highlight
9 | from pygments.lexers import YamlLexer, JsonLexer
10 | from pygments.formatters import TerminalFormatter
11 | from tabulate import tabulate
12 | import yaml
13 | from yaml.representer import SafeRepresenter
14 |
15 |
16 | def is_numeric(value):
17 | """that'd be nice if we had this in the std lib..."""
18 | return isinstance(value, (int, float, complex))
19 |
20 |
21 | def is_iterable(obj):
22 | """that'd be nice if we had this in the std lib..."""
23 | try:
24 | iter(obj)
25 | return True
26 | except TypeError:
27 | return False
28 |
29 |
30 | # Yaml-related section, hacking out of some issues
31 |
32 |
33 | class folded_str(str):
34 | pass
35 |
36 |
37 | class literal_str(str):
38 | pass
39 |
40 |
41 | def change_style(style, representer):
42 | def new_representer(dumper, data):
43 | scalar = representer(dumper, data)
44 | scalar.style = style
45 | return scalar
46 |
47 | return new_representer
48 |
49 |
50 | # represent_str does handle some corner cases, so use that
51 | # instead of calling represent_scalar directly
52 | represent_folded_str = change_style(">", SafeRepresenter.represent_str)
53 | represent_literal_str = change_style("|", SafeRepresenter.represent_str)
54 |
55 |
56 | def str_presenter(dumper, data):
57 | """
58 | Some hack to get yaml output to use look good for multiline,
59 | which is common in this package
60 |
61 | from: https://shorturl.at/klvC1
62 | """
63 | if len(data.splitlines()) > 1 or "\n" in data or "\\n" in data:
64 | text_list = [line.rstrip() for line in data.splitlines()]
65 | fixed_data = "\n".join(text_list)
66 | return dumper.represent_scalar("tag:yaml.org,2002:str", fixed_data, style="|")
67 | return dumper.represent_scalar("tag:yaml.org,2002:str", data)
68 |
69 |
70 | yaml.add_representer(folded_str, represent_folded_str)
71 | yaml.add_representer(literal_str, represent_literal_str)
72 | yaml.add_representer(str, str_presenter)
73 |
74 |
75 | def extract_json_objects(text: str, get_first: bool = True) -> List[Dict[str, Any]]:
76 | """
77 | Extract JSON objects from a given string by looking for matching curly brackets.
78 |
79 | Args:
80 | text (str): The input string containing JSON objects.
81 |
82 | Returns:
83 | List[Dict[str, Any]]: A list of JSON objects found in the input string.
84 |
85 | Example:
86 |
87 | >>> extract_json_objects('Some text: {"key1": "value1", "key2": "value2"} and more text.')
88 | [{'key1': 'value1', 'key2': 'value2'}]
89 |
90 | >>> extract_json_objects('No JSON objects in this text.')
91 | []
92 |
93 | >>> extract_json_objects('{"a": 1, "b": 2} and {"c": 3, "d": 4}')
94 | [{'a': 1, 'b': 2}, {'c': 3, 'd': 4}]
95 | """
96 | json_objects = []
97 | # man that regex is something... generated by GPT4...
98 | regex = r"(\{(?:[^{}]|(?:\{(?:[^{}]|(?:\{(?:[^{}]|(?:\{[^{}]*\}))*\}))*\}))*\})"
99 | for match in re.finditer(regex, text):
100 | try:
101 | json_object = json.loads(match.group())
102 | json_objects.append(json_object)
103 | except json.JSONDecodeError:
104 | pass
105 | return json_objects
106 |
107 |
108 | def int_hash(text):
109 | # Create a SHA-256 hash of the input string
110 | hash_object = hashlib.sha256(str(text).encode())
111 |
112 | # Convert the hash to a hexadecimal string
113 | hex_hash = hash_object.hexdigest()
114 | return int(hex_hash, 16)
115 |
116 |
117 | def short_hash(obj, length=8):
118 | hash_object = hashlib.sha256(str(obj).encode())
119 | hex_hash = hash_object.hexdigest()
120 | return hex_hash[:length]
121 |
122 |
123 | def to_yaml(data, highlighted=True):
124 | data = yaml.dump(data, sort_keys=False)
125 | if highlighted:
126 | data = highlight(data, YamlLexer(), TerminalFormatter())
127 | return data
128 |
129 |
130 | def to_json(data, highlighted=True):
131 | data = json.dumps(data, indent=2)
132 | highlighted = highlight(data, JsonLexer(), TerminalFormatter())
133 | return highlighted
134 |
135 |
136 | def serialize_object(data, style="yaml", highlighted=True):
137 | if style == "yaml":
138 | return to_yaml(data, highlighted)
139 | return to_json(data, highlighted)
140 |
141 |
142 | def transform_strings(obj, transformation):
143 | """
144 | Recursively iterates through nested iterables (lists and tuples) and dictionaries,
145 | applying a given transformation function to all strings found.
146 |
147 | :param obj: The input object, which can be a string, dictionary, list, or tuple.
148 | Other types will be returned unchanged.
149 | :param transformation: A function that takes a single string argument and returns
150 | a transformed string.
151 | :return: A new object with the same structure as the input object, but with all
152 | strings transformed by the given transformation function.
153 | """
154 | if isinstance(obj, str):
155 | return transformation(obj)
156 | elif isinstance(obj, dict):
157 | return {key: transform_strings(value, transformation) for key, value in obj.items()}
158 | elif isinstance(obj, list):
159 | return [transform_strings(item, transformation) for item in obj]
160 | elif isinstance(obj, tuple):
161 | return tuple(transform_strings(item, transformation) for item in obj)
162 | else:
163 | return obj
164 |
165 |
166 | def get_git_info(sha_length: int = 12):
167 | try:
168 | sha = subprocess.check_output(["git", "rev-parse", "HEAD"]).decode("utf-8").strip()
169 | if sha_length:
170 | sha = sha[:sha_length]
171 | branch = (
172 | subprocess.check_output(["git", "rev-parse", "--abbrev-ref", "HEAD"])
173 | .decode("utf-8")
174 | .strip()
175 | )
176 | dirty = subprocess.call(["git", "diff-index", "--quiet", "HEAD"]) != 0
177 |
178 | return {"sha": sha, "branch": branch, "dirty": dirty}
179 | except subprocess.CalledProcessError:
180 | return None
181 |
182 |
183 | class MeasureDuration:
184 | def __init__(self):
185 | self.duration = None
186 |
187 | def __enter__(self):
188 | self.start_time = time.time()
189 | return self
190 |
191 | def __exit__(self, exc_type, exc_val, exc_tb):
192 | end_time = time.time()
193 | self.duration = (end_time - self.start_time) * 1000
194 |
195 |
196 | def insert_in_dict(
197 | dictionary: Dict[Any, Any],
198 | key: Any,
199 | value: Any,
200 | position: Optional[int] = None,
201 | before_key: Optional[Any] = None,
202 | after_key: Optional[Any] = None,
203 | ) -> Dict[Any, Any]:
204 | """
205 | Insert a key/value pair in a dictionary at a specific position, before a
206 | specified key, or after a specified key.
207 |
208 | Args:
209 | dictionary (Dict[Any, Any]): The original dictionary.
210 | key (Any): The key to be inserted.
211 | value (Any): The value associated with the key.
212 | position (Optional[int], optional): The position at which the key/value pair
213 | should be inserted. Defaults to None.
214 | before_key (Optional[Any], optional): The key before which the new
215 | key/value pair should be inserted. Defaults to None.
216 | after_key (Optional[Any], optional): The key after which the new
217 | key/value pair should be inserted. Defaults to None.
218 |
219 | Raises:
220 | ValueError: If more than one of 'position', 'before_key', or 'after_key' is specified.
221 | ValueError: If the specified position is out of range.
222 | KeyError: If 'before_key' or 'after_key' is not found in the dictionary.
223 |
224 | Returns:
225 | Dict[Any, Any]: A new dictionary with the inserted key/value pair.
226 | """
227 | if sum([bool(position is not None), bool(before_key), bool(after_key)]) > 1:
228 | raise ValueError("Only one of 'position', 'before_key', or 'after_key' can be specified")
229 |
230 | if position is not None and (position > len(dictionary) or position < 0):
231 | raise ValueError("Position is out of range")
232 |
233 | if before_key is not None and before_key not in dictionary:
234 | raise KeyError(f"'before_key': {before_key} not found in the dictionary")
235 |
236 | if after_key is not None and after_key not in dictionary:
237 | raise KeyError(f"'after_key': {after_key} not found in the dictionary")
238 |
239 | new_dict = {}
240 | inserted = False
241 |
242 | for index, (dict_key, dict_value) in enumerate(dictionary.items()):
243 | if position == index or dict_key == before_key:
244 | new_dict[key] = value
245 | inserted = True
246 | elif after_key == dict_key:
247 | new_dict[dict_key] = dict_value
248 | new_dict[key] = value
249 | inserted = True
250 | continue
251 |
252 | new_dict[dict_key] = dict_value
253 |
254 | if not inserted or position == len(dictionary):
255 | new_dict[key] = value
256 |
257 | return new_dict
258 |
259 |
260 | def current_iso_timestamp():
261 | now = datetime.utcnow()
262 | return now.isoformat()
263 |
264 |
265 | def hashable_repr(obj):
266 | if isinstance(obj, (list, tuple, set)):
267 | return "".join(hashable_repr(item) for item in obj)
268 | elif isinstance(obj, dict):
269 | return "".join(
270 | hashable_repr(key) + hashable_repr(value) for key, value in sorted(obj.items())
271 | )
272 | elif callable(obj):
273 | return str(obj.__code__.co_code)
274 | else:
275 | return str(obj)
276 |
277 |
278 | def trabulate(df, showindex=True, headers="keys"):
279 | headers = headers if headers else []
280 | for column in df.columns:
281 | if df[column].dtype == "int64":
282 | df[column] = df[column].astype(str)
283 | return tabulate(df, headers=headers, showindex=showindex, tablefmt="psql", floatfmt=".2f")
284 |
--------------------------------------------------------------------------------
/publish.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # FIRST! manually update the setup.py with the version number
4 | # export PROMPTIMIZE_VERSION=0.2.3
5 |
6 | git checkout main
7 | git commit -a -m "Version $PROMPTIMIZE_VERSION"
8 | git tag $PROMPTIMIZE_VERSION
9 | git push origin main $PROMPTIMIZE_VERSION
10 | python setup.py sdist bdist_wheel
11 | twine upload dist/promptimize-$PROMPTIMIZE_VERSION*
12 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.black]
2 | line-length = 100
3 |
--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | flake8
2 | pre-commit
3 | recommonmark
4 | setuptools
5 | sphinx
6 | sphinx-click
7 | sphinxcontrib-napoleon
8 | twine
9 | wheel
10 |
--------------------------------------------------------------------------------
/requirements-examples.txt:
--------------------------------------------------------------------------------
1 | RestrictedPython
2 | demjson
3 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | jinja2
2 | python-box
3 | black
4 | click
5 | pygments
6 | pyyaml
7 | langchain
8 | tabulate
9 | pandas
10 | openai
11 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 |
3 | # Read the contents of the requirements.txt file
4 | with open("requirements.txt") as f:
5 | requirements = f.read().splitlines()
6 |
7 | # Read the contents of the README.md file
8 | with open("README.md", "r", encoding="utf-8") as f:
9 | readme = f.read()
10 |
11 | setup(
12 | name="promptimize",
13 | version="0.2.3",
14 | packages=find_packages(),
15 | include_package_data=True,
16 | install_requires=requirements,
17 | entry_points={
18 | "console_scripts": [
19 | "promptimize=promptimize:cli",
20 | "p9e=promptimize:cli",
21 | ],
22 | },
23 | author="Maxime Beauchemin",
24 | author_email="maximebeauchemin@gmail.com",
25 | description="A python toolkit to generate and evaluate prompts for GPT at scale",
26 | long_description=readme,
27 | long_description_content_type="text/markdown",
28 | license="Apache License, Version 2.0",
29 | license_file="LICENSE",
30 | )
31 |
--------------------------------------------------------------------------------