├── .env.example ├── .flake8 ├── .github ├── ISSUE_TEMPLATE │ ├── bug-report.yml │ ├── config.yml │ ├── documentation.yml │ ├── feature-request.yml │ └── question.yml └── workflows │ ├── ci.yml │ └── comment.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .readthedocs.yaml ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── docs ├── Makefile ├── README.md ├── img │ ├── feedback.png │ └── table.png ├── requirements.txt └── source │ ├── conf.py │ ├── experiment.rst │ ├── harness.rst │ ├── index.rst │ ├── modules.rst │ ├── playground.rst │ ├── prompttools.experiment.experiments.rst │ ├── prompttools.experiment.rst │ ├── prompttools.experiment.widgets.rst │ ├── prompttools.harness.rst │ ├── prompttools.mock.rst │ ├── prompttools.prompttest.error.rst │ ├── prompttools.prompttest.rst │ ├── prompttools.prompttest.runner.rst │ ├── prompttools.requests.rst │ ├── prompttools.rst │ ├── prompttools.utils.rst │ ├── quickstart.rst │ ├── setup.rst │ ├── testing.rst │ ├── usage.rst │ └── utils.rst ├── examples ├── notebooks │ ├── AnthropicExperiment.ipynb │ ├── AutoEval.ipynb │ ├── AzureOpenAIServiceExperiment.ipynb │ ├── FineTuningExperiment.ipynb │ ├── GPT4RegressionTesting.ipynb │ ├── GPT4vsLlama2.ipynb │ ├── GoogleGeminiChatExperiment.ipynb │ ├── GoogleVertexChatExperiment.ipynb │ ├── HuggingFaceHub.ipynb │ ├── HumanFeedback.ipynb │ ├── LlamaCppExperiment.ipynb │ ├── LlamaHeadToHead.ipynb │ ├── MistralChatExperiment.ipynb │ ├── ModelComparison.ipynb │ ├── Moderation Evaluation.ipynb │ ├── OpenAIChatExperiment.ipynb │ ├── OpenAIChatFunctionExperiment.ipynb │ ├── PaLM2Experiment.ipynb │ ├── README.md │ ├── ReplicateLlama2.ipynb │ ├── SemanticSimilarity.ipynb │ ├── StructuredOutput.ipynb │ ├── audio_experiments │ │ ├── MusicGenExperiment.ipynb │ │ └── sample_audio_files │ │ │ └── 80s_billy_joel.wav │ ├── benchmark │ │ └── Benchmarking.ipynb │ ├── frameworks │ │ ├── LangChainRouterChainExperiment.ipynb │ │ ├── LangChainSequentialChainExperiment.ipynb │ │ └── MindsDBExperiment.ipynb │ ├── image_experiments │ │ ├── ReplicateStableDiffusion.ipynb │ │ ├── StableDiffusion.ipynb │ │ ├── compare_images_folder │ │ │ ├── A panda writing code in the Swiss Alps3.png │ │ │ └── A_fruit_basket_on_the_moon.png │ │ └── stablediffusion_images │ │ │ ├── An_apple_orchard.png │ │ │ └── Just_a_fruit_basket.png │ ├── remote │ │ ├── Logging.ipynb │ │ ├── Saving_and_Loading_ExperimentHarness_to_Remote.ipynb │ │ └── Saving_and_Loading_Experiment_to_Remote.ipynb │ └── vectordb_experiments │ │ ├── ChromaDBExperiment.ipynb │ │ ├── LanceDBExperiment.ipynb │ │ ├── PineconeExperiment.ipynb │ │ ├── QdrantExperiment.ipynb │ │ ├── RetrievalAugmentedGeneration.ipynb │ │ └── WeaviateExperiment.ipynb └── prompttests │ ├── test_chromadb.py │ ├── test_huggingface_hub.py │ ├── test_openai_chat.py │ └── test_qdrant.py ├── img ├── demo.gif ├── feedback.png ├── hegel_ai_logo.svg ├── hegel_ai_logo_dark.svg ├── playground.gif ├── prompttest.png └── table.png ├── prompttools ├── __init__.py ├── benchmarks │ ├── __init__.py │ └── benchmark.py ├── common.py ├── data │ └── benchmarking │ │ └── hellaswag │ │ ├── hellaswag_dataset.jsonl │ │ └── hellaswag_labels.lst ├── experiment │ ├── __init__.py │ ├── experiments │ │ ├── __init__.py │ │ ├── _utils.py │ │ ├── anthropic_completion_experiment.py │ │ ├── chromadb_experiment.py │ │ ├── error.py │ │ ├── experiment.py │ │ ├── google_gemini_chat_experiment.py │ │ ├── google_palm_experiment.py │ │ ├── google_vertex_chat_experiment.py │ │ ├── huggingface_endpoint_experiment.py │ │ ├── huggingface_hub_experiment.py │ │ ├── lancedb_experiment.py │ │ ├── langchain_experiment.py │ │ ├── llama_cpp_experiment.py │ │ ├── mindsdb_experiment.py │ │ ├── mistral_experiment.py │ │ ├── musicgen_experiment.py │ │ ├── openai_chat_experiment.py │ │ ├── openai_completion_experiment.py │ │ ├── pinecone_experiment.py │ │ ├── qdrant_experiment.py │ │ ├── replicate_experiment.py │ │ ├── stablediffusion_experiment.py │ │ ├── style.mplstyle │ │ └── weaviate_experiment.py │ └── widgets │ │ ├── __init__.py │ │ ├── comparison.py │ │ ├── feedback.py │ │ └── utility.py ├── harness │ ├── __init__.py │ ├── chat_history_harness.py │ ├── chat_model_comparison_harness.py │ ├── chat_prompt_template_harness.py │ ├── document_retrieval_harness.py │ ├── function_call_harness.py │ ├── harness.py │ ├── model_comparison_harness.py │ ├── multi_experiment_harness.py │ ├── prompt_template_harness.py │ ├── rag_harness.py │ ├── system_prompt_harness.py │ └── utility.py ├── logger │ ├── __init__.py │ └── logger.py ├── mock │ ├── __init__.py │ ├── mock.py │ └── mock_data │ │ └── images │ │ ├── 19th_century_wombat_gentleman.png │ │ └── Just_a_fruit_basket.png ├── playground │ ├── README.md │ ├── __init__.py │ ├── constants.py │ ├── data_loader.py │ ├── packages.txt │ ├── playground.py │ └── requirements.txt ├── prompttest │ ├── __init__.py │ ├── error │ │ ├── __init__.py │ │ └── failure.py │ ├── prompttest.py │ ├── runner │ │ ├── __init__.py │ │ └── runner.py │ └── threshold_type.py ├── requests │ ├── __init__.py │ ├── request_queue.py │ └── retries.py ├── selector │ ├── __init__.py │ └── prompt_selector.py ├── sentry.py ├── utils │ ├── __init__.py │ ├── autoeval.py │ ├── autoeval_from_expected.py │ ├── autoeval_scoring.py │ ├── autoeval_with_docs.py │ ├── chunk_text.py │ ├── error.py │ ├── expected.py │ ├── moderation.py │ ├── ranking_correlation.py │ ├── similarity.py │ ├── validate_json.py │ └── validate_python.py └── version.py ├── pyproject.toml ├── requirements.txt ├── scripts └── create_comment.py ├── setup.py ├── test ├── app.py ├── requirements.txt ├── test_experiment.py ├── test_harness.py └── test_logger.py └── version.txt /.env.example: -------------------------------------------------------------------------------- 1 | # To access remote service, make a copy of this file and save it as `.env` in the same directory 2 | # Then, paste your Hegel API key below between the quotation marks. 3 | ENV="prod" 4 | HEGELAI_API_KEY="" 5 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug-report.yml: -------------------------------------------------------------------------------- 1 | name: 🐛 Bug Report 2 | description: Create a report to help us reproduce and fix the bug 3 | 4 | body: 5 | - type: markdown 6 | attributes: 7 | value: > 8 | #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the 9 | existing and past issues](https://github.com/hegelai/prompttools/issues?q=is%3Aissue+sort%3Acreated-desc+). 10 | - type: textarea 11 | attributes: 12 | label: 🐛 Describe the bug 13 | description: | 14 | Please provide a clear and concise description of what the bug is. 15 | 16 | If relevant, add a minimal example so that we can reproduce the error by running the code. For example 17 | 18 | ```python 19 | # Include all the necessary imports at the beginning 20 | from prompttools.harness import PromptTemplateExperimentationHarness 21 | 22 | # A succinct reproducible example trimmed down to the essential parts: 23 | prompt_templates = ["Answer the following question: {{input}}", "Respond the following query: {{input}}"] 24 | user_inputs = [{"input": "Who was the first president?"}, {"input": "Who was the first president of India?"}] 25 | harness = PromptTemplateExperimentationHarness("text-davinci-003", prompt_templates, user_inputs) 26 | 27 | harness.run() 28 | harness.visualize() # Note: the bug is here, the visualization "X" is different from expected "Y" 29 | ``` 30 | 31 | If the code is too long, feel free to put it in a public gist and link it here: https://gist.github.com. 32 | 33 | Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````. 34 | placeholder: | 35 | A clear and concise description of what the bug is. 36 | 37 | ```python 38 | Sample code to reproduce the problem 39 | ``` 40 | 41 | ``` 42 | The error message you got, with the full traceback. 43 | ``` 44 | validations: 45 | required: true 46 | - type: markdown 47 | attributes: 48 | value: > 49 | Thanks for contributing 🎉! 50 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: true 2 | contact_links: 3 | - name: Contact Us 4 | url: team@hegel-ai.com 5 | about: If for any reason you do not wish to open a public issue, feel free to contact us with the linked email. We would love to hear about your experience and pain points, and we'll offer bespoke advice and solution whenever possible. 6 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation.yml: -------------------------------------------------------------------------------- 1 | name: 📚 Documentation 2 | description: Report an issue related to our documentation 3 | 4 | body: 5 | - type: textarea 6 | attributes: 7 | label: 📚 The doc issue 8 | description: > 9 | A clear and concise description of what content in our documentation is an issue. 10 | validations: 11 | required: true 12 | - type: textarea 13 | attributes: 14 | label: Suggest a potential fix 15 | description: > 16 | Tell us how we could improve the documentation in this regard. 17 | - type: markdown 18 | attributes: 19 | value: > 20 | Thanks for contributing 🎉! 21 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.yml: -------------------------------------------------------------------------------- 1 | name: 🚀 Feature request 2 | description: Submit a proposal or request for a new feature in prompttools! 3 | 4 | body: 5 | - type: textarea 6 | attributes: 7 | label: 🚀 The feature 8 | description: > 9 | A clear and concise description of the feature proposal 10 | validations: 11 | required: true 12 | - type: textarea 13 | attributes: 14 | label: Motivation, pitch 15 | description: > 16 | Please outline the motivation for the proposal. Is your feature request related to a specific problem? e.g., 17 | *"I'm working on X and would like Y to be possible"*. If this is related to another GitHub issue, please link 18 | here too. 19 | validations: 20 | required: true 21 | - type: textarea 22 | attributes: 23 | label: Alternatives 24 | description: > 25 | A description of any alternative solutions or features you've considered, if any. 26 | - type: textarea 27 | attributes: 28 | label: Additional context 29 | description: > 30 | Add any other context or screenshots about the feature request. 31 | - type: markdown 32 | attributes: 33 | value: > 34 | Thanks for contributing 🎉! 35 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/question.yml: -------------------------------------------------------------------------------- 1 | name: ⁉️ Question and Discussion 2 | description: Discuss or ask question about how to use prompttools and what the best practice may be. 3 | 4 | body: 5 | - type: textarea 6 | attributes: 7 | label: ⁉️ Discussion/Question 8 | description: | 9 | Provide context on how you are using prompttools, what you have tried, and what question you may have. 10 | validations: 11 | required: true 12 | - type: markdown 13 | attributes: 14 | value: > 15 | Thanks for contributing 🎉! -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Run prompttools tests 2 | 3 | on: 4 | push: 5 | branches: [ "main" ] 6 | pull_request: 7 | branches: [ "main" ] 8 | 9 | jobs: 10 | build: 11 | 12 | runs-on: ubuntu-latest 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | python-version: ["3.11"] 17 | 18 | steps: 19 | - name: Check out source repository 20 | uses: actions/checkout@v3 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v3 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | python -m pip install flake8 pytest 29 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 30 | if [ -f test/requirements.txt ]; then pip install -r test/requirements.txt; fi 31 | - name: Lint with flake8 32 | run: | 33 | # exit-zero treats all errors as warnings. The GitHub editor is 120 chars wide 34 | flake8 . --count --exit-zero --max-complexity=10 --max-line-length=120 --statistics 35 | - name: Build prompttools 36 | run: | 37 | pip3 install . 38 | - name: Test with pytest 39 | run: | 40 | pytest -v test 41 | - name: Run examples 42 | run: | 43 | DEBUG=1 python examples/prompttests/test_openai_chat.py 44 | -------------------------------------------------------------------------------- /.github/workflows/comment.yml: -------------------------------------------------------------------------------- 1 | name: PromptTools 2 | 3 | on: 4 | # Trigger the workflow on push or pull request 5 | pull_request: 6 | branches: 7 | - steventkrawczyk-test 8 | 9 | jobs: 10 | comment: 11 | permissions: write-all 12 | runs-on: ubuntu-latest 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | python-version: ["3.11"] 17 | steps: 18 | - name: Check out source repository 19 | uses: actions/checkout@v3 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v3 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | python -m pip install flake8 pytest 28 | if [ -f requirements.txt ]; then pip install -r requirements.txt; fi 29 | - name: Build prompttools 30 | run: | 31 | pip3 install . 32 | - name: Create markdown 33 | run: | 34 | DEBUG=1 python scripts/create_comment.py 35 | - name: Write comment 36 | uses: actions/github-script@v6 37 | with: 38 | script: | 39 | const fs = require('fs') 40 | fs.readFile('./markdown.md', 'utf8', (err, data) => { 41 | github.rest.issues.createComment({ 42 | issue_number: context.issue.number, 43 | owner: context.repo.owner, 44 | repo: context.repo.repo, 45 | body: data 46 | }) 47 | }); 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | prompttools/version.py 7 | 8 | # C extensions 9 | *.so 10 | 11 | macOS 12 | */.DS_Store 13 | 14 | # PyCharm 15 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 16 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 17 | # and can be added to the global gitignore or merged into this file. For a more nuclear 18 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 19 | .idea/ 20 | 21 | 22 | # Distribution / packaging 23 | .Python 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | wheels/ 36 | share/python-wheels/ 37 | *.egg-info/ 38 | .installed.cfg 39 | *.egg 40 | MANIFEST 41 | 42 | # PyInstaller 43 | # Usually these files are written by a python script from a template 44 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 45 | *.manifest 46 | *.spec 47 | 48 | # Installer logs 49 | pip-log.txt 50 | pip-delete-this-directory.txt 51 | 52 | # Unit test / coverage reports 53 | htmlcov/ 54 | .tox/ 55 | .nox/ 56 | .coverage 57 | .coverage.* 58 | .cache 59 | nosetests.xml 60 | coverage.xml 61 | *.cover 62 | *.py,cover 63 | .hypothesis/ 64 | .pytest_cache/ 65 | cover/ 66 | 67 | # Translations 68 | *.mo 69 | *.pot 70 | 71 | # Django stuff: 72 | *.log 73 | local_settings.py 74 | db.sqlite3 75 | db.sqlite3-journal 76 | 77 | # Flask stuff: 78 | instance/ 79 | .webassets-cache 80 | 81 | # Scrapy stuff: 82 | .scrapy 83 | 84 | # Sphinx documentation 85 | docs/_build/ 86 | 87 | # PyBuilder 88 | .pybuilder/ 89 | target/ 90 | 91 | # Jupyter Notebook 92 | .ipynb_checkpoints 93 | 94 | # Chroma 95 | .chroma 96 | 97 | # IPython 98 | profile_default/ 99 | ipython_config.py 100 | 101 | # pyenv 102 | # For a library or package, you might want to ignore these files since the code is 103 | # intended to run in multiple environments; otherwise, check them in: 104 | # .python-version 105 | 106 | # pipenv 107 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 108 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 109 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 110 | # install all needed dependencies. 111 | #Pipfile.lock 112 | 113 | # poetry 114 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 115 | # This is especially recommended for binary packages to ensure reproducibility, and is more 116 | # commonly ignored for libraries. 117 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 118 | #poetry.lock 119 | 120 | # pdm 121 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 122 | #pdm.lock 123 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 124 | # in version control. 125 | # https://pdm.fming.dev/#use-with-ide 126 | .pdm.toml 127 | 128 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 129 | __pypackages__/ 130 | 131 | # Celery stuff 132 | celerybeat-schedule 133 | celerybeat.pid 134 | 135 | # SageMath parsed files 136 | *.sage.py 137 | 138 | # Environments 139 | .env 140 | .venv 141 | env/ 142 | venv/ 143 | ptenv/ 144 | ENV/ 145 | env.bak/ 146 | venv.bak/ 147 | 148 | # Spyder project settings 149 | .spyderproject 150 | .spyproject 151 | 152 | # Rope project settings 153 | .ropeproject 154 | 155 | # mkdocs documentation 156 | /site 157 | 158 | # mypy 159 | .mypy_cache/ 160 | .dmypy.json 161 | dmypy.json 162 | 163 | # Pyre type checker 164 | .pyre/ 165 | 166 | # pytype static type analyzer 167 | .pytype/ 168 | 169 | # Cython debug symbols 170 | cython_debug/ 171 | 172 | # PyCharm 173 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 174 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 175 | # and can be added to the global gitignore or merged into this file. For a more nuclear 176 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 177 | #.idea/ 178 | 179 | **/.chroma/** 180 | /ptvenv 181 | 182 | **/lancedb/** 183 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v2.3.0 4 | hooks: 5 | - id: end-of-file-fixer 6 | - id: trailing-whitespace 7 | - repo: https://github.com/psf/black 8 | rev: 22.10.0 9 | hooks: 10 | - id: black 11 | - repo: https://github.com/PyCQA/flake8 12 | rev: 6.0.0 13 | hooks: 14 | - id: flake8 15 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # .readthedocs.yaml 2 | # Read the Docs configuration file 3 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 4 | 5 | # Required 6 | version: 2 7 | 8 | # Set the OS, Python version and other tools you might need 9 | build: 10 | os: ubuntu-22.04 11 | tools: 12 | python: "3.11" 13 | 14 | # Build documentation in the "docs/" directory with Sphinx 15 | sphinx: 16 | configuration: docs/source/conf.py 17 | 18 | # Optionally build your docs in additional formats such as PDF and ePub 19 | # formats: 20 | # - pdf 21 | # - epub 22 | 23 | # Optional but recommended, declare the Python requirements required 24 | # to build your documentation 25 | # See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html 26 | python: 27 | install: 28 | - requirements: docs/requirements.txt 29 | 30 | 31 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | 2 | # Contributor Covenant Code of Conduct 3 | 4 | ## Our Pledge 5 | 6 | We as members, contributors, and leaders pledge to make participation in our 7 | community a harassment-free experience for everyone, regardless of age, body 8 | size, visible or invisible disability, ethnicity, sex characteristics, gender 9 | identity and expression, level of experience, education, socio-economic status, 10 | nationality, personal appearance, race, caste, color, religion, or sexual 11 | identity and orientation. 12 | 13 | We pledge to act and interact in ways that contribute to an open, welcoming, 14 | diverse, inclusive, and healthy community. 15 | 16 | ## Our Standards 17 | 18 | Examples of behavior that contributes to a positive environment for our 19 | community include: 20 | 21 | * Demonstrating empathy and kindness toward other people 22 | * Being respectful of differing opinions, viewpoints, and experiences 23 | * Giving and gracefully accepting constructive feedback 24 | * Accepting responsibility and apologizing to those affected by our mistakes, 25 | and learning from the experience 26 | * Focusing on what is best not just for us as individuals, but for the overall 27 | community 28 | 29 | Examples of unacceptable behavior include: 30 | 31 | * The use of sexualized language or imagery, and sexual attention or advances of 32 | any kind 33 | * Trolling, insulting or derogatory comments, and personal or political attacks 34 | * Public or private harassment 35 | * Publishing others' private information, such as a physical or email address, 36 | without their explicit permission 37 | * Other conduct which could reasonably be considered inappropriate in a 38 | professional setting 39 | 40 | ## Enforcement Responsibilities 41 | 42 | Community leaders are responsible for clarifying and enforcing our standards of 43 | acceptable behavior and will take appropriate and fair corrective action in 44 | response to any behavior that they deem inappropriate, threatening, offensive, 45 | or harmful. 46 | 47 | Community leaders have the right and responsibility to remove, edit, or reject 48 | comments, commits, code, wiki edits, issues, and other contributions that are 49 | not aligned to this Code of Conduct, and will communicate reasons for moderation 50 | decisions when appropriate. 51 | 52 | ## Scope 53 | 54 | This Code of Conduct applies within all community spaces, and also applies when 55 | an individual is officially representing the community in public spaces. 56 | Examples of representing our community include using an official e-mail address, 57 | posting via an official social media account, or acting as an appointed 58 | representative at an online or offline event. 59 | 60 | ## Enforcement 61 | 62 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 63 | reported to the community leaders responsible for enforcement at 64 | [team@hegel-ai.com](team@hegel-ai.com). 65 | All complaints will be reviewed and investigated promptly and fairly. 66 | 67 | All community leaders are obligated to respect the privacy and security of the 68 | reporter of any incident. 69 | 70 | ## Attribution 71 | 72 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 73 | version 2.1, available at 74 | [https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. 75 | 76 | Community Impact Guidelines were inspired by 77 | [Mozilla's code of conduct enforcement ladder][Mozilla CoC]. 78 | 79 | For answers to common questions about this code of conduct, see the FAQ at 80 | [https://www.contributor-covenant.org/faq][FAQ]. Translations are available at 81 | [https://www.contributor-covenant.org/translations][translations]. 82 | 83 | [homepage]: https://www.contributor-covenant.org 84 | [v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html 85 | [Mozilla CoC]: https://github.com/mozilla/diversity 86 | [FAQ]: https://www.contributor-covenant.org/faq 87 | [translations]: https://www.contributor-covenant.org/translations 88 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to `prompttools` 2 | 3 | ## TL;DR 4 | 5 | We appreciate all contributions to our project! If you are interested in contributing to `prompttools`, there are many ways to help out. 6 | Your contributions may fall into the following categories: 7 | 8 | It will greatly help our project if you: 9 | 10 | - Star ⭐ our project and share it with your network! 11 | 12 | - Report issues that you see, or upvote issues that others have reported and are relevant to you 13 | 14 | - Look through existing issues for new feature ideas 15 | (["Help Wanted" issues](https://github.com/hegelai/prompttools/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22)) and open PRs to implement them. 16 | 17 | - Answer questions on the issue tracker, investigating and fixing bugs are very valuable contributions to the project. 18 | 19 | - Improve the documentation is welcomed. If you find a typo in the documentation, 20 | do not hesitate to submit a GitHub issue or pull request. 21 | 22 | - Feature a usage example in our documentation, that is welcomed as well. 23 | 24 | ## Issues 25 | 26 | We use GitHub issues to track bugs. Please follow the existing templates if possible and ensure that the 27 | description is clear and has sufficient instructions to reproduce the issue. 28 | 29 | You can also use open an issue to seek advice or discuss best practices of using our tool or prompting in general. 30 | 31 | ## Development installation 32 | 33 | ### Install `prompttools` from source 34 | 35 | ```bash 36 | git clone https://github.com/hegelai/prompttools.git 37 | cd prompttools 38 | pip install -e . 39 | pip install flake8 40 | ``` 41 | 42 | ## Pull Requests 43 | 44 | We actively welcome your pull requests. 45 | 46 | 1. Fork the repo and create your branch from `main`. 47 | - Optionally, you can create a new branch locally and push to the branch to `origin`. 48 | 2. If you've added code that should be tested, add tests. 49 | 3. If you've changed APIs, update the inline documentation and examples. 50 | 4. Ensure all unit tests pass. 51 | 5. If you haven't already, complete the Contributor License Agreement ("CLA"). More details below 52 | 53 | ### Code style 54 | 55 | `prompttools` adheres to code format through [`pre-commit`](https://pre-commit.com). You can install it with 56 | 57 | ```shell 58 | pip install pre-commit 59 | ``` 60 | 61 | To check and in most cases fix the code format, stage all your changes (`git add`) and run `pre-commit run`. 62 | 63 | We recommend you to perform the checks automatically before every `git commit`, you can install that by executing 64 | this in the directory: 65 | 66 | ```shell 67 | pre-commit install 68 | ``` 69 | 70 | 71 | ## Contributor License Agreement ("CLA") 72 | 73 | In order to accept your pull request, we need you to sign a CLA. You only need to do this once to work on our project. 74 | 75 | Please sign the CLA here: 76 | 77 | ## License 78 | 79 | By contributing to `prompttools`, you agree that your contributions will be licensed under the LICENSE file in the root 80 | directory of this source tree. 81 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | ## Building the Documentation 2 | 3 | To build the documentation, you will need [Sphinx](http://www.sphinx-doc.org) and various dependencies. You can install them via: 4 | 5 | ```bash 6 | cd docs/ 7 | pip install -r requirements.txt 8 | ``` 9 | 10 | You can then build the documentation by running `make ` from the `docs/` folder. Run `make` to get a list of all 11 | available output formats. The most common case is `html`: 12 | 13 | ```bash 14 | make html 15 | ``` 16 | 17 | ## Improving the Documentation 18 | 19 | Feel free to open an issue or pull request regarding any inaccuracy or potential improvement for 20 | our documentation. Thank you! 21 | -------------------------------------------------------------------------------- /docs/img/feedback.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/docs/img/feedback.png -------------------------------------------------------------------------------- /docs/img/table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/docs/img/table.png -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | # Defining the exact version will make sure things don't break 2 | sphinx==5.3.0 3 | furo 4 | readthedocs-sphinx-search==0.1.1 5 | prompttools 6 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # For the full list of built-in configuration values, see the documentation: 4 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 5 | 6 | # -- Project information ----------------------------------------------------- 7 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information 8 | 9 | from sphinx.builders.html import StandaloneHTMLBuilder 10 | 11 | project = "prompttools" 12 | copyright = "2023, Hegel AI" 13 | author = "Hegel AI" 14 | release = "0.0.46" 15 | 16 | # -- General configuration --------------------------------------------------- 17 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration 18 | 19 | extensions = [ 20 | "sphinx.ext.duration", 21 | "sphinx.ext.doctest", 22 | "sphinx.ext.autodoc", 23 | "sphinx.ext.autosummary", 24 | "sphinx.ext.intersphinx", 25 | "sphinx.ext.napoleon", 26 | ] 27 | 28 | intersphinx_mapping = { 29 | "python": ("https://docs.python.org/3/", None), 30 | "sphinx": ("https://www.sphinx-doc.org/en/master/", None), 31 | } 32 | intersphinx_disabled_domains = ["std"] 33 | 34 | templates_path = ["_templates"] 35 | exclude_patterns = [] 36 | 37 | # -- Options for HTML output ------------------------------------------------- 38 | # https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output 39 | 40 | html_theme = "furo" 41 | html_static_path = ["_static"] 42 | 43 | 44 | # We can enable these options individually if appropriate 45 | html_theme_options = { 46 | # 'analytics_id': 'G-XXXXXXXXXX', # Provided by Google in your dashboard 47 | # 'analytics_anonymize_ip': False, 48 | # 'logo_only': False, 49 | # 'display_version': True, 50 | # 'prev_next_buttons_location': 'bottom', 51 | # 'style_external_links': False, 52 | # 'vcs_pageview_mode': '', 53 | # 'style_nav_header_background': 'white', 54 | # Toc options 55 | # 'collapse_navigation': False, 56 | # 'sticky_navigation': True, 57 | # 'navigation_depth': 4, 58 | # 'includehidden': True, 59 | # 'titles_only': False 60 | } 61 | 62 | autodoc_typehints = "description" 63 | 64 | # -- Options for EPUB output 65 | epub_show_urls = "footnote" 66 | 67 | StandaloneHTMLBuilder.supported_image_types = ["image/svg+xml", "image/gif", "image/png", "image/jpeg"] 68 | -------------------------------------------------------------------------------- /docs/source/experiment.rst: -------------------------------------------------------------------------------- 1 | Experiment 2 | ========================== 3 | 4 | .. currentmodule:: prompttools.experiment 5 | 6 | There are two main abstractions used in the ``prompttools`` library: Experiments and Harnesses. 7 | Occasionally, you may want to use a harness, because it abstracts away more details. 8 | 9 | An experiment is a low level abstraction that takes the Cartesian product of possible inputs to 10 | an LLM API. For example, the ``OpenAIChatExperiment`` accepts lists of inputs for each parameter 11 | of the OpenAI Chat Completion API. Then, it constructs and asynchronously executes requests 12 | using those potential inputs. An example of using experiment is `here `_. 13 | 14 | There are two ways to initialize an experiment: 15 | 16 | 1. Wrap your parameters in ``list``\ s and pass them into the ``__init__`` method. See each class's 17 | method signature in the "Integrated Experiment APIs" section for details. 18 | 2. Define which parameters should be tested and which ones should be frozen in two dictionaries. Pass the 19 | dictionaries to the ``initialize`` method. See the ``classmethod initialize`` below for details. 20 | 21 | The ``Experiment`` superclass's shared API is below. 22 | 23 | .. autoclass:: Experiment 24 | :members: 25 | 26 | Integrated Experiment APIs 27 | ----------------------------- 28 | 29 | LLMs 30 | +++++++++++++++++++++++++++++++++++++++++ 31 | 32 | .. autoclass:: OpenAIChatExperiment 33 | 34 | .. autoclass:: OpenAICompletionExperiment 35 | 36 | .. autoclass:: AnthropicCompletionExperiment 37 | 38 | .. autoclass:: HuggingFaceHubExperiment 39 | 40 | .. autoclass:: GoogleGeminiChatCompletionExperiment 41 | 42 | .. autoclass:: GooglePaLMCompletionExperiment 43 | 44 | .. autoclass:: GoogleVertexChatCompletionExperiment 45 | 46 | .. autoclass:: MistralChatCompletionExperiment 47 | 48 | .. autoclass:: LlamaCppExperiment 49 | 50 | .. autoclass:: ReplicateExperiment 51 | 52 | Frameworks 53 | +++++++++++++++++++++++++++++++++++++++++ 54 | 55 | .. autoclass:: SequentialChainExperiment 56 | 57 | .. autoclass:: RouterChainExperiment 58 | 59 | .. autoclass:: MindsDBExperiment 60 | 61 | Vector DBs 62 | +++++++++++++++++++++++++++++++++++++++++ 63 | 64 | .. autoclass:: ChromaDBExperiment 65 | 66 | .. autoclass:: WeaviateExperiment 67 | 68 | .. autoclass:: LanceDBExperiment 69 | 70 | .. autoclass:: QdrantExperiment 71 | 72 | .. autoclass:: PineconeExperiment 73 | 74 | Computer Vision 75 | +++++++++++++++++++++++++++++++++++++++++ 76 | 77 | .. autoclass:: StableDiffusionExperiment 78 | 79 | .. autoclass:: ReplicateExperiment 80 | -------------------------------------------------------------------------------- /docs/source/harness.rst: -------------------------------------------------------------------------------- 1 | Harness 2 | =========== 3 | 4 | .. currentmodule:: prompttools.harness 5 | 6 | There are two main abstractions used in the ``prompttools`` library: Experiments and Harnesses. 7 | Occasionally, you may want to use a harness, because it abstracts away more details. 8 | 9 | A harness is built on top of an experiment, and manages abstractions over inputs. 10 | For example, the ``PromptTemplateExperimentationHarness`` freezes one set of model arguments 11 | and varies the prompt input based on prompt templates and user inputs. It then constructs 12 | a corresponding experiment, and keeps track of the templates and inputs used for each prompt. 13 | 14 | .. autoclass:: ExperimentationHarness 15 | :members: 16 | 17 | .. autoclass:: ChatHistoryExperimentationHarness 18 | 19 | .. autoclass:: ChatModelComparisonHarness 20 | 21 | .. autoclass:: ChatPromptTemplateExperimentationHarness 22 | 23 | .. autoclass:: ModelComparisonHarness 24 | 25 | .. autoclass:: MultiExperimentHarness 26 | 27 | .. autoclass:: PromptTemplateExperimentationHarness 28 | 29 | .. autoclass:: RetrievalAugmentedGenerationExperimentationHarness 30 | 31 | .. autoclass:: SystemPromptExperimentationHarness 32 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | .. prompttools documentation master file, created by 2 | sphinx-quickstart on Sun Jul 16 15:34:13 2023. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | PromptTools 7 | =========== 8 | 9 | Welcome to ``prompttools`` created by `Hegel 10 | AI `__! This 11 | `repository `_ 12 | offers a set of free, open-source tools for testing and experimenting with prompts. 13 | The core idea is to enable developers to evaluate prompts using familiar 14 | interfaces like *code* and *notebooks*. 15 | 16 | .. image:: ../../img/demo.gif 17 | :alt: The prompttools notebook demo. 18 | :align: center 19 | 20 | There are primarily two ways you can use ``prompttools`` in your LLM workflow: 21 | 22 | 1. Run experiments in `notebooks `_ and evaluate the outputs. 23 | 2. Turn evaluations into 24 | `unit tests `_ and 25 | integrate them into your CI/CD workflow 26 | `via Github Actions `_. 27 | 28 | Please don't hesitate to star our repo, reach out, and provide feedback on GitHub! 29 | 30 | To stay in touch with us about issues and future updates, join the 31 | `Discord `__. 32 | 33 | Installation 34 | ------------ 35 | 36 | To install ``prompttools`` using pip: 37 | 38 | .. code:: bash 39 | 40 | pip install prompttools 41 | 42 | To install from source, first clone this GitHub repo to your local 43 | machine, then, from the repo, run: 44 | 45 | .. code:: bash 46 | 47 | pip install . 48 | 49 | You can then proceed to run `our examples `__. 50 | 51 | Frequently Asked Questions (FAQs) 52 | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 53 | 54 | 1. Will this library forward my LLM calls to a server before sending it 55 | to OpenAI/Anthropic/etc? 56 | 57 | - No, the source code will be executed on your machine. Any call to 58 | LLM APIs will be directly executed from your machine without any 59 | forwarding. 60 | 61 | Contributing 62 | ------------ 63 | 64 | We welcome PRs and suggestions! Don’t hesitate to open a PR/issue or to 65 | reach out to us `via email `__. Please have a 66 | look at our `contribution guide `__ and `“Help Wanted” 67 | issues `__ 68 | to get started! 69 | 70 | Usage and Feedback 71 | ------------------ 72 | 73 | We will be delighted to work with early adopters to shape our designs. 74 | Please reach out to us `via email `__ if 75 | you’re interested in using this tooling for your project or have any 76 | feedback. 77 | 78 | License 79 | ------- 80 | 81 | We will be gradually releasing more components to the open-source 82 | community. The current license can be found in the `LICENSE `__ 83 | file. If there is any concern, please `contact 84 | us `__ and we will be happy to work with you. 85 | 86 | Module Index 87 | ------- 88 | 89 | * :ref:`modindex` 90 | 91 | .. Hidden TOCs 92 | 93 | .. toctree:: 94 | :caption: Getting Started 95 | :maxdepth: 2 96 | :hidden: 97 | 98 | quickstart 99 | usage 100 | playground 101 | 102 | .. toctree:: 103 | :caption: Concepts 104 | :maxdepth: 2 105 | :hidden: 106 | 107 | experiment 108 | harness 109 | utils 110 | testing 111 | -------------------------------------------------------------------------------- /docs/source/modules.rst: -------------------------------------------------------------------------------- 1 | prompttools 2 | =========== 3 | 4 | .. toctree:: 5 | :maxdepth: 4 6 | 7 | prompttools 8 | -------------------------------------------------------------------------------- /docs/source/playground.rst: -------------------------------------------------------------------------------- 1 | Playground 2 | =========== 3 | 4 | If you want to interact with `prompttools` using our playground interface, you can launch it with the following commands. 5 | 6 | .. image:: ../../img/playground.gif 7 | :alt: The prompttools playground. 8 | :align: center 9 | 10 | First, install prompttools: 11 | 12 | .. code:: bash 13 | 14 | pip install prompttools 15 | 16 | Then, clone the git repo and launch the streamlit app: 17 | 18 | .. code:: bash 19 | 20 | git clone https://github.com/hegelai/prompttools.git 21 | cd prompttools && streamlit run prompttools/playground/playground.py 22 | -------------------------------------------------------------------------------- /docs/source/prompttools.experiment.experiments.rst: -------------------------------------------------------------------------------- 1 | prompttools.experiment.experiments package 2 | ========================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | prompttools.experiment.experiments.anthropic\_claude\_experiment module 8 | ----------------------------------------------------------------------- 9 | 10 | .. automodule:: prompttools.experiment.experiments.anthropic_claude_experiment 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | prompttools.experiment.experiments.chromadb\_experiment module 16 | -------------------------------------------------------------- 17 | 18 | .. automodule:: prompttools.experiment.experiments.chromadb_experiment 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | prompttools.experiment.experiments.error module 24 | ----------------------------------------------- 25 | 26 | .. automodule:: prompttools.experiment.experiments.error 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | prompttools.experiment.experiments.experiment module 32 | ---------------------------------------------------- 33 | 34 | .. automodule:: prompttools.experiment.experiments.experiment 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | prompttools.experiment.experiments.google\_palm\_experiment module 40 | ------------------------------------------------------------------ 41 | 42 | .. automodule:: prompttools.experiment.experiments.google_palm_experiment 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | prompttools.experiment.experiments.huggingface\_endpoint\_experiment module 48 | --------------------------------------------------------------------------- 49 | 50 | .. automodule:: prompttools.experiment.experiments.huggingface_endpoint_experiment 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | prompttools.experiment.experiments.huggingface\_hub\_experiment module 56 | ---------------------------------------------------------------------- 57 | 58 | .. automodule:: prompttools.experiment.experiments.huggingface_hub_experiment 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | prompttools.experiment.experiments.llama\_cpp\_experiment module 64 | ---------------------------------------------------------------- 65 | 66 | .. automodule:: prompttools.experiment.experiments.llama_cpp_experiment 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | prompttools.experiment.experiments.openai\_chat\_experiment module 72 | ------------------------------------------------------------------ 73 | 74 | .. automodule:: prompttools.experiment.experiments.openai_chat_experiment 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | 79 | prompttools.experiment.experiments.openai\_completion\_experiment module 80 | ------------------------------------------------------------------------ 81 | 82 | .. automodule:: prompttools.experiment.experiments.openai_completion_experiment 83 | :members: 84 | :undoc-members: 85 | :show-inheritance: 86 | 87 | prompttools.experiment.experiments.openai\_function\_experiment module 88 | ---------------------------------------------------------------------- 89 | 90 | .. automodule:: prompttools.experiment.experiments.openai_function_experiment 91 | :members: 92 | :undoc-members: 93 | :show-inheritance: 94 | 95 | prompttools.experiment.experiments.vector\_database\_experiment module 96 | ---------------------------------------------------------------------- 97 | 98 | .. automodule:: prompttools.experiment.experiments.vector_database_experiment 99 | :members: 100 | :undoc-members: 101 | :show-inheritance: 102 | 103 | Module contents 104 | --------------- 105 | 106 | .. automodule:: prompttools.experiment.experiments 107 | :members: 108 | :undoc-members: 109 | :show-inheritance: 110 | -------------------------------------------------------------------------------- /docs/source/prompttools.experiment.rst: -------------------------------------------------------------------------------- 1 | prompttools.experiment package 2 | ============================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | prompttools.experiment.experiments 11 | prompttools.experiment.widgets 12 | 13 | Module contents 14 | --------------- 15 | 16 | .. automodule:: prompttools.experiment 17 | :members: 18 | :undoc-members: 19 | :show-inheritance: 20 | -------------------------------------------------------------------------------- /docs/source/prompttools.experiment.widgets.rst: -------------------------------------------------------------------------------- 1 | prompttools.experiment.widgets package 2 | ====================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | prompttools.experiment.widgets.comparison module 8 | ------------------------------------------------ 9 | 10 | .. automodule:: prompttools.experiment.widgets.comparison 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | prompttools.experiment.widgets.feedback module 16 | ---------------------------------------------- 17 | 18 | .. automodule:: prompttools.experiment.widgets.feedback 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | prompttools.experiment.widgets.utility module 24 | --------------------------------------------- 25 | 26 | .. automodule:: prompttools.experiment.widgets.utility 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | Module contents 32 | --------------- 33 | 34 | .. automodule:: prompttools.experiment.widgets 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | -------------------------------------------------------------------------------- /docs/source/prompttools.harness.rst: -------------------------------------------------------------------------------- 1 | prompttools.harness package 2 | =========================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | prompttools.harness.chat\_history\_harness module 8 | ------------------------------------------------- 9 | 10 | .. automodule:: prompttools.harness.chat_history_harness 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | prompttools.harness.chat\_model\_comparison\_harness module 16 | ----------------------------------------------------------- 17 | 18 | .. automodule:: prompttools.harness.chat_model_comparison_harness 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | prompttools.harness.document\_retrieval\_harness module 24 | ------------------------------------------------------- 25 | 26 | .. automodule:: prompttools.harness.document_retrieval_harness 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | prompttools.harness.function\_call\_harness module 32 | -------------------------------------------------- 33 | 34 | .. automodule:: prompttools.harness.function_call_harness 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | prompttools.harness.harness module 40 | ---------------------------------- 41 | 42 | .. automodule:: prompttools.harness.harness 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | prompttools.harness.multi\_experiment\_harness module 48 | ----------------------------------------------------- 49 | 50 | .. automodule:: prompttools.harness.multi_experiment_harness 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | prompttools.harness.prompt\_template\_harness module 56 | ---------------------------------------------------- 57 | 58 | .. automodule:: prompttools.harness.prompt_template_harness 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | 63 | prompttools.harness.system\_prompt\_harness module 64 | -------------------------------------------------- 65 | 66 | .. automodule:: prompttools.harness.system_prompt_harness 67 | :members: 68 | :undoc-members: 69 | :show-inheritance: 70 | 71 | Module contents 72 | --------------- 73 | 74 | .. automodule:: prompttools.harness 75 | :members: 76 | :undoc-members: 77 | :show-inheritance: 78 | -------------------------------------------------------------------------------- /docs/source/prompttools.mock.rst: -------------------------------------------------------------------------------- 1 | prompttools.mock package 2 | ======================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | prompttools.mock.mock module 8 | ---------------------------- 9 | 10 | .. automodule:: prompttools.mock.mock 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: prompttools.mock 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/source/prompttools.prompttest.error.rst: -------------------------------------------------------------------------------- 1 | prompttools.prompttest.error package 2 | ==================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | prompttools.prompttest.error.failure module 8 | ------------------------------------------- 9 | 10 | .. automodule:: prompttools.prompttest.error.failure 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | Module contents 16 | --------------- 17 | 18 | .. automodule:: prompttools.prompttest.error 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | -------------------------------------------------------------------------------- /docs/source/prompttools.prompttest.rst: -------------------------------------------------------------------------------- 1 | prompttools.prompttest package 2 | ============================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | prompttools.prompttest.error 11 | prompttools.prompttest.runner 12 | 13 | Submodules 14 | ---------- 15 | 16 | prompttools.prompttest.prompttest module 17 | ---------------------------------------- 18 | 19 | .. automodule:: prompttools.prompttest.prompttest 20 | :members: 21 | :undoc-members: 22 | :show-inheritance: 23 | 24 | prompttools.prompttest.threshold\_type module 25 | --------------------------------------------- 26 | 27 | .. automodule:: prompttools.prompttest.threshold_type 28 | :members: 29 | :undoc-members: 30 | :show-inheritance: 31 | 32 | Module contents 33 | --------------- 34 | 35 | .. automodule:: prompttools.prompttest 36 | :members: 37 | :undoc-members: 38 | :show-inheritance: 39 | -------------------------------------------------------------------------------- /docs/source/prompttools.prompttest.runner.rst: -------------------------------------------------------------------------------- 1 | prompttools.prompttest.runner package 2 | ===================================== 3 | 4 | Submodules 5 | ---------- 6 | 7 | prompttools.prompttest.runner.prompt\_template\_runner module 8 | ------------------------------------------------------------- 9 | 10 | .. automodule:: prompttools.prompttest.runner.prompt_template_runner 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | prompttools.prompttest.runner.runner module 16 | ------------------------------------------- 17 | 18 | .. automodule:: prompttools.prompttest.runner.runner 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | prompttools.prompttest.runner.system\_prompt\_runner module 24 | ----------------------------------------------------------- 25 | 26 | .. automodule:: prompttools.prompttest.runner.system_prompt_runner 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | Module contents 32 | --------------- 33 | 34 | .. automodule:: prompttools.prompttest.runner 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | -------------------------------------------------------------------------------- /docs/source/prompttools.requests.rst: -------------------------------------------------------------------------------- 1 | prompttools.requests package 2 | ============================ 3 | 4 | Submodules 5 | ---------- 6 | 7 | prompttools.requests.request\_queue module 8 | ------------------------------------------ 9 | 10 | .. automodule:: prompttools.requests.request_queue 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | prompttools.requests.retries module 16 | ----------------------------------- 17 | 18 | .. automodule:: prompttools.requests.retries 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | Module contents 24 | --------------- 25 | 26 | .. automodule:: prompttools.requests 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | -------------------------------------------------------------------------------- /docs/source/prompttools.rst: -------------------------------------------------------------------------------- 1 | prompttools package 2 | =================== 3 | 4 | Subpackages 5 | ----------- 6 | 7 | .. toctree:: 8 | :maxdepth: 4 9 | 10 | prompttools.experiment 11 | prompttools.harness 12 | prompttools.mock 13 | prompttools.prompttest 14 | prompttools.requests 15 | prompttools.utils 16 | 17 | Submodules 18 | ---------- 19 | 20 | prompttools.version module 21 | -------------------------- 22 | 23 | .. automodule:: prompttools.version 24 | :members: 25 | :undoc-members: 26 | :show-inheritance: 27 | 28 | Module contents 29 | --------------- 30 | 31 | .. automodule:: prompttools 32 | :members: 33 | :undoc-members: 34 | :show-inheritance: 35 | -------------------------------------------------------------------------------- /docs/source/prompttools.utils.rst: -------------------------------------------------------------------------------- 1 | prompttools.utils package 2 | ========================= 3 | 4 | Submodules 5 | ---------- 6 | 7 | prompttools.utils.autoeval module 8 | --------------------------------- 9 | 10 | .. automodule:: prompttools.utils.autoeval 11 | :members: 12 | :undoc-members: 13 | :show-inheritance: 14 | 15 | prompttools.utils.error module 16 | ------------------------------ 17 | 18 | .. automodule:: prompttools.utils.error 19 | :members: 20 | :undoc-members: 21 | :show-inheritance: 22 | 23 | prompttools.utils.expected module 24 | --------------------------------- 25 | 26 | .. automodule:: prompttools.utils.expected 27 | :members: 28 | :undoc-members: 29 | :show-inheritance: 30 | 31 | prompttools.utils.json module 32 | ----------------------------- 33 | 34 | .. automodule:: prompttools.utils.json 35 | :members: 36 | :undoc-members: 37 | :show-inheritance: 38 | 39 | prompttools.utils.python module 40 | ------------------------------- 41 | 42 | .. automodule:: prompttools.utils.python 43 | :members: 44 | :undoc-members: 45 | :show-inheritance: 46 | 47 | prompttools.utils.similarity module 48 | ----------------------------------- 49 | 50 | .. automodule:: prompttools.utils.similarity 51 | :members: 52 | :undoc-members: 53 | :show-inheritance: 54 | 55 | Module contents 56 | --------------- 57 | 58 | .. automodule:: prompttools.utils 59 | :members: 60 | :undoc-members: 61 | :show-inheritance: 62 | -------------------------------------------------------------------------------- /docs/source/quickstart.rst: -------------------------------------------------------------------------------- 1 | Quickstart 2 | =========== 3 | 4 | To install ``prompttools``, you can use ``pip``: 5 | 6 | :: 7 | 8 | pip install prompttools 9 | 10 | You can run a simple example of a ``prompttools`` locally with the following 11 | 12 | :: 13 | 14 | git clone https://github.com/hegelai/prompttools.git 15 | cd prompttools && jupyter notebook examples/notebooks/OpenAIChatExperiment.ipynb 16 | 17 | There are many `notebook examples `_ that 18 | you can explore. The README on that page gives you brief description of the examples. 19 | 20 | If you prefer to have a UI instead of using jupyter notebook, have a look at 21 | our `Playground <./playground.html>`_. 22 | -------------------------------------------------------------------------------- /docs/source/setup.rst: -------------------------------------------------------------------------------- 1 | setup module 2 | ============ 3 | 4 | .. automodule:: setup 5 | :members: 6 | :undoc-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/source/testing.rst: -------------------------------------------------------------------------------- 1 | Testing and CI/CD 2 | =========== 3 | 4 | .. currentmodule:: prompttools.prompttest 5 | 6 | After identifying the right evaluation/validation function for the outputs, you 7 | can easily create unit tests and add them to your CI/CD workflow. 8 | 9 | Unit tests in ``prompttools`` are called ``prompttests``. They use the ``@prompttest`` annotation to transform a 10 | completion function into an efficient unit test. The ``prompttest`` framework executes and evaluates experiments 11 | so you can test prompts over time. For example: 12 | 13 | .. code-block:: python 14 | 15 | import prompttools.prompttest as prompttest 16 | 17 | @prompttest.prompttest( 18 | metric_name="is_valid_json", 19 | eval_fn=validate_json.evaluate, 20 | prompts=[create_json_prompt()], 21 | ) 22 | def json_completion_fn(prompt: str): 23 | response = None 24 | if os.getenv("DEBUG", default=False): 25 | response = mock_openai_completion_fn(**{"prompt": prompt}) 26 | else: 27 | response = openai.completions.create(prompt) 28 | return response.choices[0].text 29 | 30 | 31 | In the file, be sure to call the ``main()`` method of ``prompttest`` like you would for ``unittest``. 32 | 33 | .. code-block:: python 34 | 35 | if __name__ == "__main__": 36 | prompttest.main() 37 | -------------------------------------------------------------------------------- /docs/source/usage.rst: -------------------------------------------------------------------------------- 1 | Using the library 2 | =========== 3 | 4 | There are primarily two ways you can use ``prompttools`` in your LLM 5 | workflow: 6 | 7 | 1. Run experiments in `notebooks `__. 8 | 2. Write `unit tests `__ and 9 | integrate them into your CI/CD workflow `via Github 10 | Actions `__. 11 | 12 | Notebooks 13 | ------------ 14 | 15 | There are a few different ways to run an experiment in a notebook. 16 | 17 | The simplest way is to define an experimentation harness and an 18 | evaluation function: 19 | 20 | .. code:: python 21 | 22 | from prompttools.harness import PromptTemplateExperimentationHarness 23 | 24 | 25 | def eval_fn(prompt: str, results: Dict, metadata: Dict) -> float: 26 | # Your logic here, or use a built-in one such as `prompttools.utils.similarity`. 27 | pass 28 | 29 | prompt_templates = [ 30 | "Answer the following question: {{input}}", 31 | "Respond the following query: {{input}}" 32 | ] 33 | 34 | user_inputs = [ 35 | {"input": "Who was the first president?"}, 36 | {"input": "Who was the first president of India?"} 37 | ] 38 | 39 | harness = PromptTemplateExperimentationHarness("text-davinci-003", 40 | prompt_templates, 41 | user_inputs) 42 | 43 | 44 | harness.run() 45 | harness.evaluate("metric_name", eval_fn) 46 | harness.visualize() # The results will be displayed as a table in your notebook 47 | 48 | .. figure:: ../img/table.png 49 | :alt: The visualized table in your notebook. 50 | 51 | If you are interested to compare different models, the `ModelComparison 52 | example `__ may be of 53 | interest. 54 | 55 | For an example of built-in evaluation function, please see this example 56 | of `semantic similarity 57 | comparison `__ for 58 | details. 59 | 60 | You can also manually enter feedback to evaluate prompts, see 61 | `HumanFeedback.ipynb `__. 62 | 63 | .. figure:: ../img/feedback.png 64 | :alt: You can annotate feedback directly within the notebook. 65 | 66 | .. 67 | 68 | Note: Above we used an ``ExperimentationHarness``. Under the hood, 69 | that harness uses an ``Experiment`` to construct and make API calls 70 | to LLMs. The harness is responsible for managing higher level 71 | abstractions, like prompt templates or system prompts. To see how 72 | experiments work at a low level, `see this 73 | example `__. 74 | 75 | Unit Tests 76 | ------------ 77 | 78 | Unit tests in ``prompttools`` are called ``prompttests``. They use the ``@prompttest`` annotation to transform a 79 | completion function into an efficient unit test. The ``prompttest`` framework executes and evaluates experiments 80 | so you can test prompts over time. For example: 81 | 82 | .. code-block:: python 83 | 84 | import prompttools.prompttest as prompttest 85 | 86 | @prompttest.prompttest( 87 | metric_name="is_valid_json", 88 | eval_fn=validate_json.evaluate, 89 | prompts=[create_json_prompt()], 90 | ) 91 | def json_completion_fn(prompt: str): 92 | response = None 93 | if os.getenv("DEBUG", default=False): 94 | response = mock_openai_completion_fn(**{"prompt": prompt}) 95 | else: 96 | response = openai.completions.create(prompt) 97 | return response.choices[0].text 98 | 99 | 100 | The evaluation functions should accept one of the following as it's parameters: 101 | 102 | * ``input_pair: Tuple[str, Dict[str, str]], results: Dict, metadata: Dict`` 103 | * ``prompt: str, results: Dict, metadata: Dict`` 104 | * ``messages: List[Dict[str,str], results: Dict, metadata: Dict`` 105 | 106 | 107 | You can see an example 108 | test `here `__ and an example 109 | of that test being used as a Github Action 110 | `here `__. 111 | -------------------------------------------------------------------------------- /docs/source/utils.rst: -------------------------------------------------------------------------------- 1 | Evaluation and Validation 2 | =========== 3 | 4 | .. currentmodule:: prompttools.utils 5 | 6 | These built-in functions help you to evaluate the outputs of your experiments. 7 | They can also be used with ``prompttest`` for be part of your CI/CD system. 8 | 9 | .. autofunction:: prompttools.utils.autoeval_binary_scoring 10 | 11 | .. autofunction:: prompttools.utils.autoeval_scoring 12 | 13 | .. autofunction:: prompttools.utils.autoeval_with_documents 14 | 15 | .. autofunction:: prompttools.utils.chunk_text 16 | 17 | .. autofunction:: prompttools.utils.compute_similarity_against_model 18 | 19 | .. autofunction:: prompttools.utils.apply_moderation 20 | 21 | .. autofunction:: prompttools.utils.ranking_correlation 22 | 23 | .. autofunction:: prompttools.utils.validate_json_response 24 | 25 | .. autofunction:: prompttools.utils.validate_json.validate_keys 26 | 27 | .. autofunction:: prompttools.utils.validate_python_response 28 | 29 | .. autofunction:: prompttools.utils.semantic_similarity 30 | 31 | .. autofunction:: prompttools.utils.structural_similarity 32 | 33 | .. autofunction:: prompttools.utils.similarity.compute 34 | -------------------------------------------------------------------------------- /examples/notebooks/GoogleGeminiChatExperiment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Google Gemini Chat Experiment Example" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Installations" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# !pip install --quiet --force-reinstall prompttools" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Setup imports and API keys" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "In order for the Google GenAI API to work, you must set up your Google AI Studio credentials (one example in the following cell) or execute this experiment on Google Colab.\n", 38 | "\n", 39 | "Executing on Google Colab may require the least amount of set-up." 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 5, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "import google.generativeai as genai\n", 49 | "\n", 50 | "from google.colab import userdata\n", 51 | "\n", 52 | "GOOGLE_API_KEY = \"\" # You can manually set your key\n", 53 | "# GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY') # Or, you can read it from your account\n", 54 | "\n", 55 | "genai.configure(api_key=GOOGLE_API_KEY)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": {}, 61 | "source": [ 62 | "Once you succeed in setting up your credential, you should be able to execute the following cell without error and see the list of models you have access to." 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": null, 68 | "metadata": {}, 69 | "outputs": [], 70 | "source": [ 71 | "for m in genai.list_models():\n", 72 | " if 'generateContent' in m.supported_generation_methods:\n", 73 | " print(m.name)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "Then we'll import the relevant `prompttools` modules to setup our experiment." 81 | ] 82 | }, 83 | { 84 | "cell_type": "markdown", 85 | "metadata": { 86 | "ExecuteTime": { 87 | "end_time": "2023-07-28T21:15:15.360723Z", 88 | "start_time": "2023-07-28T21:15:15.230441Z" 89 | }, 90 | "collapsed": true, 91 | "jupyter": { 92 | "outputs_hidden": true 93 | } 94 | }, 95 | "source": [ 96 | "## Run an experiment" 97 | ] 98 | }, 99 | { 100 | "cell_type": "markdown", 101 | "metadata": {}, 102 | "source": [ 103 | "Next, we create our test inputs. We can iterate over models (`\"gemini-pro\"` in this case, you can also use the ultra model if you have access to it), contents (equivalent of prompt). You can also experiment with configurations like temperature using `generation_config` or `safety_settings`.\n" 104 | ] 105 | }, 106 | { 107 | "cell_type": "code", 108 | "execution_count": 31, 109 | "metadata": {}, 110 | "outputs": [], 111 | "source": [ 112 | "from prompttools.experiment import GoogleVertexChatCompletionExperiment\n", 113 | "\n", 114 | "model = ['gemini-pro']\n", 115 | "contents = [\"What is the meaning of life?\", \"Who was the first president?\"]\n", 116 | "\n", 117 | "experiment = GoogleVertexChatCompletionExperiment(model=model, contents=contents)" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "experiment.run()\n", 127 | "experiment.visualize()" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "## Evaluate the model response" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "Please reference other notebooks (such as Google PaLM 2, Anthropic) for detailed evaluation of the model's response." 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": null, 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [] 150 | } 151 | ], 152 | "metadata": { 153 | "kernelspec": { 154 | "display_name": "Python 3 (ipykernel)", 155 | "language": "python", 156 | "name": "python3" 157 | }, 158 | "language_info": { 159 | "codemirror_mode": { 160 | "name": "ipython", 161 | "version": 3 162 | }, 163 | "file_extension": ".py", 164 | "mimetype": "text/x-python", 165 | "name": "python", 166 | "nbconvert_exporter": "python", 167 | "pygments_lexer": "ipython3", 168 | "version": "3.11.7" 169 | } 170 | }, 171 | "nbformat": 4, 172 | "nbformat_minor": 4 173 | } 174 | -------------------------------------------------------------------------------- /examples/notebooks/GoogleVertexChatExperiment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Google Vertex Chat Experiment Example" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "## Installations" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# !pip install --quiet --force-reinstall prompttools" 24 | ] 25 | }, 26 | { 27 | "cell_type": "markdown", 28 | "metadata": {}, 29 | "source": [ 30 | "## Setup imports and API keys" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "In order for the Google Vertex AI API to work, you must set up your Google aiplatform credentials (one example in the following cell) or execute this experiment on https://console.cloud.google.com/vertex-ai/." 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 5, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "# from google.cloud import aiplatform\n", 47 | "\n", 48 | "# aiplatform.init(\n", 49 | "# project=project,\n", 50 | "# location=location,\n", 51 | "# experiment=experiment,\n", 52 | "# staging_bucket=staging_bucket,\n", 53 | "# credentials=credentials,\n", 54 | "# encryption_spec_key_name=encryption_spec_key_name,\n", 55 | "# service_account=service_account,\n", 56 | "# )" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "Once you succeed in setting up your credential, you should be able to execute the following cell without error." 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": null, 69 | "metadata": {}, 70 | "outputs": [], 71 | "source": [ 72 | "from vertexai.preview.language_models import ChatModel, InputOutputTextPair\n", 73 | "\n", 74 | "chat_model = ChatModel.from_pretrained(\"chat-bison\")" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "Then we'll import the relevant `prompttools` modules to setup our experiment." 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": { 87 | "ExecuteTime": { 88 | "end_time": "2023-07-28T21:15:15.360723Z", 89 | "start_time": "2023-07-28T21:15:15.230441Z" 90 | }, 91 | "collapsed": true 92 | }, 93 | "source": [ 94 | "## Run an experiment" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "Next, we create our test inputs. We can iterate over models (`\"chat-bison\"` in this case), context (equivalent of system message), message (equivalent of prompt). You can also experiment with configurations like temperature.\n" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 31, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "from prompttools.experiment import GoogleVertexChatCompletionExperiment\n", 111 | "\n", 112 | "\n", 113 | "model = [\"chat-bison\"]\n", 114 | "\n", 115 | "context = [\"You are a helpful assistant.\",\n", 116 | " \"Answer the following question only if you know the answer or can make a well-informed guess; otherwise tell me you don't know it. In addition, explain your reasoning of your final answer.\"]\n", 117 | "\n", 118 | "message = [\n", 119 | " \"Is 97 a prime number?\",\n", 120 | " \"Is 17077 a prime number?\",\n", 121 | "]\n", 122 | "\n", 123 | "experiment = GoogleVertexChatCompletionExperiment(model=model, message=message, context=context)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [ 132 | "experiment.run()\n", 133 | "experiment.visualize()" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "## Evaluate the model response" 141 | ] 142 | }, 143 | { 144 | "cell_type": "markdown", 145 | "metadata": {}, 146 | "source": [ 147 | "Please reference other notebooks (such as Google PaLM 2, Anthropic) for detailed evaluation of the model's response." 148 | ] 149 | } 150 | ], 151 | "metadata": { 152 | "kernelspec": { 153 | "display_name": "Python 3 (ipykernel)", 154 | "language": "python", 155 | "name": "python3" 156 | }, 157 | "language_info": { 158 | "codemirror_mode": { 159 | "name": "ipython", 160 | "version": 3 161 | }, 162 | "file_extension": ".py", 163 | "mimetype": "text/x-python", 164 | "name": "python", 165 | "nbconvert_exporter": "python", 166 | "pygments_lexer": "ipython3", 167 | "version": "3.11.4" 168 | } 169 | }, 170 | "nbformat": 4, 171 | "nbformat_minor": 1 172 | } 173 | -------------------------------------------------------------------------------- /examples/notebooks/README.md: -------------------------------------------------------------------------------- 1 | ## Notebook Examples 2 | 3 | In this folder, you will find various examples of how you can use `prompttools` for 4 | various experimentation and testing. Often, you can simply change a few parameters 5 | and put in your own test data to make `prompttools` suitable for your use case. 6 | 7 | If you have additional use case in mind or spot an issue, please open an issue 8 | and we will be happy to discuss. 9 | 10 | We also welcome community contribution of usage examples! Please open a PR if you 11 | have something to share. 12 | 13 | ### LLM 14 | 15 | #### Single Model Examples 16 | - [OpenAI Chat Experiment](OpenAIChatExperiment.ipynb) shows how you can experiment with OpenAI with different models and parameters. 17 | - [OpenAI Chat Function Experiment](OpenAIChatFunctionExperiment.ipynb) shows how you can experiment with OpenAI's function calling API. 18 | - [Anthropic Experiment](AnthropicExperiment.ipynb) shows how you can experiment with Anthropic Claude with different models and parameters. 19 | - [Google PaLM 2 Text Completion](PaLM2Experiment.ipynb) 20 | and [Google Vertex AI Chat Completion](GoogleVertexChatExperiment.ipynb) utilizes Google's LLM models. 21 | - [LLaMA Cpp Experiment](LlamaCppExperiment.ipynb) executes LLaMA locally with various parameters and see how it does. 22 | - [LLaMA Cpp Experiment](LlamaCppExperiment.ipynb) executes LLaMA locally with various parameters and see how it does. 23 | - [HuggingFace Hub](HuggingFaceHub.ipynb) compares different OSS models hosted on HuggingFace. 24 | - [GPT-4 Regression](GPT4RegressionTesting.ipynb) examines how the current version GPT-4 model compares with the older, frozen versions. 25 | 26 | #### Head To Head Model Comparison 27 | 28 | - [Model Comparison](ModelComparison.ipynb) shows how you can compare two OpenAI models. 29 | - [GPT4 vs LLaMA2](GPT4vsLlama2.ipynb) allows you understand if LLaMA might be enough for your use case. 30 | - [LLaMA Head To Head](LlamaHeadToHead.ipynb) presents a match-up between LLaMA 1 and LLaMA 2! 31 | 32 | #### Evaluation 33 | - [Auto Evaluation](AutoEval.ipynb) presents an example of how you can use another LLM to evaluate responses. 34 | - [Structured Output](StructuredOutput.ipynb) validates the model outputs adhere to your desired structured format. 35 | - [Semantic Similarity](SemanticSimilarity.ipynb) evaluates your model outputs compared to ideal outputs. 36 | - [Human Feedback](HumanFeedback.ipynb) allows you to provide human feedback to your outputs. 37 | 38 | 39 | ### Vector Databases 40 | 41 | - [Retrieval Augmented Generation](vectordb_experiments/RetrievalAugmentedGeneration.ipynb) combines a vector database 42 | experiment with LLM to evaluate the whole RAG process. 43 | - [ChromaDB Experiment](vectordb_experiments/ChromaDBExperiment.ipynb) demonstrates how to experiment with different 44 | embedding functions and query parameters of `Chroma`. The example evaluates the results by computing the 45 | ranking correlation against an expected output. 46 | - [Weaviate Experiment](vectordb_experiments/WeaviateExperiment.ipynb) shows how you can easily try different vectorizers, configuration, 47 | and query functions, and compare the final results. 48 | - [LanceDB Experiment](vectordb_experiments/LanceDBExperiment.ipynb) allows you to try different embedding functions, and query methods. 49 | - [Qdrant Experiment](vectordb_experiments/QdrantExperiment.ipynb) explores different ways to query Qdrant, including with vectors. 50 | - [Pinecone Experiment](vectordb_experiments/PineconeExperiment.ipynb) looks into different ways to add data into and query from Pinecone. 51 | 52 | ### Frameworks 53 | 54 | - [LangChain Sequential Chain Experiment](frameworks/LangChainSequentialChainExperiment.ipynb) 55 | - [LangChain Router Chain Experiment](frameworks/LangChainRouterChainExperiment.ipynb) 56 | - [MindsDB Experiment](frameworks/MindsDBExperiment.ipynb) 57 | 58 | ### Computer Vision 59 | - [Stable Diffusion](image_experiments/StableDiffusion.ipynb) 60 | - [Replicate's hosted Stable Diffusion](image_experiments/ReplicateStableDiffusion.ipynb) 61 | -------------------------------------------------------------------------------- /examples/notebooks/audio_experiments/sample_audio_files/80s_billy_joel.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/examples/notebooks/audio_experiments/sample_audio_files/80s_billy_joel.wav -------------------------------------------------------------------------------- /examples/notebooks/frameworks/LangChainRouterChainExperiment.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Lang Chain Router Chain Experiment Example" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [ 15 | { 16 | "ename": "ImportError", 17 | "evalue": "cannot import name 'RouterChainExperiment' from 'prompttools.experiment' (/home/hashem/.local/lib/python3.10/site-packages/prompttools/experiment/__init__.py)", 18 | "output_type": "error", 19 | "traceback": [ 20 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 21 | "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)", 22 | "Input \u001b[0;32mIn [1]\u001b[0m, in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mprompttools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mexperiment\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m RouterChainExperiment\n\u001b[1;32m 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mlangchain\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mllms\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m OpenAI\n\u001b[1;32m 4\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mprompttools\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mutils\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m semantic_similarity\n", 23 | "\u001b[0;31mImportError\u001b[0m: cannot import name 'RouterChainExperiment' from 'prompttools.experiment' (/home/hashem/.local/lib/python3.10/site-packages/prompttools/experiment/__init__.py)" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "from prompttools.experiment import RouterChainExperiment\n", 29 | "from langchain.llms import OpenAI\n", 30 | "\n", 31 | "from prompttools.utils import semantic_similarity\n", 32 | "\n", 33 | "\n", 34 | "cooking_template = \"\"\"You are a well versed chef. \\\n", 35 | "You enjoy baking bread, \\\n", 36 | "explain how to make good sushi.\n", 37 | "\n", 38 | "Here is a question:\n", 39 | "{input}\"\"\"\n", 40 | "\n", 41 | "\n", 42 | "restaurant_template = \"\"\"You are a fantastic restaurateur. \\\n", 43 | "You specialize in restaurant operations in New York City.\n", 44 | "\n", 45 | "Here is a question:\n", 46 | "{input}\"\"\"\n", 47 | "\n", 48 | "prompt_infos = [\n", 49 | " [\n", 50 | " {\n", 51 | " \"name\": \"cooking\",\n", 52 | " \"description\": \"Good for answering questions about cooking\",\n", 53 | " \"prompt_template\": cooking_template,\n", 54 | " },\n", 55 | " {\n", 56 | " \"name\": \"restaurant\",\n", 57 | " \"description\": \"Good for building a restaurant\",\n", 58 | " \"prompt_template\": restaurant_template,\n", 59 | " },\n", 60 | " ],\n", 61 | "]\n", 62 | "\n", 63 | "\n", 64 | "experiment = RouterChainExperiment(\n", 65 | " llm=[OpenAI],\n", 66 | " prompt=[\"How can I make a delicious smoothie?\"],\n", 67 | " prompt_infos=prompt_infos,\n", 68 | " **{\n", 69 | " \"temperature\": [0.1, 0.9],\n", 70 | " },\n", 71 | ")\n", 72 | "\n", 73 | "expected = \"\"\"You will need fruit of your choice, a blender, and some ice. Place about one cup of fruit in the blender.\n", 74 | "Place about a quarter cup of ice in the blender. Blend to your desired consistency.\"\"\"\n", 75 | "\n", 76 | "experiment.run()\n", 77 | "\n", 78 | "experiment.evaluate(\"similar_to_expected\", semantic_similarity, expected=[expected] * 2)\n", 79 | "\n", 80 | "experiment.visualize()" 81 | ] 82 | } 83 | ], 84 | "metadata": { 85 | "kernelspec": { 86 | "display_name": "Python 3 (ipykernel)", 87 | "language": "python", 88 | "name": "python3" 89 | }, 90 | "language_info": { 91 | "codemirror_mode": { 92 | "name": "ipython", 93 | "version": 3 94 | }, 95 | "file_extension": ".py", 96 | "mimetype": "text/x-python", 97 | "name": "python", 98 | "nbconvert_exporter": "python", 99 | "pygments_lexer": "ipython3", 100 | "version": "3.10.12" 101 | }, 102 | "vscode": { 103 | "interpreter": { 104 | "hash": "eec05f12730ef3ef66f433616fcd3cfdacd3dcf1f1c49c706eaa0465be8f325b" 105 | } 106 | } 107 | }, 108 | "nbformat": 4, 109 | "nbformat_minor": 2 110 | } 111 | -------------------------------------------------------------------------------- /examples/notebooks/image_experiments/compare_images_folder/A panda writing code in the Swiss Alps3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/examples/notebooks/image_experiments/compare_images_folder/A panda writing code in the Swiss Alps3.png -------------------------------------------------------------------------------- /examples/notebooks/image_experiments/compare_images_folder/A_fruit_basket_on_the_moon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/examples/notebooks/image_experiments/compare_images_folder/A_fruit_basket_on_the_moon.png -------------------------------------------------------------------------------- /examples/notebooks/image_experiments/stablediffusion_images/An_apple_orchard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/examples/notebooks/image_experiments/stablediffusion_images/An_apple_orchard.png -------------------------------------------------------------------------------- /examples/notebooks/image_experiments/stablediffusion_images/Just_a_fruit_basket.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/examples/notebooks/image_experiments/stablediffusion_images/Just_a_fruit_basket.png -------------------------------------------------------------------------------- /examples/prompttests/test_chromadb.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import os 8 | from typing import Dict, Tuple 9 | import prompttools.prompttest as prompttest 10 | from prompttools.utils import similarity 11 | from prompttools.experiment import ChromaDBExperiment 12 | 13 | EXPECTED = {"Who was the first president of the USA?": "George Washington"} 14 | 15 | if not (("CHROMADB_API_TOKEN" in os.environ) or ("DEBUG" in os.environ)): # placeholder api naming 16 | print("Error: This example requires you to set either your CHROMADB_API_TOKEN or DEBUG=1") 17 | exit(1) 18 | 19 | 20 | def extract_chromadb_dists(output: Dict[str, object]) -> list[str]: 21 | return output 22 | 23 | 24 | def measure_fn(): # TODO: Do we want to build a separate framework from prompttest that handles vectors? 25 | pass 26 | -------------------------------------------------------------------------------- /examples/prompttests/test_huggingface_hub.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import os 8 | import jinja2 9 | import prompttools.prompttest as prompttest 10 | from prompttools.utils import similarity 11 | from prompttools.prompttest.threshold_type import ThresholdType 12 | from prompttools.mock.mock import mock_hf_completion_fn 13 | from huggingface_hub.inference_api import InferenceApi 14 | 15 | if not (("HUGGINGFACEHUB_API_TOKEN" in os.environ) or ("DEBUG" in os.environ)): 16 | print("Error: This example requires you to set either your HUGGINGFACEHUB_API_TOKEN or DEBUG=1") 17 | exit(1) 18 | 19 | 20 | client = InferenceApi( 21 | repo_id="google/flan-t5-xxl", 22 | token=os.environ.get("HUGGINGFACEHUB_API_TOKEN"), 23 | task="text2text-generation", 24 | ) 25 | 26 | 27 | def create_prompt(): 28 | prompt_template = "Answer the following question: {{ input }}" 29 | user_input = {"input": "Who was the first president of the USA?"} 30 | environment = jinja2.Environment() 31 | template = environment.from_string(prompt_template) 32 | return template.render(**user_input) 33 | 34 | 35 | @prompttest.prompttest( 36 | metric_name="similar_to_expected", 37 | eval_fn=similarity.evaluate, 38 | prompts=[create_prompt()], 39 | expected=["George Washington"], 40 | threshold=1.0, 41 | threshold_type=ThresholdType.MAXIMUM, 42 | ) 43 | def completion_fn(prompt: str): 44 | response = None 45 | if os.getenv("DEBUG", default=False): 46 | response = mock_hf_completion_fn(**{"inputs": prompt}) 47 | else: 48 | response = client(inputs=prompt) 49 | return response[0]["generated_text"] 50 | 51 | 52 | if __name__ == "__main__": 53 | prompttest.main() 54 | -------------------------------------------------------------------------------- /examples/prompttests/test_openai_chat.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import os 8 | import openai 9 | import jinja2 10 | from prompttools import prompttest 11 | from prompttools.prompttest.threshold_type import ThresholdType 12 | from prompttools.utils import similarity 13 | from prompttools.utils import validate_json 14 | from prompttools.mock.mock import mock_openai_completion_fn 15 | 16 | 17 | if not (("OPENAI_API_KEY" in os.environ) or ("DEBUG" in os.environ)): 18 | print("Error: This example requires you to set either your OPENAI_API_KEY or DEBUG=1") 19 | exit(1) 20 | 21 | 22 | def create_json_prompt(): 23 | prompt_template = "Answer the following question using a valid JSON format: {{ input }}" 24 | user_input = {"input": "Who was the first president?"} 25 | environment = jinja2.Environment() 26 | template = environment.from_string(prompt_template) 27 | return template.render(**user_input) 28 | 29 | 30 | def create_prompt(): 31 | prompt_template = "Answer the following question: {{ input }}" 32 | user_input = {"input": "Who was the first president of the USA?"} 33 | environment = jinja2.Environment() 34 | template = environment.from_string(prompt_template) 35 | return template.render(**user_input) 36 | 37 | 38 | @prompttest.prompttest( 39 | metric_name="is_valid_json", 40 | eval_fn=validate_json.evaluate, 41 | prompts=[create_json_prompt()], 42 | ) 43 | def json_completion_fn(prompt: str): 44 | if os.getenv("DEBUG", default=False): 45 | response = mock_openai_completion_fn(**{"prompt": prompt}) 46 | else: 47 | response = openai.completions.create(model="babbage-002", prompt=prompt) 48 | return response.choices[0].text 49 | 50 | 51 | @prompttest.prompttest( 52 | metric_name="similar_to_expected", 53 | eval_fn=similarity.evaluate, 54 | prompts=[create_prompt()], 55 | expected=["George Washington"], 56 | threshold=1.0, 57 | threshold_type=ThresholdType.MAXIMUM, 58 | ) 59 | def completion_fn(prompt: str): 60 | if os.getenv("DEBUG", default=False): 61 | response = mock_openai_completion_fn(**{"prompt": prompt}) 62 | else: 63 | response = openai.completions.create(model="babbage-002", prompt=prompt) 64 | return response.choices[0].text 65 | 66 | 67 | if __name__ == "__main__": 68 | prompttest.main() 69 | -------------------------------------------------------------------------------- /examples/prompttests/test_qdrant.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | from typing import List 3 | 4 | from qdrant_client import QdrantClient 5 | 6 | from prompttools.experiment import QdrantExperiment 7 | 8 | 9 | def embedding_function(text: str) -> List[float]: 10 | r""" 11 | Create vector embedding from text. This is a dummy function for testing purposes 12 | and returns a vector of 16 floats. 13 | 14 | Args: 15 | text (str): Text to be vectorized 16 | Returns: 17 | List[float]: Vector embedding of the text 18 | """ 19 | import numpy as np 20 | import struct 21 | 22 | vectorized_text = np.abs( 23 | np.array(struct.unpack(">ffffffffffffffff", hashlib.sha512(text.encode("utf-8")).digest())) 24 | ) 25 | normalized_vector = vectorized_text / np.linalg.norm(vectorized_text) 26 | return normalized_vector.tolist() 27 | 28 | 29 | test_parameters = { 30 | "collection_params": { 31 | "vectors_config__distance": ["Cosine", "Euclid", "Dot"], 32 | "hnsw_config__m": [16, 32, 64, 128], 33 | }, 34 | "query_params": { 35 | "search_params__hnsw_ef": [1, 16, 32, 64, 128], 36 | "search_params__exact": [True, False], 37 | }, 38 | } 39 | frozen_parameters = { 40 | # Run Qdrant server locally with: 41 | # docker run -p "6333:6333" -p "6334:6334" qdrant/qdrant:v1.4.0 42 | "client": QdrantClient("http://localhost:6333"), 43 | "collection_name": "test_collection", 44 | "embedding_fn": embedding_function, 45 | "vector_size": 16, 46 | "documents": ["test document 1", "test document 2"], 47 | "queries": ["test query 1", "test query 2"], 48 | } 49 | experiment = QdrantExperiment.initialize(test_parameters=test_parameters, frozen_parameters=frozen_parameters) 50 | experiment.run() 51 | 52 | print(experiment.get_table(True)) 53 | -------------------------------------------------------------------------------- /img/demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/img/demo.gif -------------------------------------------------------------------------------- /img/feedback.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/img/feedback.png -------------------------------------------------------------------------------- /img/hegel_ai_logo.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /img/hegel_ai_logo_dark.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | -------------------------------------------------------------------------------- /img/playground.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/img/playground.gif -------------------------------------------------------------------------------- /img/prompttest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/img/prompttest.png -------------------------------------------------------------------------------- /img/table.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/img/table.png -------------------------------------------------------------------------------- /prompttools/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .prompttest import prompttest 8 | from .sentry import init_sentry 9 | 10 | 11 | init_sentry() 12 | 13 | __all__ = ["prompttest"] 14 | -------------------------------------------------------------------------------- /prompttools/benchmarks/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from .benchmark import Benchmark 8 | 9 | 10 | __all__ = [ 11 | "Benchmark", 12 | ] 13 | -------------------------------------------------------------------------------- /prompttools/benchmarks/benchmark.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Any, Callable, List, Optional 8 | import pandas as pd 9 | import warnings 10 | 11 | 12 | class Benchmark: 13 | r""" 14 | Benchmark models using defined data sets. 15 | Find example under benchmarks/examples/benchmarking.ipynb. 16 | 17 | Args: 18 | ---- 19 | experiment (experiment type): experiment to use 20 | eval_methods (Callable): list of evaluation methods to measure response similarity 21 | prompts (list(str)): list of queries, questions, prompts for LLMs to respond to 22 | response_options (list(str)): possible responses to measure against 23 | correct_response_indices (list(int)): list of index of correct response in response_options 24 | """ 25 | 26 | def __init__( 27 | self, 28 | experiment: Any, 29 | eval_method: Callable, 30 | prompts: List[str], 31 | response_options: List[Any], 32 | correct_response_indices: Optional[List[int]] = None, 33 | ): 34 | self.experiment = experiment 35 | self.eval_method = eval_method 36 | self.prompts = prompts 37 | self.response_options = response_options 38 | self.correct_response_indices = correct_response_indices 39 | 40 | def _get_precision( 41 | self, 42 | dataframe: pd.DataFrame, 43 | pred_col: str, 44 | label_col: str, 45 | ) -> float: 46 | r""" 47 | Calculate precision. 48 | """ 49 | # TODO: coming soon 50 | pass 51 | 52 | def multiple_choice_accuracy( 53 | self, 54 | dataframe: pd.DataFrame, 55 | col1: str, 56 | col2: str, 57 | ) -> float: 58 | r""" 59 | Benchmark LLM accuracy on multiple choice 60 | prompt endings. 61 | """ 62 | correct = 0 63 | for _, row in dataframe.iterrows(): 64 | if row[col1] == row[col2]: 65 | correct += 1 66 | return correct / len(dataframe) 67 | 68 | def multiple_choice_benchmark( 69 | self, 70 | ) -> Any: 71 | r""" 72 | Run model experiments to measure response quality. 73 | """ 74 | self.experiment.run() 75 | 76 | if "prompt" not in self.experiment.full_df.columns: 77 | # Assume messages column is in place of prompt 78 | self.experiment.full_df["prompt"] = self.experiment.full_df["messages"].map(lambda x: str(x)) 79 | warnings.warn("Column 'prompt' does not exist. Using column 'messages' instead.", UserWarning, stacklevel=2) 80 | # Get option with highest similarity to LLM response 81 | benchmark_df = self.experiment.full_df[["prompt", "response"]] 82 | benchmark_df["response_options"] = self.response_options 83 | benchmark_df = benchmark_df.explode(column="response_options").reset_index() 84 | scores = [] 85 | for _, row in benchmark_df.iterrows(): 86 | scores.append(self.eval_method(row=row, expected=row["response_options"])) 87 | benchmark_df["scores"] = scores 88 | benchmark_df["max_value"] = benchmark_df.groupby("prompt")["scores"].transform("max") 89 | benchmark_df = benchmark_df[benchmark_df["scores"] == benchmark_df["max_value"]] 90 | benchmark_df = benchmark_df.sort_index() 91 | # Colect model choices 92 | model_choice = [] 93 | for i, choice in enumerate(benchmark_df["response_options"].values): 94 | model_choice.append(self.response_options[i].index(choice)) 95 | benchmark_df["model_choice"] = model_choice 96 | benchmark_df["labels"] = self.correct_response_indices 97 | return self.multiple_choice_accuracy(benchmark_df, "model_choice", "labels") 98 | -------------------------------------------------------------------------------- /prompttools/common.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | import os 9 | from os.path import join, dirname 10 | 11 | try: 12 | from dotenv import load_dotenv 13 | except ImportError: 14 | load_dotenv = None 15 | 16 | if load_dotenv is not None: 17 | dotenv_path = join(dirname(dirname(__file__)), ".env") 18 | load_dotenv(dotenv_path) 19 | 20 | 21 | ENV = os.environ.get("ENV", "prod") 22 | if ENV == "development": 23 | HEGEL_BACKEND_URL = """http://127.0.0.1:5000""" 24 | else: 25 | HEGEL_BACKEND_URL = """https://api.hegel-ai.com""" 26 | -------------------------------------------------------------------------------- /prompttools/experiment/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | from .experiments.experiment import Experiment 9 | from .experiments.openai_chat_experiment import OpenAIChatExperiment 10 | from .experiments.openai_completion_experiment import OpenAICompletionExperiment 11 | from .experiments.anthropic_completion_experiment import AnthropicCompletionExperiment 12 | from .experiments.huggingface_hub_experiment import HuggingFaceHubExperiment 13 | from .experiments.google_gemini_chat_experiment import GoogleGeminiChatCompletionExperiment 14 | from .experiments.google_palm_experiment import GooglePaLMCompletionExperiment 15 | from .experiments.google_vertex_chat_experiment import GoogleVertexChatCompletionExperiment 16 | from .experiments.llama_cpp_experiment import LlamaCppExperiment 17 | from .experiments.chromadb_experiment import ChromaDBExperiment 18 | from .experiments.weaviate_experiment import WeaviateExperiment 19 | from .experiments.lancedb_experiment import LanceDBExperiment 20 | from .experiments.mistral_experiment import MistralChatCompletionExperiment 21 | from .experiments.mindsdb_experiment import MindsDBExperiment 22 | from .experiments.langchain_experiment import SequentialChainExperiment, RouterChainExperiment 23 | from .experiments.stablediffusion_experiment import StableDiffusionExperiment 24 | from .experiments.replicate_experiment import ReplicateExperiment 25 | from .experiments.qdrant_experiment import QdrantExperiment 26 | from .experiments.pinecone_experiment import PineconeExperiment 27 | from .experiments.musicgen_experiment import MusicGenExperiment 28 | 29 | __all__ = [ 30 | "AnthropicCompletionExperiment", 31 | "ChromaDBExperiment", 32 | "Experiment", 33 | "GoogleGeminiChatCompletionExperiment", 34 | "GooglePaLMCompletionExperiment", 35 | "GoogleVertexChatCompletionExperiment", 36 | "LanceDBExperiment", 37 | "LlamaCppExperiment", 38 | "HuggingFaceHubExperiment", 39 | "MistralChatCompletionExperiment", 40 | "MindsDBExperiment", 41 | "MusicGenExperiment", 42 | "OpenAIChatExperiment", 43 | "OpenAICompletionExperiment", 44 | "PineconeExperiment", 45 | "QdrantExperiment", 46 | "ReplicateExperiment", 47 | "RouterChainExperiment", 48 | "SequentialChainExperiment", 49 | "StableDiffusionExperiment", 50 | "WeaviateExperiment", 51 | ] 52 | -------------------------------------------------------------------------------- /prompttools/experiment/experiments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/prompttools/experiment/experiments/__init__.py -------------------------------------------------------------------------------- /prompttools/experiment/experiments/_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import pandas as pd 8 | 9 | 10 | def _check_column_uniqueness(column: "pd.core.series.Series") -> bool: 11 | r""" 12 | Check if all elements are equal in the column. 13 | 14 | Arg: 15 | column (pandas.core.series.Series): Column to check 16 | """ 17 | first_ele = column[0] 18 | for ele in column: 19 | if first_ele != ele: 20 | return True 21 | return False 22 | 23 | 24 | def _get_dynamic_columns(df: pd.DataFrame) -> pd.DataFrame: 25 | r""" 26 | Given a ``pd.DataFrame``, return a DataFrame where columns have more than 1 unique value. 27 | 28 | Args: 29 | df (pd.DataFrame): DataFrame to examine 30 | """ 31 | hashable_columns = [] 32 | unhashable_columns = [] 33 | for col in df.columns: 34 | try: 35 | hash(df[col][0]) 36 | hashable_columns.append(col) 37 | except TypeError: 38 | # If a column is not hashable, check if there exists value differ from the 39 | if _check_column_uniqueness(df[col]): 40 | unhashable_columns.append(col) 41 | 42 | unique_counts = df[hashable_columns].nunique() 43 | columns_with_multiple_unique_values = unique_counts[unique_counts > 1].index 44 | dfs_to_concat = [df[columns_with_multiple_unique_values], df[unhashable_columns]] 45 | if ( 46 | "prompt" in df 47 | and "prompt" not in df[columns_with_multiple_unique_values] 48 | and "prompt" not in df[unhashable_columns] 49 | ): 50 | dfs_to_concat.append(df["prompt"]) 51 | elif ( 52 | "messages" in df 53 | and "messages" not in df[columns_with_multiple_unique_values] 54 | and "messages" not in df[unhashable_columns] 55 | ): 56 | dfs_to_concat.append(df["messages"]) 57 | return pd.concat(dfs_to_concat, axis=1) 58 | -------------------------------------------------------------------------------- /prompttools/experiment/experiments/error.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | class PromptExperimentException(Exception): 9 | r""" 10 | An exception to throw when something goes wrong with the prompt test setup 11 | """ 12 | 13 | pass 14 | -------------------------------------------------------------------------------- /prompttools/experiment/experiments/google_gemini_chat_experiment.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | try: 8 | import google.generativeai as genai 9 | from google.generativeai.types import content_types 10 | from google.generativeai.types import generation_types 11 | from google.generativeai.types import safety_types 12 | except ImportError: 13 | genai = None 14 | content_types, generation_types, safety_types = None, None, None 15 | 16 | 17 | from .experiment import Experiment 18 | from typing import Optional 19 | import copy 20 | 21 | 22 | class GoogleGeminiChatCompletionExperiment(Experiment): 23 | r""" 24 | This class defines an experiment for Google GenAI's chat API. It accepts lists for each argument 25 | passed into Vertex AI's API, then creates a cartesian product of those arguments, and gets results for each. 26 | 27 | Note: 28 | - All arguments here should be a ``list``, even if you want to keep the argument frozen 29 | (i.e. ``temperature=[1.0]``), because the experiment will try all possible combination 30 | of the input arguments. 31 | - You need to set up your Google Vertex AI credentials properly before executing this experiment. One option 32 | is to execute on Google Cloud's Colab. 33 | 34 | Args: 35 | model (list[str]): Which model to call, as a string or a ``types.Model`` (e.g. ``'models/text-bison-001'``). 36 | 37 | contents (list[content_types]): Message for the chat model to respond. 38 | 39 | generation_config (list[generation_types]): Configurations for the generation of the model. 40 | 41 | safety_settings (list[safety_types]): Configurations for the safety features of the model. 42 | """ 43 | 44 | def __init__( 45 | self, 46 | model: list[str], 47 | contents: list["content_types.ContentsType"], 48 | generation_config: list[Optional["generation_types.GenerationConfigType"]] = [None], 49 | safety_settings: list[Optional["safety_types.SafetySettingOptions"]] = [None], 50 | ): 51 | if genai is None: 52 | raise ModuleNotFoundError( 53 | "Package `google-generativeai` is required to be installed to use Google GenAI API in this experiment." 54 | "Please use `pip install google-generativeai` to install the package or run this in Google Colab." 55 | ) 56 | 57 | self.completion_fn = self.google_text_completion_fn 58 | 59 | self.all_args = dict( 60 | model=model, 61 | contents=contents, 62 | generation_config=generation_config, 63 | safety_settings=safety_settings, 64 | ) 65 | super().__init__() 66 | 67 | def google_text_completion_fn(self, **input_args): 68 | params = copy.deepcopy(input_args) 69 | model = genai.GenerativeModel(input_args["model"]) 70 | del params["model"] 71 | response = model.generate_content(**params) 72 | return response 73 | 74 | @staticmethod 75 | def _extract_responses(response) -> list[str]: 76 | # `response.text` will return the top response 77 | return response.text 78 | 79 | def _get_model_names(self): 80 | return [combo["model"] for combo in self.argument_combos] 81 | 82 | def _get_prompts(self): 83 | return [combo["message"] for combo in self.argument_combos] 84 | -------------------------------------------------------------------------------- /prompttools/experiment/experiments/google_palm_experiment.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | try: 8 | import google.generativeai as palm 9 | except ImportError: 10 | palm = None 11 | 12 | from prompttools.selector.prompt_selector import PromptSelector 13 | from prompttools.mock.mock import mock_palm_completion_fn 14 | from .experiment import Experiment 15 | from typing import Optional, Union, Iterable 16 | import os 17 | 18 | 19 | class GooglePaLMCompletionExperiment(Experiment): 20 | r""" 21 | This class defines an experiment for Google PaLM's generate text API. It accepts lists for each argument 22 | passed into PaLM's API, then creates a cartesian product of those arguments, and gets results for each. 23 | 24 | Note: 25 | - All arguments here should be a ``list``, even if you want to keep the argument frozen 26 | (i.e. ``temperature=[1.0]``), because the experiment will try all possible combination 27 | of the input arguments. 28 | - You should set ``os.environ["GOOGLE_PALM_API_KEY"] = YOUR_KEY`` in order to connect with PaLM's API. 29 | 30 | Args: 31 | model (list[str]): Which model to call, as a string or a ``types.Model`` (e.g. ``'models/text-bison-001'``). 32 | 33 | prompt (list[str]): Free-form input text given to the model. Given a prompt, the model will 34 | generate text that completes the input text. 35 | 36 | temperature (list[float]): Controls the randomness of the output. Must be positive. 37 | Typical values are in the range: ``[0.0, 1.0]``. Higher values produce a 38 | more random and varied response. A temperature of zero will be deterministic. 39 | 40 | candidate_count (list[int]): The **maximum** number of generated response messages to return. 41 | This value must be between ``[1, 8]``, inclusive. If unset, this will default to ``1``. 42 | 43 | max_output_tokens (list[int]): Maximum number of tokens to include in a candidate. Must be greater 44 | than zero. If unset, will default to ``64``. 45 | 46 | top_k (list[float]): The API uses combined nucleus and top-k sampling. 47 | ``top_k`` sets the maximum number of tokens to sample from on each step. 48 | 49 | top_p (list[float]): The API uses combined nucleus and top-k sampling. ``top_p`` configures the nucleus 50 | sampling. It sets the maximum cumulative probability of tokens to sample from. 51 | 52 | safety_settings (list[Iterable[palm.types.SafetySettingDict]]): A list of unique ``types.SafetySetting`` 53 | instances for blocking unsafe content. 54 | 55 | stop_sequences (list[Union[str, Iterable[str]]]): A set of up to 5 character sequences that will stop output 56 | generation. If specified, the API will stop at the first appearance of a stop sequence. 57 | """ 58 | 59 | def __init__( 60 | self, 61 | model: list[str], 62 | prompt: list[str], 63 | temperature: list[Optional[float]] = [None], 64 | candidate_count: list[Optional[int]] = [None], 65 | max_output_tokens: list[Optional[int]] = [None], 66 | top_p: list[Optional[float]] = [None], 67 | top_k: list[Optional[float]] = [None], 68 | safety_settings: list[Optional[Iterable["palm.types.SafetySettingDict"]]] = [None], 69 | stop_sequences: list[Union[str, Iterable[str]]] = [None], 70 | ): 71 | if palm is None: 72 | raise ModuleNotFoundError( 73 | "Package `google.generativeai` is required to be installed to use PaLM API in this experiment." 74 | "Please use `pip install google.generativeai` to install the package" 75 | ) 76 | if os.getenv("DEBUG", default=False): 77 | self.completion_fn = mock_palm_completion_fn() 78 | else: 79 | self.completion_fn = self.palm_completion_fn 80 | palm.configure(api_key=os.environ["GOOGLE_PALM_API_KEY"]) 81 | 82 | # If we are using a prompt selector, we need to 83 | # render the prompts from the selector 84 | if isinstance(prompt[0], PromptSelector): 85 | prompt = [selector.for_palm() for selector in prompt] 86 | 87 | self.all_args = dict( 88 | model=model, 89 | prompt=prompt, 90 | temperature=temperature, 91 | candidate_count=candidate_count, 92 | max_output_tokens=max_output_tokens, 93 | top_p=top_p, 94 | top_k=top_k, 95 | safety_settings=safety_settings, 96 | stop_sequences=stop_sequences, 97 | ) 98 | super().__init__() 99 | 100 | def palm_completion_fn(self, **input_args): 101 | return palm.generate_text(**input_args) 102 | 103 | @staticmethod 104 | def _extract_responses(completion_response: "palm.text.text_types.Completion") -> list[str]: 105 | # `# completion_response.result` will return the top response 106 | return [candidate["output"] for candidate in completion_response.candidates][0] 107 | 108 | def _get_model_names(self): 109 | return [combo["model"] for combo in self.argument_combos] 110 | 111 | def _get_prompts(self): 112 | return [combo["prompt"] for combo in self.argument_combos] 113 | -------------------------------------------------------------------------------- /prompttools/experiment/experiments/google_vertex_chat_experiment.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | try: 8 | from vertexai.preview.language_models import ChatModel, InputOutputTextPair 9 | except ImportError: 10 | ChatModel = None 11 | InputOutputTextPair = None 12 | 13 | from .experiment import Experiment 14 | from typing import Optional 15 | import copy 16 | 17 | 18 | class GoogleVertexChatCompletionExperiment(Experiment): 19 | r""" 20 | This class defines an experiment for Google Vertex AI's chat API. It accepts lists for each argument 21 | passed into Vertex AI's API, then creates a cartesian product of those arguments, and gets results for each. 22 | 23 | Note: 24 | - All arguments here should be a ``list``, even if you want to keep the argument frozen 25 | (i.e. ``temperature=[1.0]``), because the experiment will try all possible combination 26 | of the input arguments. 27 | - You need to set up your Google Vertex AI credentials properly before executing this experiment. One option 28 | is to execute on Google Cloud's Colab. 29 | 30 | Args: 31 | model (list[str]): Which model to call, as a string or a ``types.Model`` (e.g. ``'models/text-bison-001'``). 32 | 33 | message (list[str]): Message for the chat model to respond. 34 | 35 | context (list[str]): Context shapes how the model responds throughout the conversation. For example, 36 | you can use context to specify words the model can or cannot use, 37 | topics to focus on or avoid, or the response format or style. 38 | 39 | examples (list[list['InputOutputTextPair']]): Examples for the model to learn how to 40 | respond to the conversation. 41 | 42 | temperature (list[float]): Controls the randomness of the output. Must be positive. 43 | Typical values are in the range: ``[0.0, 1.0]``. Higher values produce a 44 | more random and varied response. A temperature of zero will be deterministic. 45 | 46 | max_output_tokens (list[int]): Maximum number of tokens to include in a candidate. Must be greater 47 | than zero. If unset, will default to ``64``. 48 | 49 | top_k (list[float]): The API uses combined nucleus and top-k sampling. 50 | ``top_k`` sets the maximum number of tokens to sample from on each step. 51 | 52 | top_p (list[float]): The API uses combined nucleus and top-k sampling. ``top_p`` configures the nucleus 53 | sampling. It sets the maximum cumulative probability of tokens to sample from. 54 | 55 | stop_sequences (list[Union[str, Iterable[str]]]): A set of up to 5 character sequences that will stop output 56 | generation. If specified, the API will stop at the first appearance of a stop sequence. 57 | """ 58 | 59 | def __init__( 60 | self, 61 | model: list[str], 62 | message: list[str], 63 | context: list[Optional[str]] = [None], 64 | examples: list[Optional[list[InputOutputTextPair]]] = [None], 65 | temperature: list[Optional[float]] = [None], 66 | max_output_tokens: list[Optional[int]] = [None], 67 | top_p: list[Optional[float]] = [None], 68 | top_k: list[Optional[int]] = [None], 69 | stop_sequences: list[list[str]] = [None], 70 | ): 71 | if ChatModel is None: 72 | raise ModuleNotFoundError( 73 | "Package `vertexai` is required to be installed to use Google Vertex API in this experiment." 74 | "Please use `pip install google-cloud-aiplatform` to install the package" 75 | ) 76 | 77 | self.completion_fn = self.vertex_chat_completion_fn 78 | 79 | self.all_args = dict( 80 | model=model, 81 | message=message, 82 | context=context, 83 | examples=examples, 84 | temperature=temperature, 85 | max_output_tokens=max_output_tokens, 86 | top_p=top_p, 87 | top_k=top_k, 88 | stop_sequences=stop_sequences, 89 | ) 90 | super().__init__() 91 | 92 | def vertex_chat_completion_fn(self, **input_args): 93 | chat_model = ChatModel.from_pretrained(model_name=input_args["model"]) 94 | message = input_args["message"] 95 | params = copy.deepcopy(input_args) 96 | del params["model"], params["message"] 97 | chat = chat_model.start_chat(**params) 98 | return chat.send_message(message) 99 | 100 | @staticmethod 101 | def _extract_responses(response) -> list[str]: 102 | # `response.text` will return the top response 103 | return response.text 104 | 105 | def _get_model_names(self): 106 | return [combo["model"] for combo in self.argument_combos] 107 | 108 | def _get_prompts(self): 109 | return [combo["message"] for combo in self.argument_combos] 110 | -------------------------------------------------------------------------------- /prompttools/experiment/experiments/huggingface_endpoint_experiment.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # TODO: Coming soon 8 | -------------------------------------------------------------------------------- /prompttools/experiment/experiments/mindsdb_experiment.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import os 8 | from typing import Any, Dict, List, Tuple 9 | import itertools 10 | from time import perf_counter 11 | import logging 12 | 13 | try: 14 | from mysql.connector.connection_cext import CMySQLConnection 15 | except ImportError: 16 | CMySQLConnection = None 17 | 18 | from prompttools.mock.mock import mock_mindsdb_completion_fn 19 | 20 | from .experiment import Experiment 21 | from .error import PromptExperimentException 22 | 23 | 24 | class MindsDBExperiment(Experiment): 25 | r""" 26 | An experiment class for MindsDB. 27 | This accepts combinations of MindsDB inputs to form SQL queries, returning a list of responses. 28 | 29 | Args: 30 | db_connector (CMySQLConnection): Connector MindsDB 31 | kwargs (dict): keyword arguments for the model 32 | """ 33 | 34 | def __init__( 35 | self, 36 | db_connector: "CMySQLConnection", 37 | **kwargs: Dict[str, object], 38 | ): 39 | self.cursor = db_connector.cursor() 40 | self.completion_fn = self.mindsdb_completion_fn 41 | if os.getenv("DEBUG", default=False): 42 | self.completion_fn = mock_mindsdb_completion_fn 43 | 44 | self.call_params = dict(prompt=kwargs["prompt"]) 45 | self.model_params = dict({k: kwargs[k] for k in kwargs if k != "prompt"}) 46 | 47 | self.all_args = self.model_params | self.call_params 48 | super().__init__() 49 | 50 | def prepare(self) -> None: 51 | r""" 52 | Creates argument combinations by taking the cartesian product of all inputs. 53 | """ 54 | self.model_argument_combos = [ 55 | dict(zip(self.model_params, val)) for val in itertools.product(*self.model_params.values()) 56 | ] 57 | self.call_argument_combos = [ 58 | dict(zip(self.call_params, val)) for val in itertools.product(*self.call_params.values()) 59 | ] 60 | 61 | def mindsdb_completion_fn( 62 | self, 63 | **params: Dict[str, Any], 64 | ) -> List[Any]: 65 | r""" 66 | MindsDB helper function to make request. 67 | """ 68 | prompt = params["prompt"] 69 | 70 | self.cursor.execute(prompt) 71 | return [x for x in self.cursor] 72 | 73 | def run( 74 | self, 75 | runs: int = 1, 76 | ) -> None: 77 | r""" 78 | Create tuples of input and output for every possible combination of arguments. 79 | For each combination, it will execute `runs` times, default to 1. 80 | # TODO This can be done with an async queue 81 | """ 82 | if not self.argument_combos: 83 | logging.info("Preparing first...") 84 | self.prepare() 85 | results = [] 86 | latencies = [] 87 | for model_combo in self.model_argument_combos: 88 | for call_combo in self.call_argument_combos: 89 | call_combo["prompt"] = call_combo["prompt"].format( 90 | table=model_combo["table"], 91 | author_username=model_combo["author_username"], 92 | text=model_combo["text"], 93 | ) 94 | for _ in range(runs): 95 | call_combo["client"] = self.cursor 96 | start = perf_counter() 97 | res = self.completion_fn(**call_combo) 98 | latencies.append(perf_counter() - start) 99 | results.append(res) 100 | self.argument_combos.append(model_combo | call_combo) 101 | if len(results) == 0: 102 | logging.error("No results. Something went wrong.") 103 | raise PromptExperimentException 104 | self._construct_result_dfs(self.argument_combos, results, latencies, extract_response_equal_full_result=True) 105 | 106 | @staticmethod 107 | def _extract_responses(output: List[Dict[str, object]]) -> Tuple[str]: 108 | return output[0] 109 | -------------------------------------------------------------------------------- /prompttools/experiment/experiments/mistral_experiment.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import os 8 | 9 | from typing import Optional 10 | 11 | 12 | from .experiment import Experiment 13 | 14 | 15 | try: 16 | import mistralai 17 | from mistralai.client import MistralClient 18 | from mistralai.models.chat_completion import ChatMessage 19 | except ImportError: 20 | mistralai = None 21 | MistralClient = None 22 | ChatMessage = None 23 | 24 | 25 | class MistralChatCompletionExperiment(Experiment): 26 | r""" 27 | This class defines an experiment for Mistral's chatcompletion API. It accepts lists for each argument 28 | passed into the API, then creates a cartesian product of those arguments, and gets results for each. 29 | 30 | Note: 31 | - All arguments here should be a ``list``, even if you want to keep the argument frozen 32 | (i.e. ``temperature=[1.0]``), because the experiment will try all possible combination 33 | of the input arguments. 34 | - You should set ``os.environ["MISTRAL_API_KEY"] = YOUR_KEY`` in order to connect with Mistral's API. 35 | 36 | Args: 37 | model (list[str]): 38 | the model(s) that will complete your prompt (e.g. "mistral-tiny") 39 | 40 | messages (list[ChatMessage]): 41 | Input prompts (using Mistral's Python library). The first prompt role should be `user` or `system`. 42 | 43 | temperature (list[float], optional): 44 | The amount of randomness injected into the response 45 | 46 | top_p (list[float], optional): 47 | use nucleus sampling. 48 | 49 | max_tokens (list[int]): 50 | The maximum number of tokens to generate in the completion. 51 | 52 | safe_prompt (list[bool]): 53 | Whether to inject a safety prompt before all conversations. 54 | 55 | random_seed (list[int], optional): 56 | The seed to use for random sampling. If set, different calls will generate deterministic results. 57 | """ 58 | 59 | url = "https://api.mistral.ai/v1/chat/completions" 60 | 61 | def __init__( 62 | self, 63 | model: list[str], 64 | messages: list[str], 65 | temperature: list[float] = [None], 66 | top_p: list[float] = [None], 67 | max_tokens: list[Optional[int]] = [None], 68 | safe_prompt: list[bool] = [False], 69 | random_seed: list[Optional[int]] = [None], 70 | ): 71 | if mistralai is None: 72 | raise ModuleNotFoundError( 73 | "Package `mistralai` is required to be installed to use this experiment." 74 | "Please use `pip install mistralai` to install the package" 75 | ) 76 | self.client = MistralClient(api_key=os.environ["MISTRAL_API_KEY"]) 77 | self.completion_fn = self.mistral_completion_fn 78 | 79 | self.all_args = dict( 80 | model=model, 81 | messages=messages, 82 | temperature=temperature, 83 | top_p=top_p, 84 | max_tokens=max_tokens, 85 | safe_prompt=safe_prompt, 86 | random_seed=random_seed, 87 | ) 88 | super().__init__() 89 | 90 | def mistral_completion_fn(self, **input_args): 91 | response = self.client.chat(**input_args) 92 | return response 93 | 94 | @staticmethod 95 | def _extract_responses(response) -> list[str]: 96 | return response.choices[0].message.content 97 | 98 | def _get_model_names(self): 99 | return [combo["model"] for combo in self.argument_combos] 100 | 101 | def _get_prompts(self): 102 | return [combo["messages"] for combo in self.argument_combos] 103 | -------------------------------------------------------------------------------- /prompttools/experiment/experiments/style.mplstyle: -------------------------------------------------------------------------------- 1 | 2 | figure.figsize: 12,8 3 | figure.dpi : 100 4 | 5 | lines.linewidth : 3.0 6 | axes.linewidth: 1.8 7 | font.size : 22 8 | axes.labelsize : 22 9 | xtick.direction : in 10 | ytick.direction : in 11 | xtick.top : True 12 | ytick.right : True 13 | 14 | xtick.major.size: 10 # major tick size in points 15 | xtick.minor.size: 5 # minor tick size in points 16 | xtick.major.width: 1.8 # major tick width in points 17 | xtick.minor.width: 1.2 # minor tick width in points 18 | 19 | ytick.major.size: 10 # major tick size in points 20 | ytick.minor.size: 5 # minor tick size in points 21 | ytick.major.width: 1.8 # major tick width in points 22 | ytick.minor.width: 1.2 # minor tick width in points 23 | 24 | xtick.major.pad : 6 25 | xtick.minor.pad : 6 26 | ytick.major.pad : 6 27 | ytick.minor.pad : 6 28 | 29 | axes.labelpad: 6 30 | 31 | xtick.labelsize : 16 32 | ytick.labelsize : 16 33 | legend.fontsize : 16 34 | legend.frameon : False 35 | 36 | #axes.edgecolor : "333333" # Color of the figure axis. 37 | #axes.edgecolor : "red" # Color of the figure axis. 38 | 39 | 40 | savefig.bbox : tight 41 | savefig.dpi : 100 42 | 43 | # Hegel AI color cycle 44 | axes.prop_cycle: cycler('color', ["black", "771541", "EB8F4C","594F3B","A8B7AB","9C92A3"]) 45 | 46 | #font.family : serif 47 | #text.usetex : True 48 | #font.serif : Palatino 49 | -------------------------------------------------------------------------------- /prompttools/experiment/widgets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/prompttools/experiment/widgets/__init__.py -------------------------------------------------------------------------------- /prompttools/experiment/widgets/comparison.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Callable, List 8 | import pandas as pd 9 | from IPython import display 10 | import ipywidgets as widgets 11 | 12 | 13 | class ComparisonWidgetProvider: 14 | r""" 15 | Provides functionality for widgets to compare models. This includes 16 | displaying widgets, and recording evaluations in the experiment. 17 | """ 18 | 19 | def __init__(self, completion_fn, agg_fn, eval_listener_fn): 20 | self.completion_fn = completion_fn 21 | self.agg_fn = agg_fn 22 | self.eval_listener_fn = eval_listener_fn 23 | 24 | def _get_comparison_submission_listener(self, table: pd.DataFrame, models: List[str]) -> Callable: 25 | def on_click(b): 26 | sorted_scores = self.agg_fn(table, 0) 27 | data = { 28 | models[0]: sorted_scores.keys(), 29 | "feedback": sorted_scores.values(), 30 | } 31 | df = pd.DataFrame(data) 32 | display.display(df) 33 | 34 | return on_click 35 | 36 | def set_models(self, models: List[str]) -> None: 37 | self.models = models 38 | self.row_len = 2 + len(self.models) 39 | 40 | def get_header_widgets(self) -> List[object]: 41 | return [widgets.Label("Input")] + [widgets.Label(model) for model in self.models] + [widgets.Label("Feedback")] 42 | 43 | def get_row_widgets(self, index, row): 44 | items = [widgets.HTML(value="

" + row.name + "

")] 45 | items += [ 46 | widgets.HTML(value="

" + row[model] + "

") 47 | for model in self.models 48 | ] 49 | feedback_dropdown = widgets.Dropdown( 50 | options=[("\U0001F44D", 1), ("\U0001F44E", 0)], 51 | value=1, 52 | layout={"width": "50px"}, 53 | ) 54 | feedback_dropdown.observe(self.eval_listener_fn(index), names="value") 55 | items += [feedback_dropdown] 56 | return items 57 | 58 | def get_footer_widgets(self, table): 59 | submit_button = widgets.Button( 60 | description="Submit", 61 | disabled=False, 62 | button_style="success", 63 | tooltip="Submit", 64 | ) 65 | submit_button.on_click(self._get_comparison_submission_listener(table, self.models)) 66 | return [widgets.Label("")] * (self.row_len - 1) + [submit_button] 67 | 68 | def display(self, items): 69 | row_len = 2 + len(self.models) 70 | grid = widgets.GridBox( 71 | items, 72 | layout=widgets.Layout(grid_template_columns="repeat(" + str(row_len) + ", 230px)"), 73 | ) 74 | display.display(grid) 75 | -------------------------------------------------------------------------------- /prompttools/experiment/widgets/feedback.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Callable, List 8 | import pandas as pd 9 | from IPython import display 10 | import ipywidgets as widgets 11 | 12 | 13 | class FeedbackWidgetProvider: 14 | r""" 15 | Provides functionality for widgets to evaluate models. This includes 16 | displaying widgets, and recording evaluations in the experiment. 17 | """ 18 | 19 | def __init__(self, completion_fn, agg_fn, eval_listener_fn): 20 | self.completion_fn = completion_fn 21 | self.agg_fn = agg_fn 22 | self.eval_listener_fn = eval_listener_fn 23 | 24 | def _get_feedback_submission_listener(self, table: pd.DataFrame, pivot_columns: List[str]) -> Callable: 25 | def on_click(b): 26 | sorted_scores = self.agg_fn(table, "feedback", pivot_columns[0]) 27 | data = { 28 | pivot_columns[0]: sorted_scores.keys(), 29 | "feedback": sorted_scores.values(), 30 | } 31 | df = pd.DataFrame(data) 32 | display.display(df) 33 | 34 | return on_click 35 | 36 | def set_pivot_columns(self, pivot_columns: List[str]) -> None: 37 | self.pivot_columns = pivot_columns 38 | 39 | def get_header_widgets(self) -> List[object]: 40 | return [ 41 | widgets.Label(self.pivot_columns[0]), 42 | widgets.Label(self.pivot_columns[1]), 43 | widgets.Label("response(s)"), 44 | widgets.Label("Feedback"), 45 | ] 46 | 47 | def get_row_widgets(self, index, row): 48 | items = [ 49 | widgets.HTML(value="

" + row[self.pivot_columns[0]] + "

"), 50 | widgets.HTML(value="

" + row[self.pivot_columns[1]] + "

"), 51 | widgets.HTML(value="

" + row["response(s)"] + "

"), 52 | ] 53 | feedback_dropdown = widgets.Dropdown( 54 | options=[("\U0001F44D", 1), ("\U0001F44E", 0)], 55 | value=1, 56 | layout={"width": "50px"}, 57 | ) 58 | feedback_dropdown.observe(self.eval_listener_fn(index), names="value") 59 | items += [feedback_dropdown] 60 | return items 61 | 62 | def get_footer_widgets(self, table): 63 | submit_button = widgets.Button( 64 | description="Submit", 65 | disabled=False, 66 | button_style="success", 67 | tooltip="Submit", 68 | ) 69 | submit_button.on_click(self._get_feedback_submission_listener(table, self.pivot_columns)) 70 | return [ 71 | widgets.Label(""), 72 | widgets.Label(""), 73 | widgets.Label(""), 74 | submit_button, 75 | ] 76 | 77 | def display(self, items): 78 | grid = widgets.GridBox( 79 | items, 80 | layout=widgets.Layout(grid_template_columns="repeat(4, 230px)"), 81 | ) 82 | display.display(grid) 83 | -------------------------------------------------------------------------------- /prompttools/experiment/widgets/utility.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | def is_interactive() -> bool: 9 | r""" 10 | Used to determine if we are in a jupyter notebook, which 11 | determines how we present the visualizations. 12 | """ 13 | import __main__ as main 14 | 15 | return not hasattr(main, "__file__") 16 | -------------------------------------------------------------------------------- /prompttools/harness/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | from .harness import ExperimentationHarness 9 | from .chat_history_harness import ChatHistoryExperimentationHarness 10 | from .chat_model_comparison_harness import ChatModelComparisonHarness 11 | from .chat_prompt_template_harness import ChatPromptTemplateExperimentationHarness 12 | from .model_comparison_harness import ModelComparisonHarness 13 | from .multi_experiment_harness import MultiExperimentHarness 14 | from .prompt_template_harness import PromptTemplateExperimentationHarness 15 | from .rag_harness import RetrievalAugmentedGenerationExperimentationHarness 16 | from .system_prompt_harness import SystemPromptExperimentationHarness 17 | 18 | 19 | __all__ = [ 20 | "ChatHistoryExperimentationHarness", 21 | "ChatModelComparisonHarness", 22 | "ChatPromptTemplateExperimentationHarness", 23 | "ExperimentationHarness", 24 | "ModelComparisonHarness", 25 | "MultiExperimentHarness", 26 | "PromptTemplateExperimentationHarness", 27 | "RetrievalAugmentedGenerationExperimentationHarness", 28 | "SystemPromptExperimentationHarness", 29 | ] 30 | -------------------------------------------------------------------------------- /prompttools/harness/chat_history_harness.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Dict, List, Optional 8 | from .harness import ExperimentationHarness 9 | from prompttools.experiment import OpenAIChatExperiment 10 | 11 | 12 | class ChatHistoryExperimentationHarness(ExperimentationHarness): 13 | r""" 14 | An experimentation harness used for compare multiple chat histories. 15 | 16 | Args: 17 | model_name (str): The name of the model. 18 | chat_histories (List[List[Dict[str, str]]]): A list of chat histories that will be fed into the model. 19 | model_arguments (Optional[Dict[str, object]], optional): Additional arguments for the model. 20 | Defaults to ``None``. 21 | """ 22 | 23 | def __init__( 24 | self, 25 | model_name: str, 26 | chat_histories: List[List[Dict[str, str]]], 27 | model_arguments: Optional[Dict[str, object]] = None, 28 | ): 29 | self.experiment_cls_constructor = OpenAIChatExperiment 30 | self.model_name = model_name 31 | self.chat_histories = chat_histories 32 | self.model_arguments = {} if model_arguments is None else model_arguments 33 | super().__init__() 34 | 35 | def prepare(self) -> None: 36 | r""" 37 | Initializes and prepares the experiment. 38 | """ 39 | self.experiment = self.experiment_cls_constructor( 40 | [self.model_name], 41 | self.chat_histories, 42 | **self._prepare_arguments(self.model_arguments), 43 | ) 44 | super().prepare() 45 | 46 | def run(self): 47 | if not self.experiment: 48 | self.prepare() 49 | super().run() 50 | -------------------------------------------------------------------------------- /prompttools/harness/chat_model_comparison_harness.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Dict, List, Optional 8 | from .harness import ExperimentationHarness 9 | from prompttools.experiment import OpenAIChatExperiment 10 | 11 | 12 | class ChatModelComparisonHarness(ExperimentationHarness): 13 | r""" 14 | An experimentation harness used for comparing chat models. 15 | Multi-model version of ``ChatHistoryExperimentationHarness``. 16 | 17 | Args: 18 | model_names (List[str]): The names of the models that you would like to compare 19 | chat_histories (List[List[Dict[str, str]]]): A list of chat histories that will be fed into the models. 20 | runs (int): Number of runs to execute. Defaults to ``1``. 21 | model_arguments (Optional[Dict[str, object]], optional): Additional arguments for the model. 22 | Defaults to ``None``. 23 | """ 24 | 25 | PIVOT_COLUMNS = ["model", "messages"] 26 | 27 | def __init__( 28 | self, 29 | model_names: List[str], 30 | chat_histories: List[List[Dict[str, str]]], 31 | runs: int = 1, 32 | model_arguments: Optional[Dict[str, object]] = None, 33 | ): 34 | self.experiment_cls_constructor = OpenAIChatExperiment 35 | self.model_names = model_names 36 | self.chat_histories = chat_histories 37 | self.runs = runs 38 | self.model_arguments = {} if model_arguments is None else model_arguments 39 | super().__init__() 40 | 41 | def prepare(self) -> None: 42 | """ 43 | Initializes and prepares the experiment. 44 | """ 45 | self.experiment = self.experiment_cls_constructor( 46 | self.model_names, 47 | self.chat_histories, 48 | **self._prepare_arguments(self.model_arguments), 49 | ) 50 | super().prepare() 51 | 52 | def run(self): 53 | if not self.experiment: 54 | self.prepare() 55 | super().run() 56 | 57 | def compare(self): 58 | self.experiment.compare(self.model_names[0], self.PIVOT_COLUMNS) 59 | -------------------------------------------------------------------------------- /prompttools/harness/document_retrieval_harness.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # TODO: Coming soon 8 | -------------------------------------------------------------------------------- /prompttools/harness/function_call_harness.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # TODO: Coming soon. 8 | -------------------------------------------------------------------------------- /prompttools/harness/multi_experiment_harness.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | from typing import Callable, Dict, List 9 | from collections import defaultdict 10 | from prompttools.experiment import Experiment 11 | import pandas as pd 12 | 13 | 14 | class MultiExperimentHarness: 15 | r""" 16 | This is designed to run experiments across multiple model providers. The underlying APIs for different models 17 | (e.g. LlamaCpp and OpenAI) are different, this provides a way to manage that complexity. 18 | This will run experiments for different providers, and combine the results into a single table. 19 | 20 | The notebook "examples/notebooks/GPT4vsLlama2.ipynb" provides a good example how this can used 21 | to test prompts across different models. 22 | 23 | Args: 24 | experiments (list[Experiment]): The list of experiments that you would like to execute (e.g. 25 | ``prompttools.experiment.OpenAICompletionExperiment``) 26 | """ 27 | 28 | def __init__(self, experiments: List[Experiment]): 29 | self.experiments = experiments 30 | 31 | def prepare(self): 32 | for experiment in self.experiments: 33 | experiment.prepare() 34 | 35 | def run(self): 36 | for experiment in self.experiments: 37 | experiment.run() 38 | 39 | def evaluate(self, metric_name: str, eval_fn: Callable) -> None: 40 | for experiment in self.experiments: 41 | experiment.evaluate(metric_name, eval_fn) 42 | 43 | def gather_feedback(self) -> None: 44 | pass 45 | 46 | def _get_argument_combos(self): 47 | tmp = [combo for experiment in self.experiments for combo in experiment.argument_combos] 48 | return tmp 49 | 50 | def _get_prompts(self): 51 | tmp = [combo for experiment in self.experiments for combo in experiment._get_prompts()] 52 | return tmp 53 | 54 | def _get_results(self): 55 | tmp = [ 56 | experiment._extract_responses(result) for experiment in self.experiments for result in experiment.results 57 | ] 58 | return tmp 59 | 60 | def _get_scores(self): 61 | scores = defaultdict(list) 62 | for experiment in self.experiments: 63 | for name, score in experiment.scores.items(): 64 | scores[name].extend(score) 65 | return scores 66 | 67 | def _get_experiment_names(self): 68 | tmp = [name for experiment in self.experiments for name in experiment._get_model_names()] 69 | return tmp 70 | 71 | def visualize(self, colname: str = None) -> None: 72 | scores = self._get_scores() 73 | data = { 74 | "prompt": self._get_prompts(), 75 | "response(s)": self._get_results(), 76 | "latency": scores["latency"], 77 | "model": self._get_experiment_names(), 78 | } 79 | # Add scores for each eval fn, including feedback 80 | for metric_name, evals in scores.items(): 81 | if metric_name != "comparison": 82 | data[metric_name] = evals 83 | df = pd.DataFrame(data) 84 | if colname: 85 | df = pd.pivot_table( 86 | df, 87 | values=colname, 88 | index=["prompt"], 89 | columns=["model"], 90 | aggfunc=lambda x: x.iloc[0], 91 | ) 92 | return df 93 | 94 | def rank(self, metric_name: str, is_average: bool = False) -> Dict[str, float]: 95 | pass 96 | -------------------------------------------------------------------------------- /prompttools/harness/prompt_template_harness.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Dict, List, Optional, Type 8 | import jinja2 9 | from .harness import ExperimentationHarness, Experiment 10 | import logging 11 | 12 | 13 | class PromptTemplateExperimentationHarness(ExperimentationHarness): 14 | r""" 15 | An experimentation harness used to test various prompt templates. 16 | We use `jinja` templates, e.g. "Answer the following question: {{input}}". 17 | 18 | Args: 19 | experiment (Type[Experiment]): The experiment constructor that you would like to execute within the harness 20 | (e.g. ``prompttools.experiment.OpenAICompletionExperiment``) 21 | model_name (str): The name of the model. 22 | prompt_templates (List[str]): A list of prompt ``jinja``-styled templates. 23 | user_inputs (List[Dict[str, str]]): A list of dictionaries representing user inputs. 24 | model_arguments (Optional[Dict[str, object]], optional): Additional arguments for the model. 25 | Defaults to ``None``. 26 | """ 27 | 28 | PIVOT_COLUMNS = ["prompt_template", "user_input"] 29 | 30 | def __init__( 31 | self, 32 | experiment: Type[Experiment], 33 | model_name: str, 34 | prompt_templates: List[str], 35 | user_inputs: List[Dict[str, str]], 36 | model_arguments: Optional[Dict[str, object]] = None, 37 | ): 38 | self.environment = jinja2.Environment() 39 | self.experiment_cls_constructor = experiment 40 | self.model_name = model_name 41 | self.prompt_templates = prompt_templates 42 | self.user_inputs = user_inputs 43 | self.model_arguments = {} if model_arguments is None else model_arguments 44 | super().__init__() 45 | 46 | def prepare(self) -> None: 47 | r""" 48 | Creates prompts from templates to use for the experiment, and then initializes and prepares the experiment. 49 | """ 50 | self.input_pairs_dict = {} 51 | rendered_inputs = [] 52 | for pt in self.prompt_templates: 53 | for user_input in self.user_inputs: 54 | template = self.environment.from_string(pt) 55 | prompt = template.render(**user_input) 56 | rendered_inputs.append(prompt) 57 | self.input_pairs_dict[prompt] = (pt, user_input) 58 | self.experiment = self.experiment_cls_constructor( 59 | [self.model_name], 60 | rendered_inputs, 61 | **self._prepare_arguments(self.model_arguments), 62 | ) 63 | super().prepare() 64 | 65 | def run(self): 66 | if not self.experiment: 67 | self.prepare() 68 | super().run() 69 | -------------------------------------------------------------------------------- /prompttools/harness/rag_harness.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Optional, Type, Callable, Union 8 | import jinja2 9 | from .harness import ExperimentationHarness, Experiment 10 | import copy 11 | 12 | 13 | DOC_PROMPT_TEMPLATE = r"""Given these documents:{{documents}} 14 | 15 | {{prompt}} 16 | """ 17 | 18 | 19 | def _doc_list_to_str(documents: list[str]) -> str: 20 | res = "" 21 | for d in documents: 22 | res += "\n" 23 | res += d 24 | return res 25 | 26 | 27 | def _generate_doc_prompt(documents: list[str], prompt_or_msg: Union[str, list[dict[str, str]]], is_chat: bool): 28 | if not is_chat: 29 | prompt = prompt_or_msg 30 | else: # You have a chat message object 31 | prompt = prompt_or_msg[-1]["content"] 32 | environment = jinja2.Environment() 33 | template = environment.from_string(DOC_PROMPT_TEMPLATE) 34 | doc_str = _doc_list_to_str(documents) 35 | 36 | doc_prompt = template.render( 37 | { 38 | "documents": doc_str, 39 | "prompt": prompt, 40 | } 41 | ) 42 | if not is_chat: 43 | return doc_prompt 44 | else: 45 | new_msg = copy.deepcopy(prompt_or_msg) 46 | new_msg[-1]["content"] = doc_prompt 47 | return new_msg 48 | 49 | 50 | class RetrievalAugmentedGenerationExperimentationHarness(ExperimentationHarness): 51 | r""" 52 | An experimentation harness used to test the Retrieval-Augmented Generation process, which 53 | involves a vector DB and a LLM at the same time. 54 | 55 | Args: 56 | vector_db_experiment (Experiment): An initialized vector DB experiment. 57 | llm_experiment_cls (Type[Experiment]): The experiment constructor that you would like to execute 58 | within the harness (e.g. ``prompttools.experiment.OpenAICompletionExperiment``) 59 | llm_arguments (dict[str, list]): Dictionary of arguments for the LLM. 60 | extract_document_fn (Callable): A function, when given a row of results from the vector DB experiment, 61 | extract the relevant documents (``list[str]``) that will be inserted into the template. 62 | extract_query_metadata_fn (Callable): A function, when given a row of results from the vector DB experiment, 63 | extract the relevant metadata and return a ``str`` that will be shown for visualization in the final 64 | result table 65 | prompt_template (str): A ``jinja``-styled templates, where documents and prompt will be inserted. 66 | """ 67 | 68 | def __init__( 69 | self, 70 | vector_db_experiment: Experiment, 71 | llm_experiment_cls: Type[Experiment], 72 | llm_arguments: dict, 73 | extract_document_fn: Callable, 74 | extract_query_metadata_fn: Callable, 75 | prompt_template: str = DOC_PROMPT_TEMPLATE, 76 | ): 77 | self.vector_db_experiment = vector_db_experiment 78 | self.llm_experiment_cls: Type[Experiment] = llm_experiment_cls 79 | self.experiment: Optional[Experiment] = None 80 | self.llm_arguments = copy.copy(llm_arguments) 81 | self.extract_document_fn = extract_document_fn 82 | self.extract_query_metadata_fn = extract_query_metadata_fn 83 | self.prompt_templates = prompt_template 84 | 85 | def run(self) -> None: 86 | self.vector_db_experiment.run() 87 | document_lists: list[list[str]] = [] 88 | # latencies = [] # TODO: Include latency results 89 | # Extract documents from the result of 90 | for i, row in self.vector_db_experiment.full_df.iterrows(): 91 | document_lists.append(self.extract_document_fn(row)) 92 | # latencies.append(row["latencies"]) 93 | 94 | # Put documents into prompt template 95 | augmented_prompts = [] 96 | is_chat = self.llm_experiment_cls._is_chat() 97 | input_arg_name = "messages" if is_chat else "prompt" 98 | for doc in document_lists: 99 | for prompt_or_msg in self.llm_arguments[input_arg_name]: 100 | augmented_prompts.append(_generate_doc_prompt(doc, prompt_or_msg, is_chat)) 101 | 102 | # Pass documents into LLM 103 | self.llm_arguments[input_arg_name]: list[str] = augmented_prompts 104 | self.experiment = self.llm_experiment_cls(**self.llm_arguments) 105 | 106 | # Run the LLM experiment 107 | self.experiment.run() 108 | 109 | # Add "query text" (i.e. the prompt used to retrieve documents from the vector DB) 110 | # to the final results table here 111 | retrieval_n_rows = len(self.vector_db_experiment.full_df) 112 | query_metadata = [ 113 | self.extract_query_metadata_fn(row) for _, row in self.vector_db_experiment.full_df.iterrows() 114 | ] 115 | final_n_row = len(self.full_df) 116 | 117 | self.partial_df["retrieval_metadata"] = [query_metadata[i % retrieval_n_rows] for i in range(final_n_row)] 118 | self.full_df["retrieval_metadata"] = self.partial_df["retrieval_metadata"] 119 | 120 | def visualize(self) -> None: 121 | if self.experiment is None: 122 | self.run() 123 | self.experiment.visualize() 124 | -------------------------------------------------------------------------------- /prompttools/harness/utility.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | def is_interactive() -> bool: 9 | r""" 10 | Used to determine if we are in a jupyter notebook, which 11 | determines how we present the visualizations. 12 | """ 13 | import __main__ as main 14 | 15 | return not hasattr(main, "__file__") 16 | -------------------------------------------------------------------------------- /prompttools/logger/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | from .logger import Logger, add_feedback 9 | 10 | 11 | __all__ = [ 12 | "Logger", 13 | "add_feedback", 14 | ] 15 | -------------------------------------------------------------------------------- /prompttools/mock/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/prompttools/mock/__init__.py -------------------------------------------------------------------------------- /prompttools/mock/mock_data/images/19th_century_wombat_gentleman.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/prompttools/mock/mock_data/images/19th_century_wombat_gentleman.png -------------------------------------------------------------------------------- /prompttools/mock/mock_data/images/Just_a_fruit_basket.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/prompttools/mock/mock_data/images/Just_a_fruit_basket.png -------------------------------------------------------------------------------- /prompttools/playground/README.md: -------------------------------------------------------------------------------- 1 | ## `prompttools` Playground 2 | 3 | If you are interested to have experiment with a UI rather than a notebook, the playground allows you to do that! 4 | You can: 5 | - Evaluate different instructions (system prompts) 6 | - Try different prompt templates 7 | - Compare across models (e.g. GPT-4 vs. local LLaMA 2) 8 | 9 |

10 | 11 |

12 | 13 | To launch the playground locally, clone the git repo and run the following script with streamlit: 14 | 15 | ``` 16 | git clone https://github.com/hegelai/prompttools.git 17 | cd prompttools && pip install -r prompttools/playground/requirements.txt 18 | streamlit run prompttools/playground/playground.py 19 | ``` 20 | 21 | Similar to the notebook examples, all the executions and calls to LLM services happen within your local machines, 22 | `prompttools` do not forward your requests or log your information. 23 | -------------------------------------------------------------------------------- /prompttools/playground/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /prompttools/playground/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | from prompttools.experiment import LlamaCppExperiment 9 | from prompttools.experiment import OpenAIChatExperiment 10 | from prompttools.experiment import OpenAICompletionExperiment 11 | from prompttools.experiment import AnthropicCompletionExperiment 12 | from prompttools.experiment import GooglePaLMCompletionExperiment 13 | from prompttools.experiment import HuggingFaceHubExperiment 14 | from prompttools.experiment import ReplicateExperiment 15 | 16 | ENVIRONMENT_VARIABLE = { 17 | "Replicate": "REPLICATE_API_TOKEN", 18 | "OpenAI Chat": "OPENAI_API_KEY", 19 | "OpenAI Completion": "OPENAI_API_KEY", 20 | "Anthropic": "ANTHROPIC_API_KEY", 21 | "Google PaLM": "GOOGLE_PALM_API_KEY", 22 | "HuggingFace Hub": "HUGGINGFACEHUB_API_TOKEN", 23 | } 24 | 25 | EXPERIMENTS = { 26 | "LlamaCpp Chat": LlamaCppExperiment, 27 | "OpenAI Chat": OpenAIChatExperiment, 28 | "OpenAI Completion": OpenAICompletionExperiment, 29 | "Anthropic": AnthropicCompletionExperiment, 30 | "Google PaLM": GooglePaLMCompletionExperiment, 31 | "HuggingFace Hub": HuggingFaceHubExperiment, 32 | "Replicate": ReplicateExperiment, 33 | } 34 | 35 | MODES = ("Instruction", "Prompt Template", "Model Comparison") 36 | 37 | MODEL_TYPES = ( 38 | "OpenAI Chat", 39 | "OpenAI Completion", 40 | "Anthropic", 41 | "Google PaLM", 42 | "LlamaCpp Chat", 43 | "LlamaCpp Completion", 44 | "HuggingFace Hub", 45 | "Replicate", 46 | ) 47 | 48 | OPENAI_CHAT_MODELS = ( 49 | "gpt-3.5-turbo", 50 | "gpt-3.5-turbo-16k", 51 | "gpt-3.5-turbo-0613", 52 | "gpt-3.5-turbo-16k-0613", 53 | "gpt-3.5-turbo-0301", 54 | "gpt-4", 55 | "gpt-4-0613", 56 | "gpt-4-32k", 57 | "gpt-4-32k-0613", 58 | "gpt-4-0314", 59 | "gpt-4-32k-0314", 60 | ) 61 | 62 | OPENAI_COMPLETION_MODELS = ("text-davinci-003", "text-davinci-002", "code-davinci-002") 63 | -------------------------------------------------------------------------------- /prompttools/playground/data_loader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | import os 9 | import jinja2 10 | import streamlit as st 11 | 12 | from prompttools.selector.prompt_selector import PromptSelector 13 | from prompttools.playground.constants import ENVIRONMENT_VARIABLE, EXPERIMENTS 14 | 15 | 16 | def render_prompts(templates, vars): 17 | prompts = [] 18 | for template in templates: 19 | for var_set in vars: 20 | environment = jinja2.Environment() 21 | jinja_template = environment.from_string(template) 22 | prompts.append(jinja_template.render(**var_set)) 23 | return prompts 24 | 25 | 26 | @st.cache_data 27 | def load_data( 28 | model_type, 29 | model, 30 | instructions, 31 | user_inputs, 32 | temperature=0.0, 33 | top_p=1, 34 | max_tokens=None, 35 | frequency_penalty=0.0, 36 | presence_penalty=0.0, 37 | api_key=None, 38 | ): 39 | if api_key: 40 | os.environ[ENVIRONMENT_VARIABLE[model_type]] = api_key 41 | selectors = [PromptSelector(instruction, user_input) for instruction in instructions for user_input in user_inputs] 42 | 43 | experiment = None 44 | if model_type == "LlamaCpp Chat": 45 | call_params = dict(temperature=[temperature]) 46 | experiment = EXPERIMENTS[model_type]([model], selectors, call_params=call_params) 47 | elif model_type in {"OpenAI Chat", "OpenAI Completion"}: 48 | experiment = EXPERIMENTS[model_type]( 49 | [model], 50 | selectors, 51 | temperature=[temperature], 52 | top_p=[top_p], 53 | max_tokens=[max_tokens], 54 | frequency_penalty=[frequency_penalty], 55 | presence_penalty=[presence_penalty], 56 | ) 57 | elif model_type == "HuggingFace Hub": 58 | experiment = EXPERIMENTS[model_type]([model], selectors, temperature=[temperature]) 59 | elif model_type == "Anthropic": 60 | experiment = EXPERIMENTS[model_type]([model], selectors, temperature=[temperature]) 61 | elif model_type == "Google PaLM": 62 | experiment = EXPERIMENTS[model_type]([model], selectors, temperature=[temperature]) 63 | elif model_type == "Replicate": 64 | input_kwargs = {"prompt": selectors, 65 | "temperature": [temperature]} 66 | model_specific_kwargs = {model: {}} 67 | experiment = EXPERIMENTS[model_type]([model], input_kwargs, model_specific_kwargs) 68 | 69 | return experiment.to_pandas_df(True, True) 70 | 71 | 72 | @st.cache_data 73 | def run_multiple( 74 | model_types, 75 | models, 76 | instructions, 77 | prompts, 78 | openai_api_key=None, 79 | anthropic_api_key=None, 80 | google_api_key=None, 81 | hf_api_key=None, 82 | replicate_api_key=None, 83 | ): 84 | import os 85 | 86 | if openai_api_key: 87 | os.environ["OPENAI_API_KEY"] = openai_api_key 88 | if anthropic_api_key: 89 | os.environ["ANTHROPIC_API_KEY"] = anthropic_api_key 90 | if google_api_key: 91 | os.environ["GOOGLE_PALM_API_KEY"] = google_api_key 92 | if hf_api_key: 93 | os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_api_key 94 | if replicate_api_key: 95 | os.environ["REPLICATE_API_TOKEN"] = replicate_api_key 96 | dfs = [] 97 | for i in range(len(models)): 98 | # TODO Support temperature and other parameters 99 | selectors = [] 100 | if i + 1 in instructions: 101 | selectors = [PromptSelector(instructions[i + 1], prompt) for prompt in prompts] 102 | if model_types[i] == "Replicate": 103 | input_kwargs = {"prompt": selectors} 104 | model_specific_kwargs = {models[i]: {}} 105 | experiment = EXPERIMENTS[model_types[i]]([models[i]], input_kwargs, model_specific_kwargs) 106 | else: 107 | experiment = EXPERIMENTS[model_types[i]]([models[i]], selectors) 108 | else: 109 | if model_types[i] == "Replicate": 110 | input_kwargs = {"prompt": prompts} 111 | model_specific_kwargs = {models[i]: {}} 112 | experiment = EXPERIMENTS[model_types[i]]([models[i]], input_kwargs, model_specific_kwargs) 113 | else: 114 | experiment = EXPERIMENTS[model_types[i]]([models[i]], prompts) 115 | dfs.append(experiment.to_pandas_df(True, True)) 116 | return dfs 117 | -------------------------------------------------------------------------------- /prompttools/playground/packages.txt: -------------------------------------------------------------------------------- 1 | pkg-config -------------------------------------------------------------------------------- /prompttools/playground/requirements.txt: -------------------------------------------------------------------------------- 1 | prompttools 2 | jinja2 3 | huggingface_hub 4 | llama-cpp-python 5 | anthropic 6 | pyperclip 7 | google-generativeai 8 | replicate -------------------------------------------------------------------------------- /prompttools/prompttest/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/prompttools/prompttest/__init__.py -------------------------------------------------------------------------------- /prompttools/prompttest/error/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/prompttools/prompttest/error/__init__.py -------------------------------------------------------------------------------- /prompttools/prompttest/error/failure.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from prompttools.prompttest.threshold_type import ThresholdType 8 | 9 | 10 | class PromptTestSetupException(Exception): 11 | r""" 12 | An exception to throw when something goes wrong with the prompt test setup 13 | """ 14 | 15 | pass 16 | 17 | 18 | def log_failure(metric_name, threshold, actual, threshold_type): 19 | r""" 20 | Prints the test results to the console. 21 | """ 22 | print( 23 | "Test failed: " 24 | + metric_name 25 | + "\nThreshold: " 26 | + (" " * (len("Test failed") - len("Threshold") + 1)) 27 | + str(threshold) 28 | + "\nActual: " 29 | + (" " * (len("Test failed") - len("Actual") + 1)) 30 | + str(actual) 31 | + "\nType: " 32 | + (" " * (len("Test failed") - len("Type") + 1)) 33 | + str("Minimum" if threshold_type is ThresholdType.MINIMUM else "Maximum") 34 | ) 35 | print("-" * (len("Test failed: " + metric_name) + 2)) 36 | -------------------------------------------------------------------------------- /prompttools/prompttest/prompttest.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from typing import Callable, List, Optional 8 | from functools import wraps 9 | import logging 10 | 11 | from .threshold_type import ThresholdType 12 | from .error.failure import PromptTestSetupException 13 | from .runner.runner import run_prompttest 14 | 15 | TESTS_TO_RUN = [] 16 | 17 | 18 | def prompttest( 19 | metric_name: str, 20 | eval_fn: Callable, 21 | prompts: List[str], 22 | threshold: float = 1.0, 23 | threshold_type: ThresholdType = ThresholdType.MINIMUM, 24 | expected: Optional[List[str]] = None, 25 | ): 26 | r""" 27 | Creates a decorator for prompt tests, which can annotate evaluation functions. 28 | This enables developers to create a prompt test suite from their evaluations. 29 | """ 30 | 31 | def prompttest_decorator(completion_fn: Callable): 32 | @wraps(completion_fn) 33 | def runs_test(): 34 | results = [completion_fn(prompt) for prompt in prompts] 35 | return run_prompttest( 36 | metric_name, 37 | eval_fn, 38 | threshold, 39 | threshold_type, 40 | prompts, 41 | results, 42 | expected=expected, 43 | ) 44 | 45 | TESTS_TO_RUN.append(runs_test) 46 | return runs_test 47 | 48 | return prompttest_decorator 49 | 50 | 51 | def main(): 52 | logging.getLogger().setLevel(logging.WARNING) 53 | print("Running " + str(len(TESTS_TO_RUN)) + " test(s)") 54 | failures = int(sum([test() for test in TESTS_TO_RUN])) 55 | if failures == 0: 56 | print("All " + str(len(TESTS_TO_RUN)) + " test(s) passed!") 57 | exit(0) 58 | else: 59 | print("Tests failed: " + str(failures)) 60 | exit(1) 61 | -------------------------------------------------------------------------------- /prompttools/prompttest/runner/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/prompttools/prompttest/runner/__init__.py -------------------------------------------------------------------------------- /prompttools/prompttest/runner/runner.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from collections import defaultdict 8 | from typing import Callable, Dict, List, Optional, Type 9 | import logging 10 | 11 | from prompttools.prompttest.threshold_type import ThresholdType 12 | from prompttools.prompttest.error.failure import log_failure 13 | from prompttools.experiment import Experiment 14 | from prompttools.prompttest.error.failure import PromptTestSetupException 15 | 16 | 17 | class PromptTestRunner: 18 | r""" 19 | Base class for prompt test runners. Please use the subclass instead.s 20 | """ 21 | 22 | def __init__(self): 23 | self.ran = defaultdict(bool) 24 | self.experiments = dict() 25 | 26 | def run(self, *args, **kwargs) -> str: 27 | r""" 28 | Runs the test if it has not already been run. 29 | """ 30 | key = str(args) 31 | if self.ran[key]: 32 | return key 33 | self.experiments[key] = self._get_experiment(*args, **kwargs) 34 | self.experiments[key].run() 35 | self.ran[key] = True 36 | return key 37 | 38 | def evaluate( 39 | self, 40 | key: str, 41 | metric_name: str, 42 | eval_fn: Callable, 43 | expected: Optional[str] = None, 44 | ) -> None: 45 | r""" 46 | Evaluates the test results using the given ``eval_fn``. 47 | """ 48 | self.experiments[key].evaluate(metric_name, eval_fn, expected=expected) 49 | 50 | def visualize(self, key: str) -> None: 51 | r""" 52 | Evaluates the test results using the given ``eval_fn``. 53 | """ 54 | self.experiments[key].visualize() 55 | 56 | def scores(self, key): 57 | r""" 58 | Returns the scores for the underlying experiment at the 59 | given key. 60 | """ 61 | return self.experiments[key].scores 62 | 63 | @staticmethod 64 | def _get_experiment( 65 | experiment: Type[Experiment], 66 | model_name: str, 67 | prompts: List[str], 68 | model_args: Dict[str, object], 69 | ) -> Experiment: 70 | return experiment([model_name], prompts, **{k: [v] for k, v in model_args}) 71 | 72 | 73 | prompt_test_runner = PromptTestRunner() 74 | 75 | 76 | def run_prompttest( 77 | metric_name: str, 78 | eval_fn: Callable, 79 | threshold: float, 80 | threshold_type: ThresholdType, 81 | prompts: List[str], 82 | results: List[str], 83 | expected: Optional[List[str]], 84 | ) -> int: 85 | """ 86 | Runs the prompt test evaluation. 87 | """ 88 | scores = [] 89 | for i, result in enumerate(results): 90 | if expected: 91 | score = eval_fn(prompts[i], result, metadata={}, expected=expected[i]) 92 | else: 93 | score = eval_fn(prompts[i], result, metadata={}) 94 | scores.append(score) 95 | if not scores: 96 | logging.error("Something went wrong during testing. Make sure your API keys are set correctly.") 97 | raise PromptTestSetupException 98 | for score in scores: 99 | if not (score <= threshold if threshold_type == ThresholdType.MAXIMUM else score >= threshold): 100 | log_failure(metric_name, threshold, score, threshold_type) 101 | return 1 102 | return 0 103 | -------------------------------------------------------------------------------- /prompttools/prompttest/threshold_type.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from enum import Enum 8 | 9 | 10 | class ThresholdType(Enum): 11 | r""" 12 | Defines the types of thresholds a user can specify for their test case. 13 | """ 14 | 15 | MINIMUM = 1 16 | MAXIMUM = 2 17 | -------------------------------------------------------------------------------- /prompttools/requests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hegelai/prompttools/2446cc9e629fef0a82553ec338c20a203f0688b3/prompttools/requests/__init__.py -------------------------------------------------------------------------------- /prompttools/requests/request_queue.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import os 8 | from typing import Callable, Dict, List, Tuple 9 | from queue import Queue, Empty 10 | from time import perf_counter 11 | import threading 12 | import openai 13 | import logging 14 | 15 | from prompttools.requests.retries import retry_decorator 16 | 17 | 18 | class RequestQueue: 19 | r""" 20 | A generic queue for processing requests in the `prompttools` library. 21 | It can be used to handle and time requests to any LLM asynchronously. 22 | """ 23 | 24 | def __init__(self): 25 | self.data_queue = Queue() 26 | self.is_running = True 27 | self.worker_thread = threading.Thread(target=self._process_queue, daemon=True) 28 | self.worker_thread.start() 29 | self.request_args: list[dict[str, object]] = [] 30 | self.request_results: list[dict[str, object]] = [] 31 | self.request_latencies: list[float] = [] 32 | 33 | def _process_queue(self) -> None: 34 | while self.is_running: 35 | try: 36 | fn, args = self.data_queue.get(timeout=0.2) 37 | self._do_task(fn, args) 38 | self.data_queue.task_done() 39 | except Empty: 40 | continue 41 | 42 | def _do_task(self, fn: Callable, args: Dict[str, object]) -> None: 43 | try: 44 | # TODO: For the streamlit app, we need to set the api key this way. 45 | # Ideally, OpenAI should be able to use the env var. 46 | if "OPENAI_API_KEY" in os.environ: 47 | openai.api_key = os.environ["OPENAI_API_KEY"] 48 | res = self._run(fn, args) 49 | self.request_args.append(args) 50 | self.request_results.append(res[0]) 51 | self.request_latencies.append(res[1]) 52 | # TODO: If we get an unexpected error here, the queue will hang 53 | except openai.AuthenticationError: 54 | logging.error("Authentication error. Skipping request.") 55 | 56 | @retry_decorator 57 | def _run(self, fn: Callable, args: Dict[str, object]) -> Tuple[Dict[str, object], float]: 58 | start = perf_counter() 59 | result = fn(**args) 60 | return result, perf_counter() - start 61 | 62 | def shutdown(self) -> None: 63 | r""" 64 | Stops the worker thread from executed and joins it. 65 | """ 66 | self.data_queue.join() 67 | self.is_running = False 68 | # TODO: If we are hanging and interrupt, this line will 69 | # have the following error: TypeError: 'NoneType' object is not callable 70 | self.worker_thread.join() 71 | 72 | def __del__(self) -> None: 73 | self.shutdown() 74 | 75 | def enqueue(self, callable: Callable, args: Dict[str, object]) -> None: 76 | r""" 77 | Adds another request to the queue. 78 | """ 79 | self.data_queue.put((callable, args)) 80 | 81 | def get_input_args(self) -> List[Dict[str, object]]: 82 | r""" 83 | Joins the queue and gets input args that lead to the result. 84 | """ 85 | self.data_queue.join() 86 | return self.request_args 87 | 88 | def get_results(self) -> List[Dict[str, object]]: 89 | r""" 90 | Joins the queue and gets results. 91 | """ 92 | self.data_queue.join() 93 | return self.request_results 94 | 95 | def get_latencies(self) -> List[float]: 96 | r""" 97 | Joins the queue and gets latencies. 98 | """ 99 | self.data_queue.join() 100 | return self.request_latencies 101 | -------------------------------------------------------------------------------- /prompttools/requests/retries.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from tenacity import ( 8 | before_sleep_log, 9 | retry, 10 | retry_if_exception_type, 11 | stop_after_attempt, 12 | wait_exponential, 13 | ) 14 | import openai 15 | import logging 16 | 17 | 18 | def generate_retry_decorator(wait_lower_bound: int = 3, wait_upper_bound: int = 12, max_retry_attempts: int = 5): 19 | r""" 20 | Creates a retry decorator that can be used for requests. It looks for specific exceptions and waits for 21 | certain about of time before retrying. This improves the reliability of the request queue. 22 | 23 | Args: 24 | wait_lower_bound (int): lower bound to the wait time before retry, defaults to 3. 25 | wait_upper_bound (int): upper bound to the wait time before retry, defaults to 12. 26 | max_retry_attempts (int): maximum number of retries before stopping, defaults to 5. 27 | """ 28 | return retry( 29 | # For the `i`th attempt, wait 2^i seconds before retrying 30 | # with lower and upper bound of [3s, 12s]. 31 | wait=wait_exponential(multiplier=1, min=wait_lower_bound, max=wait_upper_bound), 32 | stop=stop_after_attempt(max_retry_attempts), 33 | reraise=True, 34 | retry=( # Retry for these specific exceptions 35 | retry_if_exception_type(openai.APIConnectionError) 36 | | retry_if_exception_type(openai.APIError) 37 | | retry_if_exception_type(openai.RateLimitError) 38 | | retry_if_exception_type(openai.APIStatusError) 39 | | retry_if_exception_type(openai.APIConnectionError) 40 | | retry_if_exception_type(openai.APIResponseValidationError) 41 | | retry_if_exception_type(openai.APITimeoutError) 42 | ), 43 | before_sleep=before_sleep_log(logging.getLogger(__name__), logging.WARNING), 44 | ) 45 | 46 | 47 | retry_decorator = generate_retry_decorator() 48 | -------------------------------------------------------------------------------- /prompttools/selector/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | -------------------------------------------------------------------------------- /prompttools/selector/prompt_selector.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | try: 8 | from anthropic import HUMAN_PROMPT, AI_PROMPT 9 | except ImportError: 10 | HUMAN_PROMPT, AI_PROMPT = None, None 11 | 12 | 13 | GENERIC_TEMPLATE = """INSTRUCTION: 14 | {instruction} 15 | PROMPT: 16 | {user_input} 17 | RESPONSE: 18 | """ 19 | 20 | PALM_TEMPLATE = """{instruction} 21 | 22 | {user_input} 23 | """ 24 | 25 | LLAMA_TEMPLATE = """[INST] <> 26 | {instruction} 27 | < 28 | {user_input} [/INST] 29 | """ 30 | 31 | ANTHROPIC_TEMPLATE = """{HUMAN_PROMPT}{instruction} 32 | {user_input} 33 | {AI_PROMPT}""" 34 | 35 | 36 | class PromptSelector: 37 | r""" 38 | An abstraction for rendering the same prompt 39 | for different models, e.g. OpenAI Chat models 40 | and Llama models 41 | """ 42 | 43 | def __init__(self, instruction: str, user_input: object): 44 | self.instruction = instruction 45 | self.user_input = user_input 46 | 47 | def for_openai_chat(self): 48 | return [ 49 | {"role": "system", "content": self.instruction}, 50 | {"role": "user", "content": self.user_input}, 51 | ] 52 | 53 | def for_openai_completion(self): 54 | return GENERIC_TEMPLATE.format(instruction=self.instruction, user_input=self.user_input) 55 | 56 | def for_huggingface_hub(self): 57 | return GENERIC_TEMPLATE.format(instruction=self.instruction, user_input=self.user_input) 58 | 59 | def for_llama(self): 60 | return LLAMA_TEMPLATE.format(instruction=self.instruction, user_input=self.user_input) 61 | 62 | def for_anthropic(self): 63 | return ANTHROPIC_TEMPLATE.format( 64 | HUMAN_PROMPT=HUMAN_PROMPT, instruction=self.instruction, user_input=self.user_input, AI_PROMPT=AI_PROMPT 65 | ) 66 | 67 | def for_palm(self): 68 | return PALM_TEMPLATE.format(instruction=self.instruction, user_input=self.user_input) 69 | 70 | def for_music_gen(self): 71 | return GENERIC_TEMPLATE.format(instruction=self.instruction, user_input=self.user_input) 72 | -------------------------------------------------------------------------------- /prompttools/sentry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | # Sentry collects crash reports and performance numbers 9 | # It is possible to turn off data collection using an environment variable named "SENTRY_OPT_OUT" 10 | import sentry_sdk 11 | 12 | import os 13 | import platform 14 | import uuid 15 | import hashlib 16 | from .version import __version__ 17 | 18 | 19 | SENTRY_DSN = "https://43fbb5a3a556ca0a879f5a08ce805d87@o4505656408211456.ingest.sentry.io/4505656412667904" 20 | 21 | # Get a random token based on the machine uuid 22 | token = hashlib.sha256(str(uuid.getnode()).encode()).hexdigest() 23 | 24 | 25 | def find_certifi_path(): 26 | try: 27 | import certifi 28 | 29 | return os.path.join(os.path.dirname(certifi.__file__), "cacert.pem") 30 | except Exception: 31 | pass 32 | return None 33 | 34 | 35 | def filter_info(event, _hint): 36 | # Remove personal info 37 | try: 38 | event["modules"] = None 39 | event["extra"] = None 40 | event["server_name"] = None 41 | except Exception: 42 | pass 43 | return event 44 | 45 | 46 | def init_sentry(): 47 | if "SENTRY_OPT_OUT" not in os.environ: 48 | if platform.system() == "Darwin": 49 | # Fix CA certificate issue on latest MAC models 50 | path = find_certifi_path() 51 | if path is not None: 52 | if "SSL_CERT_FILE" not in os.environ: 53 | os.environ["SSL_CERT_FILE"] = path 54 | if "REQUESTS_CA_BUNDLE" not in os.environ: 55 | os.environ["REQUESTS_CA_BUNDLE"] = path 56 | 57 | sentry_sdk.init( 58 | dsn=SENTRY_DSN, 59 | release=__version__, 60 | traces_sample_rate=0.01, 61 | include_local_variables=False, 62 | send_default_pii=False, 63 | attach_stacktrace=False, 64 | before_send=filter_info, 65 | include_source_context=False, 66 | # the rate at which transaction and performance data is sampled for profiling purposes 67 | profiles_sample_rate=0.0, 68 | ) 69 | try: 70 | filename = os.path.join(os.environ.get("HOME", "/tmp"), ".token") 71 | if platform.system() == "Windows": 72 | filename = os.path.join(os.environ.get("USERPROFILE", "c:\\"), ".token") 73 | with open(filename, "w") as f: 74 | f.write(token) 75 | except Exception: 76 | pass 77 | 78 | sentry_sdk.capture_message("Initializing prompttools", "info") 79 | -------------------------------------------------------------------------------- /prompttools/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | from . import autoeval, expected, validate_json, validate_python, similarity 9 | from .autoeval import autoeval_binary_scoring 10 | from .autoeval_from_expected import autoeval_from_expected_response 11 | from .autoeval_scoring import autoeval_scoring 12 | from .autoeval_with_docs import autoeval_with_documents 13 | from .chunk_text import chunk_text 14 | from .expected import compute_similarity_against_model 15 | from .moderation import apply_moderation 16 | from .ranking_correlation import ranking_correlation 17 | from .similarity import semantic_similarity, cos_similarity 18 | from .validate_json import validate_json_response 19 | from .validate_python import validate_python_response 20 | 21 | __all__ = [ 22 | "autoeval", 23 | "autoeval_binary_scoring", 24 | "autoeval_from_expected_response", 25 | "autoeval_scoring", 26 | "autoeval_with_documents", 27 | "chunk_text", 28 | "compute_similarity_against_model", 29 | "expected", 30 | "apply_moderation", 31 | "ranking_correlation", 32 | "semantic_similarity", 33 | "cos_similarity", 34 | "similarity", 35 | "validate_json", 36 | "validate_json_response", 37 | "validate_python", 38 | "validate_python_response", 39 | ] 40 | -------------------------------------------------------------------------------- /prompttools/utils/autoeval.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | import os 9 | from typing import Dict 10 | import openai 11 | import pandas.core.series 12 | import jinja2 13 | from .error import PromptToolsUtilityError 14 | 15 | EVALUATION_SYSTEM_PROMPT = """ 16 | Determine whether or not the response is following directions. 17 | Your answer should either be "RIGHT" if the response follows directions, 18 | or "WRONG" if the model is not following directions. 19 | """ 20 | 21 | EVALUATION_USER_TEMPLATE = """ 22 | PROMPT: {{prompt}} 23 | RESPONSE: {{response}} 24 | ANSWER: 25 | """ 26 | 27 | 28 | def _get_messages(prompt: str, response: str): 29 | environment = jinja2.Environment() 30 | template = environment.from_string(EVALUATION_USER_TEMPLATE) 31 | user_message = template.render({"prompt": prompt, "response": response}) 32 | return [ 33 | {"role": "system", "content": EVALUATION_SYSTEM_PROMPT}, 34 | {"role": "user", "content": user_message}, 35 | ] 36 | 37 | 38 | def compute(prompt: str, response: str, model: str = "gpt-4") -> float: 39 | r""" 40 | Uses a high quality chat model, like GPT-4, to automatically evaluate a given 41 | prompt/response pair. Outputs can be 0 or 1. 42 | 43 | Args: 44 | prompt (str): The input prompt. 45 | response (str): The model response. 46 | model (str): The OpenAI chat model to use for generating an expected response. 47 | Defaults to GPT-4. 48 | """ 49 | if not os.environ["OPENAI_API_KEY"]: 50 | raise PromptToolsUtilityError 51 | evaluation = openai.chat.completions.create(model=model, messages=_get_messages(prompt, response)) 52 | return 1.0 if "RIGHT" in evaluation.choices[0].message.content else 0.0 53 | 54 | 55 | def evaluate(prompt: str, response: str, _metadata: Dict) -> float: 56 | r""" 57 | Uses auto-evaluation to score the model response with "gpt-4" as the judge, returning 0.0 or 1.0. 58 | 59 | Args: 60 | prompt (str): The input prompt. 61 | response (str): The model response. 62 | metadata (str): Not used. 63 | """ 64 | return compute(prompt, response) 65 | 66 | 67 | def autoeval_binary_scoring( 68 | row: pandas.core.series.Series, 69 | prompt_column_name: str, 70 | response_column_name: str = "response", 71 | ) -> float: 72 | r""" 73 | Uses auto-evaluation to score the model response with "gpt-4" as the judge, returning 0.0 or 1.0. 74 | 75 | Args: 76 | row (pandas.core.series.Series): A row of data from the full DataFrame (including input, model response, other 77 | metrics, etc). 78 | prompt_column_name (str): name of the column that contains the input prompt 79 | response_column_name (str): name of the column that contains the model's response, defaults to ``"response"`` 80 | """ 81 | return compute(row[prompt_column_name], row[response_column_name]) 82 | -------------------------------------------------------------------------------- /prompttools/utils/autoeval_from_expected.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | import os 9 | import openai 10 | import jinja2 11 | import pandas 12 | from .error import PromptToolsUtilityError 13 | 14 | EVALUATION_SYSTEM_PROMPT = """ 15 | You are a grader evaluating responses to math questions. 16 | Given the PROMPT and EXPECTED, evaluate the ACTUAL answer. 17 | The ACTUAL answer should be the same as the EXPECTED. 18 | You should grade the response as either RIGHT or WRONG. 19 | If the ACTUAL answer is the same as the EXPECTED, mark it RIGHT. 20 | Otherwise, mark it WRONG. 21 | """ 22 | 23 | EVALUATION_USER_TEMPLATE = """ 24 | PROMPT: {{prompt}} 25 | EXPECTED: {{expected}} 26 | ACTUAL: {{actual}} 27 | ANSWER: 28 | """ 29 | 30 | 31 | def _get_messages(prompt: str, expected: str, response: str): 32 | environment = jinja2.Environment() 33 | template = environment.from_string(EVALUATION_USER_TEMPLATE) 34 | user_message = template.render({"prompt": prompt, "expected": expected, "actual": response}) 35 | return [ 36 | {"role": "system", "content": EVALUATION_SYSTEM_PROMPT}, 37 | {"role": "user", "content": user_message}, 38 | ] 39 | 40 | 41 | def compute(prompt: str, expected: str, response: str, model: str = "gpt-4") -> float: 42 | r""" 43 | Uses a high quality chat model, like GPT-4, to automatically evaluate a given 44 | prompt/response pair. Outputs can be 0 or 1. 45 | 46 | Args: 47 | prompt (str): The input prompt. 48 | response (str): The model response. 49 | model (str): The OpenAI chat model to use for generating an expected response. 50 | Defaults to GPT-4. 51 | """ 52 | if not os.environ["OPENAI_API_KEY"]: 53 | raise PromptToolsUtilityError("Missing API key for evaluation.") 54 | evaluation = openai.chat.completions.create(model=model, messages=_get_messages(prompt, expected, response)) 55 | return 1.0 if "RIGHT" in evaluation.choices[0].message.content else 0.0 56 | 57 | 58 | def evaluate(prompt: str, response: str, metadata: dict, expected: str) -> float: 59 | r""" 60 | Uses auto-evaluation to score the model response. 61 | """ 62 | return compute(prompt, expected, response) 63 | 64 | 65 | def autoeval_from_expected_response( 66 | row: pandas.core.series.Series, expected: str, prompt_column_name: str, response_column_name: str = "response" 67 | ): 68 | prompt = row[prompt_column_name] 69 | response = row[response_column_name] 70 | return compute(prompt, expected, response) 71 | -------------------------------------------------------------------------------- /prompttools/utils/autoeval_scoring.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | import os 9 | import pandas.core.series 10 | import jinja2 11 | 12 | try: 13 | import anthropic 14 | except ImportError: 15 | anthropic = None 16 | 17 | 18 | AUTO_EVAL_PROMPT_TEMPLATE = """ 19 | {{HUMAN_PROMPT}} Given the fact {{fact}} 20 | 21 | Evaluate the following Answer on a scale from 1 - 7. Please only respond with an integer from 1 - 7 with no other text. 22 | Lower score means the answer is factually wrong, higher score means the answer is correct. A medium score for 23 | uncertain but not wrong. 24 | 25 | Answer: {{model_answer}} 26 | 27 | {{AI_PROMPT}} 28 | """ 29 | 30 | 31 | def _generate_auto_eval_prompt(fact: str, model_answer: str): 32 | environment = jinja2.Environment() 33 | template = environment.from_string(AUTO_EVAL_PROMPT_TEMPLATE) 34 | auto_eval_prompt = template.render( 35 | { 36 | "HUMAN_PROMPT": anthropic.HUMAN_PROMPT, 37 | "AI_PROMPT": anthropic.AI_PROMPT, 38 | "fact": fact, 39 | "model_answer": model_answer, 40 | } 41 | ) 42 | return auto_eval_prompt 43 | 44 | 45 | def compute(fact: str, model_answer: str, model: str = "claude-2") -> float: 46 | r""" 47 | Uses a high quality chat model, like claude-2, to automatically score a given 48 | fact/response pair. Output should be an integer ranging from 1 - 7. 49 | 50 | Args: 51 | fact (str): The fact (truth). The auto-eval model will judge how close the ``response`` is 52 | from this fact (truth). 53 | model_answer (str): The model response. 54 | model (str): The model that will be judging how close is the response from the truth. 55 | Defaults to Claude 2. 56 | """ 57 | if not os.environ["ANTHROPIC_API_KEY"]: 58 | raise RuntimeError("Missing API key for evaluation.") 59 | client = anthropic.Anthropic(api_key=os.environ["ANTHROPIC_API_KEY"]) 60 | completion_response = client.completions.create( 61 | max_tokens_to_sample=100, model=model, prompt=_generate_auto_eval_prompt(fact, model_answer) 62 | ) 63 | return int(completion_response.completion) 64 | 65 | 66 | def autoeval_scoring(row: pandas.core.series.Series, expected: str, response_column_name: str = "response") -> float: 67 | r""" 68 | Uses auto-evaluation to score the model response. 69 | 70 | Args: 71 | row (pandas.core.series.Series): A row of data from the full DataFrame (including input, model response, other 72 | metrics, etc). 73 | expected (str): the expected response 74 | response_column_name (str): name of the column that contains the model's response, defaults to ``"response"`` 75 | """ 76 | if anthropic is None: 77 | raise ModuleNotFoundError( 78 | "Package `anthropic` is required to be installed to use this experiment." 79 | "Please use `pip install anthropic` to install the package" 80 | ) 81 | return compute(fact=expected, model_answer=row[response_column_name]) 82 | -------------------------------------------------------------------------------- /prompttools/utils/autoeval_with_docs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | import os 9 | import openai 10 | import pandas.core.series 11 | import jinja2 12 | from .error import PromptToolsUtilityError 13 | 14 | 15 | EVALUATION_SYSTEM_PROMPT = """ 16 | Using the provided documents, determine whether or not the response is accurate. 17 | Your answer should be an integer rating from 0 to 10, with 0 being extremely inaccurate 18 | and 10 being perfectly accurate. Only an integer should be returned in the response. 19 | """ 20 | 21 | EVALUATION_USER_TEMPLATE = """ 22 | DOCUMENTS: 23 | {{documents}} 24 | 25 | RESPONSE: {{response}} 26 | ANSWER: 27 | """ 28 | 29 | 30 | def _get_messages(documents: list[str], response: str): 31 | environment = jinja2.Environment() 32 | template = environment.from_string(EVALUATION_USER_TEMPLATE) 33 | user_message = template.render({"documents": "\n".join(documents), "response": response}) 34 | return [ 35 | {"role": "system", "content": EVALUATION_SYSTEM_PROMPT}, 36 | {"role": "user", "content": user_message}, 37 | ] 38 | 39 | 40 | def compute(documents: list[str], response: str, model: str = "gpt-4") -> float: 41 | r""" 42 | Uses a high quality chat model, like GPT-4, to automatically evaluate a given 43 | prompt/response pair. Outputs can be 0 or 1. 44 | 45 | Args: 46 | documents (list[str]): documents to provide relevant context for the model to judge 47 | model (str): The OpenAI chat model to use for generating an expected response. 48 | Defaults to GPT-4. 49 | """ 50 | if not os.environ["OPENAI_API_KEY"]: 51 | raise PromptToolsUtilityError 52 | evaluation = openai.chat.completions.create(model=model, messages=_get_messages(documents, response)) 53 | score_text = evaluation.choices[0].message.content 54 | return int(score_text) 55 | 56 | 57 | def autoeval_with_documents( 58 | row: pandas.core.series.Series, 59 | documents: list[str], 60 | response_column_name: str = "response", 61 | ) -> float: 62 | r""" 63 | Given a list of documents, score whether the model response is accurate with "gpt-4" as the judge, 64 | returning an integer score from 0 to 10. 65 | 66 | Args: 67 | row (pandas.core.series.Series): A row of data from the full DataFrame (including input, model response, other 68 | metrics, etc). 69 | documents (list[str]): documents to provide relevant context for the model to judge 70 | response_column_name (str): name of the column that contains the model's response, defaults to ``"response"`` 71 | """ 72 | return compute(documents, row[response_column_name]) 73 | -------------------------------------------------------------------------------- /prompttools/utils/chunk_text.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | def chunk_text(text: str, max_chunk_length: int) -> list[str]: 9 | r""" 10 | Given a long string paragraph of text and a chunk max length, returns chunks of texts where each chunk's 11 | length is smaller than the max length, without breaking up individual words (separated by space). 12 | 13 | Args: 14 | text (str): source text to be chunked 15 | max_chunk_length (int): maximum length of a chunk 16 | """ 17 | 18 | words = text.split() 19 | chunks = [] 20 | current_chunk = "" 21 | 22 | for word in words: 23 | if len(current_chunk) + len(word) + 1 <= max_chunk_length: 24 | if current_chunk: 25 | current_chunk += " " 26 | current_chunk += word 27 | else: 28 | chunks.append(current_chunk) 29 | current_chunk = word 30 | 31 | if current_chunk: 32 | chunks.append(current_chunk) 33 | 34 | return chunks 35 | -------------------------------------------------------------------------------- /prompttools/utils/error.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | class PromptToolsUtilityError(Exception): 9 | r""" 10 | An exception to throw when something goes wrong with the prompttools utility. 11 | """ 12 | 13 | pass 14 | -------------------------------------------------------------------------------- /prompttools/utils/expected.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | import os 9 | import openai 10 | import pandas.core.series 11 | from .error import PromptToolsUtilityError 12 | from . import similarity 13 | 14 | 15 | def compute(prompt: str, model: str = "gpt-4") -> str: 16 | r""" 17 | Computes the expected result of a given prompt by using a high 18 | quality LLM, like GPT-4. 19 | 20 | Args: 21 | prompt (str): The input prompt. 22 | model (str): The OpenAI chat model to use for generating an expected response. 23 | Defaults to GPT-4. 24 | """ 25 | if not os.environ["OPENAI_API_KEY"]: 26 | raise PromptToolsUtilityError 27 | response = openai.chat.completions.create( 28 | model=model, 29 | messages=[ 30 | {"role": "user", "content": prompt}, 31 | ], 32 | ) 33 | return response.choices[0].message.content 34 | 35 | 36 | def evaluate(prompt: str, response: str, model: str = "gpt-4") -> str: 37 | r""" 38 | Computes the similarity of a given response to the expected result 39 | generated from a high quality LLM (by default GPT-4) using the same prompt. 40 | 41 | Args: 42 | prompt (str): The input prompt. 43 | response (str): The model response. 44 | model (str): The OpenAI chat model to use for generating an expected response. 45 | Defaults to GPT-4. 46 | """ 47 | expected_response = compute(prompt, model) 48 | return similarity.compute(response, expected_response) 49 | 50 | 51 | def compute_similarity_against_model( 52 | row: pandas.core.series.Series, 53 | prompt_column_name: str, 54 | model: str = "gpt-4", 55 | response_column_name: str = "response", 56 | ) -> str: 57 | r""" 58 | Computes the similarity of a given response to the expected result 59 | generated from a high quality LLM (by default GPT-4) using the same prompt. 60 | 61 | Args: 62 | row (pandas.core.series.Series): A row of data from the full DataFrame (including input, model response, other 63 | metrics, etc). 64 | prompt_column_name (str): name of the column that contains the input prompt 65 | model (str): name of the model that will serve as the judge 66 | response_column_name (str): name of the column that contains the model's response, defaults to ``"response"`` 67 | """ 68 | 69 | expected_response = compute(row[prompt_column_name], model) 70 | return similarity.compute(row[response_column_name], expected_response) 71 | -------------------------------------------------------------------------------- /prompttools/utils/moderation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | import openai 9 | import pandas 10 | from typing import Optional, Union 11 | 12 | 13 | def apply_moderation( 14 | row: pandas.core.series.Series, 15 | text_col_name: str = "response", 16 | moderation_model: str = "text-moderation-latest", 17 | category_names: Optional[list[str]] = None, 18 | category_score_names: Optional[list[str]] = None, 19 | ) -> Union[bool, dict]: 20 | r""" 21 | Uses OpenAI's moderation API to determine whether the text complies with OpenAI's usage policies. 22 | 23 | Args: 24 | row (pandas.core.series.Series): A row of data from the full DataFrame (including input, model response, other 25 | metrics, etc). 26 | text_col_name (str): column name of text to be moderated 27 | moderation_model (str): name of the OpenAI moderation model, defaults to ``"text-moderation-latest"`` 28 | category_names (Optional[list[str]]): specify the names of category flags to extract from the response and 29 | be added as column(s) in the row, optional. (e.g. ``["harassment", "violence"]``) 30 | category_score_names (Optional[list[str]]): specify the names of category scores to extract from the response 31 | and be added as column(s) in the row, optional. (e.g. ``["harassment", "violence"]``) 32 | 33 | Returns: 34 | A boolean flag (of whether the input violates policies), or a dict with various topic specific flags/scores. 35 | """ 36 | text = row[text_col_name] 37 | 38 | moderation_response = openai.moderations.create(model=moderation_model, input=text) 39 | flagged = moderation_response.results[0].flagged 40 | res = {} 41 | if category_names: 42 | category_flags = moderation_response.results[0].categories.model_dump() 43 | for c in category_names: 44 | res[c] = category_flags[c] 45 | if category_score_names: 46 | category_scores = moderation_response.results[0].category_scores.model_dump() 47 | for c in category_score_names: 48 | res[f"{c}_score"] = category_scores[c] 49 | if category_names or category_score_names: 50 | res["moderation_flag"] = flagged 51 | return res 52 | else: 53 | return flagged 54 | -------------------------------------------------------------------------------- /prompttools/utils/ranking_correlation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | try: 9 | import scipy.stats as stats 10 | except ImportError: 11 | stats = None 12 | import pandas 13 | 14 | 15 | def ranking_correlation( 16 | row: pandas.core.series.Series, expected_ranking: list, ranking_column_name: str = "top doc ids" 17 | ) -> float: 18 | r""" 19 | A simple test that compares the expected ranking for a given query with the actual ranking produced 20 | by the embedding function being tested. 21 | 22 | Args: 23 | row (pandas.core.series.Series): A row of data from the full DataFrame (including input, model response, other 24 | metrics, etc). 25 | expected_ranking (list): the expected list of ranking to compare 26 | ranking_column_name (str): the column name of the actual ranking produced by the model, 27 | defaults to ``"top doc ids"`` 28 | 29 | Example: 30 | >>> EXPECTED_RANKING_LIST = [ 31 | >>> ["id1", "id3", "id2"], 32 | >>> ["id2", "id3", "id1"], 33 | >>> ["id1", "id3", "id2"], 34 | >>> ["id2", "id3", "id1"], 35 | >>> ] 36 | >>> experiment.evaluate("ranking_correlation", ranking_correlation, expected_ranking=EXPECTED_RANKING_LIST) 37 | """ 38 | if stats is None: 39 | raise ModuleNotFoundError( 40 | "Package `SciPy` is required to be installed to use this evaluation method." 41 | "Please use `pip install scipy` to install the package" 42 | ) 43 | actual_ranking = row[ranking_column_name] 44 | if len(expected_ranking) == 1 and len(actual_ranking) == 1: 45 | return 1.0 if expected_ranking == actual_ranking else -1.0 46 | correlation, _ = stats.spearmanr(actual_ranking, expected_ranking) 47 | return correlation 48 | -------------------------------------------------------------------------------- /prompttools/utils/validate_json.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | from typing import Callable, Dict, List, Optional 9 | import pandas.core.series 10 | import json 11 | import re 12 | 13 | KEY_EXTRACTION_REGEX = r'"([^"]+?)"\s*:' 14 | 15 | 16 | def strip_outer_brackets(text: str) -> str: 17 | r""" 18 | Removes all chars outside the first '{' and the last '}'. Intended to be a pre-processing 19 | step prior to parsing a string as JSON. 20 | 21 | Args: 22 | text(str): the text to process 23 | """ 24 | first_brace = text.find("{") 25 | last_brace = text.rfind("}") 26 | return text[first_brace : last_brace + 1] 27 | 28 | 29 | def sample_pre_process_fn(text: str): 30 | r""" 31 | An example pre-processing that you may use before attempting to parse a string as JSON. 32 | This function removes all chars outside the first '{' and the last '}'. Then, 33 | it removes ``"\\n"``. 34 | 35 | This function should be modified depending on your LLM's output. 36 | 37 | Args: 38 | text(str): the text to process 39 | """ 40 | text = strip_outer_brackets(text) 41 | text = text.replace("\\n", "") 42 | return text 43 | 44 | 45 | def validate(text: str, pre_process_fn: Optional[Callable] = None): 46 | r""" 47 | Validates that the generated text is JSON. 48 | 49 | Args: 50 | text (str): The generated text, which should be valid JSON. 51 | pre_process_fn (Callable[str, str]): a function to pre-process the text response from the LLM before attempting 52 | to parse the string as JSON. Look at ``validate_json.sample_pre_process_fn`` as an example. 53 | """ 54 | if pre_process_fn: 55 | text = pre_process_fn(text) 56 | try: 57 | json.loads(text) 58 | except ValueError: 59 | return 0.0 60 | return 1.0 61 | 62 | 63 | def validate_keys(text: str, valid_keys: List[str]): 64 | r""" 65 | Guarantees that all keys in the generated JSON are valid. 66 | 67 | Args: 68 | text (str): The generated text, which should be valid JSON. 69 | valid_keys (List[str]): A list of valid keys which may appear in the JSON. 70 | """ 71 | match = re.search(text, KEY_EXTRACTION_REGEX) 72 | for group in match.groups(): 73 | if group not in valid_keys: 74 | return 0.0 75 | return 1.0 76 | 77 | 78 | def validate_json_response(row: pandas.core.series.Series, response_column_name: str = "response") -> float: 79 | r""" 80 | Validate whether ``response`` string is in a valid JSON format. 81 | 82 | Args: 83 | row (pandas.core.series.Series): A row of data from the full DataFrame (including input, model response, other 84 | metrics, etc). 85 | response_column_name (str): name of the column that contains the model's response, defaults to ``"response"`` 86 | """ 87 | return validate(row[response_column_name]) 88 | 89 | 90 | def evaluate(prompt: str, response: str, metadata: Dict) -> float: 91 | r""" 92 | Validate whether ``response`` string is in a valid JSON format. 93 | 94 | Args: 95 | prompt (str): Not used. 96 | response (str): the string that will be validated 97 | metadata (dict): Not used. 98 | """ 99 | return validate(response) 100 | -------------------------------------------------------------------------------- /prompttools/utils/validate_python.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | import os 9 | from typing import Dict 10 | import pandas.core.series 11 | from .error import PromptToolsUtilityError 12 | 13 | try: 14 | from pylint import epylint as lint 15 | except ImportError: 16 | lint = None 17 | 18 | PROMPTTOOLS_TMP = "prompttools_tmp.py" 19 | 20 | 21 | def validate(text: str): 22 | r""" 23 | Validates that the generated text is python. 24 | 25 | Args: 26 | text (str): The generated text, which should be valid python. 27 | """ 28 | if lint is None: 29 | raise RuntimeError( 30 | "Our built-in `validate_python` function requires pylint<3.0. Please use a custom eval function." 31 | "Feel free to open a GitHub issue or PR." 32 | ) 33 | if os.path.isfile(PROMPTTOOLS_TMP): 34 | raise PromptToolsUtilityError 35 | with open(PROMPTTOOLS_TMP, "w") as f: 36 | f.write(text) 37 | pylint_stdout, _ = lint.py_run(PROMPTTOOLS_TMP, return_std=True) 38 | os.remove(PROMPTTOOLS_TMP) 39 | return 0.0 if "error" in pylint_stdout.getvalue() else 1.0 40 | 41 | 42 | def validate_python_response(row: pandas.core.series.Series, response_column_name: str = "response") -> float: 43 | r""" 44 | Validate whether ``response`` string follows Python's syntax. 45 | 46 | Args: 47 | row (pandas.core.series.Series): A row of data from the full DataFrame (including input, model response, other 48 | metrics, etc). 49 | response_column_name (str): name of the column that contains the model's response, defaults to ``"response"`` 50 | """ 51 | return validate(row[response_column_name]) 52 | 53 | 54 | def evaluate(prompt: str, response: str, metadata: Dict) -> float: 55 | r""" 56 | Validate whether ``response`` string follows Python's syntax. 57 | 58 | Args: 59 | prompt (str): Not used. 60 | response (str): the string that will be validated 61 | metadata (dict): Not used. 62 | """ 63 | return validate(response) 64 | -------------------------------------------------------------------------------- /prompttools/version.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.0.46a0+5a80732" 2 | git_version = "5a807328435d269d7ed17b53f86283e116e08244" 3 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "prompttools" 7 | version = "0.0.46" 8 | authors = [ 9 | { name="Hegel AI", email="team@hegel-ai.com" }, 10 | ] 11 | description = "Tools for LLM prompt testing and experimentation" 12 | readme = "README.md" 13 | requires-python = ">=3.10" 14 | classifiers = [ 15 | "Programming Language :: Python :: 3", 16 | "License :: OSI Approved :: Apache Software License", 17 | "Operating System :: OS Independent", 18 | ] 19 | 20 | dynamic = ["dependencies", "license"] 21 | 22 | [project.urls] 23 | "Homepage" = "https://github.com/hegelai/prompttools" 24 | "Bug Tracker" = "https://github.com/hegelai/prompttools" 25 | 26 | [tool.setuptools.dynamic] 27 | dependencies = {file = ["requirements.txt"]} 28 | 29 | [tool.black] 30 | line-length = 120 31 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | openai 2 | tenacity 3 | tabulate 4 | pandas 5 | jinja2 6 | jupyterlab 7 | ipywidgets 8 | pylint 9 | sentry-sdk>=1.23.0 10 | -------------------------------------------------------------------------------- /scripts/create_comment.py: -------------------------------------------------------------------------------- 1 | from prompttools.experiment import OpenAIChatExperiment 2 | from prompttools.selector.prompt_selector import PromptSelector 3 | 4 | PROMPTTOOLS_MD_TMP = "markdown.md" 5 | 6 | selectors = [ 7 | PromptSelector("You are a helpful assistant.", "Is 17077 a prime number?"), 8 | PromptSelector("You are a math tutor.", "Is 17077 a prime number?"), 9 | ] 10 | models = ["gpt-3.5-turbo", "gpt-4"] 11 | temperatures = [0.0] 12 | openai_experiment = OpenAIChatExperiment(models, selectors, temperature=temperatures) 13 | openai_experiment.run() 14 | 15 | markdown = openai_experiment.to_markdown() 16 | with open(PROMPTTOOLS_MD_TMP, "w") as f: 17 | f.write(markdown) 18 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import distutils.command.clean 4 | import os 5 | import shutil 6 | import subprocess 7 | 8 | from pathlib import Path 9 | 10 | from setuptools import find_packages, setup 11 | 12 | ROOT_DIR = Path(__file__).parent.resolve() 13 | 14 | 15 | def _get_requirements(): 16 | """Get dependency requirements from `requirements.txt`.""" 17 | req_list = [] 18 | with Path("requirements.txt").open("r") as f: 19 | for line in f: 20 | req = line.strip() 21 | if len(req) == 0 or req.startswith("#"): 22 | continue 23 | req_list.append(req) 24 | return req_list 25 | 26 | 27 | def _get_version(): 28 | """Get package version.""" 29 | # with open(os.path.join(ROOT_DIR, "version.txt")) as f: 30 | # version = f.readline().strip() 31 | version = "0.0.46a0" 32 | 33 | sha = "Unknown" 34 | try: 35 | sha = subprocess.check_output(["git", "rev-parse", "HEAD"], cwd=str(ROOT_DIR)).decode("ascii").strip() 36 | except Exception: 37 | pass 38 | 39 | os_build_version = os.getenv("BUILD_VERSION") 40 | if os_build_version: 41 | version = os_build_version 42 | elif sha != "Unknown": 43 | version += "+" + sha[:7] 44 | 45 | return version, sha 46 | 47 | 48 | def _export_version(version, sha): 49 | version_path = ROOT_DIR / "prompttools" / "version.py" 50 | with open(version_path, "w") as f: 51 | f.write(f"__version__ = '{version}'\n") 52 | f.write(f"git_version = {repr(sha)}\n") 53 | 54 | 55 | requirements = _get_requirements() 56 | 57 | 58 | class Clean(distutils.command.clean.clean): 59 | def run(self): 60 | # Run default behavior first 61 | distutils.command.clean.clean.run(self) 62 | 63 | # Remove prompttools extension 64 | def remove_extension(pattern): 65 | for path in (ROOT_DIR / "prompttools").glob(pattern): 66 | print(f"removing extension '{path}'") 67 | path.unlink() 68 | 69 | for ext in ["so", "dylib", "pyd"]: 70 | remove_extension("**/*." + ext) 71 | 72 | # Remove build directory 73 | build_dirs = [ 74 | ROOT_DIR / "build", # Remove build 75 | ROOT_DIR / "prompttools.egg-info", # Remove egg metadata 76 | ] 77 | for path in build_dirs: 78 | if path.exists(): 79 | print(f"removing '{path}' (and everything under it)") 80 | shutil.rmtree(str(path), ignore_errors=True) 81 | 82 | 83 | if __name__ == "__main__": 84 | VERSION, SHA = _get_version() 85 | # TODO: Exporting the version here breaks `python -m build` 86 | # _export_version(VERSION, SHA) 87 | 88 | print("-- Building version " + VERSION) 89 | 90 | setup( 91 | # Metadata 92 | name="prompttools", 93 | version=VERSION, 94 | description="Tools for prompts.", 95 | long_description=Path("README.md").read_text(encoding="utf-8"), 96 | long_description_content_type="text/markdown", 97 | url="https://github.com/hegelai/prompttools", 98 | author="Hegel AI", 99 | author_email="steve@hegel-ai.com, kevin@hegel-ai.com", 100 | license="Proprietary", 101 | install_requires=requirements, 102 | python_requires=">=3.10", 103 | classifiers=[ 104 | "Intended Audience :: Developers", 105 | "Intended Audience :: Science/Research", 106 | "Operating System :: MacOS :: MacOS X", 107 | "Operating System :: Microsoft :: Windows", 108 | "Programming Language :: Python :: 3.10", 109 | "Programming Language :: Python :: 3.11", 110 | "Programming Language :: Python :: Implementation :: CPython", 111 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 112 | ], 113 | # Package Info 114 | packages=find_packages(exclude=["test*", "examples*", "build*"]), 115 | zip_safe=False, 116 | cmdclass={ 117 | "clean": Clean, 118 | }, 119 | ) 120 | -------------------------------------------------------------------------------- /test/app.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | r""" 9 | App for local testing of logger 10 | """ 11 | 12 | from flask import Flask, request 13 | import time 14 | 15 | app = Flask(__name__) 16 | 17 | 18 | @app.route("/", methods=["POST"]) 19 | def process_request(): 20 | time.sleep(0.1) 21 | data = request.json 22 | print(f"Request received and processed {data}.") 23 | return "", 200 24 | 25 | 26 | if __name__ == "__main__": 27 | app.run(debug=True) 28 | -------------------------------------------------------------------------------- /test/requirements.txt: -------------------------------------------------------------------------------- 1 | sentence_transformers 2 | -------------------------------------------------------------------------------- /test/test_experiment.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from prompttools.experiment import ( 4 | LlamaCppExperiment, 5 | HuggingFaceHubExperiment, 6 | OpenAIChatExperiment, 7 | OpenAICompletionExperiment, 8 | ) 9 | 10 | 11 | class TestExperiment(TestCase): 12 | # TODO: Currently, it only ensures importing is correct. 13 | # Add unit tests to verify initialization. 14 | def test_llama_cpp_experiment(self): 15 | pass 16 | 17 | def test_hugging_face_experiment(self): 18 | pass 19 | 20 | def test_openai_chat_experiment(self): 21 | pass 22 | 23 | def test_openai_completion_experiment(self): 24 | pass 25 | -------------------------------------------------------------------------------- /test/test_harness.py: -------------------------------------------------------------------------------- 1 | from unittest import TestCase 2 | 3 | from prompttools.harness import ( 4 | ChatHistoryExperimentationHarness, 5 | ChatModelComparisonHarness, 6 | PromptTemplateExperimentationHarness, 7 | SystemPromptExperimentationHarness, 8 | ) 9 | 10 | 11 | class TestHarness(TestCase): 12 | # TODO: Currently, it only ensures importing is correct. 13 | # Add unit tests to verify initialization. 14 | def test_chat_history_exp_harness(self): 15 | pass 16 | 17 | def test_chat_model_exp_harness(self): 18 | pass 19 | 20 | def test_prrmpt_template_exp_harness(self): 21 | pass 22 | 23 | def test_system_prompt_exp_harness(self): 24 | pass 25 | -------------------------------------------------------------------------------- /test/test_logger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Hegel AI, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code's license can be found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | if False: # Skipping this in CI 9 | import openai 10 | import prompttools.logger # noqa: F401 Importing this line will monkey-patch `openai.chat.completions.create` 11 | 12 | 13 | r""" 14 | Example of using `prompttools.logger`. 15 | 16 | All you need to do is call `import prompttools.logger` to start logging. 17 | You can optionally add `hegel_model` to your call (as seen below). This will associate 18 | this call with a specific name in the logs. 19 | 20 | The OpenAI call is unchanged, it executes normally between your machine and OpenAI's server. 21 | 22 | Note: 23 | You should have "HEGELAI_API_KEY" and "OPENAI_API_KEY" loaded into `os.environ`. 24 | """ 25 | 26 | if __name__ == "__main__": 27 | if False: # Skipping this in CI 28 | for i in range(1): 29 | messages = [ 30 | {"role": "user", "content": f"What is 1 + {i}?"}, 31 | ] 32 | 33 | # `hegel_model` is an optional argument that allows you to tag your call with a specific name 34 | # Logging still works without this argument 35 | # The rest of the OpenAI call happens as normal between your machine and OpenAI's server 36 | openai_response = openai.chat.completions.create( 37 | model="gpt-3.5-turbo", messages=messages, hegel_model="Math Model" 38 | ) 39 | print(f"{openai_response = }") 40 | 41 | print("End") 42 | -------------------------------------------------------------------------------- /version.txt: -------------------------------------------------------------------------------- 1 | 0.0.46a0 2 | --------------------------------------------------------------------------------